From e8d277081293b6fb2a5d469616baaa7a06f52496 Mon Sep 17 00:00:00 2001 From: qinxialei Date: Thu, 29 Oct 2020 11:26:59 +0800 Subject: Import Upstream version 0.16.0 --- .gitignore | 2 + AUTHORS | 6 + CMakeLists.txt | 126 + CONTRIBUTING.md | 27 + LICENSE | 202 ++ README.md | 168 ++ cmake/libgav1-config.cmake.template | 2 + cmake/libgav1.pc.template | 11 + cmake/libgav1_build_definitions.cmake | 150 ++ cmake/libgav1_cpu_detection.cmake | 49 + cmake/libgav1_flags.cmake | 251 ++ cmake/libgav1_helpers.cmake | 134 + cmake/libgav1_install.cmake | 60 + cmake/libgav1_intrinsics.cmake | 135 + cmake/libgav1_options.cmake | 55 + cmake/libgav1_sanitizer.cmake | 45 + cmake/libgav1_targets.cmake | 347 +++ cmake/libgav1_variables.cmake | 78 + cmake/toolchains/aarch64-linux-gnu.cmake | 28 + cmake/toolchains/android.cmake | 53 + cmake/toolchains/arm-linux-gnueabihf.cmake | 29 + codereview.settings | 4 + examples/file_reader.cc | 186 ++ examples/file_reader.h | 100 + examples/file_reader_constants.cc | 23 + examples/file_reader_constants.h | 39 + examples/file_reader_factory.cc | 51 + examples/file_reader_factory.h | 51 + examples/file_reader_interface.h | 63 + examples/file_writer.cc | 183 ++ examples/file_writer.h | 102 + examples/gav1_decode.cc | 452 ++++ examples/gav1_decode_cv_pixel_buffer_pool.cc | 278 ++ examples/gav1_decode_cv_pixel_buffer_pool.h | 73 + examples/ivf_parser.cc | 96 + examples/ivf_parser.h | 57 + examples/libgav1_examples.cmake | 63 + examples/logging.h | 65 + src/buffer_pool.cc | 218 ++ src/buffer_pool.h | 399 +++ src/decoder.cc | 119 + src/decoder_impl.cc | 1661 ++++++++++++ src/decoder_impl.h | 266 ++ src/decoder_settings.cc | 33 + src/decoder_state.h | 89 + src/dsp/arm/average_blend_neon.cc | 146 + src/dsp/arm/average_blend_neon.h | 36 + src/dsp/arm/cdef_neon.cc | 697 +++++ src/dsp/arm/cdef_neon.h | 38 + src/dsp/arm/common_neon.h | 777 ++++++ src/dsp/arm/convolve_neon.cc | 3105 ++++++++++++++++++++++ src/dsp/arm/convolve_neon.h | 50 + src/dsp/arm/distance_weighted_blend_neon.cc | 203 ++ src/dsp/arm/distance_weighted_blend_neon.h | 39 + src/dsp/arm/film_grain_neon.cc | 1188 +++++++++ src/dsp/arm/film_grain_neon.h | 47 + src/dsp/arm/intra_edge_neon.cc | 301 +++ src/dsp/arm/intra_edge_neon.h | 39 + src/dsp/arm/intrapred_cfl_neon.cc | 479 ++++ src/dsp/arm/intrapred_directional_neon.cc | 926 +++++++ src/dsp/arm/intrapred_filter_intra_neon.cc | 176 ++ src/dsp/arm/intrapred_neon.cc | 1144 ++++++++ src/dsp/arm/intrapred_neon.h | 418 +++ src/dsp/arm/intrapred_smooth_neon.cc | 616 +++++ src/dsp/arm/inverse_transform_neon.cc | 3128 ++++++++++++++++++++++ src/dsp/arm/inverse_transform_neon.h | 52 + src/dsp/arm/loop_filter_neon.cc | 1190 +++++++++ src/dsp/arm/loop_filter_neon.h | 53 + src/dsp/arm/loop_restoration_neon.cc | 1901 +++++++++++++ src/dsp/arm/loop_restoration_neon.h | 40 + src/dsp/arm/mask_blend_neon.cc | 444 ++++ src/dsp/arm/mask_blend_neon.h | 41 + src/dsp/arm/motion_field_projection_neon.cc | 393 +++ src/dsp/arm/motion_field_projection_neon.h | 39 + src/dsp/arm/motion_vector_search_neon.cc | 267 ++ src/dsp/arm/motion_vector_search_neon.h | 39 + src/dsp/arm/obmc_neon.cc | 392 +++ src/dsp/arm/obmc_neon.h | 38 + src/dsp/arm/super_res_neon.cc | 166 ++ src/dsp/arm/super_res_neon.h | 37 + src/dsp/arm/warp_neon.cc | 453 ++++ src/dsp/arm/warp_neon.h | 37 + src/dsp/arm/weight_mask_neon.cc | 463 ++++ src/dsp/arm/weight_mask_neon.h | 52 + src/dsp/average_blend.cc | 101 + src/dsp/average_blend.h | 47 + src/dsp/cdef.cc | 306 +++ src/dsp/cdef.h | 47 + src/dsp/cdef.inc | 29 + src/dsp/common.h | 82 + src/dsp/constants.cc | 103 + src/dsp/constants.h | 71 + src/dsp/convolve.cc | 876 ++++++ src/dsp/convolve.h | 49 + src/dsp/convolve.inc | 50 + src/dsp/distance_weighted_blend.cc | 101 + src/dsp/distance_weighted_blend.h | 47 + src/dsp/dsp.cc | 150 ++ src/dsp/dsp.h | 910 +++++++ src/dsp/film_grain.cc | 870 ++++++ src/dsp/film_grain.h | 39 + src/dsp/film_grain_common.h | 78 + src/dsp/intra_edge.cc | 115 + src/dsp/intra_edge.h | 48 + src/dsp/intrapred.cc | 2911 ++++++++++++++++++++ src/dsp/intrapred.h | 49 + src/dsp/inverse_transform.cc | 1636 ++++++++++++ src/dsp/inverse_transform.h | 47 + src/dsp/inverse_transform.inc | 64 + src/dsp/libgav1_dsp.cmake | 176 ++ src/dsp/loop_filter.cc | 616 +++++ src/dsp/loop_filter.h | 47 + src/dsp/loop_restoration.cc | 936 +++++++ src/dsp/loop_restoration.h | 85 + src/dsp/mask_blend.cc | 207 ++ src/dsp/mask_blend.h | 49 + src/dsp/motion_field_projection.cc | 138 + src/dsp/motion_field_projection.h | 48 + src/dsp/motion_vector_search.cc | 211 ++ src/dsp/motion_vector_search.h | 49 + src/dsp/obmc.cc | 125 + src/dsp/obmc.h | 47 + src/dsp/obmc.inc | 32 + src/dsp/super_res.cc | 109 + src/dsp/super_res.h | 47 + src/dsp/warp.cc | 475 ++++ src/dsp/warp.h | 47 + src/dsp/weight_mask.cc | 227 ++ src/dsp/weight_mask.h | 47 + src/dsp/x86/average_blend_sse4.cc | 156 ++ src/dsp/x86/average_blend_sse4.h | 41 + src/dsp/x86/cdef_sse4.cc | 728 +++++ src/dsp/x86/cdef_sse4.h | 45 + src/dsp/x86/common_avx2.h | 138 + src/dsp/x86/common_sse4.h | 265 ++ src/dsp/x86/convolve_avx2.cc | 534 ++++ src/dsp/x86/convolve_avx2.h | 43 + src/dsp/x86/convolve_sse4.cc | 2830 ++++++++++++++++++++ src/dsp/x86/convolve_sse4.h | 75 + src/dsp/x86/distance_weighted_blend_sse4.cc | 230 ++ src/dsp/x86/distance_weighted_blend_sse4.h | 41 + src/dsp/x86/intra_edge_sse4.cc | 270 ++ src/dsp/x86/intra_edge_sse4.h | 46 + src/dsp/x86/intrapred_cfl_sse4.cc | 976 +++++++ src/dsp/x86/intrapred_smooth_sse4.cc | 2662 +++++++++++++++++++ src/dsp/x86/intrapred_sse4.cc | 3535 +++++++++++++++++++++++++ src/dsp/x86/intrapred_sse4.h | 1060 ++++++++ src/dsp/x86/inverse_transform_sse4.cc | 3086 +++++++++++++++++++++ src/dsp/x86/inverse_transform_sse4.h | 89 + src/dsp/x86/loop_filter_sse4.cc | 2256 ++++++++++++++++ src/dsp/x86/loop_filter_sse4.h | 119 + src/dsp/x86/loop_restoration_10bit_avx2.cc | 592 +++++ src/dsp/x86/loop_restoration_10bit_sse4.cc | 551 ++++ src/dsp/x86/loop_restoration_avx2.cc | 2902 ++++++++++++++++++++ src/dsp/x86/loop_restoration_avx2.h | 52 + src/dsp/x86/loop_restoration_sse4.cc | 2549 ++++++++++++++++++ src/dsp/x86/loop_restoration_sse4.h | 52 + src/dsp/x86/mask_blend_sse4.cc | 447 ++++ src/dsp/x86/mask_blend_sse4.h | 60 + src/dsp/x86/motion_field_projection_sse4.cc | 397 +++ src/dsp/x86/motion_field_projection_sse4.h | 41 + src/dsp/x86/motion_vector_search_sse4.cc | 262 ++ src/dsp/x86/motion_vector_search_sse4.h | 41 + src/dsp/x86/obmc_sse4.cc | 329 +++ src/dsp/x86/obmc_sse4.h | 43 + src/dsp/x86/super_res_sse4.cc | 166 ++ src/dsp/x86/super_res_sse4.h | 38 + src/dsp/x86/transpose_sse4.h | 307 +++ src/dsp/x86/warp_sse4.cc | 525 ++++ src/dsp/x86/warp_sse4.h | 44 + src/dsp/x86/weight_mask_sse4.cc | 464 ++++ src/dsp/x86/weight_mask_sse4.h | 104 + src/film_grain.cc | 817 ++++++ src/film_grain.h | 193 ++ src/frame_buffer.cc | 151 ++ src/frame_buffer_utils.h | 78 + src/frame_scratch_buffer.h | 113 + src/gav1/decoder.h | 148 ++ src/gav1/decoder_buffer.h | 279 ++ src/gav1/decoder_settings.h | 144 + src/gav1/frame_buffer.h | 177 ++ src/gav1/status_code.h | 118 + src/gav1/symbol_visibility.h | 88 + src/gav1/version.h | 71 + src/inter_intra_masks.inc | 581 ++++ src/internal_frame_buffer_list.cc | 122 + src/internal_frame_buffer_list.h | 81 + src/libgav1_decoder.cmake | 157 ++ src/loop_restoration_info.cc | 240 ++ src/loop_restoration_info.h | 104 + src/motion_vector.cc | 1001 +++++++ src/motion_vector.h | 59 + src/obu_parser.cc | 2885 ++++++++++++++++++++ src/obu_parser.h | 406 +++ src/post_filter.h | 565 ++++ src/post_filter/cdef.cc | 660 +++++ src/post_filter/deblock.cc | 523 ++++ src/post_filter/deblock_thresholds.inc | 85 + src/post_filter/loop_restoration.cc | 172 ++ src/post_filter/post_filter.cc | 601 +++++ src/post_filter/super_res.cc | 199 ++ src/prediction_mask.cc | 236 ++ src/prediction_mask.h | 41 + src/quantizer.cc | 269 ++ src/quantizer.h | 74 + src/quantizer_tables.inc | 3080 +++++++++++++++++++++ src/reconstruction.cc | 190 ++ src/reconstruction.h | 54 + src/residual_buffer_pool.cc | 142 + src/residual_buffer_pool.h | 203 ++ src/scan_tables.inc | 440 +++ src/status_code.cc | 57 + src/symbol_decoder_context.cc | 322 +++ src/symbol_decoder_context.h | 301 +++ src/symbol_decoder_context_cdfs.inc | 2509 ++++++++++++++++++ src/threading_strategy.cc | 222 ++ src/threading_strategy.h | 131 + src/tile.h | 914 +++++++ src/tile/bitstream/mode_info.cc | 1303 +++++++++ src/tile/bitstream/palette.cc | 319 +++ src/tile/bitstream/partition.cc | 148 ++ src/tile/bitstream/transform_size.cc | 222 ++ src/tile/prediction.cc | 1361 ++++++++++ src/tile/tile.cc | 2573 ++++++++++++++++++ src/tile_scratch_buffer.cc | 26 + src/tile_scratch_buffer.h | 160 ++ src/utils/array_2d.h | 131 + src/utils/bit_mask_set.h | 79 + src/utils/bit_reader.cc | 117 + src/utils/bit_reader.h | 49 + src/utils/block_parameters_holder.cc | 107 + src/utils/block_parameters_holder.h | 85 + src/utils/blocking_counter.h | 97 + src/utils/common.h | 534 ++++ src/utils/compiler_attributes.h | 181 ++ src/utils/constants.cc | 874 ++++++ src/utils/constants.h | 744 ++++++ src/utils/cpu.cc | 84 + src/utils/cpu.h | 107 + src/utils/dynamic_buffer.h | 82 + src/utils/entropy_decoder.cc | 1117 ++++++++ src/utils/entropy_decoder.h | 123 + src/utils/executor.cc | 21 + src/utils/executor.h | 36 + src/utils/libgav1_utils.cmake | 72 + src/utils/logging.cc | 65 + src/utils/logging.h | 85 + src/utils/memory.h | 237 ++ src/utils/parameter_tree.cc | 133 + src/utils/parameter_tree.h | 113 + src/utils/queue.h | 105 + src/utils/raw_bit_reader.cc | 224 ++ src/utils/raw_bit_reader.h | 78 + src/utils/reference_info.h | 92 + src/utils/segmentation.cc | 31 + src/utils/segmentation.h | 32 + src/utils/segmentation_map.cc | 49 + src/utils/segmentation_map.h | 71 + src/utils/stack.h | 59 + src/utils/threadpool.cc | 323 +++ src/utils/threadpool.h | 167 ++ src/utils/types.h | 525 ++++ src/utils/unbounded_queue.h | 245 ++ src/utils/vector.h | 352 +++ src/version.cc | 39 + src/warp_prediction.cc | 244 ++ src/warp_prediction.h | 40 + src/yuv_buffer.cc | 201 ++ src/yuv_buffer.h | 183 ++ tests/fuzzer/decoder_fuzzer.cc | 87 + tests/fuzzer/decoder_fuzzer_frame_parallel.cc | 139 + tests/fuzzer/fuzzer_temp_file.h | 148 ++ tests/fuzzer/obu_parser_fuzzer.cc | 89 + 273 files changed, 102925 insertions(+) create mode 100644 .gitignore create mode 100644 AUTHORS create mode 100644 CMakeLists.txt create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE create mode 100644 README.md create mode 100644 cmake/libgav1-config.cmake.template create mode 100644 cmake/libgav1.pc.template create mode 100644 cmake/libgav1_build_definitions.cmake create mode 100644 cmake/libgav1_cpu_detection.cmake create mode 100644 cmake/libgav1_flags.cmake create mode 100644 cmake/libgav1_helpers.cmake create mode 100644 cmake/libgav1_install.cmake create mode 100644 cmake/libgav1_intrinsics.cmake create mode 100644 cmake/libgav1_options.cmake create mode 100644 cmake/libgav1_sanitizer.cmake create mode 100644 cmake/libgav1_targets.cmake create mode 100644 cmake/libgav1_variables.cmake create mode 100644 cmake/toolchains/aarch64-linux-gnu.cmake create mode 100644 cmake/toolchains/android.cmake create mode 100644 cmake/toolchains/arm-linux-gnueabihf.cmake create mode 100644 codereview.settings create mode 100644 examples/file_reader.cc create mode 100644 examples/file_reader.h create mode 100644 examples/file_reader_constants.cc create mode 100644 examples/file_reader_constants.h create mode 100644 examples/file_reader_factory.cc create mode 100644 examples/file_reader_factory.h create mode 100644 examples/file_reader_interface.h create mode 100644 examples/file_writer.cc create mode 100644 examples/file_writer.h create mode 100644 examples/gav1_decode.cc create mode 100644 examples/gav1_decode_cv_pixel_buffer_pool.cc create mode 100644 examples/gav1_decode_cv_pixel_buffer_pool.h create mode 100644 examples/ivf_parser.cc create mode 100644 examples/ivf_parser.h create mode 100644 examples/libgav1_examples.cmake create mode 100644 examples/logging.h create mode 100644 src/buffer_pool.cc create mode 100644 src/buffer_pool.h create mode 100644 src/decoder.cc create mode 100644 src/decoder_impl.cc create mode 100644 src/decoder_impl.h create mode 100644 src/decoder_settings.cc create mode 100644 src/decoder_state.h create mode 100644 src/dsp/arm/average_blend_neon.cc create mode 100644 src/dsp/arm/average_blend_neon.h create mode 100644 src/dsp/arm/cdef_neon.cc create mode 100644 src/dsp/arm/cdef_neon.h create mode 100644 src/dsp/arm/common_neon.h create mode 100644 src/dsp/arm/convolve_neon.cc create mode 100644 src/dsp/arm/convolve_neon.h create mode 100644 src/dsp/arm/distance_weighted_blend_neon.cc create mode 100644 src/dsp/arm/distance_weighted_blend_neon.h create mode 100644 src/dsp/arm/film_grain_neon.cc create mode 100644 src/dsp/arm/film_grain_neon.h create mode 100644 src/dsp/arm/intra_edge_neon.cc create mode 100644 src/dsp/arm/intra_edge_neon.h create mode 100644 src/dsp/arm/intrapred_cfl_neon.cc create mode 100644 src/dsp/arm/intrapred_directional_neon.cc create mode 100644 src/dsp/arm/intrapred_filter_intra_neon.cc create mode 100644 src/dsp/arm/intrapred_neon.cc create mode 100644 src/dsp/arm/intrapred_neon.h create mode 100644 src/dsp/arm/intrapred_smooth_neon.cc create mode 100644 src/dsp/arm/inverse_transform_neon.cc create mode 100644 src/dsp/arm/inverse_transform_neon.h create mode 100644 src/dsp/arm/loop_filter_neon.cc create mode 100644 src/dsp/arm/loop_filter_neon.h create mode 100644 src/dsp/arm/loop_restoration_neon.cc create mode 100644 src/dsp/arm/loop_restoration_neon.h create mode 100644 src/dsp/arm/mask_blend_neon.cc create mode 100644 src/dsp/arm/mask_blend_neon.h create mode 100644 src/dsp/arm/motion_field_projection_neon.cc create mode 100644 src/dsp/arm/motion_field_projection_neon.h create mode 100644 src/dsp/arm/motion_vector_search_neon.cc create mode 100644 src/dsp/arm/motion_vector_search_neon.h create mode 100644 src/dsp/arm/obmc_neon.cc create mode 100644 src/dsp/arm/obmc_neon.h create mode 100644 src/dsp/arm/super_res_neon.cc create mode 100644 src/dsp/arm/super_res_neon.h create mode 100644 src/dsp/arm/warp_neon.cc create mode 100644 src/dsp/arm/warp_neon.h create mode 100644 src/dsp/arm/weight_mask_neon.cc create mode 100644 src/dsp/arm/weight_mask_neon.h create mode 100644 src/dsp/average_blend.cc create mode 100644 src/dsp/average_blend.h create mode 100644 src/dsp/cdef.cc create mode 100644 src/dsp/cdef.h create mode 100644 src/dsp/cdef.inc create mode 100644 src/dsp/common.h create mode 100644 src/dsp/constants.cc create mode 100644 src/dsp/constants.h create mode 100644 src/dsp/convolve.cc create mode 100644 src/dsp/convolve.h create mode 100644 src/dsp/convolve.inc create mode 100644 src/dsp/distance_weighted_blend.cc create mode 100644 src/dsp/distance_weighted_blend.h create mode 100644 src/dsp/dsp.cc create mode 100644 src/dsp/dsp.h create mode 100644 src/dsp/film_grain.cc create mode 100644 src/dsp/film_grain.h create mode 100644 src/dsp/film_grain_common.h create mode 100644 src/dsp/intra_edge.cc create mode 100644 src/dsp/intra_edge.h create mode 100644 src/dsp/intrapred.cc create mode 100644 src/dsp/intrapred.h create mode 100644 src/dsp/inverse_transform.cc create mode 100644 src/dsp/inverse_transform.h create mode 100644 src/dsp/inverse_transform.inc create mode 100644 src/dsp/libgav1_dsp.cmake create mode 100644 src/dsp/loop_filter.cc create mode 100644 src/dsp/loop_filter.h create mode 100644 src/dsp/loop_restoration.cc create mode 100644 src/dsp/loop_restoration.h create mode 100644 src/dsp/mask_blend.cc create mode 100644 src/dsp/mask_blend.h create mode 100644 src/dsp/motion_field_projection.cc create mode 100644 src/dsp/motion_field_projection.h create mode 100644 src/dsp/motion_vector_search.cc create mode 100644 src/dsp/motion_vector_search.h create mode 100644 src/dsp/obmc.cc create mode 100644 src/dsp/obmc.h create mode 100644 src/dsp/obmc.inc create mode 100644 src/dsp/super_res.cc create mode 100644 src/dsp/super_res.h create mode 100644 src/dsp/warp.cc create mode 100644 src/dsp/warp.h create mode 100644 src/dsp/weight_mask.cc create mode 100644 src/dsp/weight_mask.h create mode 100644 src/dsp/x86/average_blend_sse4.cc create mode 100644 src/dsp/x86/average_blend_sse4.h create mode 100644 src/dsp/x86/cdef_sse4.cc create mode 100644 src/dsp/x86/cdef_sse4.h create mode 100644 src/dsp/x86/common_avx2.h create mode 100644 src/dsp/x86/common_sse4.h create mode 100644 src/dsp/x86/convolve_avx2.cc create mode 100644 src/dsp/x86/convolve_avx2.h create mode 100644 src/dsp/x86/convolve_sse4.cc create mode 100644 src/dsp/x86/convolve_sse4.h create mode 100644 src/dsp/x86/distance_weighted_blend_sse4.cc create mode 100644 src/dsp/x86/distance_weighted_blend_sse4.h create mode 100644 src/dsp/x86/intra_edge_sse4.cc create mode 100644 src/dsp/x86/intra_edge_sse4.h create mode 100644 src/dsp/x86/intrapred_cfl_sse4.cc create mode 100644 src/dsp/x86/intrapred_smooth_sse4.cc create mode 100644 src/dsp/x86/intrapred_sse4.cc create mode 100644 src/dsp/x86/intrapred_sse4.h create mode 100644 src/dsp/x86/inverse_transform_sse4.cc create mode 100644 src/dsp/x86/inverse_transform_sse4.h create mode 100644 src/dsp/x86/loop_filter_sse4.cc create mode 100644 src/dsp/x86/loop_filter_sse4.h create mode 100644 src/dsp/x86/loop_restoration_10bit_avx2.cc create mode 100644 src/dsp/x86/loop_restoration_10bit_sse4.cc create mode 100644 src/dsp/x86/loop_restoration_avx2.cc create mode 100644 src/dsp/x86/loop_restoration_avx2.h create mode 100644 src/dsp/x86/loop_restoration_sse4.cc create mode 100644 src/dsp/x86/loop_restoration_sse4.h create mode 100644 src/dsp/x86/mask_blend_sse4.cc create mode 100644 src/dsp/x86/mask_blend_sse4.h create mode 100644 src/dsp/x86/motion_field_projection_sse4.cc create mode 100644 src/dsp/x86/motion_field_projection_sse4.h create mode 100644 src/dsp/x86/motion_vector_search_sse4.cc create mode 100644 src/dsp/x86/motion_vector_search_sse4.h create mode 100644 src/dsp/x86/obmc_sse4.cc create mode 100644 src/dsp/x86/obmc_sse4.h create mode 100644 src/dsp/x86/super_res_sse4.cc create mode 100644 src/dsp/x86/super_res_sse4.h create mode 100644 src/dsp/x86/transpose_sse4.h create mode 100644 src/dsp/x86/warp_sse4.cc create mode 100644 src/dsp/x86/warp_sse4.h create mode 100644 src/dsp/x86/weight_mask_sse4.cc create mode 100644 src/dsp/x86/weight_mask_sse4.h create mode 100644 src/film_grain.cc create mode 100644 src/film_grain.h create mode 100644 src/frame_buffer.cc create mode 100644 src/frame_buffer_utils.h create mode 100644 src/frame_scratch_buffer.h create mode 100644 src/gav1/decoder.h create mode 100644 src/gav1/decoder_buffer.h create mode 100644 src/gav1/decoder_settings.h create mode 100644 src/gav1/frame_buffer.h create mode 100644 src/gav1/status_code.h create mode 100644 src/gav1/symbol_visibility.h create mode 100644 src/gav1/version.h create mode 100644 src/inter_intra_masks.inc create mode 100644 src/internal_frame_buffer_list.cc create mode 100644 src/internal_frame_buffer_list.h create mode 100644 src/libgav1_decoder.cmake create mode 100644 src/loop_restoration_info.cc create mode 100644 src/loop_restoration_info.h create mode 100644 src/motion_vector.cc create mode 100644 src/motion_vector.h create mode 100644 src/obu_parser.cc create mode 100644 src/obu_parser.h create mode 100644 src/post_filter.h create mode 100644 src/post_filter/cdef.cc create mode 100644 src/post_filter/deblock.cc create mode 100644 src/post_filter/deblock_thresholds.inc create mode 100644 src/post_filter/loop_restoration.cc create mode 100644 src/post_filter/post_filter.cc create mode 100644 src/post_filter/super_res.cc create mode 100644 src/prediction_mask.cc create mode 100644 src/prediction_mask.h create mode 100644 src/quantizer.cc create mode 100644 src/quantizer.h create mode 100644 src/quantizer_tables.inc create mode 100644 src/reconstruction.cc create mode 100644 src/reconstruction.h create mode 100644 src/residual_buffer_pool.cc create mode 100644 src/residual_buffer_pool.h create mode 100644 src/scan_tables.inc create mode 100644 src/status_code.cc create mode 100644 src/symbol_decoder_context.cc create mode 100644 src/symbol_decoder_context.h create mode 100644 src/symbol_decoder_context_cdfs.inc create mode 100644 src/threading_strategy.cc create mode 100644 src/threading_strategy.h create mode 100644 src/tile.h create mode 100644 src/tile/bitstream/mode_info.cc create mode 100644 src/tile/bitstream/palette.cc create mode 100644 src/tile/bitstream/partition.cc create mode 100644 src/tile/bitstream/transform_size.cc create mode 100644 src/tile/prediction.cc create mode 100644 src/tile/tile.cc create mode 100644 src/tile_scratch_buffer.cc create mode 100644 src/tile_scratch_buffer.h create mode 100644 src/utils/array_2d.h create mode 100644 src/utils/bit_mask_set.h create mode 100644 src/utils/bit_reader.cc create mode 100644 src/utils/bit_reader.h create mode 100644 src/utils/block_parameters_holder.cc create mode 100644 src/utils/block_parameters_holder.h create mode 100644 src/utils/blocking_counter.h create mode 100644 src/utils/common.h create mode 100644 src/utils/compiler_attributes.h create mode 100644 src/utils/constants.cc create mode 100644 src/utils/constants.h create mode 100644 src/utils/cpu.cc create mode 100644 src/utils/cpu.h create mode 100644 src/utils/dynamic_buffer.h create mode 100644 src/utils/entropy_decoder.cc create mode 100644 src/utils/entropy_decoder.h create mode 100644 src/utils/executor.cc create mode 100644 src/utils/executor.h create mode 100644 src/utils/libgav1_utils.cmake create mode 100644 src/utils/logging.cc create mode 100644 src/utils/logging.h create mode 100644 src/utils/memory.h create mode 100644 src/utils/parameter_tree.cc create mode 100644 src/utils/parameter_tree.h create mode 100644 src/utils/queue.h create mode 100644 src/utils/raw_bit_reader.cc create mode 100644 src/utils/raw_bit_reader.h create mode 100644 src/utils/reference_info.h create mode 100644 src/utils/segmentation.cc create mode 100644 src/utils/segmentation.h create mode 100644 src/utils/segmentation_map.cc create mode 100644 src/utils/segmentation_map.h create mode 100644 src/utils/stack.h create mode 100644 src/utils/threadpool.cc create mode 100644 src/utils/threadpool.h create mode 100644 src/utils/types.h create mode 100644 src/utils/unbounded_queue.h create mode 100644 src/utils/vector.h create mode 100644 src/version.cc create mode 100644 src/warp_prediction.cc create mode 100644 src/warp_prediction.h create mode 100644 src/yuv_buffer.cc create mode 100644 src/yuv_buffer.h create mode 100644 tests/fuzzer/decoder_fuzzer.cc create mode 100644 tests/fuzzer/decoder_fuzzer_frame_parallel.cc create mode 100644 tests/fuzzer/fuzzer_temp_file.h create mode 100644 tests/fuzzer/obu_parser_fuzzer.cc diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..87ccf24 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/build +/third_party diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..d92ea0a --- /dev/null +++ b/AUTHORS @@ -0,0 +1,6 @@ +# This is the list of libgav1 authors for copyright purposes. +# +# This does not necessarily list everyone who has contributed code, since in +# some cases, their employer may be the copyright holder. To see the full list +# of contributors, see the revision history in source control. +Google LLC diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..5d00ae6 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,126 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# libgav1 requires modern CMake. +cmake_minimum_required(VERSION 3.7.1 FATAL_ERROR) + +# libgav1 requires C++11. +set(CMAKE_CXX_STANDARD 11) +set(ABSL_CXX_STANDARD 11) + +project(libgav1 CXX) + +set(libgav1_root "${CMAKE_CURRENT_SOURCE_DIR}") +set(libgav1_build "${CMAKE_BINARY_DIR}") + +if("${libgav1_root}" STREQUAL "${libgav1_build}") + message( + FATAL_ERROR + "Building from within the libgav1 source tree is not supported.\n" + "Hint: Run these commands\n" "$ rm -rf CMakeCache.txt CMakeFiles\n" + "$ mkdir -p ../libgav1_build\n" "$ cd ../libgav1_build\n" + "And re-run CMake from the libgav1_build directory.") +endif() + +set(libgav1_examples "${libgav1_root}/examples") +set(libgav1_source "${libgav1_root}/src") + +include(FindThreads) + +include("${libgav1_examples}/libgav1_examples.cmake") +include("${libgav1_root}/cmake/libgav1_build_definitions.cmake") +include("${libgav1_root}/cmake/libgav1_cpu_detection.cmake") +include("${libgav1_root}/cmake/libgav1_flags.cmake") +include("${libgav1_root}/cmake/libgav1_helpers.cmake") +include("${libgav1_root}/cmake/libgav1_install.cmake") +include("${libgav1_root}/cmake/libgav1_intrinsics.cmake") +include("${libgav1_root}/cmake/libgav1_options.cmake") +include("${libgav1_root}/cmake/libgav1_sanitizer.cmake") +include("${libgav1_root}/cmake/libgav1_targets.cmake") +include("${libgav1_root}/cmake/libgav1_variables.cmake") +include("${libgav1_source}/dsp/libgav1_dsp.cmake") +include("${libgav1_source}/libgav1_decoder.cmake") +include("${libgav1_source}/utils/libgav1_utils.cmake") + +libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING + "Enables optimized code." VALUE ON) +libgav1_option(NAME LIBGAV1_ENABLE_AVX2 HELPSTRING + "Enables avx2 optimizations." VALUE ON) +libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations." + VALUE ON) +libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING + "Enables sse4.1 optimizations." VALUE ON) +libgav1_option( + NAME LIBGAV1_VERBOSE HELPSTRING + "Enables verbose build system output. Higher numbers are more verbose." VALUE + OFF) + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +libgav1_optimization_detect() +libgav1_set_build_definitions() +libgav1_set_cxx_flags() +libgav1_configure_sanitizer() + +# Supported bit depth. +libgav1_track_configuration_variable(LIBGAV1_MAX_BITDEPTH) + +# C++ and linker flags. +libgav1_track_configuration_variable(LIBGAV1_CXX_FLAGS) +libgav1_track_configuration_variable(LIBGAV1_EXE_LINKER_FLAGS) + +# Sanitizer integration. +libgav1_track_configuration_variable(LIBGAV1_SANITIZE) + +# Generated source file directory. +libgav1_track_configuration_variable(LIBGAV1_GENERATED_SOURCES_DIRECTORY) + +# Controls use of std::mutex and absl::Mutex in ThreadPool. +libgav1_track_configuration_variable(LIBGAV1_THREADPOOL_USE_STD_MUTEX) + +if(LIBGAV1_VERBOSE) + libgav1_dump_cmake_flag_variables() + libgav1_dump_tracked_configuration_variables() + libgav1_dump_options() +endif() + +set(libgav1_abseil_build "${libgav1_build}/abseil") +set(libgav1_gtest_build "${libgav1_build}/gtest") + +# Compiler/linker flags must be lists, but come in from the environment as +# strings. Break them up: +if(NOT "${LIBGAV1_CXX_FLAGS}" STREQUAL "") + separate_arguments(LIBGAV1_CXX_FLAGS) +endif() +if(NOT "${LIBGAV1_EXE_LINKER_FLAGS}" STREQUAL "") + separate_arguments(LIBGAV1_EXE_LINKER_FLAGS) +endif() + +add_subdirectory("${libgav1_root}/third_party/abseil-cpp" + "${libgav1_abseil_build}" EXCLUDE_FROM_ALL) + +libgav1_reset_target_lists() +libgav1_add_dsp_targets() +libgav1_add_decoder_targets() +libgav1_add_examples_targets() +libgav1_add_utils_targets() +libgav1_setup_install_target() + +if(LIBGAV1_VERBOSE) + libgav1_dump_cmake_flag_variables() + libgav1_dump_tracked_configuration_variables() + libgav1_dump_options() +endif() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..69140ff --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,27 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. There are +just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use a [Gerrit](https://www.gerritcodereview.com) instance hosted at +https://chromium-review.googlesource.com for this purpose. + +## Community Guidelines + +This project follows +[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..8ab8eab --- /dev/null +++ b/README.md @@ -0,0 +1,168 @@ +# libgav1 -- an AV1 decoder + +libgav1 is a Main profile (0) & High profile (1) compliant AV1 decoder. More +information on the AV1 video format can be found at +[aomedia.org](https://aomedia.org). + +[TOC] + +## Building + +### Prerequisites + +1. A C++11 compiler. gcc 6+, clang 7+ or Microsoft Visual Studio 2017+ are + recommended. + +2. [CMake >= 3.7.1](https://cmake.org/download/) + +3. [Abseil](https://abseil.io) + + From within the libgav1 directory: + + ```shell + $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp + ``` + +### Compile + +```shell + $ mkdir build && cd build + $ cmake -G "Unix Makefiles" .. + $ make +``` + +Configuration options: + +* `LIBGAV1_MAX_BITDEPTH`: defines the maximum supported bitdepth (8, 10; + default: 10). +* `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS`: define to a non-zero value to disable + [symbol reduction](#symbol-reduction) in an optimized build to keep all + versions of dsp functions available. Automatically defined in + `src/dsp/dsp.h` if unset. +* `LIBGAV1_ENABLE_AVX2`: define to a non-zero value to enable avx2 + optimizations. Automatically defined in `src/utils/cpu.h` if unset. +* `LIBGAV1_ENABLE_NEON`: define to a non-zero value to enable NEON + optimizations. Automatically defined in `src/utils/cpu.h` if unset. +* `LIBGAV1_ENABLE_SSE4_1`: define to a non-zero value to enable sse4.1 + optimizations. Automatically defined in `src/utils/cpu.h` if unset. Note + setting this to 0 will also disable AVX2. +* `LIBGAV1_ENABLE_LOGGING`: define to 0/1 to control debug logging. + Automatically defined in `src/utils/logging.h` if unset. +* `LIBGAV1_EXAMPLES_ENABLE_LOGGING`: define to 0/1 to control error logging in + the examples. Automatically defined in `examples/logging.h` if unset. +* `LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK`: define to 1 to enable transform + coefficient range checks. +* `LIBGAV1_LOG_LEVEL`: controls the maximum allowed log level, see `enum + LogSeverity` in `src/utils/logging.h`. Automatically defined in + `src/utils/logging.cc` if unset. +* `LIBGAV1_THREADPOOL_USE_STD_MUTEX`: controls use of std::mutex and + absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil + dependency from the core library. Automatically defined in + `src/utils/threadpool.h` if unset. +* `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is + allowed to create. Has to be an integer > 0. Otherwise this is ignored. + The default value is 128. +* `LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER`: the threshold multiplier that + is used to determine when to use frame parallel decoding. Frame parallel + decoding will be used if |threads| > |tile_count| * this multiplier. Has to + be an integer > 0. The default value is 4. This is an advanced setting + intended for testing purposes. + +For additional options see: + +```shell + $ cmake .. -LH +``` + +## Testing + +* `gav1_decode` can be used to decode IVF files, see `gav1_decode --help` for + options. Note: tools like [FFmpeg](https://ffmpeg.org) can be used to + convert other container formats to IVF. + +## Development + +### Contributing + +See [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to submit patches. + +### Style + +libgav1 follows the +[Google C++ style guide](https://google.github.io/styleguide/cppguide.html) with +formatting enforced by `clang-format`. + +### Comments + +Comments of the form '`// X.Y(.Z).`', '`Section X.Y(.Z).`' or '`... in the +spec`' reference the relevant section(s) in the +[AV1 specification](http://aomediacodec.github.io/av1-spec/av1-spec.pdf). + +### DSP structure + +* `src/dsp/dsp.cc` defines the main entry point: `libgav1::dsp::DspInit()`. + This handles cpu-detection and initializing each logical unit which populate + `libgav1::dsp::Dsp` function tables. +* `src/dsp/dsp.h` contains function and type definitions for all logical units + (e.g., intra-predictors) +* `src/utils/cpu.h` contains definitions for cpu-detection +* base implementations are located in `src/dsp/*.{h,cc}` with platform + specific optimizations in sub-folders +* unit tests define `DISABLED_Speed` test(s) to allow timing of individual + functions + +#### Symbol reduction + +Based on the build configuration unneeded lesser optimizations are removed using +a hierarchical include and define system. Each logical unit in `src/dsp` should +include all platform specific headers in descending order to allow higher level +optimizations to disable lower level ones. See `src/dsp/loop_filter.h` for an +example. + +Each function receives a new define which can be checked in platform specific +headers. The format is: `LIBGAV1__FunctionName` or +`LIBGAV1__[sub-table-index1][...-indexN]`, e.g., +`LIBGAV1_Dsp8bpp_AverageBlend`, +`LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc`. The Dsp-table name is of +the form `Dspbpp` e.g. `Dsp10bpp` for bitdepth == 10 (bpp stands for +bits per pixel). The indices correspond to enum values used as lookups with +leading 'k' removed. Platform specific headers then should first check if the +symbol is defined and if not set the value to the corresponding +`LIBGAV1_CPU_` value from `src/utils/cpu.h`. + +``` + #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc + #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1 + #endif +``` + +Within each module the code should check if the symbol is defined to its +specific architecture or forced via `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS` before +defining the function. The `DSP_ENABLED_(8|10)BPP_*` macros are available to +simplify this check for optimized code. + +``` + #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc) + ... + + // In unoptimized code use the following structure; there's no equivalent + // define for LIBGAV1_CPU_C as it would require duplicating the function + // defines used in optimized code for only a small benefit to this + // boilerplate. + #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + ... + #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill + ... +``` + +## Bugs + +Please report all bugs to the issue tracker: +https://issuetracker.google.com/issues/new?component=750480&template=1355007 + +## Discussion + +Email: gav1-devel@googlegroups.com + +Web: https://groups.google.com/forum/#!forum/gav1-devel diff --git a/cmake/libgav1-config.cmake.template b/cmake/libgav1-config.cmake.template new file mode 100644 index 0000000..dc253d3 --- /dev/null +++ b/cmake/libgav1-config.cmake.template @@ -0,0 +1,2 @@ +set(LIBGAV1_INCLUDE_DIRS "@LIBGAV1_INCLUDE_DIRS@") +set(LIBGAV1_LIBRARIES "gav1") diff --git a/cmake/libgav1.pc.template b/cmake/libgav1.pc.template new file mode 100644 index 0000000..c571a43 --- /dev/null +++ b/cmake/libgav1.pc.template @@ -0,0 +1,11 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: @PROJECT_NAME@ +Description: AV1 decoder library (@LIBGAV1_MAX_BITDEPTH@-bit). +Version: @LIBGAV1_VERSION@ +Cflags: -I${includedir} +Libs: -L${libdir} -lgav1 +Libs.private: @CMAKE_THREAD_LIBS_INIT@ diff --git a/cmake/libgav1_build_definitions.cmake b/cmake/libgav1_build_definitions.cmake new file mode 100644 index 0000000..b170e7e --- /dev/null +++ b/cmake/libgav1_build_definitions.cmake @@ -0,0 +1,150 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_) + return() +endif() # LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_ +set(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_ 1) + +macro(libgav1_set_build_definitions) + string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase) + + libgav1_load_version_info() + set(LIBGAV1_SOVERSION 0) + + list(APPEND libgav1_include_paths "${libgav1_root}" "${libgav1_root}/src" + "${libgav1_build}" "${libgav1_root}/third_party/abseil-cpp") + list(APPEND libgav1_gtest_include_paths + "third_party/googletest/googlemock/include" + "third_party/googletest/googletest/include" + "third_party/googletest/googletest") + list(APPEND libgav1_test_include_paths ${libgav1_include_paths} + ${libgav1_gtest_include_paths}) + list(APPEND libgav1_defines "LIBGAV1_CMAKE=1" + "LIBGAV1_FLAGS_SRCDIR=\"${libgav1_root}\"" + "LIBGAV1_FLAGS_TMPDIR=\"/tmp\"") + + if(MSVC OR WIN32) + list(APPEND libgav1_defines "_CRT_SECURE_NO_DEPRECATE=1" "NOMINMAX=1") + endif() + + if(ANDROID) + if(CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a") + set(CMAKE_ANDROID_ARM_MODE ON) + endif() + + if(build_type_lowercase MATCHES "rel") + list(APPEND libgav1_base_cxx_flags "-fno-stack-protector") + endif() + endif() + + list(APPEND libgav1_base_cxx_flags "-Wall" "-Wextra" "-Wmissing-declarations" + "-Wno-sign-compare" "-fvisibility=hidden" + "-fvisibility-inlines-hidden") + + if(BUILD_SHARED_LIBS) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) + set(libgav1_dependency libgav1_shared) + else() + set(libgav1_dependency libgav1_static) + endif() + + list(APPEND libgav1_clang_cxx_flags "-Wextra-semi" "-Wmissing-prototypes" + "-Wshorten-64-to-32") + + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6") + # Quiet warnings in copy-list-initialization where {} elision has always + # been allowed. + list(APPEND libgav1_clang_cxx_flags "-Wno-missing-braces") + endif() + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8) + list(APPEND libgav1_clang_cxx_flags "-Wextra-semi-stmt") + endif() + endif() + + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "7") + # Quiet warnings due to potential snprintf() truncation in threadpool.cc. + list(APPEND libgav1_base_cxx_flags "-Wno-format-truncation") + + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7") + # Quiet gcc 6 vs 7 abi warnings: + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728 + list(APPEND libgav1_base_cxx_flags "-Wno-psabi") + list(APPEND ABSL_GCC_FLAGS "-Wno-psabi") + endif() + endif() + endif() + + if(build_type_lowercase MATCHES "rel") + # TODO(tomfinegan): this value is only a concern for the core library and + # can be made smaller if the test targets are avoided. + list(APPEND libgav1_base_cxx_flags "-Wstack-usage=196608") + endif() + + list(APPEND libgav1_msvc_cxx_flags + # Warning level 3. + "/W3" + # Disable warning C4018: + # '' signed/unsigned mismatch + "/wd4018" + # Disable warning C4244: + # 'argument': conversion from '' to + # '', possible loss of data + "/wd4244" + # Disable warning C4267: + # '=': conversion from '' to + # '', possible loss of data + "/wd4267" + # Disable warning C4309: + # 'argument': truncation of constant value + "/wd4309" + # Disable warning C4551: + # function call missing argument list + "/wd4551") + + if(BUILD_SHARED_LIBS) + list(APPEND libgav1_msvc_cxx_flags + # Disable warning C4251: + # 'libgav1::DecoderImpl class member' needs to have + # dll-interface to be used by clients of class + # 'libgav1::Decoder'. + "/wd4251") + endif() + + if(NOT LIBGAV1_MAX_BITDEPTH) + set(LIBGAV1_MAX_BITDEPTH 10) + elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8 AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10) + libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8 or 10.") + endif() + + list(APPEND libgav1_defines "LIBGAV1_MAX_BITDEPTH=${LIBGAV1_MAX_BITDEPTH}") + + if(DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX) + if(NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 0 + AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 1) + libgav1_die("LIBGAV1_THREADPOOL_USE_STD_MUTEX must be 0 or 1.") + endif() + + list(APPEND libgav1_defines + "LIBGAV1_THREADPOOL_USE_STD_MUTEX=${LIBGAV1_THREADPOOL_USE_STD_MUTEX}") + endif() + + # Source file names ending in these suffixes will have the appropriate + # compiler flags added to their compile commands to enable intrinsics. + set(libgav1_avx2_source_file_suffix "avx2.cc") + set(libgav1_neon_source_file_suffix "neon.cc") + set(libgav1_sse4_source_file_suffix "sse4.cc") +endmacro() diff --git a/cmake/libgav1_cpu_detection.cmake b/cmake/libgav1_cpu_detection.cmake new file mode 100644 index 0000000..e17e27c --- /dev/null +++ b/cmake/libgav1_cpu_detection.cmake @@ -0,0 +1,49 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_) + return() +endif() # LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_ +set(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_ 1) + +# Detect optimizations available for the current target CPU. +macro(libgav1_optimization_detect) + if(LIBGAV1_ENABLE_OPTIMIZATIONS) + string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase) + if(cpu_lowercase MATCHES "^arm|^aarch64") + set(libgav1_have_neon ON) + elseif(cpu_lowercase MATCHES "^x86|amd64") + set(libgav1_have_avx2 ON) + set(libgav1_have_sse4 ON) + endif() + endif() + + if(libgav1_have_avx2 AND LIBGAV1_ENABLE_AVX2) + list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=1") + else() + list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=0") + endif() + + if(libgav1_have_neon AND LIBGAV1_ENABLE_NEON) + list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=1") + else() + list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=0") + endif() + + if(libgav1_have_sse4 AND LIBGAV1_ENABLE_SSE4_1) + list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=1") + else() + list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=0") + endif() +endmacro() diff --git a/cmake/libgav1_flags.cmake b/cmake/libgav1_flags.cmake new file mode 100644 index 0000000..2d8d9a6 --- /dev/null +++ b/cmake/libgav1_flags.cmake @@ -0,0 +1,251 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_) + return() +endif() # LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_ +set(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_ 1) + +include(CheckCXXCompilerFlag) +include(CheckCXXSourceCompiles) + +# Adds compiler flags specified by FLAGS to the sources specified by SOURCES: +# +# libgav1_set_compiler_flags_for_sources(SOURCES FLAGS ) +macro(libgav1_set_compiler_flags_for_sources) + unset(compiler_SOURCES) + unset(compiler_FLAGS) + unset(optional_args) + unset(single_value_args) + set(multi_value_args SOURCES FLAGS) + cmake_parse_arguments(compiler "${optional_args}" "${single_value_args}" + "${multi_value_args}" ${ARGN}) + + if(NOT (compiler_SOURCES AND compiler_FLAGS)) + libgav1_die("libgav1_set_compiler_flags_for_sources: SOURCES and " + "FLAGS required.") + endif() + + set_source_files_properties(${compiler_SOURCES} PROPERTIES COMPILE_FLAGS + ${compiler_FLAGS}) + + if(LIBGAV1_VERBOSE GREATER 1) + foreach(source ${compiler_SOURCES}) + foreach(flag ${compiler_FLAGS}) + message("libgav1_set_compiler_flags_for_sources: source:${source} " + "flag:${flag}") + endforeach() + endforeach() + endif() +endmacro() + +# Tests compiler flags stored in list(s) specified by FLAG_LIST_VAR_NAMES, adds +# flags to $LIBGAV1_CXX_FLAGS when tests pass. Terminates configuration if +# FLAG_REQUIRED is specified and any flag check fails. +# +# ~~~ +# libgav1_test_cxx_flag(> +# [FLAG_REQUIRED]) +# ~~~ +macro(libgav1_test_cxx_flag) + unset(cxx_test_FLAG_LIST_VAR_NAMES) + unset(cxx_test_FLAG_REQUIRED) + unset(single_value_args) + set(optional_args FLAG_REQUIRED) + set(multi_value_args FLAG_LIST_VAR_NAMES) + cmake_parse_arguments(cxx_test "${optional_args}" "${single_value_args}" + "${multi_value_args}" ${ARGN}) + + if(NOT cxx_test_FLAG_LIST_VAR_NAMES) + libgav1_die("libgav1_test_cxx_flag: FLAG_LIST_VAR_NAMES required") + endif() + + unset(cxx_flags) + foreach(list_var ${cxx_test_FLAG_LIST_VAR_NAMES}) + if(LIBGAV1_VERBOSE) + message("libgav1_test_cxx_flag: adding ${list_var} to cxx_flags") + endif() + list(APPEND cxx_flags ${${list_var}}) + endforeach() + + if(LIBGAV1_VERBOSE) + message("CXX test: all flags: ${cxx_flags}") + endif() + + unset(all_cxx_flags) + list(APPEND all_cxx_flags ${LIBGAV1_CXX_FLAGS} ${cxx_flags}) + + # Turn off output from check_cxx_source_compiles. Print status directly + # instead since the logging messages from check_cxx_source_compiles can be + # quite confusing. + set(CMAKE_REQUIRED_QUIET TRUE) + + # Run the actual compile test. + unset(libgav1_all_cxx_flags_pass CACHE) + message("--- Running combined CXX flags test, flags: ${all_cxx_flags}") + check_cxx_compiler_flag("${all_cxx_flags}" libgav1_all_cxx_flags_pass) + + if(cxx_test_FLAG_REQUIRED AND NOT libgav1_all_cxx_flags_pass) + libgav1_die("Flag test failed for required flag(s): " + "${all_cxx_flags} and FLAG_REQUIRED specified.") + endif() + + if(libgav1_all_cxx_flags_pass) + # Test passed: update the global flag list used by the libgav1 target + # creation wrappers. + set(LIBGAV1_CXX_FLAGS ${cxx_flags}) + list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS) + + if(LIBGAV1_VERBOSE) + message("LIBGAV1_CXX_FLAGS=${LIBGAV1_CXX_FLAGS}") + endif() + + message("--- Passed combined CXX flags test") + else() + message("--- Failed combined CXX flags test, testing flags individually.") + + if(cxx_flags) + message("--- Testing flags from $cxx_flags: " "${cxx_flags}") + foreach(cxx_flag ${cxx_flags}) + # Between 3.17.0 and 3.18.2 check_cxx_compiler_flag() sets a normal + # variable at parent scope while check_cxx_source_compiles() continues + # to set an internal cache variable, so we unset both to avoid the + # failure / success state persisting between checks. See + # https://gitlab.kitware.com/cmake/cmake/-/issues/21207. + unset(cxx_flag_test_passed) + unset(cxx_flag_test_passed CACHE) + message("--- Testing flag: ${cxx_flag}") + check_cxx_compiler_flag("${cxx_flag}" cxx_flag_test_passed) + + if(cxx_flag_test_passed) + message("--- Passed test for ${cxx_flag}") + else() + list(REMOVE_ITEM cxx_flags ${cxx_flag}) + message("--- Failed test for ${cxx_flag}, flag removed.") + endif() + endforeach() + + set(LIBGAV1_CXX_FLAGS ${cxx_flags}) + endif() + endif() + + if(LIBGAV1_CXX_FLAGS) + list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS) + endif() +endmacro() + +# Tests executable linker flags stored in list specified by FLAG_LIST_VAR_NAME, +# adds flags to $LIBGAV1_EXE_LINKER_FLAGS when test passes. Terminates +# configuration when flag check fails. libgav1_set_cxx_flags() must be called +# before calling this macro because it assumes $LIBGAV1_CXX_FLAGS contains only +# valid CXX flags. +# +# libgav1_test_exe_linker_flag() +macro(libgav1_test_exe_linker_flag) + unset(link_FLAG_LIST_VAR_NAME) + unset(optional_args) + unset(multi_value_args) + set(single_value_args FLAG_LIST_VAR_NAME) + cmake_parse_arguments(link "${optional_args}" "${single_value_args}" + "${multi_value_args}" ${ARGN}) + + if(NOT link_FLAG_LIST_VAR_NAME) + libgav1_die("libgav1_test_link_flag: FLAG_LIST_VAR_NAME required") + endif() + + libgav1_set_and_stringify(DEST linker_flags SOURCE_VARS + ${link_FLAG_LIST_VAR_NAME}) + + if(LIBGAV1_VERBOSE) + message("EXE LINKER test: all flags: ${linker_flags}") + endif() + + # Tests of $LIBGAV1_CXX_FLAGS have already passed. Include them with the + # linker test. + libgav1_set_and_stringify(DEST CMAKE_REQUIRED_FLAGS SOURCE_VARS + LIBGAV1_CXX_FLAGS) + + # Cache the global exe linker flags. + if(CMAKE_EXE_LINKER_FLAGS) + set(cached_CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS}) + libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE + ${linker_flags}) + endif() + + libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE ${linker_flags} + ${CMAKE_EXE_LINKER_FLAGS}) + + # Turn off output from check_cxx_source_compiles. Print status directly + # instead since the logging messages from check_cxx_source_compiles can be + # quite confusing. + set(CMAKE_REQUIRED_QUIET TRUE) + + message("--- Running EXE LINKER test for flags: ${linker_flags}") + + unset(linker_flag_test_passed CACHE) + set(libgav1_cxx_main "\nint main() { return 0; }") + check_cxx_source_compiles("${libgav1_cxx_main}" linker_flag_test_passed) + + if(NOT linker_flag_test_passed) + libgav1_die("EXE LINKER test failed.") + endif() + + message("--- Passed EXE LINKER flag test.") + + # Restore cached global exe linker flags. + if(cached_CMAKE_EXE_LINKER_FLAGS) + set(CMAKE_EXE_LINKER_FLAGS cached_CMAKE_EXE_LINKER_FLAGS) + else() + unset(CMAKE_EXE_LINKER_FLAGS) + endif() +endmacro() + +# Runs the libgav1 compiler tests. This macro builds up the list of list var(s) +# that is passed to libgav1_test_cxx_flag(). +# +# Note: libgav1_set_build_definitions() must be called before this macro. +macro(libgav1_set_cxx_flags) + unset(cxx_flag_lists) + + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") + list(APPEND cxx_flag_lists libgav1_base_cxx_flags) + endif() + + # Append clang flags after the base set to allow -Wno* overrides to take + # effect. Some of the base flags may enable a large set of warnings, e.g., + # -Wall. + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + list(APPEND cxx_flag_lists libgav1_clang_cxx_flags) + endif() + + if(MSVC) + list(APPEND cxx_flag_lists libgav1_msvc_cxx_flags) + endif() + + if(LIBGAV1_VERBOSE) + if(cxx_flag_lists) + libgav1_set_and_stringify(DEST cxx_flags SOURCE_VARS ${cxx_flag_lists}) + message("libgav1_set_cxx_flags: internal CXX flags: ${cxx_flags}") + endif() + endif() + + if(LIBGAV1_CXX_FLAGS) + list(APPEND cxx_flag_lists LIBGAV1_CXX_FLAGS) + if(LIBGAV1_VERBOSE) + message("libgav1_set_cxx_flags: user CXX flags: ${LIBGAV1_CXX_FLAGS}") + endif() + endif() + + libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES ${cxx_flag_lists}) +endmacro() diff --git a/cmake/libgav1_helpers.cmake b/cmake/libgav1_helpers.cmake new file mode 100644 index 0000000..76d8d67 --- /dev/null +++ b/cmake/libgav1_helpers.cmake @@ -0,0 +1,134 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_) + return() +endif() # LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_ +set(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_ 1) + +# Kills build generation using message(FATAL_ERROR) and outputs all data passed +# to the console via use of $ARGN. +macro(libgav1_die) + message(FATAL_ERROR ${ARGN}) +endmacro() + +# Converts semi-colon delimited list variable(s) to string. Output is written to +# variable supplied via the DEST parameter. Input is from an expanded variable +# referenced by SOURCE and/or variable(s) referenced by SOURCE_VARS. +macro(libgav1_set_and_stringify) + set(optional_args) + set(single_value_args DEST SOURCE_VAR) + set(multi_value_args SOURCE SOURCE_VARS) + cmake_parse_arguments(sas "${optional_args}" "${single_value_args}" + "${multi_value_args}" ${ARGN}) + + if(NOT sas_DEST OR NOT (sas_SOURCE OR sas_SOURCE_VARS)) + libgav1_die("libgav1_set_and_stringify: DEST and at least one of SOURCE " + "SOURCE_VARS required.") + endif() + + unset(${sas_DEST}) + + if(sas_SOURCE) + # $sas_SOURCE is one or more expanded variables, just copy the values to + # $sas_DEST. + set(${sas_DEST} "${sas_SOURCE}") + endif() + + if(sas_SOURCE_VARS) + # $sas_SOURCE_VARS is one or more variable names. Each iteration expands a + # variable and appends it to $sas_DEST. + foreach(source_var ${sas_SOURCE_VARS}) + set(${sas_DEST} "${${sas_DEST}} ${${source_var}}") + endforeach() + + # Because $sas_DEST can be empty when entering this scope leading whitespace + # can be introduced to $sas_DEST on the first iteration of the above loop. + # Remove it: + string(STRIP "${${sas_DEST}}" ${sas_DEST}) + endif() + + # Lists in CMake are simply semicolon delimited strings, so stringification is + # just a find and replace of the semicolon. + string(REPLACE ";" " " ${sas_DEST} "${${sas_DEST}}") + + if(LIBGAV1_VERBOSE GREATER 1) + message("libgav1_set_and_stringify: ${sas_DEST}=${${sas_DEST}}") + endif() +endmacro() + +# Creates a dummy source file in $LIBGAV1_GENERATED_SOURCES_DIRECTORY and adds +# it to the specified target. Optionally adds its path to a list variable. +# +# libgav1_create_dummy_source_file( BASENAME > +# [LISTVAR ]) +macro(libgav1_create_dummy_source_file) + set(optional_args) + set(single_value_args TARGET BASENAME LISTVAR) + set(multi_value_args) + cmake_parse_arguments(cdsf "${optional_args}" "${single_value_args}" + "${multi_value_args}" ${ARGN}) + + if(NOT cdsf_TARGET OR NOT cdsf_BASENAME) + libgav1_die( + "libgav1_create_dummy_source_file: TARGET and BASENAME required.") + endif() + + if(NOT LIBGAV1_GENERATED_SOURCES_DIRECTORY) + set(LIBGAV1_GENERATED_SOURCES_DIRECTORY "${libgav1_build}/gen_src") + endif() + + set(dummy_source_dir "${LIBGAV1_GENERATED_SOURCES_DIRECTORY}") + set(dummy_source_file + "${dummy_source_dir}/libgav1_${cdsf_TARGET}_${cdsf_BASENAME}.cc") + set(dummy_source_code + "// Generated file. DO NOT EDIT!\n" + "// C++ source file created for target ${cdsf_TARGET}. \n" + "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void);\n" + "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void) {}\n") + file(WRITE "${dummy_source_file}" "${dummy_source_code}") + + target_sources(${cdsf_TARGET} PRIVATE ${dummy_source_file}) + + if(cdsf_LISTVAR) + list(APPEND ${cdsf_LISTVAR} "${dummy_source_file}") + endif() +endmacro() + +# Loads the version components from $libgav1_source/gav1/version.h and sets the +# corresponding CMake variables: +# - LIBGAV1_MAJOR_VERSION +# - LIBGAV1_MINOR_VERSION +# - LIBGAV1_PATCH_VERSION +# - LIBGAV1_VERSION, which is: +# - $LIBGAV1_MAJOR_VERSION.$LIBGAV1_MINOR_VERSION.$LIBGAV1_PATCH_VERSION +macro(libgav1_load_version_info) + file(STRINGS "${libgav1_source}/gav1/version.h" version_file_strings) + foreach(str ${version_file_strings}) + if(str MATCHES "#define LIBGAV1_") + if(str MATCHES "#define LIBGAV1_MAJOR_VERSION ") + string(REPLACE "#define LIBGAV1_MAJOR_VERSION " "" LIBGAV1_MAJOR_VERSION + "${str}") + elseif(str MATCHES "#define LIBGAV1_MINOR_VERSION ") + string(REPLACE "#define LIBGAV1_MINOR_VERSION " "" LIBGAV1_MINOR_VERSION + "${str}") + elseif(str MATCHES "#define LIBGAV1_PATCH_VERSION ") + string(REPLACE "#define LIBGAV1_PATCH_VERSION " "" LIBGAV1_PATCH_VERSION + "${str}") + endif() + endif() + endforeach() + set(LIBGAV1_VERSION "${LIBGAV1_MAJOR_VERSION}.${LIBGAV1_MINOR_VERSION}") + set(LIBGAV1_VERSION "${LIBGAV1_VERSION}.${LIBGAV1_PATCH_VERSION}") +endmacro() diff --git a/cmake/libgav1_install.cmake b/cmake/libgav1_install.cmake new file mode 100644 index 0000000..b7f6006 --- /dev/null +++ b/cmake/libgav1_install.cmake @@ -0,0 +1,60 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_) + return() +endif() # LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_ +set(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_ 1) + +# Sets up the Libgav1 install targets. Must be called after the static library +# target is created. +macro(libgav1_setup_install_target) + if(NOT (MSVC OR XCODE)) + include(GNUInstallDirs) + + # pkg-config: libgav1.pc + set(prefix "${CMAKE_INSTALL_PREFIX}") + set(exec_prefix "\${prefix}") + set(libdir "\${prefix}/${CMAKE_INSTALL_LIBDIR}") + set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}") + set(libgav1_lib_name "libgav1") + + configure_file("${libgav1_root}/cmake/libgav1.pc.template" + "${libgav1_build}/libgav1.pc" @ONLY NEWLINE_STYLE UNIX) + install(FILES "${libgav1_build}/libgav1.pc" + DESTINATION "${prefix}/${CMAKE_INSTALL_LIBDIR}/pkgconfig") + + # CMake config: libgav1-config.cmake + set(LIBGAV1_INCLUDE_DIRS "${prefix}/${CMAKE_INSTALL_INCLUDEDIR}") + configure_file("${libgav1_root}/cmake/libgav1-config.cmake.template" + "${libgav1_build}/libgav1-config.cmake" @ONLY + NEWLINE_STYLE UNIX) + install( + FILES "${libgav1_build}/libgav1-config.cmake" + DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_DATAROOTDIR}/cmake") + + install( + FILES ${libgav1_api_includes} + DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gav1") + + install(TARGETS gav1_decode DESTINATION + "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}") + install(TARGETS libgav1_static DESTINATION + "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") + if(BUILD_SHARED_LIBS) + install(TARGETS libgav1_shared DESTINATION + "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") + endif() + endif() +endmacro() diff --git a/cmake/libgav1_intrinsics.cmake b/cmake/libgav1_intrinsics.cmake new file mode 100644 index 0000000..a2e9ddb --- /dev/null +++ b/cmake/libgav1_intrinsics.cmake @@ -0,0 +1,135 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_) + return() +endif() # LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_ +set(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_ 1) + +# Returns the compiler flag for the SIMD intrinsics suffix specified by the +# SUFFIX argument via the variable specified by the VARIABLE argument: +# libgav1_get_intrinsics_flag_for_suffix(SUFFIX VARIABLE ) +macro(libgav1_get_intrinsics_flag_for_suffix) + unset(intrinsics_SUFFIX) + unset(intrinsics_VARIABLE) + unset(optional_args) + unset(multi_value_args) + set(single_value_args SUFFIX VARIABLE) + cmake_parse_arguments(intrinsics "${optional_args}" "${single_value_args}" + "${multi_value_args}" ${ARGN}) + + if(NOT (intrinsics_SUFFIX AND intrinsics_VARIABLE)) + message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: SUFFIX and " + "VARIABLE required.") + endif() + + if(intrinsics_SUFFIX MATCHES "neon") + if(NOT MSVC) + set(${intrinsics_VARIABLE} "${LIBGAV1_NEON_INTRINSICS_FLAG}") + endif() + elseif(intrinsics_SUFFIX MATCHES "avx2") + if(MSVC) + set(${intrinsics_VARIABLE} "/arch:AVX2") + else() + set(${intrinsics_VARIABLE} "-mavx2") + endif() + elseif(intrinsics_SUFFIX MATCHES "sse4") + if(NOT MSVC) + set(${intrinsics_VARIABLE} "-msse4.1") + endif() + else() + message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: Unknown " + "instrinics suffix: ${intrinsics_SUFFIX}") + endif() + + if(LIBGAV1_VERBOSE GREATER 1) + message("libgav1_get_intrinsics_flag_for_suffix: " + "suffix:${intrinsics_SUFFIX} flag:${${intrinsics_VARIABLE}}") + endif() +endmacro() + +# Processes source files specified by SOURCES and adds intrinsics flags as +# necessary: libgav1_process_intrinsics_sources(SOURCES ) +# +# Detects requirement for intrinsics flags using source file name suffix. +# Currently supports AVX2 and SSE4.1. +macro(libgav1_process_intrinsics_sources) + unset(arg_TARGET) + unset(arg_SOURCES) + unset(optional_args) + set(single_value_args TARGET) + set(multi_value_args SOURCES) + cmake_parse_arguments(arg "${optional_args}" "${single_value_args}" + "${multi_value_args}" ${ARGN}) + if(NOT (arg_TARGET AND arg_SOURCES)) + message(FATAL_ERROR "libgav1_process_intrinsics_sources: TARGET and " + "SOURCES required.") + endif() + + if(LIBGAV1_ENABLE_AVX2 AND libgav1_have_avx2) + unset(avx2_sources) + list(APPEND avx2_sources ${arg_SOURCES}) + + list(FILTER avx2_sources INCLUDE REGEX + "${libgav1_avx2_source_file_suffix}$") + + if(avx2_sources) + unset(avx2_flags) + libgav1_get_intrinsics_flag_for_suffix(SUFFIX + ${libgav1_avx2_source_file_suffix} + VARIABLE avx2_flags) + if(avx2_flags) + libgav1_set_compiler_flags_for_sources(SOURCES ${avx2_sources} FLAGS + ${avx2_flags}) + endif() + endif() + endif() + + if(LIBGAV1_ENABLE_SSE4_1 AND libgav1_have_sse4) + unset(sse4_sources) + list(APPEND sse4_sources ${arg_SOURCES}) + + list(FILTER sse4_sources INCLUDE REGEX + "${libgav1_sse4_source_file_suffix}$") + + if(sse4_sources) + unset(sse4_flags) + libgav1_get_intrinsics_flag_for_suffix(SUFFIX + ${libgav1_sse4_source_file_suffix} + VARIABLE sse4_flags) + if(sse4_flags) + libgav1_set_compiler_flags_for_sources(SOURCES ${sse4_sources} FLAGS + ${sse4_flags}) + endif() + endif() + endif() + + if(LIBGAV1_ENABLE_NEON AND libgav1_have_neon) + unset(neon_sources) + list(APPEND neon_sources ${arg_SOURCES}) + list(FILTER neon_sources INCLUDE REGEX + "${libgav1_neon_source_file_suffix}$") + + if(neon_sources AND LIBGAV1_NEON_INTRINSICS_FLAG) + unset(neon_flags) + libgav1_get_intrinsics_flag_for_suffix(SUFFIX + ${libgav1_neon_source_file_suffix} + VARIABLE neon_flags) + if(neon_flags) + libgav1_set_compiler_flags_for_sources(SOURCES ${neon_sources} FLAGS + ${neon_flags}) + endif() + endif() + endif() +endmacro() diff --git a/cmake/libgav1_options.cmake b/cmake/libgav1_options.cmake new file mode 100644 index 0000000..6327bee --- /dev/null +++ b/cmake/libgav1_options.cmake @@ -0,0 +1,55 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_) + return() +endif() # LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_ +set(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_) + +# Simple wrapper for CMake's builtin option command that tracks libgav1's build +# options in the list variable $libgav1_options. +macro(libgav1_option) + unset(option_NAME) + unset(option_HELPSTRING) + unset(option_VALUE) + unset(optional_args) + unset(multi_value_args) + set(single_value_args NAME HELPSTRING VALUE) + cmake_parse_arguments(option "${optional_args}" "${single_value_args}" + "${multi_value_args}" ${ARGN}) + + if(NOT (option_NAME AND option_HELPSTRING AND DEFINED option_VALUE)) + message(FATAL_ERROR "libgav1_option: NAME HELPSTRING and VALUE required.") + endif() + + option(${option_NAME} ${option_HELPSTRING} ${option_VALUE}) + + if(LIBGAV1_VERBOSE GREATER 2) + message("--------- libgav1_option ---------\n" + "option_NAME=${option_NAME}\n" + "option_HELPSTRING=${option_HELPSTRING}\n" + "option_VALUE=${option_VALUE}\n" + "------------------------------------------\n") + endif() + + list(APPEND libgav1_options ${option_NAME}) + list(REMOVE_DUPLICATES libgav1_options) +endmacro() + +# Dumps the $libgav1_options list via CMake message command. +macro(libgav1_dump_options) + foreach(option_name ${libgav1_options}) + message("${option_name}: ${${option_name}}") + endforeach() +endmacro() diff --git a/cmake/libgav1_sanitizer.cmake b/cmake/libgav1_sanitizer.cmake new file mode 100644 index 0000000..4bb2263 --- /dev/null +++ b/cmake/libgav1_sanitizer.cmake @@ -0,0 +1,45 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_) + return() +endif() # LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_ +set(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_ 1) + +macro(libgav1_configure_sanitizer) + if(LIBGAV1_SANITIZE AND NOT MSVC) + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + if(LIBGAV1_SANITIZE MATCHES "cfi") + list(APPEND LIBGAV1_CXX_FLAGS "-flto" "-fno-sanitize-trap=cfi") + list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-flto" "-fno-sanitize-trap=cfi" + "-fuse-ld=gold") + endif() + + if(${CMAKE_SIZEOF_VOID_P} EQUAL 4 + AND LIBGAV1_SANITIZE MATCHES "integer|undefined") + list(APPEND LIBGAV1_EXE_LINKER_FLAGS "--rtlib=compiler-rt" "-lgcc_s") + endif() + endif() + + list(APPEND LIBGAV1_CXX_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}") + list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}") + + # Make sanitizer callstacks accurate. + list(APPEND LIBGAV1_CXX_FLAGS "-fno-omit-frame-pointer" + "-fno-optimize-sibling-calls") + + libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED) + libgav1_test_exe_linker_flag(FLAG_LIST_VAR_NAME LIBGAV1_EXE_LINKER_FLAGS) + endif() +endmacro() diff --git a/cmake/libgav1_targets.cmake b/cmake/libgav1_targets.cmake new file mode 100644 index 0000000..78b4865 --- /dev/null +++ b/cmake/libgav1_targets.cmake @@ -0,0 +1,347 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_) + return() +endif() # LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_ +set(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_ 1) + +# Resets list variables used to track libgav1 targets. +macro(libgav1_reset_target_lists) + unset(libgav1_targets) + unset(libgav1_exe_targets) + unset(libgav1_lib_targets) + unset(libgav1_objlib_targets) + unset(libgav1_sources) + unset(libgav1_test_targets) +endmacro() + +# Creates an executable target. The target name is passed as a parameter to the +# NAME argument, and the sources passed as a parameter to the SOURCES argument: +# libgav1_add_test(NAME SOURCES [optional args]) +# +# Optional args: +# cmake-format: off +# - OUTPUT_NAME: Override output file basename. Target basename defaults to +# NAME. +# - TEST: Flag. Presence means treat executable as a test. +# - DEFINES: List of preprocessor macro definitions. +# - INCLUDES: list of include directories for the target. +# - COMPILE_FLAGS: list of compiler flags for the target. +# - LINK_FLAGS: List of linker flags for the target. +# - OBJLIB_DEPS: List of CMake object library target dependencies. +# - LIB_DEPS: List of CMake library dependencies. +# cmake-format: on +# +# Sources passed to this macro are added to $libgav1_test_sources when TEST is +# specified. Otherwise sources are added to $libgav1_sources. +# +# Targets passed to this macro are always added $libgav1_targets. When TEST is +# specified targets are also added to list $libgav1_test_targets. Otherwise +# targets are added to $libgav1_exe_targets. +macro(libgav1_add_executable) + unset(exe_TEST) + unset(exe_TEST_DEFINES_MAIN) + unset(exe_NAME) + unset(exe_OUTPUT_NAME) + unset(exe_SOURCES) + unset(exe_DEFINES) + unset(exe_INCLUDES) + unset(exe_COMPILE_FLAGS) + unset(exe_LINK_FLAGS) + unset(exe_OBJLIB_DEPS) + unset(exe_LIB_DEPS) + set(optional_args TEST) + set(single_value_args NAME OUTPUT_NAME) + set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS + OBJLIB_DEPS LIB_DEPS) + + cmake_parse_arguments(exe "${optional_args}" "${single_value_args}" + "${multi_value_args}" ${ARGN}) + + if(LIBGAV1_VERBOSE GREATER 1) + message("--------- libgav1_add_executable ---------\n" + "exe_TEST=${exe_TEST}\n" + "exe_TEST_DEFINES_MAIN=${exe_TEST_DEFINES_MAIN}\n" + "exe_NAME=${exe_NAME}\n" + "exe_OUTPUT_NAME=${exe_OUTPUT_NAME}\n" + "exe_SOURCES=${exe_SOURCES}\n" + "exe_DEFINES=${exe_DEFINES}\n" + "exe_INCLUDES=${exe_INCLUDES}\n" + "exe_COMPILE_FLAGS=${exe_COMPILE_FLAGS}\n" + "exe_LINK_FLAGS=${exe_LINK_FLAGS}\n" + "exe_OBJLIB_DEPS=${exe_OBJLIB_DEPS}\n" + "exe_LIB_DEPS=${exe_LIB_DEPS}\n" + "------------------------------------------\n") + endif() + + if(NOT (exe_NAME AND exe_SOURCES)) + message(FATAL_ERROR "libgav1_add_executable: NAME and SOURCES required.") + endif() + + list(APPEND libgav1_targets ${exe_NAME}) + if(exe_TEST) + list(APPEND libgav1_test_targets ${exe_NAME}) + list(APPEND libgav1_test_sources ${exe_SOURCES}) + else() + list(APPEND libgav1_exe_targets ${exe_NAME}) + list(APPEND libgav1_sources ${exe_SOURCES}) + endif() + + add_executable(${exe_NAME} ${exe_SOURCES}) + + if(exe_OUTPUT_NAME) + set_target_properties(${exe_NAME} PROPERTIES OUTPUT_NAME ${exe_OUTPUT_NAME}) + endif() + + libgav1_process_intrinsics_sources(TARGET ${exe_NAME} SOURCES ${exe_SOURCES}) + + if(exe_DEFINES) + target_compile_definitions(${exe_NAME} PRIVATE ${exe_DEFINES}) + endif() + + if(exe_INCLUDES) + target_include_directories(${exe_NAME} PRIVATE ${exe_INCLUDES}) + endif() + + if(exe_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS) + target_compile_options(${exe_NAME} + PRIVATE ${exe_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS}) + endif() + + if(exe_LINK_FLAGS OR LIBGAV1_EXE_LINKER_FLAGS) + set_target_properties(${exe_NAME} + PROPERTIES LINK_FLAGS ${exe_LINK_FLAGS} + ${LIBGAV1_EXE_LINKER_FLAGS}) + endif() + + if(exe_OBJLIB_DEPS) + foreach(objlib_dep ${exe_OBJLIB_DEPS}) + target_sources(${exe_NAME} PRIVATE $) + endforeach() + endif() + + if(CMAKE_THREAD_LIBS_INIT) + list(APPEND exe_LIB_DEPS ${CMAKE_THREAD_LIBS_INIT}) + endif() + + if(BUILD_SHARED_LIBS AND (MSVC OR WIN32)) + target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0") + endif() + + if(exe_LIB_DEPS) + unset(exe_static) + if("${CMAKE_EXE_LINKER_FLAGS} ${LIBGAV1_EXE_LINKER_FLAGS}" MATCHES "static") + set(exe_static ON) + endif() + + if(exe_static AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") + # Third party dependencies can introduce dependencies on system and test + # libraries. Since the target created here is an executable, and CMake + # does not provide a method of controlling order of link dependencies, + # wrap all of the dependencies of this target in start/end group flags to + # ensure that dependencies of third party targets can be resolved when + # those dependencies happen to be resolved by dependencies of the current + # target. + list(INSERT exe_LIB_DEPS 0 -Wl,--start-group) + list(APPEND exe_LIB_DEPS -Wl,--end-group) + endif() + target_link_libraries(${exe_NAME} PRIVATE ${exe_LIB_DEPS}) + endif() +endmacro() + +# Creates a library target of the specified type. The target name is passed as a +# parameter to the NAME argument, the type as a parameter to the TYPE argument, +# and the sources passed as a parameter to the SOURCES argument: +# libgav1_add_library(NAME TYPE SOURCES [optional args]) +# +# Optional args: +# cmake-format: off +# - OUTPUT_NAME: Override output file basename. Target basename defaults to +# NAME. OUTPUT_NAME is ignored when BUILD_SHARED_LIBS is enabled and CMake +# is generating a build for which MSVC or WIN32 are true. This is to avoid +# output basename collisions with DLL import libraries. +# - TEST: Flag. Presence means treat library as a test. +# - DEFINES: List of preprocessor macro definitions. +# - INCLUDES: list of include directories for the target. +# - COMPILE_FLAGS: list of compiler flags for the target. +# - LINK_FLAGS: List of linker flags for the target. +# - OBJLIB_DEPS: List of CMake object library target dependencies. +# - LIB_DEPS: List of CMake library dependencies. +# - PUBLIC_INCLUDES: List of include paths to export to dependents. +# cmake-format: on +# +# Sources passed to the macro are added to the lists tracking libgav1 sources: +# cmake-format: off +# - When TEST is specified sources are added to $libgav1_test_sources. +# - Otherwise sources are added to $libgav1_sources. +# cmake-format: on +# +# Targets passed to this macro are added to the lists tracking libgav1 targets: +# cmake-format: off +# - Targets are always added to $libgav1_targets. +# - When the TEST flag is specified, targets are added to +# $libgav1_test_targets. +# - When TEST is not specified: +# - Libraries of type SHARED are added to $libgav1_dylib_targets. +# - Libraries of type OBJECT are added to $libgav1_objlib_targets. +# - Libraries of type STATIC are added to $libgav1_lib_targets. +# cmake-format: on +macro(libgav1_add_library) + unset(lib_TEST) + unset(lib_NAME) + unset(lib_OUTPUT_NAME) + unset(lib_TYPE) + unset(lib_SOURCES) + unset(lib_DEFINES) + unset(lib_INCLUDES) + unset(lib_COMPILE_FLAGS) + unset(lib_LINK_FLAGS) + unset(lib_OBJLIB_DEPS) + unset(lib_LIB_DEPS) + unset(lib_PUBLIC_INCLUDES) + set(optional_args TEST) + set(single_value_args NAME OUTPUT_NAME TYPE) + set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS + OBJLIB_DEPS LIB_DEPS PUBLIC_INCLUDES) + + cmake_parse_arguments(lib "${optional_args}" "${single_value_args}" + "${multi_value_args}" ${ARGN}) + + if(LIBGAV1_VERBOSE GREATER 1) + message("--------- libgav1_add_library ---------\n" + "lib_TEST=${lib_TEST}\n" + "lib_NAME=${lib_NAME}\n" + "lib_OUTPUT_NAME=${lib_OUTPUT_NAME}\n" + "lib_TYPE=${lib_TYPE}\n" + "lib_SOURCES=${lib_SOURCES}\n" + "lib_DEFINES=${lib_DEFINES}\n" + "lib_INCLUDES=${lib_INCLUDES}\n" + "lib_COMPILE_FLAGS=${lib_COMPILE_FLAGS}\n" + "lib_LINK_FLAGS=${lib_LINK_FLAGS}\n" + "lib_OBJLIB_DEPS=${lib_OBJLIB_DEPS}\n" + "lib_LIB_DEPS=${lib_LIB_DEPS}\n" + "lib_PUBLIC_INCLUDES=${lib_PUBLIC_INCLUDES}\n" + "---------------------------------------\n") + endif() + + if(NOT (lib_NAME AND lib_TYPE AND lib_SOURCES)) + message(FATAL_ERROR "libgav1_add_library: NAME, TYPE and SOURCES required.") + endif() + + list(APPEND libgav1_targets ${lib_NAME}) + if(lib_TEST) + list(APPEND libgav1_test_targets ${lib_NAME}) + list(APPEND libgav1_test_sources ${lib_SOURCES}) + else() + list(APPEND libgav1_sources ${lib_SOURCES}) + if(lib_TYPE STREQUAL OBJECT) + list(APPEND libgav1_objlib_targets ${lib_NAME}) + elseif(lib_TYPE STREQUAL SHARED) + list(APPEND libgav1_dylib_targets ${lib_NAME}) + elseif(lib_TYPE STREQUAL STATIC) + list(APPEND libgav1_lib_targets ${lib_NAME}) + else() + message(WARNING "libgav1_add_library: Unhandled type: ${lib_TYPE}") + endif() + endif() + + add_library(${lib_NAME} ${lib_TYPE} ${lib_SOURCES}) + libgav1_process_intrinsics_sources(TARGET ${lib_NAME} SOURCES ${lib_SOURCES}) + + if(lib_OUTPUT_NAME) + if(NOT (BUILD_SHARED_LIBS AND (MSVC OR WIN32))) + set_target_properties(${lib_NAME} + PROPERTIES OUTPUT_NAME ${lib_OUTPUT_NAME}) + endif() + endif() + + if(lib_DEFINES) + target_compile_definitions(${lib_NAME} PRIVATE ${lib_DEFINES}) + endif() + + if(lib_INCLUDES) + target_include_directories(${lib_NAME} PRIVATE ${lib_INCLUDES}) + endif() + + if(lib_PUBLIC_INCLUDES) + target_include_directories(${lib_NAME} PUBLIC ${lib_PUBLIC_INCLUDES}) + endif() + + if(lib_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS) + target_compile_options(${lib_NAME} + PRIVATE ${lib_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS}) + endif() + + if(lib_LINK_FLAGS) + set_target_properties(${lib_NAME} PROPERTIES LINK_FLAGS ${lib_LINK_FLAGS}) + endif() + + if(lib_OBJLIB_DEPS) + foreach(objlib_dep ${lib_OBJLIB_DEPS}) + target_sources(${lib_NAME} PRIVATE $) + endforeach() + endif() + + if(lib_LIB_DEPS) + if(lib_TYPE STREQUAL STATIC) + set(link_type PUBLIC) + else() + set(link_type PRIVATE) + if(lib_TYPE STREQUAL SHARED AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU") + # The libgav1 shared object uses the static libgav1 as input to turn it + # into a shared object. Include everything from the static library in + # the shared object. + if(APPLE) + list(INSERT lib_LIB_DEPS 0 -Wl,-force_load) + else() + list(INSERT lib_LIB_DEPS 0 -Wl,--whole-archive) + list(APPEND lib_LIB_DEPS -Wl,--no-whole-archive) + endif() + endif() + endif() + target_link_libraries(${lib_NAME} ${link_type} ${lib_LIB_DEPS}) + endif() + + if(NOT MSVC AND lib_NAME MATCHES "^lib") + # Non-MSVC generators prepend lib to static lib target file names. Libgav1 + # already includes lib in its name. Avoid naming output files liblib*. + set_target_properties(${lib_NAME} PROPERTIES PREFIX "") + endif() + + if(lib_TYPE STREQUAL SHARED AND NOT MSVC) + set_target_properties(${lib_NAME} PROPERTIES SOVERSION ${LIBGAV1_SOVERSION}) + endif() + + if(BUILD_SHARED_LIBS AND (MSVC OR WIN32)) + if(lib_TYPE STREQUAL SHARED) + target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=1") + else() + target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0") + endif() + endif() + + # Determine if $lib_NAME is a header only target. + set(sources_list ${lib_SOURCES}) + list(FILTER sources_list INCLUDE REGEX cc$) + if(NOT sources_list) + if(NOT XCODE) + # This is a header only target. Tell CMake the link language. + set_target_properties(${lib_NAME} PROPERTIES LINKER_LANGUAGE CXX) + else() + # The Xcode generator ignores LINKER_LANGUAGE. Add a dummy cc file. + libgav1_create_dummy_source_file(TARGET ${lib_NAME} BASENAME ${lib_NAME}) + endif() + endif() +endmacro() diff --git a/cmake/libgav1_variables.cmake b/cmake/libgav1_variables.cmake new file mode 100644 index 0000000..0dd0f37 --- /dev/null +++ b/cmake/libgav1_variables.cmake @@ -0,0 +1,78 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_) + return() +endif() # LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_ +set(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_ 1) + +# Halts generation when $variable_name does not refer to a directory that +# exists. +macro(libgav1_variable_must_be_directory variable_name) + if("${variable_name}" STREQUAL "") + message( + FATAL_ERROR + "Empty variable_name passed to libgav1_variable_must_be_directory.") + endif() + + if("${${variable_name}}" STREQUAL "") + message( + FATAL_ERROR + "Empty variable ${variable_name} is required to build libgav1.") + endif() + + if(NOT IS_DIRECTORY "${${variable_name}}") + message( + FATAL_ERROR + "${variable_name}, which is ${${variable_name}}, does not refer to a\n" + "directory.") + endif() +endmacro() + +# Adds $var_name to the tracked variables list. +macro(libgav1_track_configuration_variable var_name) + if(LIBGAV1_VERBOSE GREATER 2) + message("---- libgav1_track_configuration_variable ----\n" + "var_name=${var_name}\n" + "----------------------------------------------\n") + endif() + + list(APPEND libgav1_configuration_variables ${var_name}) + list(REMOVE_DUPLICATES libgav1_configuration_variables) +endmacro() + +# Logs current C++ and executable linker flags via CMake's message command. +macro(libgav1_dump_cmake_flag_variables) + unset(flag_variables) + list(APPEND flag_variables "CMAKE_CXX_FLAGS_INIT" "CMAKE_CXX_FLAGS" + "CMAKE_EXE_LINKER_FLAGS_INIT" "CMAKE_EXE_LINKER_FLAGS") + if(CMAKE_BUILD_TYPE) + list(APPEND flag_variables "CMAKE_BUILD_TYPE" + "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}_INIT" + "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}" + "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}_INIT" + "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}") + endif() + foreach(flag_variable ${flag_variables}) + message("${flag_variable}:${${flag_variable}}") + endforeach() +endmacro() + +# Dumps the variables tracked in $libgav1_configuration_variables via CMake's +# message command. +macro(libgav1_dump_tracked_configuration_variables) + foreach(config_variable ${libgav1_configuration_variables}) + message("${config_variable}:${${config_variable}}") + endforeach() +endmacro() diff --git a/cmake/toolchains/aarch64-linux-gnu.cmake b/cmake/toolchains/aarch64-linux-gnu.cmake new file mode 100644 index 0000000..7ffe397 --- /dev/null +++ b/cmake/toolchains/aarch64-linux-gnu.cmake @@ -0,0 +1,28 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_) + return() +endif() # LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_ +set(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_ 1) + +set(CMAKE_SYSTEM_NAME "Linux") + +if("${CROSS}" STREQUAL "") + set(CROSS aarch64-linux-gnu-) +endif() + +set(CMAKE_CXX_COMPILER ${CROSS}g++) +set(CMAKE_CXX_FLAGS_INIT "-march=armv8-a") +set(CMAKE_SYSTEM_PROCESSOR "aarch64") diff --git a/cmake/toolchains/android.cmake b/cmake/toolchains/android.cmake new file mode 100644 index 0000000..492957b --- /dev/null +++ b/cmake/toolchains/android.cmake @@ -0,0 +1,53 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_) + return() +endif() # LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_ + +# Additional ANDROID_* settings are available, see: +# https://developer.android.com/ndk/guides/cmake#variables + +if(NOT ANDROID_PLATFORM) + set(ANDROID_PLATFORM android-21) +endif() + +# Choose target architecture with: +# +# -DANDROID_ABI={armeabi-v7a,armeabi-v7a with NEON,arm64-v8a,x86,x86_64} +if(NOT ANDROID_ABI) + set(ANDROID_ABI arm64-v8a) +endif() + +# Force arm mode for 32-bit targets (instead of the default thumb) to improve +# performance. +if(NOT ANDROID_ARM_MODE) + set(ANDROID_ARM_MODE arm) +endif() + +# Toolchain files don't have access to cached variables: +# https://gitlab.kitware.com/cmake/cmake/issues/16170. Set an intermediate +# environment variable when loaded the first time. +if(LIBGAV1_ANDROID_NDK_PATH) + set(ENV{LIBGAV1_ANDROID_NDK_PATH} "${LIBGAV1_ANDROID_NDK_PATH}") +else() + set(LIBGAV1_ANDROID_NDK_PATH "$ENV{LIBGAV1_ANDROID_NDK_PATH}") +endif() + +if(NOT LIBGAV1_ANDROID_NDK_PATH) + message(FATAL_ERROR "LIBGAV1_ANDROID_NDK_PATH not set.") + return() +endif() + +include("${LIBGAV1_ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake") diff --git a/cmake/toolchains/arm-linux-gnueabihf.cmake b/cmake/toolchains/arm-linux-gnueabihf.cmake new file mode 100644 index 0000000..8051f0d --- /dev/null +++ b/cmake/toolchains/arm-linux-gnueabihf.cmake @@ -0,0 +1,29 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_) + return() +endif() # LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_ +set(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_ 1) + +set(CMAKE_SYSTEM_NAME "Linux") + +if("${CROSS}" STREQUAL "") + set(CROSS arm-linux-gnueabihf-) +endif() + +set(CMAKE_CXX_COMPILER ${CROSS}g++) +set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm") +set(CMAKE_SYSTEM_PROCESSOR "armv7") +set(LIBGAV1_NEON_INTRINSICS_FLAG "-mfpu=neon") diff --git a/codereview.settings b/codereview.settings new file mode 100644 index 0000000..ccba2ee --- /dev/null +++ b/codereview.settings @@ -0,0 +1,4 @@ +# This file is used by git cl to get repository specific information. +GERRIT_HOST: True +CODE_REVIEW_SERVER: chromium-review.googlesource.com +GERRIT_SQUASH_UPLOADS: False diff --git a/examples/file_reader.cc b/examples/file_reader.cc new file mode 100644 index 0000000..b096722 --- /dev/null +++ b/examples/file_reader.cc @@ -0,0 +1,186 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "examples/file_reader.h" + +#include +#include +#include +#include +#include +#include + +#if defined(_WIN32) +#include +#include +#endif + +#include "examples/file_reader_constants.h" +#include "examples/file_reader_factory.h" +#include "examples/file_reader_interface.h" +#include "examples/ivf_parser.h" +#include "examples/logging.h" + +namespace libgav1 { +namespace { + +FILE* SetBinaryMode(FILE* stream) { +#if defined(_WIN32) + _setmode(_fileno(stream), _O_BINARY); +#endif + return stream; +} + +} // namespace + +bool FileReader::registered_in_factory_ = + FileReaderFactory::RegisterReader(FileReader::Open); + +FileReader::~FileReader() { + if (owns_file_) fclose(file_); +} + +std::unique_ptr FileReader::Open( + const std::string& file_name, const bool error_tolerant) { + if (file_name.empty()) return nullptr; + + FILE* raw_file_ptr; + + bool owns_file = true; + if (file_name == "-") { + raw_file_ptr = SetBinaryMode(stdin); + owns_file = false; // stdin is owned by the Standard C Library. + } else { + raw_file_ptr = fopen(file_name.c_str(), "rb"); + } + + if (raw_file_ptr == nullptr) { + return nullptr; + } + + std::unique_ptr file( + new (std::nothrow) FileReader(raw_file_ptr, owns_file, error_tolerant)); + if (file == nullptr) { + LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory"); + if (owns_file) fclose(raw_file_ptr); + return nullptr; + } + + if (!file->ReadIvfFileHeader()) { + LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported file type"); + return nullptr; + } + + return file; +} + +// IVF Frame Header format, from https://wiki.multimedia.cx/index.php/IVF +// bytes 0-3 size of frame in bytes (not including the 12-byte header) +// bytes 4-11 64-bit presentation timestamp +// bytes 12.. frame data +bool FileReader::ReadTemporalUnit(std::vector* const tu_data, + int64_t* const timestamp) { + if (tu_data == nullptr) return false; + tu_data->clear(); + + uint8_t header_buffer[kIvfFrameHeaderSize]; + const size_t num_read = fread(header_buffer, 1, kIvfFrameHeaderSize, file_); + + if (IsEndOfFile()) { + if (num_read != 0) { + LIBGAV1_EXAMPLES_LOG_ERROR( + "Cannot read IVF frame header: Not enough data available"); + return false; + } + + return true; + } + + IvfFrameHeader ivf_frame_header; + if (!ParseIvfFrameHeader(header_buffer, &ivf_frame_header)) { + LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF frame header"); + if (error_tolerant_) { + ivf_frame_header.frame_size = + std::min(ivf_frame_header.frame_size, size_t{kMaxTemporalUnitSize}); + } else { + return false; + } + } + + if (timestamp != nullptr) *timestamp = ivf_frame_header.timestamp; + + tu_data->resize(ivf_frame_header.frame_size); + const size_t size_read = + fread(tu_data->data(), 1, ivf_frame_header.frame_size, file_); + if (size_read != ivf_frame_header.frame_size) { + LIBGAV1_EXAMPLES_LOG_ERROR( + "Unexpected EOF or I/O error reading frame data"); + if (error_tolerant_) { + tu_data->resize(size_read); + } else { + return false; + } + } + return true; +} + +// Attempt to read an IVF file header. Returns true for success, and false for +// failure. +// +// IVF File Header format, from https://wiki.multimedia.cx/index.php/IVF +// bytes 0-3 signature: 'DKIF' +// bytes 4-5 version (should be 0) +// bytes 6-7 length of header in bytes +// bytes 8-11 codec FourCC (e.g., 'VP80') +// bytes 12-13 width in pixels +// bytes 14-15 height in pixels +// bytes 16-19 frame rate +// bytes 20-23 time scale +// bytes 24-27 number of frames in file +// bytes 28-31 unused +// +// Note: The rate and scale fields correspond to the numerator and denominator +// of frame rate (fps) or time base (the reciprocal of frame rate) as follows: +// +// bytes 16-19 frame rate timebase.den framerate.numerator +// bytes 20-23 time scale timebase.num framerate.denominator +bool FileReader::ReadIvfFileHeader() { + uint8_t header_buffer[kIvfFileHeaderSize]; + const size_t num_read = fread(header_buffer, 1, kIvfFileHeaderSize, file_); + if (num_read != kIvfFileHeaderSize) { + LIBGAV1_EXAMPLES_LOG_ERROR( + "Cannot read IVF header: Not enough data available"); + return false; + } + + IvfFileHeader ivf_file_header; + if (!ParseIvfFileHeader(header_buffer, &ivf_file_header)) { + LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF file header"); + if (error_tolerant_) { + ivf_file_header = {}; + } else { + return false; + } + } + + width_ = ivf_file_header.width; + height_ = ivf_file_header.height; + frame_rate_ = ivf_file_header.frame_rate_numerator; + time_scale_ = ivf_file_header.frame_rate_denominator; + type_ = kFileTypeIvf; + + return true; +} + +} // namespace libgav1 diff --git a/examples/file_reader.h b/examples/file_reader.h new file mode 100644 index 0000000..c342a20 --- /dev/null +++ b/examples/file_reader.h @@ -0,0 +1,100 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_EXAMPLES_FILE_READER_H_ +#define LIBGAV1_EXAMPLES_FILE_READER_H_ + +#include +#include +#include +#include +#include +#include + +#include "examples/file_reader_interface.h" + +namespace libgav1 { + +// Temporal Unit based file reader class. Currently supports only IVF files. +class FileReader : public FileReaderInterface { + public: + enum FileType { + kFileTypeUnknown, + kFileTypeIvf, + }; + + // Creates and returns a FileReader that reads from |file_name|. + // If |error_tolerant| is true format and read errors are ignored, + // ReadTemporalUnit() may return truncated data. + // Returns nullptr when the file does not exist, cannot be read, or is not an + // IVF file. + static std::unique_ptr Open(const std::string& file_name, + bool error_tolerant = false); + + FileReader() = delete; + FileReader(const FileReader&) = delete; + FileReader& operator=(const FileReader&) = delete; + + // Closes |file_|. + ~FileReader() override; + + // Reads a temporal unit from |file_| and writes the data to |tu_data|. + // Returns true when: + // - A temporal unit is read successfully, or + // - At end of file. + // When ReadTemporalUnit() is called at the end of the file, it will return + // true without writing any data to |tu_data|. + // + // The |timestamp| pointer is optional: callers not interested in timestamps + // can pass nullptr. When |timestamp| is not a nullptr, this function returns + // the presentation timestamp from the IVF frame header. + /*LIBGAV1_MUST_USE_RESULT*/ bool ReadTemporalUnit( + std::vector* tu_data, int64_t* timestamp) override; + + /*LIBGAV1_MUST_USE_RESULT*/ bool IsEndOfFile() const override { + return feof(file_) != 0; + } + + // The values returned by these accessors are strictly informative. No + // validation is performed when they are read from the IVF file header. + size_t width() const override { return width_; } + size_t height() const override { return height_; } + size_t frame_rate() const override { return frame_rate_; } + size_t time_scale() const override { return time_scale_; } + + private: + FileReader(FILE* file, bool owns_file, bool error_tolerant) + : file_(file), owns_file_(owns_file), error_tolerant_(error_tolerant) {} + + bool ReadIvfFileHeader(); + + FILE* file_ = nullptr; + size_t width_ = 0; + size_t height_ = 0; + size_t frame_rate_ = 0; + size_t time_scale_ = 0; + FileType type_ = kFileTypeUnknown; + // True if this object owns file_ and is responsible for closing it when + // done. + const bool owns_file_; + const bool error_tolerant_; + + static bool registered_in_factory_; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_EXAMPLES_FILE_READER_H_ diff --git a/examples/file_reader_constants.cc b/examples/file_reader_constants.cc new file mode 100644 index 0000000..8439071 --- /dev/null +++ b/examples/file_reader_constants.cc @@ -0,0 +1,23 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "examples/file_reader_constants.h" + +namespace libgav1 { + +const char kIvfSignature[4] = {'D', 'K', 'I', 'F'}; +const char kAv1FourCcUpper[4] = {'A', 'V', '0', '1'}; +const char kAv1FourCcLower[4] = {'a', 'v', '0', '1'}; + +} // namespace libgav1 diff --git a/examples/file_reader_constants.h b/examples/file_reader_constants.h new file mode 100644 index 0000000..00922b4 --- /dev/null +++ b/examples/file_reader_constants.h @@ -0,0 +1,39 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_ +#define LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_ + +namespace libgav1 { + +enum { + kIvfHeaderVersion = 0, + kIvfFrameHeaderSize = 12, + kIvfFileHeaderSize = 32, +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + kMaxTemporalUnitSize = 512 * 1024, +#else + kMaxTemporalUnitSize = 256 * 1024 * 1024, +#endif +}; + +extern const char kIvfSignature[4]; +extern const char kAv1FourCcUpper[4]; +extern const char kAv1FourCcLower[4]; + +} // namespace libgav1 + +#endif // LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_ diff --git a/examples/file_reader_factory.cc b/examples/file_reader_factory.cc new file mode 100644 index 0000000..d5260eb --- /dev/null +++ b/examples/file_reader_factory.cc @@ -0,0 +1,51 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "examples/file_reader_factory.h" + +#include + +#include "examples/logging.h" + +namespace libgav1 { +namespace { + +std::vector* GetFileReaderOpenFunctions() { + static auto* open_functions = + new (std::nothrow) std::vector(); + return open_functions; +} + +} // namespace + +bool FileReaderFactory::RegisterReader(OpenFunction open_function) { + if (open_function == nullptr) return false; + auto* open_functions = GetFileReaderOpenFunctions(); + const size_t num_readers = open_functions->size(); + open_functions->push_back(open_function); + return open_functions->size() == num_readers + 1; +} + +std::unique_ptr FileReaderFactory::OpenReader( + const std::string& file_name, const bool error_tolerant /*= false*/) { + for (auto* open_function : *GetFileReaderOpenFunctions()) { + auto reader = open_function(file_name, error_tolerant); + if (reader == nullptr) continue; + return reader; + } + LIBGAV1_EXAMPLES_LOG_ERROR("No file reader able to open input"); + return nullptr; +} + +} // namespace libgav1 diff --git a/examples/file_reader_factory.h b/examples/file_reader_factory.h new file mode 100644 index 0000000..0f53484 --- /dev/null +++ b/examples/file_reader_factory.h @@ -0,0 +1,51 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_ +#define LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_ + +#include +#include + +#include "examples/file_reader_interface.h" + +namespace libgav1 { + +class FileReaderFactory { + public: + using OpenFunction = std::unique_ptr (*)( + const std::string& file_name, bool error_tolerant); + + FileReaderFactory() = delete; + FileReaderFactory(const FileReaderFactory&) = delete; + FileReaderFactory& operator=(const FileReaderFactory&) = delete; + ~FileReaderFactory() = default; + + // Registers the OpenFunction for a FileReaderInterface and returns true when + // registration succeeds. + static bool RegisterReader(OpenFunction open_function); + + // Passes |file_name| to each OpenFunction until one succeeds. Returns nullptr + // when no reader is found for |file_name|. Otherwise a FileReaderInterface is + // returned. If |error_tolerant| is true and the reader supports it, some + // format and read errors may be ignored and partial data returned. + static std::unique_ptr OpenReader( + const std::string& file_name, bool error_tolerant = false); +}; + +} // namespace libgav1 + +#endif // LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_ diff --git a/examples/file_reader_interface.h b/examples/file_reader_interface.h new file mode 100644 index 0000000..d8f7030 --- /dev/null +++ b/examples/file_reader_interface.h @@ -0,0 +1,63 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_ +#define LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_ + +#include +#include +#include + +namespace libgav1 { + +class FileReaderInterface { + public: + FileReaderInterface() = default; + FileReaderInterface(const FileReaderInterface&) = delete; + FileReaderInterface& operator=(const FileReaderInterface&) = delete; + + FileReaderInterface(FileReaderInterface&&) = default; + FileReaderInterface& operator=(FileReaderInterface&&) = default; + + // Closes the file. + virtual ~FileReaderInterface() = default; + + // Reads a temporal unit from the file and writes the data to |tu_data|. + // Returns true when: + // - A temporal unit is read successfully, or + // - At end of file. + // When ReadTemporalUnit() is called at the end of the file, it will return + // true without writing any data to |tu_data|. + // + // The |timestamp| pointer is optional: callers not interested in timestamps + // can pass nullptr. When |timestamp| is not a nullptr, this function returns + // the presentation timestamp of the temporal unit. + /*LIBGAV1_MUST_USE_RESULT*/ virtual bool ReadTemporalUnit( + std::vector* tu_data, int64_t* timestamp) = 0; + + /*LIBGAV1_MUST_USE_RESULT*/ virtual bool IsEndOfFile() const = 0; + + // The values returned by these accessors are strictly informative. No + // validation is performed when they are read from file. + virtual size_t width() const = 0; + virtual size_t height() const = 0; + virtual size_t frame_rate() const = 0; + virtual size_t time_scale() const = 0; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_ diff --git a/examples/file_writer.cc b/examples/file_writer.cc new file mode 100644 index 0000000..54afe14 --- /dev/null +++ b/examples/file_writer.cc @@ -0,0 +1,183 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "examples/file_writer.h" + +#include +#include +#include +#include +#include + +#if defined(_WIN32) +#include +#include +#endif + +#include "examples/logging.h" + +namespace libgav1 { +namespace { + +FILE* SetBinaryMode(FILE* stream) { +#if defined(_WIN32) + _setmode(_fileno(stream), _O_BINARY); +#endif + return stream; +} + +std::string GetY4mColorSpaceString( + const FileWriter::Y4mParameters& y4m_parameters) { + std::string color_space_string; + switch (y4m_parameters.image_format) { + case kImageFormatMonochrome400: + color_space_string = "mono"; + break; + case kImageFormatYuv420: + if (y4m_parameters.bitdepth == 8) { + if (y4m_parameters.chroma_sample_position == + kChromaSamplePositionVertical) { + color_space_string = "420mpeg2"; + } else if (y4m_parameters.chroma_sample_position == + kChromaSamplePositionColocated) { + color_space_string = "420"; + } else { + color_space_string = "420jpeg"; + } + } else { + color_space_string = "420"; + } + break; + case kImageFormatYuv422: + color_space_string = "422"; + break; + case kImageFormatYuv444: + color_space_string = "444"; + break; + } + + if (y4m_parameters.bitdepth > 8) { + const bool monochrome = + y4m_parameters.image_format == kImageFormatMonochrome400; + if (!monochrome) color_space_string += "p"; + color_space_string += std::to_string(y4m_parameters.bitdepth); + } + + return color_space_string; +} + +} // namespace + +FileWriter::~FileWriter() { fclose(file_); } + +std::unique_ptr FileWriter::Open( + const std::string& file_name, FileType file_type, + const Y4mParameters* const y4m_parameters) { + if (file_name.empty() || + (file_type == kFileTypeY4m && y4m_parameters == nullptr) || + (file_type != kFileTypeRaw && file_type != kFileTypeY4m)) { + LIBGAV1_EXAMPLES_LOG_ERROR("Invalid parameters"); + return nullptr; + } + + FILE* raw_file_ptr; + + if (file_name == "-") { + raw_file_ptr = SetBinaryMode(stdout); + } else { + raw_file_ptr = fopen(file_name.c_str(), "wb"); + } + + if (raw_file_ptr == nullptr) { + LIBGAV1_EXAMPLES_LOG_ERROR("Unable to open output file"); + return nullptr; + } + + std::unique_ptr file(new (std::nothrow) FileWriter(raw_file_ptr)); + if (file == nullptr) { + LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory"); + fclose(raw_file_ptr); + return nullptr; + } + + if (file_type == kFileTypeY4m && !file->WriteY4mFileHeader(*y4m_parameters)) { + LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M file header"); + return nullptr; + } + + file->file_type_ = file_type; + return file; +} + +bool FileWriter::WriteFrame(const DecoderBuffer& frame_buffer) { + if (file_type_ == kFileTypeY4m) { + const char kY4mFrameHeader[] = "FRAME\n"; + if (fwrite(kY4mFrameHeader, 1, strlen(kY4mFrameHeader), file_) != + strlen(kY4mFrameHeader)) { + LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M frame header"); + return false; + } + } + + const size_t pixel_size = + (frame_buffer.bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t); + for (int plane_index = 0; plane_index < frame_buffer.NumPlanes(); + ++plane_index) { + const int height = frame_buffer.displayed_height[plane_index]; + const int width = frame_buffer.displayed_width[plane_index]; + const int stride = frame_buffer.stride[plane_index]; + const uint8_t* const plane_pointer = frame_buffer.plane[plane_index]; + for (int row = 0; row < height; ++row) { + const uint8_t* const row_pointer = &plane_pointer[row * stride]; + if (fwrite(row_pointer, pixel_size, width, file_) != + static_cast(width)) { + char error_string[256]; + snprintf(error_string, sizeof(error_string), + "File write failed: %s (errno=%d)", strerror(errno), errno); + LIBGAV1_EXAMPLES_LOG_ERROR(error_string); + return false; + } + } + } + + return true; +} + +// Writes Y4M file header to |file_| and returns true when successful. +// +// A Y4M file begins with a plaintext file signature of 'YUV4MPEG2 '. +// +// Following the signature is any number of optional parameters preceded by a +// space. We always write: +// +// Width: 'W' followed by image width in pixels. +// Height: 'H' followed by image height in pixels. +// Frame Rate: 'F' followed frames/second in the form numerator:denominator. +// Interlacing: 'I' followed by 'p' for progressive. +// Color space: 'C' followed by a string representation of the color space. +// +// More info here: https://wiki.multimedia.cx/index.php/YUV4MPEG2 +bool FileWriter::WriteY4mFileHeader(const Y4mParameters& y4m_parameters) { + std::string y4m_header = "YUV4MPEG2"; + y4m_header += " W" + std::to_string(y4m_parameters.width); + y4m_header += " H" + std::to_string(y4m_parameters.height); + y4m_header += " F" + std::to_string(y4m_parameters.frame_rate_numerator) + + ":" + std::to_string(y4m_parameters.frame_rate_denominator); + y4m_header += " Ip C" + GetY4mColorSpaceString(y4m_parameters); + y4m_header += "\n"; + return fwrite(y4m_header.c_str(), 1, y4m_header.length(), file_) == + y4m_header.length(); +} + +} // namespace libgav1 diff --git a/examples/file_writer.h b/examples/file_writer.h new file mode 100644 index 0000000..00f6cc3 --- /dev/null +++ b/examples/file_writer.h @@ -0,0 +1,102 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_EXAMPLES_FILE_WRITER_H_ +#define LIBGAV1_EXAMPLES_FILE_WRITER_H_ + +#include +#include +#include +#include +#include + +#include "gav1/decoder_buffer.h" + +namespace libgav1 { + +// Frame based file writer class. Supports only Y4M (YUV4MPEG2) and RAW output. +class FileWriter { + public: + enum FileType : uint8_t { + kFileTypeRaw, + kFileTypeY4m, + }; + + struct Y4mParameters { + Y4mParameters() = default; + Y4mParameters(size_t width, size_t height, size_t frame_rate_numerator, + size_t frame_rate_denominator, + ChromaSamplePosition chroma_sample_position, + ImageFormat image_format, size_t bitdepth) + : width(width), + height(height), + frame_rate_numerator(frame_rate_numerator), + frame_rate_denominator(frame_rate_denominator), + chroma_sample_position(chroma_sample_position), + image_format(image_format), + bitdepth(bitdepth) {} + + Y4mParameters(const Y4mParameters& rhs) = default; + Y4mParameters& operator=(const Y4mParameters& rhs) = default; + Y4mParameters(Y4mParameters&& rhs) = default; + Y4mParameters& operator=(Y4mParameters&& rhs) = default; + + size_t width = 0; + size_t height = 0; + size_t frame_rate_numerator = 30; + size_t frame_rate_denominator = 1; + ChromaSamplePosition chroma_sample_position = kChromaSamplePositionUnknown; + ImageFormat image_format = kImageFormatYuv420; + size_t bitdepth = 8; + }; + + // Opens |file_name|. When |file_type| is kFileTypeY4m the Y4M file header is + // written out to |file_| before this method returns. + // + // Returns a FileWriter instance after the file is opened successfully for + // kFileTypeRaw files, and after the Y4M file header bytes are written for + // kFileTypeY4m files. Returns nullptr upon failure. + static std::unique_ptr Open(const std::string& file_name, + FileType type, + const Y4mParameters* y4m_parameters); + + FileWriter() = delete; + FileWriter(const FileWriter&) = delete; + FileWriter& operator=(const FileWriter&) = delete; + + FileWriter(FileWriter&&) = default; + FileWriter& operator=(FileWriter&&) = default; + + // Closes |file_|. + ~FileWriter(); + + // Writes the frame data in |frame_buffer| to |file_|. Returns true after + // successful write of |frame_buffer| data. + /*LIBGAV1_MUST_USE_RESULT*/ bool WriteFrame( + const DecoderBuffer& frame_buffer); + + private: + explicit FileWriter(FILE* file) : file_(file) {} + + bool WriteY4mFileHeader(const Y4mParameters& y4m_parameters); + + FILE* file_ = nullptr; + FileType file_type_ = kFileTypeRaw; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_EXAMPLES_FILE_WRITER_H_ diff --git a/examples/gav1_decode.cc b/examples/gav1_decode.cc new file mode 100644 index 0000000..4de0ba2 --- /dev/null +++ b/examples/gav1_decode.cc @@ -0,0 +1,452 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "absl/strings/numbers.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" +#include "examples/file_reader_factory.h" +#include "examples/file_reader_interface.h" +#include "examples/file_writer.h" +#include "gav1/decoder.h" + +#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL +#include "examples/gav1_decode_cv_pixel_buffer_pool.h" +#endif + +namespace { + +struct Options { + const char* input_file_name = nullptr; + const char* output_file_name = nullptr; + const char* frame_timing_file_name = nullptr; + libgav1::FileWriter::FileType output_file_type = + libgav1::FileWriter::kFileTypeRaw; + uint8_t post_filter_mask = 0x1f; + int threads = 1; + bool frame_parallel = false; + bool output_all_layers = false; + int operating_point = 0; + int limit = 0; + int skip = 0; + int verbose = 0; +}; + +struct Timing { + absl::Duration input; + absl::Duration dequeue; +}; + +struct FrameTiming { + absl::Time enqueue; + absl::Time dequeue; +}; + +void PrintHelp(FILE* const fout) { + fprintf(fout, + "Usage: gav1_decode [options] " + " [-o ]\n"); + fprintf(fout, "\n"); + fprintf(fout, "Options:\n"); + fprintf(fout, " -h, --help This help message.\n"); + fprintf(fout, " --threads (Default 1).\n"); + fprintf(fout, " --frame_parallel.\n"); + fprintf(fout, + " --limit Stop decoding after N frames (0 = all).\n"); + fprintf(fout, " --skip Skip initial N frames (Default 0).\n"); + fprintf(fout, " --version.\n"); + fprintf(fout, " --y4m (Default false).\n"); + fprintf(fout, " --raw (Default true).\n"); + fprintf(fout, " -v logging verbosity, can be used multiple times.\n"); + fprintf(fout, " --all_layers.\n"); + fprintf(fout, + " --operating_point (Default 0).\n"); + fprintf(fout, + " --frame_timing Output per-frame timing to in tsv" + " format.\n Yields meaningful results only when frame parallel is" + " off.\n"); + fprintf(fout, "\nAdvanced settings:\n"); + fprintf(fout, " --post_filter_mask (Default 0x1f).\n"); + fprintf(fout, + " Mask indicating which post filters should be applied to the" + " reconstructed\n frame. This may be given as octal, decimal or" + " hexadecimal. From LSB:\n"); + fprintf(fout, " Bit 0: Loop filter (deblocking filter)\n"); + fprintf(fout, " Bit 1: Cdef\n"); + fprintf(fout, " Bit 2: SuperRes\n"); + fprintf(fout, " Bit 3: Loop Restoration\n"); + fprintf(fout, " Bit 4: Film Grain Synthesis\n"); +} + +void ParseOptions(int argc, char* argv[], Options* const options) { + for (int i = 1; i < argc; ++i) { + int32_t value; + if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { + PrintHelp(stdout); + exit(EXIT_SUCCESS); + } else if (strcmp(argv[i], "-o") == 0) { + if (++i >= argc) { + fprintf(stderr, "Missing argument for '-o'\n"); + PrintHelp(stderr); + exit(EXIT_FAILURE); + } + options->output_file_name = argv[i]; + } else if (strcmp(argv[i], "--frame_timing") == 0) { + if (++i >= argc) { + fprintf(stderr, "Missing argument for '--frame_timing'\n"); + PrintHelp(stderr); + exit(EXIT_FAILURE); + } + options->frame_timing_file_name = argv[i]; + } else if (strcmp(argv[i], "--version") == 0) { + printf("gav1_decode, a libgav1 based AV1 decoder\n"); + printf("libgav1 %s\n", libgav1::GetVersionString()); + printf("max bitdepth: %d\n", libgav1::Decoder::GetMaxBitdepth()); + printf("build configuration: %s\n", libgav1::GetBuildConfiguration()); + exit(EXIT_SUCCESS); + } else if (strcmp(argv[i], "-v") == 0) { + ++options->verbose; + } else if (strcmp(argv[i], "--raw") == 0) { + options->output_file_type = libgav1::FileWriter::kFileTypeRaw; + } else if (strcmp(argv[i], "--y4m") == 0) { + options->output_file_type = libgav1::FileWriter::kFileTypeY4m; + } else if (strcmp(argv[i], "--threads") == 0) { + if (++i >= argc || !absl::SimpleAtoi(argv[i], &value)) { + fprintf(stderr, "Missing/Invalid value for --threads.\n"); + PrintHelp(stderr); + exit(EXIT_FAILURE); + } + options->threads = value; + } else if (strcmp(argv[i], "--frame_parallel") == 0) { + options->frame_parallel = true; + } else if (strcmp(argv[i], "--all_layers") == 0) { + options->output_all_layers = true; + } else if (strcmp(argv[i], "--operating_point") == 0) { + if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0 || + value >= 32) { + fprintf(stderr, "Missing/Invalid value for --operating_point.\n"); + PrintHelp(stderr); + exit(EXIT_FAILURE); + } + options->operating_point = value; + } else if (strcmp(argv[i], "--limit") == 0) { + if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) { + fprintf(stderr, "Missing/Invalid value for --limit.\n"); + PrintHelp(stderr); + exit(EXIT_FAILURE); + } + options->limit = value; + } else if (strcmp(argv[i], "--skip") == 0) { + if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) { + fprintf(stderr, "Missing/Invalid value for --skip.\n"); + PrintHelp(stderr); + exit(EXIT_FAILURE); + } + options->skip = value; + } else if (strcmp(argv[i], "--post_filter_mask") == 0) { + errno = 0; + char* endptr = nullptr; + value = (++i >= argc) ? -1 + // NOLINTNEXTLINE(runtime/deprecated_fn) + : static_cast(strtol(argv[i], &endptr, 0)); + // Only the last 5 bits of the mask can be set. + if ((value & ~31) != 0 || errno != 0 || endptr == argv[i]) { + fprintf(stderr, "Invalid value for --post_filter_mask.\n"); + PrintHelp(stderr); + exit(EXIT_FAILURE); + } + options->post_filter_mask = value; + } else if (strlen(argv[i]) > 1 && argv[i][0] == '-') { + fprintf(stderr, "Unknown option '%s'!\n", argv[i]); + exit(EXIT_FAILURE); + } else { + if (options->input_file_name == nullptr) { + options->input_file_name = argv[i]; + } else { + fprintf(stderr, "Found invalid parameter: \"%s\".\n", argv[i]); + PrintHelp(stderr); + exit(EXIT_FAILURE); + } + } + } + + if (argc < 2 || options->input_file_name == nullptr) { + fprintf(stderr, "Input file is required!\n"); + PrintHelp(stderr); + exit(EXIT_FAILURE); + } +} + +using InputBuffer = std::vector; + +class InputBuffers { + public: + ~InputBuffers() { + for (auto buffer : free_buffers_) { + delete buffer; + } + } + InputBuffer* GetFreeBuffer() { + if (free_buffers_.empty()) { + auto* const buffer = new (std::nothrow) InputBuffer(); + if (buffer == nullptr) { + fprintf(stderr, "Failed to create input buffer.\n"); + return nullptr; + } + free_buffers_.push_back(buffer); + } + InputBuffer* const buffer = free_buffers_.front(); + free_buffers_.pop_front(); + return buffer; + } + + void ReleaseInputBuffer(InputBuffer* buffer) { + free_buffers_.push_back(buffer); + } + + private: + std::deque free_buffers_; +}; + +void ReleaseInputBuffer(void* callback_private_data, + void* buffer_private_data) { + auto* const input_buffers = static_cast(callback_private_data); + input_buffers->ReleaseInputBuffer( + static_cast(buffer_private_data)); +} + +int CloseFile(FILE* stream) { return (stream == nullptr) ? 0 : fclose(stream); } + +} // namespace + +int main(int argc, char* argv[]) { + Options options; + ParseOptions(argc, argv, &options); + + auto file_reader = + libgav1::FileReaderFactory::OpenReader(options.input_file_name); + if (file_reader == nullptr) { + fprintf(stderr, "Cannot open input file!\n"); + return EXIT_FAILURE; + } + + std::unique_ptr frame_timing_file(nullptr, + &CloseFile); + if (options.frame_timing_file_name != nullptr) { + frame_timing_file.reset(fopen(options.frame_timing_file_name, "wb")); + if (frame_timing_file == nullptr) { + fprintf(stderr, "Cannot open frame timing file '%s'!\n", + options.frame_timing_file_name); + return EXIT_FAILURE; + } + } + +#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL + // Reference frames + 1 scratch frame (for either the current frame or the + // film grain frame). + constexpr int kNumBuffers = 8 + 1; + std::unique_ptr cv_pixel_buffers = + Gav1DecodeCVPixelBufferPool::Create(kNumBuffers); + if (cv_pixel_buffers == nullptr) { + fprintf(stderr, "Cannot create Gav1DecodeCVPixelBufferPool!\n"); + return EXIT_FAILURE; + } +#endif + + InputBuffers input_buffers; + libgav1::Decoder decoder; + libgav1::DecoderSettings settings; + settings.post_filter_mask = options.post_filter_mask; + settings.threads = options.threads; + settings.frame_parallel = options.frame_parallel; + settings.output_all_layers = options.output_all_layers; + settings.operating_point = options.operating_point; + settings.blocking_dequeue = true; + settings.callback_private_data = &input_buffers; + settings.release_input_buffer = ReleaseInputBuffer; +#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL + settings.on_frame_buffer_size_changed = Gav1DecodeOnCVPixelBufferSizeChanged; + settings.get_frame_buffer = Gav1DecodeGetCVPixelBuffer; + settings.release_frame_buffer = Gav1DecodeReleaseCVPixelBuffer; + settings.callback_private_data = cv_pixel_buffers.get(); + settings.release_input_buffer = nullptr; + // TODO(vigneshv): Support frame parallel mode to be used with + // CVPixelBufferPool. + settings.frame_parallel = false; +#endif + libgav1::StatusCode status = decoder.Init(&settings); + if (status != libgav1::kStatusOk) { + fprintf(stderr, "Error initializing decoder: %s\n", + libgav1::GetErrorString(status)); + return EXIT_FAILURE; + } + + fprintf(stderr, "decoding '%s'\n", options.input_file_name); + if (options.verbose > 0 && options.skip > 0) { + fprintf(stderr, "skipping %d frame(s).\n", options.skip); + } + + int input_frames = 0; + int decoded_frames = 0; + Timing timing = {}; + std::vector frame_timing; + const bool record_frame_timing = frame_timing_file != nullptr; + std::unique_ptr file_writer; + InputBuffer* input_buffer = nullptr; + bool limit_reached = false; + bool dequeue_finished = false; + const absl::Time decode_loop_start = absl::Now(); + do { + if (input_buffer == nullptr && !file_reader->IsEndOfFile() && + !limit_reached) { + input_buffer = input_buffers.GetFreeBuffer(); + if (input_buffer == nullptr) return EXIT_FAILURE; + const absl::Time read_start = absl::Now(); + if (!file_reader->ReadTemporalUnit(input_buffer, + /*timestamp=*/nullptr)) { + fprintf(stderr, "Error reading input file.\n"); + return EXIT_FAILURE; + } + timing.input += absl::Now() - read_start; + } + + if (++input_frames <= options.skip) { + input_buffers.ReleaseInputBuffer(input_buffer); + input_buffer = nullptr; + continue; + } + + if (input_buffer != nullptr) { + if (input_buffer->empty()) { + input_buffers.ReleaseInputBuffer(input_buffer); + input_buffer = nullptr; + continue; + } + + const absl::Time enqueue_start = absl::Now(); + status = decoder.EnqueueFrame(input_buffer->data(), input_buffer->size(), + static_cast(frame_timing.size()), + /*buffer_private_data=*/input_buffer); + if (status == libgav1::kStatusOk) { + if (options.verbose > 1) { + fprintf(stderr, "enqueue frame (length %zu)\n", input_buffer->size()); + } + if (record_frame_timing) { + FrameTiming enqueue_time = {enqueue_start, absl::UnixEpoch()}; + frame_timing.emplace_back(enqueue_time); + } + + input_buffer = nullptr; + // Continue to enqueue frames until we get a kStatusTryAgain status. + continue; + } + if (status != libgav1::kStatusTryAgain) { + fprintf(stderr, "Unable to enqueue frame: %s\n", + libgav1::GetErrorString(status)); + return EXIT_FAILURE; + } + } + + const libgav1::DecoderBuffer* buffer; + status = decoder.DequeueFrame(&buffer); + if (status == libgav1::kStatusNothingToDequeue) { + dequeue_finished = true; + continue; + } + if (status != libgav1::kStatusOk) { + fprintf(stderr, "Unable to dequeue frame: %s\n", + libgav1::GetErrorString(status)); + return EXIT_FAILURE; + } + dequeue_finished = false; + if (buffer == nullptr) continue; + ++decoded_frames; + if (options.verbose > 1) { + fprintf(stderr, "buffer dequeued\n"); + } + + if (record_frame_timing) { + frame_timing[static_cast(buffer->user_private_data)].dequeue = + absl::Now(); + } + + if (options.output_file_name != nullptr && file_writer == nullptr) { + libgav1::FileWriter::Y4mParameters y4m_parameters; + y4m_parameters.width = buffer->displayed_width[0]; + y4m_parameters.height = buffer->displayed_height[0]; + y4m_parameters.frame_rate_numerator = file_reader->frame_rate(); + y4m_parameters.frame_rate_denominator = file_reader->time_scale(); + y4m_parameters.chroma_sample_position = buffer->chroma_sample_position; + y4m_parameters.image_format = buffer->image_format; + y4m_parameters.bitdepth = static_cast(buffer->bitdepth); + file_writer = libgav1::FileWriter::Open( + options.output_file_name, options.output_file_type, &y4m_parameters); + if (file_writer == nullptr) { + fprintf(stderr, "Cannot open output file!\n"); + return EXIT_FAILURE; + } + } + + if (!limit_reached && file_writer != nullptr && + !file_writer->WriteFrame(*buffer)) { + fprintf(stderr, "Error writing output file.\n"); + return EXIT_FAILURE; + } + if (options.limit > 0 && options.limit == decoded_frames) { + limit_reached = true; + if (input_buffer != nullptr) { + input_buffers.ReleaseInputBuffer(input_buffer); + } + input_buffer = nullptr; + } + } while (input_buffer != nullptr || + (!file_reader->IsEndOfFile() && !limit_reached) || + !dequeue_finished); + timing.dequeue = absl::Now() - decode_loop_start - timing.input; + + if (record_frame_timing) { + // Note timing for frame parallel will be skewed by the time spent queueing + // additional frames and in the output queue waiting for previous frames, + // the values reported won't be that meaningful. + fprintf(frame_timing_file.get(), "frame number\tdecode time us\n"); + for (size_t i = 0; i < frame_timing.size(); ++i) { + const int decode_time_us = static_cast(absl::ToInt64Microseconds( + frame_timing[i].dequeue - frame_timing[i].enqueue)); + fprintf(frame_timing_file.get(), "%zu\t%d\n", i, decode_time_us); + } + } + + if (options.verbose > 0) { + fprintf(stderr, "time to read input: %d us\n", + static_cast(absl::ToInt64Microseconds(timing.input))); + const int decode_time_us = + static_cast(absl::ToInt64Microseconds(timing.dequeue)); + const double decode_fps = + (decode_time_us == 0) ? 0.0 : 1.0e6 * decoded_frames / decode_time_us; + fprintf(stderr, "time to decode input: %d us (%d frames, %.2f fps)\n", + decode_time_us, decoded_frames, decode_fps); + } + + return EXIT_SUCCESS; +} diff --git a/examples/gav1_decode_cv_pixel_buffer_pool.cc b/examples/gav1_decode_cv_pixel_buffer_pool.cc new file mode 100644 index 0000000..6aa4e61 --- /dev/null +++ b/examples/gav1_decode_cv_pixel_buffer_pool.cc @@ -0,0 +1,278 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "examples/gav1_decode_cv_pixel_buffer_pool.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace { + +struct CFTypeDeleter { + void operator()(CFTypeRef cf) const { CFRelease(cf); } +}; + +using UniqueCFNumberRef = + std::unique_ptr::type, CFTypeDeleter>; + +using UniqueCFDictionaryRef = + std::unique_ptr::type, CFTypeDeleter>; + +} // namespace + +extern "C" { + +libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged( + void* callback_private_data, int bitdepth, + libgav1::ImageFormat image_format, int width, int height, int left_border, + int right_border, int top_border, int bottom_border, int stride_alignment) { + auto* buffer_pool = + static_cast(callback_private_data); + return buffer_pool->OnCVPixelBufferSizeChanged( + bitdepth, image_format, width, height, left_border, right_border, + top_border, bottom_border, stride_alignment); +} + +libgav1::StatusCode Gav1DecodeGetCVPixelBuffer( + void* callback_private_data, int bitdepth, + libgav1::ImageFormat image_format, int width, int height, int left_border, + int right_border, int top_border, int bottom_border, int stride_alignment, + libgav1::FrameBuffer* frame_buffer) { + auto* buffer_pool = + static_cast(callback_private_data); + return buffer_pool->GetCVPixelBuffer( + bitdepth, image_format, width, height, left_border, right_border, + top_border, bottom_border, stride_alignment, frame_buffer); +} + +void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data, + void* buffer_private_data) { + auto* buffer_pool = + static_cast(callback_private_data); + buffer_pool->ReleaseCVPixelBuffer(buffer_private_data); +} + +} // extern "C" + +// static +std::unique_ptr +Gav1DecodeCVPixelBufferPool::Create(size_t num_buffers) { + std::unique_ptr buffer_pool( + new (std::nothrow) Gav1DecodeCVPixelBufferPool(num_buffers)); + return buffer_pool; +} + +Gav1DecodeCVPixelBufferPool::Gav1DecodeCVPixelBufferPool(size_t num_buffers) + : num_buffers_(static_cast(num_buffers)) {} + +Gav1DecodeCVPixelBufferPool::~Gav1DecodeCVPixelBufferPool() { + CVPixelBufferPoolRelease(pool_); +} + +libgav1::StatusCode Gav1DecodeCVPixelBufferPool::OnCVPixelBufferSizeChanged( + int bitdepth, libgav1::ImageFormat image_format, int width, int height, + int left_border, int right_border, int top_border, int bottom_border, + int stride_alignment) { + if (bitdepth != 8 || (image_format != libgav1::kImageFormatYuv420 && + image_format != libgav1::kImageFormatMonochrome400)) { + fprintf(stderr, + "Only bitdepth 8, 4:2:0 videos are supported: bitdepth %d, " + "image_format: %d.\n", + bitdepth, image_format); + return libgav1::kStatusUnimplemented; + } + + // stride_alignment must be a power of 2. + assert((stride_alignment & (stride_alignment - 1)) == 0); + + // The possible keys for CVPixelBufferPool are: + // kCVPixelBufferPoolMinimumBufferCountKey + // kCVPixelBufferPoolMaximumBufferAgeKey + // kCVPixelBufferPoolAllocationThresholdKey + const void* pool_keys[] = {kCVPixelBufferPoolMinimumBufferCountKey}; + const int min_buffer_count = 10; + UniqueCFNumberRef cf_min_buffer_count( + CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &min_buffer_count)); + if (cf_min_buffer_count == nullptr) { + fprintf(stderr, "CFNumberCreate failed.\n"); + return libgav1::kStatusUnknownError; + } + const void* pool_values[] = {cf_min_buffer_count.get()}; + UniqueCFDictionaryRef pool_attributes(CFDictionaryCreate( + nullptr, pool_keys, pool_values, 1, &kCFTypeDictionaryKeyCallBacks, + &kCFTypeDictionaryValueCallBacks)); + if (pool_attributes == nullptr) { + fprintf(stderr, "CFDictionaryCreate failed.\n"); + return libgav1::kStatusUnknownError; + } + + // The pixelBufferAttributes argument to CVPixelBufferPoolCreate() cannot be + // null and must contain the pixel format, width, and height, otherwise + // CVPixelBufferPoolCreate() fails with kCVReturnInvalidPixelBufferAttributes + // (-6682). + + // I420: kCVPixelFormatType_420YpCbCr8Planar (video range). + const int pixel_format = (image_format == libgav1::kImageFormatYuv420) + ? kCVPixelFormatType_420YpCbCr8PlanarFullRange + : kCVPixelFormatType_OneComponent8; + UniqueCFNumberRef cf_pixel_format( + CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &pixel_format)); + UniqueCFNumberRef cf_width( + CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &width)); + UniqueCFNumberRef cf_height( + CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &height)); + UniqueCFNumberRef cf_left_border( + CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &left_border)); + UniqueCFNumberRef cf_right_border( + CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &right_border)); + UniqueCFNumberRef cf_top_border( + CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &top_border)); + UniqueCFNumberRef cf_bottom_border( + CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &bottom_border)); + UniqueCFNumberRef cf_stride_alignment( + CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &stride_alignment)); + + const void* buffer_keys[] = { + kCVPixelBufferPixelFormatTypeKey, + kCVPixelBufferWidthKey, + kCVPixelBufferHeightKey, + kCVPixelBufferExtendedPixelsLeftKey, + kCVPixelBufferExtendedPixelsRightKey, + kCVPixelBufferExtendedPixelsTopKey, + kCVPixelBufferExtendedPixelsBottomKey, + kCVPixelBufferBytesPerRowAlignmentKey, + }; + const void* buffer_values[] = { + cf_pixel_format.get(), cf_width.get(), + cf_height.get(), cf_left_border.get(), + cf_right_border.get(), cf_top_border.get(), + cf_bottom_border.get(), cf_stride_alignment.get(), + }; + UniqueCFDictionaryRef buffer_attributes(CFDictionaryCreate( + kCFAllocatorDefault, buffer_keys, buffer_values, 8, + &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks)); + if (buffer_attributes == nullptr) { + fprintf(stderr, "CFDictionaryCreate of buffer_attributes failed.\n"); + return libgav1::kStatusUnknownError; + } + CVPixelBufferPoolRef cv_pool; + CVReturn ret = CVPixelBufferPoolCreate( + /*allocator=*/nullptr, pool_attributes.get(), buffer_attributes.get(), + &cv_pool); + if (ret != kCVReturnSuccess) { + fprintf(stderr, "CVPixelBufferPoolCreate failed: %d.\n", + static_cast(ret)); + return libgav1::kStatusOutOfMemory; + } + CVPixelBufferPoolRelease(pool_); + pool_ = cv_pool; + return libgav1::kStatusOk; +} + +libgav1::StatusCode Gav1DecodeCVPixelBufferPool::GetCVPixelBuffer( + int bitdepth, libgav1::ImageFormat image_format, int /*width*/, + int /*height*/, int /*left_border*/, int /*right_border*/, + int /*top_border*/, int /*bottom_border*/, int /*stride_alignment*/, + libgav1::FrameBuffer* frame_buffer) { + static_cast(bitdepth); + assert(bitdepth == 8 && (image_format == libgav1::kImageFormatYuv420 || + image_format == libgav1::kImageFormatMonochrome400)); + const bool is_monochrome = + (image_format == libgav1::kImageFormatMonochrome400); + + // The dictionary must have kCVPixelBufferPoolAllocationThresholdKey, + // otherwise CVPixelBufferPoolCreatePixelBufferWithAuxAttributes() fails with + // kCVReturnWouldExceedAllocationThreshold (-6689). + UniqueCFNumberRef cf_num_buffers( + CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &num_buffers_)); + + const void* buffer_keys[] = { + kCVPixelBufferPoolAllocationThresholdKey, + }; + const void* buffer_values[] = { + cf_num_buffers.get(), + }; + UniqueCFDictionaryRef aux_attributes(CFDictionaryCreate( + kCFAllocatorDefault, buffer_keys, buffer_values, 1, + &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks)); + if (aux_attributes == nullptr) { + fprintf(stderr, "CFDictionaryCreate of aux_attributes failed.\n"); + return libgav1::kStatusUnknownError; + } + + CVPixelBufferRef pixel_buffer; + CVReturn ret = CVPixelBufferPoolCreatePixelBufferWithAuxAttributes( + /*allocator=*/nullptr, pool_, aux_attributes.get(), &pixel_buffer); + if (ret != kCVReturnSuccess) { + fprintf(stderr, + "CVPixelBufferPoolCreatePixelBufferWithAuxAttributes failed: %d.\n", + static_cast(ret)); + return libgav1::kStatusOutOfMemory; + } + + ret = CVPixelBufferLockBaseAddress(pixel_buffer, /*lockFlags=*/0); + if (ret != kCVReturnSuccess) { + fprintf(stderr, "CVPixelBufferLockBaseAddress failed: %d.\n", + static_cast(ret)); + CFRelease(pixel_buffer); + return libgav1::kStatusUnknownError; + } + + // If the pixel format type is kCVPixelFormatType_OneComponent8, the pixel + // buffer is nonplanar (CVPixelBufferIsPlanar returns false and + // CVPixelBufferGetPlaneCount returns 0), but + // CVPixelBufferGetBytesPerRowOfPlane and CVPixelBufferGetBaseAddressOfPlane + // still work for plane index 0, even though the documentation says they + // return NULL for nonplanar pixel buffers. + frame_buffer->stride[0] = + static_cast(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 0)); + frame_buffer->plane[0] = static_cast( + CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 0)); + if (is_monochrome) { + frame_buffer->stride[1] = 0; + frame_buffer->stride[2] = 0; + frame_buffer->plane[1] = nullptr; + frame_buffer->plane[2] = nullptr; + } else { + frame_buffer->stride[1] = + static_cast(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 1)); + frame_buffer->stride[2] = + static_cast(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 2)); + frame_buffer->plane[1] = static_cast( + CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 1)); + frame_buffer->plane[2] = static_cast( + CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 2)); + } + frame_buffer->private_data = pixel_buffer; + + return libgav1::kStatusOk; +} + +void Gav1DecodeCVPixelBufferPool::ReleaseCVPixelBuffer( + void* buffer_private_data) { + auto const pixel_buffer = static_cast(buffer_private_data); + CVReturn ret = + CVPixelBufferUnlockBaseAddress(pixel_buffer, /*unlockFlags=*/0); + if (ret != kCVReturnSuccess) { + fprintf(stderr, "%s:%d: CVPixelBufferUnlockBaseAddress failed: %d.\n", + __FILE__, __LINE__, static_cast(ret)); + abort(); + } + CFRelease(pixel_buffer); +} diff --git a/examples/gav1_decode_cv_pixel_buffer_pool.h b/examples/gav1_decode_cv_pixel_buffer_pool.h new file mode 100644 index 0000000..7aee324 --- /dev/null +++ b/examples/gav1_decode_cv_pixel_buffer_pool.h @@ -0,0 +1,73 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_ +#define LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_ + +#include + +#include +#include + +#include "gav1/frame_buffer.h" + +extern "C" libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged( + void* callback_private_data, int bitdepth, + libgav1::ImageFormat image_format, int width, int height, int left_border, + int right_border, int top_border, int bottom_border, int stride_alignment); + +extern "C" libgav1::StatusCode Gav1DecodeGetCVPixelBuffer( + void* callback_private_data, int bitdepth, + libgav1::ImageFormat image_format, int width, int height, int left_border, + int right_border, int top_border, int bottom_border, int stride_alignment, + libgav1::FrameBuffer* frame_buffer); + +extern "C" void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data, + void* buffer_private_data); + +class Gav1DecodeCVPixelBufferPool { + public: + static std::unique_ptr Create( + size_t num_buffers); + + // Not copyable or movable. + Gav1DecodeCVPixelBufferPool(const Gav1DecodeCVPixelBufferPool&) = delete; + Gav1DecodeCVPixelBufferPool& operator=(const Gav1DecodeCVPixelBufferPool&) = + delete; + + ~Gav1DecodeCVPixelBufferPool(); + + libgav1::StatusCode OnCVPixelBufferSizeChanged( + int bitdepth, libgav1::ImageFormat image_format, int width, int height, + int left_border, int right_border, int top_border, int bottom_border, + int stride_alignment); + + libgav1::StatusCode GetCVPixelBuffer(int bitdepth, + libgav1::ImageFormat image_format, + int width, int height, int left_border, + int right_border, int top_border, + int bottom_border, int stride_alignment, + libgav1::FrameBuffer* frame_buffer); + void ReleaseCVPixelBuffer(void* buffer_private_data); + + private: + Gav1DecodeCVPixelBufferPool(size_t num_buffers); + + CVPixelBufferPoolRef pool_ = nullptr; + const int num_buffers_; +}; + +#endif // LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_ diff --git a/examples/ivf_parser.cc b/examples/ivf_parser.cc new file mode 100644 index 0000000..f8adb14 --- /dev/null +++ b/examples/ivf_parser.cc @@ -0,0 +1,96 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "examples/ivf_parser.h" + +#include +#include + +#include "examples/file_reader_constants.h" +#include "examples/logging.h" + +namespace libgav1 { +namespace { + +size_t ReadLittleEndian16(const uint8_t* const buffer) { + size_t value = buffer[1] << 8; + value |= buffer[0]; + return value; +} + +size_t ReadLittleEndian32(const uint8_t* const buffer) { + size_t value = buffer[3] << 24; + value |= buffer[2] << 16; + value |= buffer[1] << 8; + value |= buffer[0]; + return value; +} + +} // namespace + +bool ParseIvfFileHeader(const uint8_t* const header_buffer, + IvfFileHeader* const ivf_file_header) { + if (header_buffer == nullptr || ivf_file_header == nullptr) return false; + + if (memcmp(kIvfSignature, header_buffer, 4) != 0) { + return false; + } + + // Verify header version and length. + const size_t ivf_header_version = ReadLittleEndian16(&header_buffer[4]); + if (ivf_header_version != kIvfHeaderVersion) { + LIBGAV1_EXAMPLES_LOG_ERROR("Unexpected IVF version"); + } + + const size_t ivf_header_size = ReadLittleEndian16(&header_buffer[6]); + if (ivf_header_size != kIvfFileHeaderSize) { + LIBGAV1_EXAMPLES_LOG_ERROR("Invalid IVF file header size"); + return false; + } + + if (memcmp(kAv1FourCcLower, &header_buffer[8], 4) != 0 && + memcmp(kAv1FourCcUpper, &header_buffer[8], 4) != 0) { + LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported codec 4CC"); + return false; + } + + ivf_file_header->width = ReadLittleEndian16(&header_buffer[12]); + ivf_file_header->height = ReadLittleEndian16(&header_buffer[14]); + ivf_file_header->frame_rate_numerator = + ReadLittleEndian32(&header_buffer[16]); + ivf_file_header->frame_rate_denominator = + ReadLittleEndian32(&header_buffer[20]); + + return true; +} + +bool ParseIvfFrameHeader(const uint8_t* const header_buffer, + IvfFrameHeader* const ivf_frame_header) { + if (header_buffer == nullptr || ivf_frame_header == nullptr) return false; + + ivf_frame_header->frame_size = ReadLittleEndian32(header_buffer); + if (ivf_frame_header->frame_size > kMaxTemporalUnitSize) { + LIBGAV1_EXAMPLES_LOG_ERROR("Temporal Unit size exceeds maximum"); + return false; + } + + ivf_frame_header->timestamp = ReadLittleEndian32(&header_buffer[4]); + const uint64_t timestamp_hi = + static_cast(ReadLittleEndian32(&header_buffer[8])) << 32; + ivf_frame_header->timestamp |= timestamp_hi; + + return true; +} + +} // namespace libgav1 diff --git a/examples/ivf_parser.h b/examples/ivf_parser.h new file mode 100644 index 0000000..b6bbc59 --- /dev/null +++ b/examples/ivf_parser.h @@ -0,0 +1,57 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_EXAMPLES_IVF_PARSER_H_ +#define LIBGAV1_EXAMPLES_IVF_PARSER_H_ + +#include +#include + +namespace libgav1 { + +struct IvfFileHeader { + IvfFileHeader() = default; + IvfFileHeader(const IvfFileHeader& rhs) = default; + IvfFileHeader& operator=(const IvfFileHeader& rhs) = default; + IvfFileHeader(IvfFileHeader&& rhs) = default; + IvfFileHeader& operator=(IvfFileHeader&& rhs) = default; + + size_t width = 0; + size_t height = 0; + size_t frame_rate_numerator = 0; + size_t frame_rate_denominator = 0; +}; + +struct IvfFrameHeader { + IvfFrameHeader() = default; + IvfFrameHeader(const IvfFrameHeader& rhs) = default; + IvfFrameHeader& operator=(const IvfFrameHeader& rhs) = default; + IvfFrameHeader(IvfFrameHeader&& rhs) = default; + IvfFrameHeader& operator=(IvfFrameHeader&& rhs) = default; + + size_t frame_size = 0; + int64_t timestamp = 0; +}; + +bool ParseIvfFileHeader(const uint8_t* header_buffer, + IvfFileHeader* ivf_file_header); + +bool ParseIvfFrameHeader(const uint8_t* header_buffer, + IvfFrameHeader* ivf_frame_header); + +} // namespace libgav1 + +#endif // LIBGAV1_EXAMPLES_IVF_PARSER_H_ diff --git a/examples/libgav1_examples.cmake b/examples/libgav1_examples.cmake new file mode 100644 index 0000000..1f949f3 --- /dev/null +++ b/examples/libgav1_examples.cmake @@ -0,0 +1,63 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_) + return() +endif() # LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ +set(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ 1) + +set(libgav1_file_reader_sources "${libgav1_examples}/file_reader.cc" + "${libgav1_examples}/file_reader.h" + "${libgav1_examples}/file_reader_constants.cc" + "${libgav1_examples}/file_reader_constants.h" + "${libgav1_examples}/file_reader_factory.cc" + "${libgav1_examples}/file_reader_factory.h" + "${libgav1_examples}/file_reader_interface.h" + "${libgav1_examples}/ivf_parser.cc" + "${libgav1_examples}/ivf_parser.h" + "${libgav1_examples}/logging.h") + +set(libgav1_file_writer_sources "${libgav1_examples}/file_writer.cc" + "${libgav1_examples}/file_writer.h" + "${libgav1_examples}/logging.h") + +set(libgav1_decode_sources "${libgav1_examples}/gav1_decode.cc") + +macro(libgav1_add_examples_targets) + libgav1_add_library(NAME libgav1_file_reader TYPE OBJECT SOURCES + ${libgav1_file_reader_sources} DEFINES ${libgav1_defines} + INCLUDES ${libgav1_include_paths}) + + libgav1_add_library(NAME libgav1_file_writer TYPE OBJECT SOURCES + ${libgav1_file_writer_sources} DEFINES ${libgav1_defines} + INCLUDES ${libgav1_include_paths}) + + libgav1_add_executable(NAME + gav1_decode + SOURCES + ${libgav1_decode_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_include_paths} + ${libgav1_gtest_include_paths} + OBJLIB_DEPS + libgav1_file_reader + libgav1_file_writer + LIB_DEPS + absl::strings + absl::str_format_internal + absl::time + ${libgav1_dependency}) +endmacro() diff --git a/examples/logging.h b/examples/logging.h new file mode 100644 index 0000000..c0bcad7 --- /dev/null +++ b/examples/logging.h @@ -0,0 +1,65 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_EXAMPLES_LOGGING_H_ +#define LIBGAV1_EXAMPLES_LOGGING_H_ + +#include +#include + +namespace libgav1 { +namespace examples { + +#if !defined(LIBGAV1_EXAMPLES_ENABLE_LOGGING) +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) +#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 0 +#else +#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 1 +#endif +#endif + +#if LIBGAV1_EXAMPLES_ENABLE_LOGGING + +// Compile-time function to get the 'base' file_name, that is, the part of +// a file_name after the last '/' or '\' path separator. The search starts at +// the end of the string; the second parameter is the length of the string. +constexpr const char* Basename(const char* file_name, size_t offset) { + return (offset == 0 || file_name[offset - 1] == '/' || + file_name[offset - 1] == '\\') + ? file_name + offset + : Basename(file_name, offset - 1); +} + +#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \ + do { \ + constexpr const char* libgav1_examples_basename = \ + ::libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1); \ + fprintf(stderr, "%s:%d (%s): %s.\n", libgav1_examples_basename, __LINE__, \ + __func__, error_string); \ + } while (false) + +#else // !LIBGAV1_EXAMPLES_ENABLE_LOGGING + +#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \ + do { \ + } while (false) + +#endif // LIBGAV1_EXAMPLES_ENABLE_LOGGING + +} // namespace examples +} // namespace libgav1 + +#endif // LIBGAV1_EXAMPLES_LOGGING_H_ diff --git a/src/buffer_pool.cc b/src/buffer_pool.cc new file mode 100644 index 0000000..c1a5606 --- /dev/null +++ b/src/buffer_pool.cc @@ -0,0 +1,218 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/buffer_pool.h" + +#include +#include + +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/logging.h" + +namespace libgav1 { + +namespace { + +// Copies the feature_enabled, feature_data, segment_id_pre_skip, and +// last_active_segment_id fields of Segmentation. +void CopySegmentationParameters(const Segmentation& from, Segmentation* to) { + memcpy(to->feature_enabled, from.feature_enabled, + sizeof(to->feature_enabled)); + memcpy(to->feature_data, from.feature_data, sizeof(to->feature_data)); + to->segment_id_pre_skip = from.segment_id_pre_skip; + to->last_active_segment_id = from.last_active_segment_id; +} + +} // namespace + +RefCountedBuffer::RefCountedBuffer() = default; + +RefCountedBuffer::~RefCountedBuffer() = default; + +bool RefCountedBuffer::Realloc(int bitdepth, bool is_monochrome, int width, + int height, int subsampling_x, int subsampling_y, + int left_border, int right_border, + int top_border, int bottom_border) { + // The YuvBuffer::Realloc() could call the get frame buffer callback which + // will need to be thread safe. So we ensure that we only call Realloc() once + // at any given time. + std::lock_guard lock(pool_->mutex_); + assert(!buffer_private_data_valid_); + if (!yuv_buffer_.Realloc( + bitdepth, is_monochrome, width, height, subsampling_x, subsampling_y, + left_border, right_border, top_border, bottom_border, + pool_->get_frame_buffer_, pool_->callback_private_data_, + &buffer_private_data_)) { + return false; + } + buffer_private_data_valid_ = true; + return true; +} + +bool RefCountedBuffer::SetFrameDimensions(const ObuFrameHeader& frame_header) { + upscaled_width_ = frame_header.upscaled_width; + frame_width_ = frame_header.width; + frame_height_ = frame_header.height; + render_width_ = frame_header.render_width; + render_height_ = frame_header.render_height; + rows4x4_ = frame_header.rows4x4; + columns4x4_ = frame_header.columns4x4; + if (frame_header.refresh_frame_flags != 0 && + !IsIntraFrame(frame_header.frame_type)) { + const int rows4x4_half = DivideBy2(rows4x4_); + const int columns4x4_half = DivideBy2(columns4x4_); + if (!reference_info_.Reset(rows4x4_half, columns4x4_half)) { + return false; + } + } + return segmentation_map_.Allocate(rows4x4_, columns4x4_); +} + +void RefCountedBuffer::SetGlobalMotions( + const std::array& global_motions) { + for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) { + static_assert(sizeof(global_motion_[ref].params) == + sizeof(global_motions[ref].params), + ""); + memcpy(global_motion_[ref].params, global_motions[ref].params, + sizeof(global_motion_[ref].params)); + } +} + +void RefCountedBuffer::SetFrameContext(const SymbolDecoderContext& context) { + frame_context_ = context; + frame_context_.ResetIntraFrameYModeCdf(); + frame_context_.ResetCounters(); +} + +void RefCountedBuffer::GetSegmentationParameters( + Segmentation* segmentation) const { + CopySegmentationParameters(/*from=*/segmentation_, /*to=*/segmentation); +} + +void RefCountedBuffer::SetSegmentationParameters( + const Segmentation& segmentation) { + CopySegmentationParameters(/*from=*/segmentation, /*to=*/&segmentation_); +} + +void RefCountedBuffer::SetBufferPool(BufferPool* pool) { pool_ = pool; } + +void RefCountedBuffer::ReturnToBufferPool(RefCountedBuffer* ptr) { + ptr->pool_->ReturnUnusedBuffer(ptr); +} + +BufferPool::BufferPool( + FrameBufferSizeChangedCallback on_frame_buffer_size_changed, + GetFrameBufferCallback get_frame_buffer, + ReleaseFrameBufferCallback release_frame_buffer, + void* callback_private_data) { + if (get_frame_buffer != nullptr) { + // on_frame_buffer_size_changed may be null. + assert(release_frame_buffer != nullptr); + on_frame_buffer_size_changed_ = on_frame_buffer_size_changed; + get_frame_buffer_ = get_frame_buffer; + release_frame_buffer_ = release_frame_buffer; + callback_private_data_ = callback_private_data; + } else { + on_frame_buffer_size_changed_ = OnInternalFrameBufferSizeChanged; + get_frame_buffer_ = GetInternalFrameBuffer; + release_frame_buffer_ = ReleaseInternalFrameBuffer; + callback_private_data_ = &internal_frame_buffers_; + } +} + +BufferPool::~BufferPool() { + for (const auto* buffer : buffers_) { + if (buffer->in_use_) { + assert(false && "RefCountedBuffer still in use at destruction time."); + LIBGAV1_DLOG(ERROR, "RefCountedBuffer still in use at destruction time."); + } + delete buffer; + } +} + +bool BufferPool::OnFrameBufferSizeChanged(int bitdepth, + Libgav1ImageFormat image_format, + int width, int height, + int left_border, int right_border, + int top_border, int bottom_border) { + if (on_frame_buffer_size_changed_ == nullptr) return true; + return on_frame_buffer_size_changed_(callback_private_data_, bitdepth, + image_format, width, height, left_border, + right_border, top_border, bottom_border, + /*stride_alignment=*/16) == kStatusOk; +} + +RefCountedBufferPtr BufferPool::GetFreeBuffer() { + // In frame parallel mode, the GetFreeBuffer() calls from ObuParser all happen + // from the same thread serially, but the GetFreeBuffer() call in + // DecoderImpl::ApplyFilmGrain can happen from multiple threads at the same + // time. So this function has to be thread safe. + // TODO(b/142583029): Investigate if the GetFreeBuffer() call in + // DecoderImpl::ApplyFilmGrain() call can be serialized so that this function + // need not be thread safe. + std::unique_lock lock(mutex_); + for (auto buffer : buffers_) { + if (!buffer->in_use_) { + buffer->in_use_ = true; + buffer->progress_row_ = -1; + buffer->frame_state_ = kFrameStateUnknown; + lock.unlock(); + return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool); + } + } + lock.unlock(); + auto* const buffer = new (std::nothrow) RefCountedBuffer(); + if (buffer == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to allocate a new reference counted buffer."); + return RefCountedBufferPtr(); + } + buffer->SetBufferPool(this); + buffer->in_use_ = true; + buffer->progress_row_ = -1; + buffer->frame_state_ = kFrameStateUnknown; + lock.lock(); + const bool ok = buffers_.push_back(buffer); + lock.unlock(); + if (!ok) { + LIBGAV1_DLOG( + ERROR, + "Failed to push the new reference counted buffer into the vector."); + delete buffer; + return RefCountedBufferPtr(); + } + return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool); +} + +void BufferPool::Abort() { + std::unique_lock lock(mutex_); + for (auto buffer : buffers_) { + if (buffer->in_use_) { + buffer->Abort(); + } + } +} + +void BufferPool::ReturnUnusedBuffer(RefCountedBuffer* buffer) { + std::lock_guard lock(mutex_); + assert(buffer->in_use_); + buffer->in_use_ = false; + if (buffer->buffer_private_data_valid_) { + release_frame_buffer_(callback_private_data_, buffer->buffer_private_data_); + buffer->buffer_private_data_valid_ = false; + } +} + +} // namespace libgav1 diff --git a/src/buffer_pool.h b/src/buffer_pool.h new file mode 100644 index 0000000..f35a633 --- /dev/null +++ b/src/buffer_pool.h @@ -0,0 +1,399 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_BUFFER_POOL_H_ +#define LIBGAV1_SRC_BUFFER_POOL_H_ + +#include +#include +#include +#include // NOLINT (unapproved c++11 header) +#include +#include +#include // NOLINT (unapproved c++11 header) + +#include "src/dsp/common.h" +#include "src/gav1/decoder_buffer.h" +#include "src/gav1/frame_buffer.h" +#include "src/internal_frame_buffer_list.h" +#include "src/symbol_decoder_context.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" +#include "src/utils/reference_info.h" +#include "src/utils/segmentation.h" +#include "src/utils/segmentation_map.h" +#include "src/utils/types.h" +#include "src/utils/vector.h" +#include "src/yuv_buffer.h" + +namespace libgav1 { + +class BufferPool; + +enum FrameState : uint8_t { + kFrameStateUnknown, + kFrameStateStarted, + kFrameStateParsed, + kFrameStateDecoded +}; + +// A reference-counted frame buffer. Clients should access it via +// RefCountedBufferPtr, which manages reference counting transparently. +class RefCountedBuffer { + public: + // Not copyable or movable. + RefCountedBuffer(const RefCountedBuffer&) = delete; + RefCountedBuffer& operator=(const RefCountedBuffer&) = delete; + + // Allocates the YUV buffer. Returns true on success. Returns false on + // failure. This function ensures the thread safety of the |get_frame_buffer_| + // call (i.e.) only one |get_frame_buffer_| call will happen at a given time. + // TODO(b/142583029): In frame parallel mode, we can require the callbacks to + // be thread safe so that we can remove the thread safety of this function and + // applications can have fine grained locks. + // + // * |width| and |height| are the image dimensions in pixels. + // * |subsampling_x| and |subsampling_y| (either 0 or 1) specify the + // subsampling of the width and height of the chroma planes, respectively. + // * |left_border|, |right_border|, |top_border|, and |bottom_border| are + // the sizes (in pixels) of the borders on the left, right, top, and + // bottom sides, respectively. + // + // NOTE: The strides are a multiple of 16. Since the first row in each plane + // is 16-byte aligned, subsequent rows are also 16-byte aligned. + bool Realloc(int bitdepth, bool is_monochrome, int width, int height, + int subsampling_x, int subsampling_y, int left_border, + int right_border, int top_border, int bottom_border); + + YuvBuffer* buffer() { return &yuv_buffer_; } + + // Returns the buffer private data set by the get frame buffer callback when + // it allocated the YUV buffer. + void* buffer_private_data() const { + assert(buffer_private_data_valid_); + return buffer_private_data_; + } + + // NOTE: In the current frame, this is the frame_type syntax element in the + // frame header. In a reference frame, this implements the RefFrameType array + // in the spec. + FrameType frame_type() const { return frame_type_; } + void set_frame_type(FrameType frame_type) { frame_type_ = frame_type; } + + // The sample position for subsampled streams. This is the + // chroma_sample_position syntax element in the sequence header. + // + // NOTE: The decoder does not use chroma_sample_position, but it needs to be + // passed on to the client in DecoderBuffer. + ChromaSamplePosition chroma_sample_position() const { + return chroma_sample_position_; + } + void set_chroma_sample_position(ChromaSamplePosition chroma_sample_position) { + chroma_sample_position_ = chroma_sample_position; + } + + // Whether the frame can be used as show existing frame in the future. + bool showable_frame() const { return showable_frame_; } + void set_showable_frame(bool value) { showable_frame_ = value; } + + // Sets upscaled_width_, frame_width_, frame_height_, render_width_, + // render_height_, rows4x4_ and columns4x4_ from the corresponding fields + // in frame_header. Allocates reference_info_.motion_field_reference_frame, + // reference_info_.motion_field_mv_, and segmentation_map_. Returns true on + // success, false on failure. + bool SetFrameDimensions(const ObuFrameHeader& frame_header); + + int32_t upscaled_width() const { return upscaled_width_; } + int32_t frame_width() const { return frame_width_; } + int32_t frame_height() const { return frame_height_; } + // RenderWidth() and RenderHeight() return the render size, which is a hint + // to the application about the desired display size. + int32_t render_width() const { return render_width_; } + int32_t render_height() const { return render_height_; } + int32_t rows4x4() const { return rows4x4_; } + int32_t columns4x4() const { return columns4x4_; } + + int spatial_id() const { return spatial_id_; } + void set_spatial_id(int value) { spatial_id_ = value; } + int temporal_id() const { return temporal_id_; } + void set_temporal_id(int value) { temporal_id_ = value; } + + SegmentationMap* segmentation_map() { return &segmentation_map_; } + const SegmentationMap* segmentation_map() const { return &segmentation_map_; } + + // Only the |params| field of each GlobalMotion struct should be used. + const std::array& GlobalMotions() + const { + return global_motion_; + } + // Saves the GlobalMotion array. Only the |params| field of each GlobalMotion + // struct is saved. + void SetGlobalMotions( + const std::array& global_motions); + + // Returns the saved CDF tables. + const SymbolDecoderContext& FrameContext() const { return frame_context_; } + // Saves the CDF tables. The intra_frame_y_mode_cdf table is reset to the + // default. The last entry in each table, representing the symbol count for + // that context, is set to 0. + void SetFrameContext(const SymbolDecoderContext& context); + + const std::array& loop_filter_ref_deltas() + const { + return loop_filter_ref_deltas_; + } + const std::array& loop_filter_mode_deltas() + const { + return loop_filter_mode_deltas_; + } + // Saves the ref_deltas and mode_deltas arrays in loop_filter. + void SetLoopFilterDeltas(const LoopFilter& loop_filter) { + loop_filter_ref_deltas_ = loop_filter.ref_deltas; + loop_filter_mode_deltas_ = loop_filter.mode_deltas; + } + + // Copies the saved values of the following fields to the Segmentation + // struct: feature_enabled, feature_data, segment_id_pre_skip, and + // last_active_segment_id. The other fields are left unchanged. + void GetSegmentationParameters(Segmentation* segmentation) const; + // Saves the feature_enabled, feature_data, segment_id_pre_skip, and + // last_active_segment_id fields of the Segmentation struct. + void SetSegmentationParameters(const Segmentation& segmentation); + + const FilmGrainParams& film_grain_params() const { + return film_grain_params_; + } + void set_film_grain_params(const FilmGrainParams& params) { + film_grain_params_ = params; + } + + const ReferenceInfo* reference_info() const { return &reference_info_; } + ReferenceInfo* reference_info() { return &reference_info_; } + + // This will wake up the WaitUntil*() functions and make them return false. + void Abort() { + { + std::lock_guard lock(mutex_); + abort_ = true; + } + parsed_condvar_.notify_all(); + decoded_condvar_.notify_all(); + progress_row_condvar_.notify_all(); + } + + void SetFrameState(FrameState frame_state) { + { + std::lock_guard lock(mutex_); + frame_state_ = frame_state; + } + if (frame_state == kFrameStateParsed) { + parsed_condvar_.notify_all(); + } else if (frame_state == kFrameStateDecoded) { + decoded_condvar_.notify_all(); + progress_row_condvar_.notify_all(); + } + } + + // Sets the progress of this frame to |progress_row| and notifies any threads + // that may be waiting on rows <= |progress_row|. + void SetProgress(int progress_row) { + { + std::lock_guard lock(mutex_); + if (progress_row_ >= progress_row) return; + progress_row_ = progress_row; + } + progress_row_condvar_.notify_all(); + } + + void MarkFrameAsStarted() { + std::lock_guard lock(mutex_); + if (frame_state_ != kFrameStateUnknown) return; + frame_state_ = kFrameStateStarted; + } + + // All the WaitUntil* functions will return true if the desired wait state was + // reached successfully. If the return value is false, then the caller must + // assume that the wait was not successful and try to stop whatever they are + // doing as early as possible. + + // Waits until the frame has been parsed. + bool WaitUntilParsed() { + std::unique_lock lock(mutex_); + while (frame_state_ < kFrameStateParsed && !abort_) { + parsed_condvar_.wait(lock); + } + return !abort_; + } + + // Waits until the |progress_row| has been decoded (as indicated either by + // |progress_row_| or |frame_state_|). |progress_row_cache| must not be + // nullptr and will be populated with the value of |progress_row_| after the + // wait. + // + // Typical usage of |progress_row_cache| is as follows: + // * Initialize |*progress_row_cache| to INT_MIN. + // * Call WaitUntil only if |*progress_row_cache| < |progress_row|. + bool WaitUntil(int progress_row, int* progress_row_cache) { + // If |progress_row| is negative, it means that the wait is on the top + // border to be available. The top border will be available when row 0 has + // been decoded. So we can simply wait on row 0 instead. + progress_row = std::max(progress_row, 0); + std::unique_lock lock(mutex_); + while (progress_row_ < progress_row && frame_state_ != kFrameStateDecoded && + !abort_) { + progress_row_condvar_.wait(lock); + } + // Once |frame_state_| reaches kFrameStateDecoded, |progress_row_| may no + // longer be updated. So we set |*progress_row_cache| to INT_MAX in that + // case. + *progress_row_cache = + (frame_state_ != kFrameStateDecoded) ? progress_row_ : INT_MAX; + return !abort_; + } + + // Waits until the entire frame has been decoded. + bool WaitUntilDecoded() { + std::unique_lock lock(mutex_); + while (frame_state_ != kFrameStateDecoded && !abort_) { + decoded_condvar_.wait(lock); + } + return !abort_; + } + + private: + friend class BufferPool; + + // Methods for BufferPool: + RefCountedBuffer(); + ~RefCountedBuffer(); + void SetBufferPool(BufferPool* pool); + static void ReturnToBufferPool(RefCountedBuffer* ptr); + + BufferPool* pool_ = nullptr; + bool buffer_private_data_valid_ = false; + void* buffer_private_data_ = nullptr; + YuvBuffer yuv_buffer_; + bool in_use_ = false; // Only used by BufferPool. + + std::mutex mutex_; + FrameState frame_state_ = kFrameStateUnknown LIBGAV1_GUARDED_BY(mutex_); + int progress_row_ = -1 LIBGAV1_GUARDED_BY(mutex_); + // Signaled when progress_row_ is updated or when frame_state_ is set to + // kFrameStateDecoded. + std::condition_variable progress_row_condvar_; + // Signaled when the frame state is set to kFrameStateParsed. + std::condition_variable parsed_condvar_; + // Signaled when the frame state is set to kFrameStateDecoded. + std::condition_variable decoded_condvar_; + bool abort_ = false LIBGAV1_GUARDED_BY(mutex_); + + FrameType frame_type_ = kFrameKey; + ChromaSamplePosition chroma_sample_position_ = kChromaSamplePositionUnknown; + bool showable_frame_ = false; + + int32_t upscaled_width_ = 0; + int32_t frame_width_ = 0; + int32_t frame_height_ = 0; + int32_t render_width_ = 0; + int32_t render_height_ = 0; + int32_t columns4x4_ = 0; + int32_t rows4x4_ = 0; + int spatial_id_ = 0; + int temporal_id_ = 0; + + // segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array. + SegmentationMap segmentation_map_; + + // Only the |params| field of each GlobalMotion struct is used. + // global_motion_[0] (for kReferenceFrameIntra) is not used. + std::array global_motion_ = {}; + SymbolDecoderContext frame_context_; + std::array loop_filter_ref_deltas_; + std::array loop_filter_mode_deltas_; + // Only the feature_enabled, feature_data, segment_id_pre_skip, and + // last_active_segment_id fields of the Segmentation struct are used. + // + // Note: The spec only requires that we save feature_enabled and + // feature_data. Since segment_id_pre_skip and last_active_segment_id depend + // on feature_enabled only, we also save their values as an optimization. + Segmentation segmentation_ = {}; + FilmGrainParams film_grain_params_ = {}; + ReferenceInfo reference_info_; +}; + +// RefCountedBufferPtr contains a reference to a RefCountedBuffer. +// +// Note: For simplicity, RefCountedBufferPtr is implemented as a +// std::shared_ptr. This requires a heap allocation of the +// control block for std::shared_ptr. To avoid that heap allocation, we can +// add a |ref_count_| field to RefCountedBuffer and implement a custom +// RefCountedBufferPtr class. +using RefCountedBufferPtr = std::shared_ptr; + +// BufferPool maintains a pool of RefCountedBuffers. +class BufferPool { + public: + BufferPool(FrameBufferSizeChangedCallback on_frame_buffer_size_changed, + GetFrameBufferCallback get_frame_buffer, + ReleaseFrameBufferCallback release_frame_buffer, + void* callback_private_data); + + // Not copyable or movable. + BufferPool(const BufferPool&) = delete; + BufferPool& operator=(const BufferPool&) = delete; + + ~BufferPool(); + + LIBGAV1_MUST_USE_RESULT bool OnFrameBufferSizeChanged( + int bitdepth, Libgav1ImageFormat image_format, int width, int height, + int left_border, int right_border, int top_border, int bottom_border); + + // Finds a free buffer in the buffer pool and returns a reference to the free + // buffer. If there is no free buffer, returns a null pointer. This function + // is thread safe. + RefCountedBufferPtr GetFreeBuffer(); + + // Aborts all the buffers that are in use. + void Abort(); + + private: + friend class RefCountedBuffer; + + // Returns an unused buffer to the buffer pool. Called by RefCountedBuffer + // only. This function is thread safe. + void ReturnUnusedBuffer(RefCountedBuffer* buffer); + + // Used to make the following functions thread safe: GetFreeBuffer(), + // ReturnUnusedBuffer(), RefCountedBuffer::Realloc(). + std::mutex mutex_; + + // Storing a RefCountedBuffer object in a Vector is complicated because of the + // copy/move semantics. So the simplest way around that is to store a list of + // pointers in the vector. + Vector buffers_ LIBGAV1_GUARDED_BY(mutex_); + InternalFrameBufferList internal_frame_buffers_; + + // Frame buffer callbacks. + FrameBufferSizeChangedCallback on_frame_buffer_size_changed_; + GetFrameBufferCallback get_frame_buffer_; + ReleaseFrameBufferCallback release_frame_buffer_; + // Private data associated with the frame buffer callbacks. + void* callback_private_data_; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_BUFFER_POOL_H_ diff --git a/src/decoder.cc b/src/decoder.cc new file mode 100644 index 0000000..b9e43e0 --- /dev/null +++ b/src/decoder.cc @@ -0,0 +1,119 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/gav1/decoder.h" + +#include +#include + +#include "src/decoder_impl.h" + +extern "C" { + +Libgav1StatusCode Libgav1DecoderCreate(const Libgav1DecoderSettings* settings, + Libgav1Decoder** decoder_out) { + std::unique_ptr cxx_decoder(new (std::nothrow) + libgav1::Decoder()); + if (cxx_decoder == nullptr) return kLibgav1StatusOutOfMemory; + + libgav1::DecoderSettings cxx_settings; + cxx_settings.threads = settings->threads; + cxx_settings.frame_parallel = settings->frame_parallel != 0; + cxx_settings.blocking_dequeue = settings->blocking_dequeue != 0; + cxx_settings.on_frame_buffer_size_changed = + settings->on_frame_buffer_size_changed; + cxx_settings.get_frame_buffer = settings->get_frame_buffer; + cxx_settings.release_frame_buffer = settings->release_frame_buffer; + cxx_settings.release_input_buffer = settings->release_input_buffer; + cxx_settings.callback_private_data = settings->callback_private_data; + cxx_settings.output_all_layers = settings->output_all_layers != 0; + cxx_settings.operating_point = settings->operating_point; + cxx_settings.post_filter_mask = settings->post_filter_mask; + + const Libgav1StatusCode status = cxx_decoder->Init(&cxx_settings); + if (status == kLibgav1StatusOk) { + *decoder_out = reinterpret_cast(cxx_decoder.release()); + } + return status; +} + +void Libgav1DecoderDestroy(Libgav1Decoder* decoder) { + auto* cxx_decoder = reinterpret_cast(decoder); + delete cxx_decoder; +} + +Libgav1StatusCode Libgav1DecoderEnqueueFrame(Libgav1Decoder* decoder, + const uint8_t* data, size_t size, + int64_t user_private_data, + void* buffer_private_data) { + auto* cxx_decoder = reinterpret_cast(decoder); + return cxx_decoder->EnqueueFrame(data, size, user_private_data, + buffer_private_data); +} + +Libgav1StatusCode Libgav1DecoderDequeueFrame( + Libgav1Decoder* decoder, const Libgav1DecoderBuffer** out_ptr) { + auto* cxx_decoder = reinterpret_cast(decoder); + return cxx_decoder->DequeueFrame(out_ptr); +} + +Libgav1StatusCode Libgav1DecoderSignalEOS(Libgav1Decoder* decoder) { + auto* cxx_decoder = reinterpret_cast(decoder); + return cxx_decoder->SignalEOS(); +} + +int Libgav1DecoderGetMaxBitdepth() { + return libgav1::Decoder::GetMaxBitdepth(); +} + +} // extern "C" + +namespace libgav1 { + +Decoder::Decoder() = default; + +Decoder::~Decoder() = default; + +StatusCode Decoder::Init(const DecoderSettings* const settings) { + if (impl_ != nullptr) return kStatusAlready; + if (settings != nullptr) settings_ = *settings; + return DecoderImpl::Create(&settings_, &impl_); +} + +StatusCode Decoder::EnqueueFrame(const uint8_t* data, const size_t size, + int64_t user_private_data, + void* buffer_private_data) { + if (impl_ == nullptr) return kStatusNotInitialized; + return impl_->EnqueueFrame(data, size, user_private_data, + buffer_private_data); +} + +StatusCode Decoder::DequeueFrame(const DecoderBuffer** out_ptr) { + if (impl_ == nullptr) return kStatusNotInitialized; + return impl_->DequeueFrame(out_ptr); +} + +StatusCode Decoder::SignalEOS() { + if (impl_ == nullptr) return kStatusNotInitialized; + // In non-frame-parallel mode, we have to release all the references. This + // simply means replacing the |impl_| with a new instance so that all the + // existing references are released and the state is cleared. + impl_ = nullptr; + return DecoderImpl::Create(&settings_, &impl_); +} + +// static. +int Decoder::GetMaxBitdepth() { return DecoderImpl::GetMaxBitdepth(); } + +} // namespace libgav1 diff --git a/src/decoder_impl.cc b/src/decoder_impl.cc new file mode 100644 index 0000000..751671d --- /dev/null +++ b/src/decoder_impl.cc @@ -0,0 +1,1661 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/decoder_impl.h" + +#include +#include +#include +#include +#include +#include + +#include "src/dsp/common.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/film_grain.h" +#include "src/frame_buffer_utils.h" +#include "src/frame_scratch_buffer.h" +#include "src/loop_restoration_info.h" +#include "src/obu_parser.h" +#include "src/post_filter.h" +#include "src/prediction_mask.h" +#include "src/threading_strategy.h" +#include "src/utils/blocking_counter.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/logging.h" +#include "src/utils/parameter_tree.h" +#include "src/utils/raw_bit_reader.h" +#include "src/utils/segmentation.h" +#include "src/utils/threadpool.h" +#include "src/yuv_buffer.h" + +namespace libgav1 { +namespace { + +constexpr int kMaxBlockWidth4x4 = 32; +constexpr int kMaxBlockHeight4x4 = 32; + +// Computes the bottom border size in pixels. If CDEF, loop restoration or +// SuperRes is enabled, adds extra border pixels to facilitate those steps to +// happen nearly in-place (a few extra rows instead of an entire frame buffer). +// The logic in this function should match the corresponding logic for +// |vertical_shift| in the PostFilter constructor. +int GetBottomBorderPixels(const bool do_cdef, const bool do_restoration, + const bool do_superres, const int subsampling_y) { + int extra_border = 0; + if (do_cdef) { + extra_border += kCdefBorder; + } else if (do_restoration) { + // If CDEF is enabled, loop restoration is safe without extra border. + extra_border += kRestorationVerticalBorder; + } + if (do_superres) extra_border += kSuperResVerticalBorder; + // Double the number of extra bottom border pixels if the bottom border will + // be subsampled. + extra_border <<= subsampling_y; + return Align(kBorderPixels + extra_border, 2); // Must be a multiple of 2. +} + +// Sets |frame_scratch_buffer->tile_decoding_failed| to true (while holding on +// to |frame_scratch_buffer->superblock_row_mutex|) and notifies the first +// |count| condition variables in +// |frame_scratch_buffer->superblock_row_progress_condvar|. +void SetFailureAndNotifyAll(FrameScratchBuffer* const frame_scratch_buffer, + int count) { + { + std::lock_guard lock( + frame_scratch_buffer->superblock_row_mutex); + frame_scratch_buffer->tile_decoding_failed = true; + } + std::condition_variable* const condvars = + frame_scratch_buffer->superblock_row_progress_condvar.get(); + for (int i = 0; i < count; ++i) { + condvars[i].notify_one(); + } +} + +// Helper class that releases the frame scratch buffer in the destructor. +class FrameScratchBufferReleaser { + public: + FrameScratchBufferReleaser( + FrameScratchBufferPool* frame_scratch_buffer_pool, + std::unique_ptr* frame_scratch_buffer) + : frame_scratch_buffer_pool_(frame_scratch_buffer_pool), + frame_scratch_buffer_(frame_scratch_buffer) {} + ~FrameScratchBufferReleaser() { + frame_scratch_buffer_pool_->Release(std::move(*frame_scratch_buffer_)); + } + + private: + FrameScratchBufferPool* const frame_scratch_buffer_pool_; + std::unique_ptr* const frame_scratch_buffer_; +}; + +// Sets the |frame|'s segmentation map for two cases. The third case is handled +// in Tile::DecodeBlock(). +void SetSegmentationMap(const ObuFrameHeader& frame_header, + const SegmentationMap* prev_segment_ids, + RefCountedBuffer* const frame) { + if (!frame_header.segmentation.enabled) { + // All segment_id's are 0. + frame->segmentation_map()->Clear(); + } else if (!frame_header.segmentation.update_map) { + // Copy from prev_segment_ids. + if (prev_segment_ids == nullptr) { + // Treat a null prev_segment_ids pointer as if it pointed to a + // segmentation map containing all 0s. + frame->segmentation_map()->Clear(); + } else { + frame->segmentation_map()->CopyFrom(*prev_segment_ids); + } + } +} + +StatusCode DecodeTilesNonFrameParallel( + const ObuSequenceHeader& sequence_header, + const ObuFrameHeader& frame_header, + const Vector>& tiles, + FrameScratchBuffer* const frame_scratch_buffer, + PostFilter* const post_filter) { + // Decode in superblock row order. + const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16; + std::unique_ptr tile_scratch_buffer = + frame_scratch_buffer->tile_scratch_buffer_pool.Get(); + if (tile_scratch_buffer == nullptr) return kLibgav1StatusOutOfMemory; + for (int row4x4 = 0; row4x4 < frame_header.rows4x4; + row4x4 += block_width4x4) { + for (const auto& tile_ptr : tiles) { + if (!tile_ptr->ProcessSuperBlockRow( + row4x4, tile_scratch_buffer.get())) { + return kLibgav1StatusUnknownError; + } + } + post_filter->ApplyFilteringForOneSuperBlockRow( + row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4, + /*do_deblock=*/true); + } + frame_scratch_buffer->tile_scratch_buffer_pool.Release( + std::move(tile_scratch_buffer)); + return kStatusOk; +} + +StatusCode DecodeTilesThreadedNonFrameParallel( + const Vector>& tiles, + FrameScratchBuffer* const frame_scratch_buffer, + PostFilter* const post_filter, + BlockingCounterWithStatus* const pending_tiles) { + ThreadingStrategy& threading_strategy = + frame_scratch_buffer->threading_strategy; + const int num_workers = threading_strategy.tile_thread_count(); + BlockingCounterWithStatus pending_workers(num_workers); + std::atomic tile_counter(0); + const int tile_count = static_cast(tiles.size()); + bool tile_decoding_failed = false; + // Submit tile decoding jobs to the thread pool. + for (int i = 0; i < num_workers; ++i) { + threading_strategy.tile_thread_pool()->Schedule([&tiles, tile_count, + &tile_counter, + &pending_workers, + &pending_tiles]() { + bool failed = false; + int index; + while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) < + tile_count) { + if (!failed) { + const auto& tile_ptr = tiles[index]; + if (!tile_ptr->ParseAndDecode()) { + LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number()); + failed = true; + } + } else { + pending_tiles->Decrement(false); + } + } + pending_workers.Decrement(!failed); + }); + } + // Have the current thread partake in tile decoding. + int index; + while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) < + tile_count) { + if (!tile_decoding_failed) { + const auto& tile_ptr = tiles[index]; + if (!tile_ptr->ParseAndDecode()) { + LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number()); + tile_decoding_failed = true; + } + } else { + pending_tiles->Decrement(false); + } + } + // Wait until all the workers are done. This ensures that all the tiles have + // been parsed. + tile_decoding_failed |= !pending_workers.Wait(); + // Wait until all the tiles have been decoded. + tile_decoding_failed |= !pending_tiles->Wait(); + if (tile_decoding_failed) return kStatusUnknownError; + assert(threading_strategy.post_filter_thread_pool() != nullptr); + post_filter->ApplyFilteringThreaded(); + return kStatusOk; +} + +StatusCode DecodeTilesFrameParallel( + const ObuSequenceHeader& sequence_header, + const ObuFrameHeader& frame_header, + const Vector>& tiles, + const SymbolDecoderContext& saved_symbol_decoder_context, + const SegmentationMap* const prev_segment_ids, + FrameScratchBuffer* const frame_scratch_buffer, + PostFilter* const post_filter, RefCountedBuffer* const current_frame) { + // Parse the frame. + for (const auto& tile : tiles) { + if (!tile->Parse()) { + LIBGAV1_DLOG(ERROR, "Failed to parse tile number: %d\n", tile->number()); + return kStatusUnknownError; + } + } + if (frame_header.enable_frame_end_update_cdf) { + frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context; + } + current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context); + SetSegmentationMap(frame_header, prev_segment_ids, current_frame); + // Mark frame as parsed. + current_frame->SetFrameState(kFrameStateParsed); + std::unique_ptr tile_scratch_buffer = + frame_scratch_buffer->tile_scratch_buffer_pool.Get(); + if (tile_scratch_buffer == nullptr) { + return kStatusOutOfMemory; + } + const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16; + // Decode in superblock row order (inter prediction in the Tile class will + // block until the required superblocks in the reference frame are decoded). + for (int row4x4 = 0; row4x4 < frame_header.rows4x4; + row4x4 += block_width4x4) { + for (const auto& tile_ptr : tiles) { + if (!tile_ptr->ProcessSuperBlockRow( + row4x4, tile_scratch_buffer.get())) { + LIBGAV1_DLOG(ERROR, "Failed to decode tile number: %d\n", + tile_ptr->number()); + return kStatusUnknownError; + } + } + const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow( + row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4, + /*do_deblock=*/true); + if (progress_row >= 0) { + current_frame->SetProgress(progress_row); + } + } + // Mark frame as decoded (we no longer care about row-level progress since the + // entire frame has been decoded). + current_frame->SetFrameState(kFrameStateDecoded); + frame_scratch_buffer->tile_scratch_buffer_pool.Release( + std::move(tile_scratch_buffer)); + return kStatusOk; +} + +// Helper function used by DecodeTilesThreadedFrameParallel. Applies the +// deblocking filter for tile boundaries for the superblock row at |row4x4|. +void ApplyDeblockingFilterForTileBoundaries( + PostFilter* const post_filter, const std::unique_ptr* tile_row_base, + const ObuFrameHeader& frame_header, int row4x4, int block_width4x4, + int tile_columns, bool decode_entire_tiles_in_worker_threads) { + // Apply vertical deblock filtering for the first 64 columns of each tile. + for (int tile_column = 0; tile_column < tile_columns; ++tile_column) { + const Tile& tile = *tile_row_base[tile_column]; + post_filter->ApplyDeblockFilter( + kLoopFilterTypeVertical, row4x4, tile.column4x4_start(), + tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4); + } + if (decode_entire_tiles_in_worker_threads && + row4x4 == tile_row_base[0]->row4x4_start()) { + // This is the first superblock row of a tile row. In this case, apply + // horizontal deblock filtering for the entire superblock row. + post_filter->ApplyDeblockFilter(kLoopFilterTypeHorizontal, row4x4, 0, + frame_header.columns4x4, block_width4x4); + } else { + // Apply horizontal deblock filtering for the first 64 columns of the + // first tile. + const Tile& first_tile = *tile_row_base[0]; + post_filter->ApplyDeblockFilter( + kLoopFilterTypeHorizontal, row4x4, first_tile.column4x4_start(), + first_tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4); + // Apply horizontal deblock filtering for the last 64 columns of the + // previous tile and the first 64 columns of the current tile. + for (int tile_column = 1; tile_column < tile_columns; ++tile_column) { + const Tile& tile = *tile_row_base[tile_column]; + // If the previous tile has more than 64 columns, then include those + // for the horizontal deblock. + const Tile& previous_tile = *tile_row_base[tile_column - 1]; + const int column4x4_start = + tile.column4x4_start() - + ((tile.column4x4_start() - kNum4x4InLoopFilterUnit != + previous_tile.column4x4_start()) + ? kNum4x4InLoopFilterUnit + : 0); + post_filter->ApplyDeblockFilter( + kLoopFilterTypeHorizontal, row4x4, column4x4_start, + tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4); + } + // Apply horizontal deblock filtering for the last 64 columns of the + // last tile. + const Tile& last_tile = *tile_row_base[tile_columns - 1]; + // Identify the last column4x4 value and do horizontal filtering for + // that column4x4. The value of last column4x4 is the nearest multiple + // of 16 that is before tile.column4x4_end(). + const int column4x4_start = (last_tile.column4x4_end() - 1) & ~15; + // If column4x4_start is the same as tile.column4x4_start() then it + // means that the last tile has <= 64 columns. So there is nothing left + // to deblock (since it was already deblocked in the loop above). + if (column4x4_start != last_tile.column4x4_start()) { + post_filter->ApplyDeblockFilter( + kLoopFilterTypeHorizontal, row4x4, column4x4_start, + last_tile.column4x4_end(), block_width4x4); + } + } +} + +// Helper function used by DecodeTilesThreadedFrameParallel. Decodes the +// superblock row starting at |row4x4| for tile at index |tile_index| in the +// list of tiles |tiles|. If the decoding is successful, then it does the +// following: +// * Schedule the next superblock row in the current tile column for decoding +// (the next superblock row may be in a different tile than the current +// one). +// * If an entire superblock row of the frame has been decoded, it notifies +// the waiters (if there are any). +void DecodeSuperBlockRowInTile( + const Vector>& tiles, size_t tile_index, int row4x4, + const int superblock_size4x4, const int tile_columns, + const int superblock_rows, FrameScratchBuffer* const frame_scratch_buffer, + PostFilter* const post_filter, BlockingCounter* const pending_jobs) { + std::unique_ptr scratch_buffer = + frame_scratch_buffer->tile_scratch_buffer_pool.Get(); + if (scratch_buffer == nullptr) { + SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows); + return; + } + Tile& tile = *tiles[tile_index]; + const bool ok = tile.ProcessSuperBlockRow( + row4x4, scratch_buffer.get()); + frame_scratch_buffer->tile_scratch_buffer_pool.Release( + std::move(scratch_buffer)); + if (!ok) { + SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows); + return; + } + if (post_filter->DoDeblock()) { + // Apply vertical deblock filtering for all the columns in this tile except + // for the first 64 columns. + post_filter->ApplyDeblockFilter( + kLoopFilterTypeVertical, row4x4, + tile.column4x4_start() + kNum4x4InLoopFilterUnit, tile.column4x4_end(), + superblock_size4x4); + // Apply horizontal deblock filtering for all the columns in this tile + // except for the first and the last 64 columns. + // Note about the last tile of each row: For the last tile, column4x4_end + // may not be a multiple of 16. In that case it is still okay to simply + // subtract 16 since ApplyDeblockFilter() will only do the filters in + // increments of 64 columns (or 32 columns for chroma with subsampling). + post_filter->ApplyDeblockFilter( + kLoopFilterTypeHorizontal, row4x4, + tile.column4x4_start() + kNum4x4InLoopFilterUnit, + tile.column4x4_end() - kNum4x4InLoopFilterUnit, superblock_size4x4); + } + const int superblock_size4x4_log2 = FloorLog2(superblock_size4x4); + const int index = row4x4 >> superblock_size4x4_log2; + int* const superblock_row_progress = + frame_scratch_buffer->superblock_row_progress.get(); + std::condition_variable* const superblock_row_progress_condvar = + frame_scratch_buffer->superblock_row_progress_condvar.get(); + bool notify; + { + std::lock_guard lock( + frame_scratch_buffer->superblock_row_mutex); + notify = ++superblock_row_progress[index] == tile_columns; + } + if (notify) { + // We are done decoding this superblock row. Notify the post filtering + // thread. + superblock_row_progress_condvar[index].notify_one(); + } + // Schedule the next superblock row (if one exists). + ThreadPool& thread_pool = + *frame_scratch_buffer->threading_strategy.thread_pool(); + const int next_row4x4 = row4x4 + superblock_size4x4; + if (!tile.IsRow4x4Inside(next_row4x4)) { + tile_index += tile_columns; + } + if (tile_index >= tiles.size()) return; + pending_jobs->IncrementBy(1); + thread_pool.Schedule([&tiles, tile_index, next_row4x4, superblock_size4x4, + tile_columns, superblock_rows, frame_scratch_buffer, + post_filter, pending_jobs]() { + DecodeSuperBlockRowInTile(tiles, tile_index, next_row4x4, + superblock_size4x4, tile_columns, superblock_rows, + frame_scratch_buffer, post_filter, pending_jobs); + pending_jobs->Decrement(); + }); +} + +StatusCode DecodeTilesThreadedFrameParallel( + const ObuSequenceHeader& sequence_header, + const ObuFrameHeader& frame_header, + const Vector>& tiles, + const SymbolDecoderContext& saved_symbol_decoder_context, + const SegmentationMap* const prev_segment_ids, + FrameScratchBuffer* const frame_scratch_buffer, + PostFilter* const post_filter, RefCountedBuffer* const current_frame) { + // Parse the frame. + ThreadPool& thread_pool = + *frame_scratch_buffer->threading_strategy.thread_pool(); + std::atomic tile_counter(0); + const int tile_count = static_cast(tiles.size()); + const int num_workers = thread_pool.num_threads(); + BlockingCounterWithStatus parse_workers(num_workers); + // Submit tile parsing jobs to the thread pool. + for (int i = 0; i < num_workers; ++i) { + thread_pool.Schedule([&tiles, tile_count, &tile_counter, &parse_workers]() { + bool failed = false; + int index; + while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) < + tile_count) { + if (!failed) { + const auto& tile_ptr = tiles[index]; + if (!tile_ptr->Parse()) { + LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number()); + failed = true; + } + } + } + parse_workers.Decrement(!failed); + }); + } + + // Have the current thread participate in parsing. + bool failed = false; + int index; + while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) < + tile_count) { + if (!failed) { + const auto& tile_ptr = tiles[index]; + if (!tile_ptr->Parse()) { + LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number()); + failed = true; + } + } + } + + // Wait until all the parse workers are done. This ensures that all the tiles + // have been parsed. + if (!parse_workers.Wait() || failed) { + return kLibgav1StatusUnknownError; + } + if (frame_header.enable_frame_end_update_cdf) { + frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context; + } + current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context); + SetSegmentationMap(frame_header, prev_segment_ids, current_frame); + current_frame->SetFrameState(kFrameStateParsed); + + // Decode the frame. + const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16; + const int block_width4x4_log2 = + sequence_header.use_128x128_superblock ? 5 : 4; + const int superblock_rows = + (frame_header.rows4x4 + block_width4x4 - 1) >> block_width4x4_log2; + if (!frame_scratch_buffer->superblock_row_progress.Resize(superblock_rows) || + !frame_scratch_buffer->superblock_row_progress_condvar.Resize( + superblock_rows)) { + return kLibgav1StatusOutOfMemory; + } + int* const superblock_row_progress = + frame_scratch_buffer->superblock_row_progress.get(); + memset(superblock_row_progress, 0, + superblock_rows * sizeof(superblock_row_progress[0])); + frame_scratch_buffer->tile_decoding_failed = false; + const int tile_columns = frame_header.tile_info.tile_columns; + const bool decode_entire_tiles_in_worker_threads = + num_workers >= tile_columns; + BlockingCounter pending_jobs( + decode_entire_tiles_in_worker_threads ? num_workers : tile_columns); + if (decode_entire_tiles_in_worker_threads) { + // Submit tile decoding jobs to the thread pool. + tile_counter = 0; + for (int i = 0; i < num_workers; ++i) { + thread_pool.Schedule([&tiles, tile_count, &tile_counter, &pending_jobs, + frame_scratch_buffer, superblock_rows]() { + bool failed = false; + int index; + while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) < + tile_count) { + if (failed) continue; + const auto& tile_ptr = tiles[index]; + if (!tile_ptr->Decode( + &frame_scratch_buffer->superblock_row_mutex, + frame_scratch_buffer->superblock_row_progress.get(), + frame_scratch_buffer->superblock_row_progress_condvar + .get())) { + LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number()); + failed = true; + SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows); + } + } + pending_jobs.Decrement(); + }); + } + } else { + // Schedule the jobs for first tile row. + for (int tile_index = 0; tile_index < tile_columns; ++tile_index) { + thread_pool.Schedule([&tiles, tile_index, block_width4x4, tile_columns, + superblock_rows, frame_scratch_buffer, post_filter, + &pending_jobs]() { + DecodeSuperBlockRowInTile( + tiles, tile_index, 0, block_width4x4, tile_columns, superblock_rows, + frame_scratch_buffer, post_filter, &pending_jobs); + pending_jobs.Decrement(); + }); + } + } + + // Current thread will do the post filters. + std::condition_variable* const superblock_row_progress_condvar = + frame_scratch_buffer->superblock_row_progress_condvar.get(); + const std::unique_ptr* tile_row_base = &tiles[0]; + for (int row4x4 = 0, index = 0; row4x4 < frame_header.rows4x4; + row4x4 += block_width4x4, ++index) { + if (!tile_row_base[0]->IsRow4x4Inside(row4x4)) { + tile_row_base += tile_columns; + } + { + std::unique_lock lock( + frame_scratch_buffer->superblock_row_mutex); + while (superblock_row_progress[index] != tile_columns && + !frame_scratch_buffer->tile_decoding_failed) { + superblock_row_progress_condvar[index].wait(lock); + } + if (frame_scratch_buffer->tile_decoding_failed) break; + } + if (post_filter->DoDeblock()) { + // Apply deblocking filter for the tile boundaries of this superblock row. + // The deblocking filter for the internal blocks will be applied in the + // tile worker threads. In this thread, we will only have to apply + // deblocking filter for the tile boundaries. + ApplyDeblockingFilterForTileBoundaries( + post_filter, tile_row_base, frame_header, row4x4, block_width4x4, + tile_columns, decode_entire_tiles_in_worker_threads); + } + // Apply all the post filters other than deblocking. + const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow( + row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4, + /*do_deblock=*/false); + if (progress_row >= 0) { + current_frame->SetProgress(progress_row); + } + } + // Wait until all the pending jobs are done. This ensures that all the tiles + // have been decoded and wrapped up. + pending_jobs.Wait(); + { + std::lock_guard lock( + frame_scratch_buffer->superblock_row_mutex); + if (frame_scratch_buffer->tile_decoding_failed) { + return kLibgav1StatusUnknownError; + } + } + + current_frame->SetFrameState(kFrameStateDecoded); + return kStatusOk; +} + +} // namespace + +// static +StatusCode DecoderImpl::Create(const DecoderSettings* settings, + std::unique_ptr* output) { + if (settings->threads <= 0) { + LIBGAV1_DLOG(ERROR, "Invalid settings->threads: %d.", settings->threads); + return kStatusInvalidArgument; + } + if (settings->frame_parallel) { + if (settings->release_input_buffer == nullptr) { + LIBGAV1_DLOG(ERROR, + "release_input_buffer callback must not be null when " + "frame_parallel is true."); + return kStatusInvalidArgument; + } + } + std::unique_ptr impl(new (std::nothrow) DecoderImpl(settings)); + if (impl == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to allocate DecoderImpl."); + return kStatusOutOfMemory; + } + const StatusCode status = impl->Init(); + if (status != kStatusOk) return status; + *output = std::move(impl); + return kStatusOk; +} + +DecoderImpl::DecoderImpl(const DecoderSettings* settings) + : buffer_pool_(settings->on_frame_buffer_size_changed, + settings->get_frame_buffer, settings->release_frame_buffer, + settings->callback_private_data), + settings_(*settings) { + dsp::DspInit(); +} + +DecoderImpl::~DecoderImpl() { + // Clean up and wait until all the threads have stopped. We just have to pass + // in a dummy status that is not kStatusOk or kStatusTryAgain to trigger the + // path that clears all the threads and structs. + SignalFailure(kStatusUnknownError); + // Release any other frame buffer references that we may be holding on to. + ReleaseOutputFrame(); + output_frame_queue_.Clear(); + for (auto& reference_frame : state_.reference_frame) { + reference_frame = nullptr; + } +} + +StatusCode DecoderImpl::Init() { + if (!GenerateWedgeMask(&wedge_masks_)) { + LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed."); + return kStatusOutOfMemory; + } + if (!output_frame_queue_.Init(kMaxLayers)) { + LIBGAV1_DLOG(ERROR, "output_frame_queue_.Init() failed."); + return kStatusOutOfMemory; + } + return kStatusOk; +} + +StatusCode DecoderImpl::InitializeFrameThreadPoolAndTemporalUnitQueue( + const uint8_t* data, size_t size) { + is_frame_parallel_ = false; + if (settings_.frame_parallel) { + DecoderState state; + std::unique_ptr obu(new (std::nothrow) ObuParser( + data, size, settings_.operating_point, &buffer_pool_, &state)); + if (obu == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser."); + return kStatusOutOfMemory; + } + RefCountedBufferPtr current_frame; + const StatusCode status = obu->ParseOneFrame(¤t_frame); + if (status != kStatusOk) { + LIBGAV1_DLOG(ERROR, "Failed to parse OBU."); + return status; + } + current_frame = nullptr; + // We assume that the first frame that was parsed will contain the frame + // header. This assumption is usually true in practice. So we will simply + // not use frame parallel mode if this is not the case. + if (settings_.threads > 1 && + !InitializeThreadPoolsForFrameParallel( + settings_.threads, obu->frame_header().tile_info.tile_count, + obu->frame_header().tile_info.tile_columns, &frame_thread_pool_, + &frame_scratch_buffer_pool_)) { + return kStatusOutOfMemory; + } + } + const int max_allowed_frames = + (frame_thread_pool_ != nullptr) ? frame_thread_pool_->num_threads() : 1; + assert(max_allowed_frames > 0); + if (!temporal_units_.Init(max_allowed_frames)) { + LIBGAV1_DLOG(ERROR, "temporal_units_.Init() failed."); + return kStatusOutOfMemory; + } + is_frame_parallel_ = frame_thread_pool_ != nullptr; + return kStatusOk; +} + +StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size, + int64_t user_private_data, + void* buffer_private_data) { + if (data == nullptr || size == 0) return kStatusInvalidArgument; + if (HasFailure()) return kStatusUnknownError; + if (!seen_first_frame_) { + seen_first_frame_ = true; + const StatusCode status = + InitializeFrameThreadPoolAndTemporalUnitQueue(data, size); + if (status != kStatusOk) { + return SignalFailure(status); + } + } + if (temporal_units_.Full()) { + return kStatusTryAgain; + } + if (is_frame_parallel_) { + return ParseAndSchedule(data, size, user_private_data, buffer_private_data); + } + TemporalUnit temporal_unit(data, size, user_private_data, + buffer_private_data); + temporal_units_.Push(std::move(temporal_unit)); + return kStatusOk; +} + +StatusCode DecoderImpl::SignalFailure(StatusCode status) { + if (status == kStatusOk || status == kStatusTryAgain) return status; + // Set the |failure_status_| first so that any pending jobs in + // |frame_thread_pool_| will exit right away when the thread pool is being + // released below. + { + std::lock_guard lock(mutex_); + failure_status_ = status; + } + // Make sure all waiting threads exit. + buffer_pool_.Abort(); + frame_thread_pool_ = nullptr; + while (!temporal_units_.Empty()) { + if (settings_.release_input_buffer != nullptr) { + settings_.release_input_buffer( + settings_.callback_private_data, + temporal_units_.Front().buffer_private_data); + } + temporal_units_.Pop(); + } + return status; +} + +// DequeueFrame() follows the following policy to avoid holding unnecessary +// frame buffer references in output_frame_: output_frame_ must be null when +// DequeueFrame() returns false. +StatusCode DecoderImpl::DequeueFrame(const DecoderBuffer** out_ptr) { + if (out_ptr == nullptr) { + LIBGAV1_DLOG(ERROR, "Invalid argument: out_ptr == nullptr."); + return kStatusInvalidArgument; + } + // We assume a call to DequeueFrame() indicates that the caller is no longer + // using the previous output frame, so we can release it. + ReleaseOutputFrame(); + if (temporal_units_.Empty()) { + // No input frames to decode. + *out_ptr = nullptr; + return kStatusNothingToDequeue; + } + TemporalUnit& temporal_unit = temporal_units_.Front(); + if (!is_frame_parallel_) { + // If |output_frame_queue_| is not empty, then return the first frame from + // that queue. + if (!output_frame_queue_.Empty()) { + RefCountedBufferPtr frame = std::move(output_frame_queue_.Front()); + output_frame_queue_.Pop(); + buffer_.user_private_data = temporal_unit.user_private_data; + if (output_frame_queue_.Empty()) { + temporal_units_.Pop(); + } + const StatusCode status = CopyFrameToOutputBuffer(frame); + if (status != kStatusOk) { + return status; + } + *out_ptr = &buffer_; + return kStatusOk; + } + // Decode the next available temporal unit and return. + const StatusCode status = DecodeTemporalUnit(temporal_unit, out_ptr); + if (status != kStatusOk) { + // In case of failure, discard all the output frames that we may be + // holding on references to. + output_frame_queue_.Clear(); + } + if (settings_.release_input_buffer != nullptr) { + settings_.release_input_buffer(settings_.callback_private_data, + temporal_unit.buffer_private_data); + } + if (output_frame_queue_.Empty()) { + temporal_units_.Pop(); + } + return status; + } + { + std::unique_lock lock(mutex_); + if (settings_.blocking_dequeue) { + while (!temporal_unit.decoded && failure_status_ == kStatusOk) { + decoded_condvar_.wait(lock); + } + } else { + if (!temporal_unit.decoded && failure_status_ == kStatusOk) { + return kStatusTryAgain; + } + } + if (failure_status_ != kStatusOk) { + const StatusCode failure_status = failure_status_; + lock.unlock(); + return SignalFailure(failure_status); + } + } + if (settings_.release_input_buffer != nullptr && + !temporal_unit.released_input_buffer) { + temporal_unit.released_input_buffer = true; + settings_.release_input_buffer(settings_.callback_private_data, + temporal_unit.buffer_private_data); + } + if (temporal_unit.status != kStatusOk) { + temporal_units_.Pop(); + return SignalFailure(temporal_unit.status); + } + if (!temporal_unit.has_displayable_frame) { + *out_ptr = nullptr; + temporal_units_.Pop(); + return kStatusOk; + } + assert(temporal_unit.output_layer_count > 0); + StatusCode status = CopyFrameToOutputBuffer( + temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame); + temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame = + nullptr; + if (status != kStatusOk) { + temporal_units_.Pop(); + return SignalFailure(status); + } + buffer_.user_private_data = temporal_unit.user_private_data; + *out_ptr = &buffer_; + if (--temporal_unit.output_layer_count == 0) { + temporal_units_.Pop(); + } + return kStatusOk; +} + +StatusCode DecoderImpl::ParseAndSchedule(const uint8_t* data, size_t size, + int64_t user_private_data, + void* buffer_private_data) { + TemporalUnit temporal_unit(data, size, user_private_data, + buffer_private_data); + std::unique_ptr obu(new (std::nothrow) ObuParser( + temporal_unit.data, temporal_unit.size, settings_.operating_point, + &buffer_pool_, &state_)); + if (obu == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser."); + return kStatusOutOfMemory; + } + if (has_sequence_header_) { + obu->set_sequence_header(sequence_header_); + } + StatusCode status; + int position_in_temporal_unit = 0; + while (obu->HasData()) { + RefCountedBufferPtr current_frame; + status = obu->ParseOneFrame(¤t_frame); + if (status != kStatusOk) { + LIBGAV1_DLOG(ERROR, "Failed to parse OBU."); + return status; + } + if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) { + LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed."); + return kStatusOutOfMemory; + } + if (IsNewSequenceHeader(*obu)) { + const ObuSequenceHeader& sequence_header = obu->sequence_header(); + const Libgav1ImageFormat image_format = + ComposeImageFormat(sequence_header.color_config.is_monochrome, + sequence_header.color_config.subsampling_x, + sequence_header.color_config.subsampling_y); + const int max_bottom_border = GetBottomBorderPixels( + /*do_cdef=*/true, /*do_restoration=*/true, + /*do_superres=*/true, sequence_header.color_config.subsampling_y); + // TODO(vigneshv): This may not be the right place to call this callback + // for the frame parallel case. Investigate and fix it. + if (!buffer_pool_.OnFrameBufferSizeChanged( + sequence_header.color_config.bitdepth, image_format, + sequence_header.max_frame_width, sequence_header.max_frame_height, + kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) { + LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed."); + return kStatusUnknownError; + } + } + // This can happen when there are multiple spatial/temporal layers and if + // all the layers are outside the current operating point. + if (current_frame == nullptr) { + continue; + } + // Note that we cannot set EncodedFrame.temporal_unit here. It will be set + // in the code below after |temporal_unit| is std::move'd into the + // |temporal_units_| queue. + if (!temporal_unit.frames.emplace_back(obu.get(), state_, current_frame, + position_in_temporal_unit++)) { + LIBGAV1_DLOG(ERROR, "temporal_unit.frames.emplace_back failed."); + return kStatusOutOfMemory; + } + state_.UpdateReferenceFrames(current_frame, + obu->frame_header().refresh_frame_flags); + } + // This function cannot fail after this point. So it is okay to move the + // |temporal_unit| into |temporal_units_| queue. + temporal_units_.Push(std::move(temporal_unit)); + if (temporal_units_.Back().frames.empty()) { + std::lock_guard lock(mutex_); + temporal_units_.Back().has_displayable_frame = false; + temporal_units_.Back().decoded = true; + return kStatusOk; + } + for (auto& frame : temporal_units_.Back().frames) { + EncodedFrame* const encoded_frame = &frame; + encoded_frame->temporal_unit = &temporal_units_.Back(); + frame_thread_pool_->Schedule([this, encoded_frame]() { + if (HasFailure()) return; + const StatusCode status = DecodeFrame(encoded_frame); + encoded_frame->state = {}; + encoded_frame->frame = nullptr; + TemporalUnit& temporal_unit = *encoded_frame->temporal_unit; + std::lock_guard lock(mutex_); + if (failure_status_ != kStatusOk) return; + // temporal_unit's status defaults to kStatusOk. So we need to set it only + // on error. If |failure_status_| is not kStatusOk at this point, it means + // that there has already been a failure. So we don't care about this + // subsequent failure. We will simply return the error code of the first + // failure. + if (status != kStatusOk) { + temporal_unit.status = status; + if (failure_status_ == kStatusOk) { + failure_status_ = status; + } + } + temporal_unit.decoded = + ++temporal_unit.decoded_count == temporal_unit.frames.size(); + if (temporal_unit.decoded && settings_.output_all_layers && + temporal_unit.output_layer_count > 1) { + std::sort( + temporal_unit.output_layers, + temporal_unit.output_layers + temporal_unit.output_layer_count); + } + if (temporal_unit.decoded || failure_status_ != kStatusOk) { + decoded_condvar_.notify_one(); + } + }); + } + return kStatusOk; +} + +StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) { + const ObuSequenceHeader& sequence_header = encoded_frame->sequence_header; + const ObuFrameHeader& frame_header = encoded_frame->frame_header; + RefCountedBufferPtr current_frame = std::move(encoded_frame->frame); + + std::unique_ptr frame_scratch_buffer = + frame_scratch_buffer_pool_.Get(); + if (frame_scratch_buffer == nullptr) { + LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer."); + return kStatusOutOfMemory; + } + // |frame_scratch_buffer| will be released when this local variable goes out + // of scope (i.e.) on any return path in this function. + FrameScratchBufferReleaser frame_scratch_buffer_releaser( + &frame_scratch_buffer_pool_, &frame_scratch_buffer); + + StatusCode status; + if (!frame_header.show_existing_frame) { + if (encoded_frame->tile_buffers.empty()) { + // This means that the last call to ParseOneFrame() did not actually + // have any tile groups. This could happen in rare cases (for example, + // if there is a Metadata OBU after the TileGroup OBU). We currently do + // not have a reason to handle those cases, so we simply continue. + return kStatusOk; + } + status = DecodeTiles(sequence_header, frame_header, + encoded_frame->tile_buffers, encoded_frame->state, + frame_scratch_buffer.get(), current_frame.get()); + if (status != kStatusOk) { + return status; + } + } else { + if (!current_frame->WaitUntilDecoded()) { + return kStatusUnknownError; + } + } + if (!frame_header.show_frame && !frame_header.show_existing_frame) { + // This frame is not displayable. Not an error. + return kStatusOk; + } + RefCountedBufferPtr film_grain_frame; + status = ApplyFilmGrain( + sequence_header, frame_header, current_frame, &film_grain_frame, + frame_scratch_buffer->threading_strategy.thread_pool()); + if (status != kStatusOk) { + return status; + } + + TemporalUnit& temporal_unit = *encoded_frame->temporal_unit; + std::lock_guard lock(mutex_); + if (temporal_unit.has_displayable_frame && !settings_.output_all_layers) { + assert(temporal_unit.output_frame_position >= 0); + // A displayable frame was already found in this temporal unit. This can + // happen if there are multiple spatial/temporal layers. Since + // |settings_.output_all_layers| is false, we will output only the last + // displayable frame. + if (temporal_unit.output_frame_position > + encoded_frame->position_in_temporal_unit) { + return kStatusOk; + } + // Replace any output frame that we may have seen before with the current + // frame. + assert(temporal_unit.output_layer_count == 1); + --temporal_unit.output_layer_count; + } + temporal_unit.has_displayable_frame = true; + temporal_unit.output_layers[temporal_unit.output_layer_count].frame = + std::move(film_grain_frame); + temporal_unit.output_layers[temporal_unit.output_layer_count] + .position_in_temporal_unit = encoded_frame->position_in_temporal_unit; + ++temporal_unit.output_layer_count; + temporal_unit.output_frame_position = + encoded_frame->position_in_temporal_unit; + return kStatusOk; +} + +StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit, + const DecoderBuffer** out_ptr) { + std::unique_ptr obu(new (std::nothrow) ObuParser( + temporal_unit.data, temporal_unit.size, settings_.operating_point, + &buffer_pool_, &state_)); + if (obu == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser."); + return kStatusOutOfMemory; + } + if (has_sequence_header_) { + obu->set_sequence_header(sequence_header_); + } + StatusCode status; + std::unique_ptr frame_scratch_buffer = + frame_scratch_buffer_pool_.Get(); + if (frame_scratch_buffer == nullptr) { + LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer."); + return kStatusOutOfMemory; + } + // |frame_scratch_buffer| will be released when this local variable goes out + // of scope (i.e.) on any return path in this function. + FrameScratchBufferReleaser frame_scratch_buffer_releaser( + &frame_scratch_buffer_pool_, &frame_scratch_buffer); + + while (obu->HasData()) { + RefCountedBufferPtr current_frame; + status = obu->ParseOneFrame(¤t_frame); + if (status != kStatusOk) { + LIBGAV1_DLOG(ERROR, "Failed to parse OBU."); + return status; + } + if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) { + LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed."); + return kStatusOutOfMemory; + } + if (IsNewSequenceHeader(*obu)) { + const ObuSequenceHeader& sequence_header = obu->sequence_header(); + const Libgav1ImageFormat image_format = + ComposeImageFormat(sequence_header.color_config.is_monochrome, + sequence_header.color_config.subsampling_x, + sequence_header.color_config.subsampling_y); + const int max_bottom_border = GetBottomBorderPixels( + /*do_cdef=*/true, /*do_restoration=*/true, + /*do_superres=*/true, sequence_header.color_config.subsampling_y); + if (!buffer_pool_.OnFrameBufferSizeChanged( + sequence_header.color_config.bitdepth, image_format, + sequence_header.max_frame_width, sequence_header.max_frame_height, + kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) { + LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed."); + return kStatusUnknownError; + } + } + if (!obu->frame_header().show_existing_frame) { + if (obu->tile_buffers().empty()) { + // This means that the last call to ParseOneFrame() did not actually + // have any tile groups. This could happen in rare cases (for example, + // if there is a Metadata OBU after the TileGroup OBU). We currently do + // not have a reason to handle those cases, so we simply continue. + continue; + } + status = DecodeTiles(obu->sequence_header(), obu->frame_header(), + obu->tile_buffers(), state_, + frame_scratch_buffer.get(), current_frame.get()); + if (status != kStatusOk) { + return status; + } + } + state_.UpdateReferenceFrames(current_frame, + obu->frame_header().refresh_frame_flags); + if (obu->frame_header().show_frame || + obu->frame_header().show_existing_frame) { + if (!output_frame_queue_.Empty() && !settings_.output_all_layers) { + // There is more than one displayable frame in the current operating + // point and |settings_.output_all_layers| is false. In this case, we + // simply return the last displayable frame as the output frame and + // ignore the rest. + assert(output_frame_queue_.Size() == 1); + output_frame_queue_.Pop(); + } + RefCountedBufferPtr film_grain_frame; + status = ApplyFilmGrain( + obu->sequence_header(), obu->frame_header(), current_frame, + &film_grain_frame, + frame_scratch_buffer->threading_strategy.film_grain_thread_pool()); + if (status != kStatusOk) return status; + output_frame_queue_.Push(std::move(film_grain_frame)); + } + } + if (output_frame_queue_.Empty()) { + // No displayable frame in the temporal unit. Not an error. + *out_ptr = nullptr; + return kStatusOk; + } + status = CopyFrameToOutputBuffer(output_frame_queue_.Front()); + output_frame_queue_.Pop(); + if (status != kStatusOk) { + return status; + } + buffer_.user_private_data = temporal_unit.user_private_data; + *out_ptr = &buffer_; + return kStatusOk; +} + +StatusCode DecoderImpl::CopyFrameToOutputBuffer( + const RefCountedBufferPtr& frame) { + YuvBuffer* yuv_buffer = frame->buffer(); + + buffer_.chroma_sample_position = frame->chroma_sample_position(); + + if (yuv_buffer->is_monochrome()) { + buffer_.image_format = kImageFormatMonochrome400; + } else { + if (yuv_buffer->subsampling_x() == 0 && yuv_buffer->subsampling_y() == 0) { + buffer_.image_format = kImageFormatYuv444; + } else if (yuv_buffer->subsampling_x() == 1 && + yuv_buffer->subsampling_y() == 0) { + buffer_.image_format = kImageFormatYuv422; + } else if (yuv_buffer->subsampling_x() == 1 && + yuv_buffer->subsampling_y() == 1) { + buffer_.image_format = kImageFormatYuv420; + } else { + LIBGAV1_DLOG(ERROR, + "Invalid chroma subsampling values: cannot determine buffer " + "image format."); + return kStatusInvalidArgument; + } + } + buffer_.color_range = sequence_header_.color_config.color_range; + buffer_.color_primary = sequence_header_.color_config.color_primary; + buffer_.transfer_characteristics = + sequence_header_.color_config.transfer_characteristics; + buffer_.matrix_coefficients = + sequence_header_.color_config.matrix_coefficients; + + buffer_.bitdepth = yuv_buffer->bitdepth(); + const int num_planes = + yuv_buffer->is_monochrome() ? kMaxPlanesMonochrome : kMaxPlanes; + int plane = kPlaneY; + for (; plane < num_planes; ++plane) { + buffer_.stride[plane] = yuv_buffer->stride(plane); + buffer_.plane[plane] = yuv_buffer->data(plane); + buffer_.displayed_width[plane] = yuv_buffer->width(plane); + buffer_.displayed_height[plane] = yuv_buffer->height(plane); + } + for (; plane < kMaxPlanes; ++plane) { + buffer_.stride[plane] = 0; + buffer_.plane[plane] = nullptr; + buffer_.displayed_width[plane] = 0; + buffer_.displayed_height[plane] = 0; + } + buffer_.spatial_id = frame->spatial_id(); + buffer_.temporal_id = frame->temporal_id(); + buffer_.buffer_private_data = frame->buffer_private_data(); + output_frame_ = frame; + return kStatusOk; +} + +void DecoderImpl::ReleaseOutputFrame() { + for (auto& plane : buffer_.plane) { + plane = nullptr; + } + output_frame_ = nullptr; +} + +StatusCode DecoderImpl::DecodeTiles( + const ObuSequenceHeader& sequence_header, + const ObuFrameHeader& frame_header, const Vector& tile_buffers, + const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer, + RefCountedBuffer* const current_frame) { + frame_scratch_buffer->tile_scratch_buffer_pool.Reset( + sequence_header.color_config.bitdepth); + if (!frame_scratch_buffer->loop_restoration_info.Reset( + &frame_header.loop_restoration, frame_header.upscaled_width, + frame_header.height, sequence_header.color_config.subsampling_x, + sequence_header.color_config.subsampling_y, + sequence_header.color_config.is_monochrome)) { + LIBGAV1_DLOG(ERROR, + "Failed to allocate memory for loop restoration info units."); + return kStatusOutOfMemory; + } + ThreadingStrategy& threading_strategy = + frame_scratch_buffer->threading_strategy; + if (!is_frame_parallel_ && + !threading_strategy.Reset(frame_header, settings_.threads)) { + return kStatusOutOfMemory; + } + const bool do_cdef = + PostFilter::DoCdef(frame_header, settings_.post_filter_mask); + const int num_planes = sequence_header.color_config.is_monochrome + ? kMaxPlanesMonochrome + : kMaxPlanes; + const bool do_restoration = PostFilter::DoRestoration( + frame_header.loop_restoration, settings_.post_filter_mask, num_planes); + const bool do_superres = + PostFilter::DoSuperRes(frame_header, settings_.post_filter_mask); + // Use kBorderPixels for the left, right, and top borders. Only the bottom + // border may need to be bigger. Cdef border is needed only if we apply Cdef + // without multithreading. + const int bottom_border = GetBottomBorderPixels( + do_cdef && threading_strategy.post_filter_thread_pool() == nullptr, + do_restoration, do_superres, sequence_header.color_config.subsampling_y); + current_frame->set_chroma_sample_position( + sequence_header.color_config.chroma_sample_position); + if (!current_frame->Realloc(sequence_header.color_config.bitdepth, + sequence_header.color_config.is_monochrome, + frame_header.upscaled_width, frame_header.height, + sequence_header.color_config.subsampling_x, + sequence_header.color_config.subsampling_y, + /*left_border=*/kBorderPixels, + /*right_border=*/kBorderPixels, + /*top_border=*/kBorderPixels, bottom_border)) { + LIBGAV1_DLOG(ERROR, "Failed to allocate memory for the decoder buffer."); + return kStatusOutOfMemory; + } + if (sequence_header.enable_cdef) { + if (!frame_scratch_buffer->cdef_index.Reset( + DivideBy16(frame_header.rows4x4 + kMaxBlockHeight4x4), + DivideBy16(frame_header.columns4x4 + kMaxBlockWidth4x4), + /*zero_initialize=*/false)) { + LIBGAV1_DLOG(ERROR, "Failed to allocate memory for cdef index."); + return kStatusOutOfMemory; + } + } + if (!frame_scratch_buffer->inter_transform_sizes.Reset( + frame_header.rows4x4 + kMaxBlockHeight4x4, + frame_header.columns4x4 + kMaxBlockWidth4x4, + /*zero_initialize=*/false)) { + LIBGAV1_DLOG(ERROR, "Failed to allocate memory for inter_transform_sizes."); + return kStatusOutOfMemory; + } + if (frame_header.use_ref_frame_mvs) { + if (!frame_scratch_buffer->motion_field.mv.Reset( + DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4), + /*zero_initialize=*/false) || + !frame_scratch_buffer->motion_field.reference_offset.Reset( + DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4), + /*zero_initialize=*/false)) { + LIBGAV1_DLOG(ERROR, + "Failed to allocate memory for temporal motion vectors."); + return kStatusOutOfMemory; + } + + // For each motion vector, only mv[0] needs to be initialized to + // kInvalidMvValue, mv[1] is not necessary to be initialized and can be + // set to an arbitrary value. For simplicity, mv[1] is set to 0. + // The following memory initialization of contiguous memory is very fast. It + // is not recommended to make the initialization multi-threaded, unless the + // memory which needs to be initialized in each thread is still contiguous. + MotionVector invalid_mv; + invalid_mv.mv[0] = kInvalidMvValue; + invalid_mv.mv[1] = 0; + MotionVector* const motion_field_mv = + &frame_scratch_buffer->motion_field.mv[0][0]; + std::fill(motion_field_mv, + motion_field_mv + frame_scratch_buffer->motion_field.mv.size(), + invalid_mv); + } + + // The addition of kMaxBlockHeight4x4 and kMaxBlockWidth4x4 is necessary so + // that the block parameters cache can be filled in for the last row/column + // without having to check for boundary conditions. + if (!frame_scratch_buffer->block_parameters_holder.Reset( + frame_header.rows4x4 + kMaxBlockHeight4x4, + frame_header.columns4x4 + kMaxBlockWidth4x4, + sequence_header.use_128x128_superblock)) { + return kStatusOutOfMemory; + } + const dsp::Dsp* const dsp = + dsp::GetDspTable(sequence_header.color_config.bitdepth); + if (dsp == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to get the dsp table for bitdepth %d.", + sequence_header.color_config.bitdepth); + return kStatusInternalError; + } + + const int tile_count = frame_header.tile_info.tile_count; + assert(tile_count >= 1); + Vector> tiles; + if (!tiles.reserve(tile_count)) { + LIBGAV1_DLOG(ERROR, "tiles.reserve(%d) failed.\n", tile_count); + return kStatusOutOfMemory; + } + + if (threading_strategy.row_thread_pool(0) != nullptr || is_frame_parallel_) { + if (frame_scratch_buffer->residual_buffer_pool == nullptr) { + frame_scratch_buffer->residual_buffer_pool.reset( + new (std::nothrow) ResidualBufferPool( + sequence_header.use_128x128_superblock, + sequence_header.color_config.subsampling_x, + sequence_header.color_config.subsampling_y, + sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t) + : sizeof(int32_t))); + if (frame_scratch_buffer->residual_buffer_pool == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to allocate residual buffer.\n"); + return kStatusOutOfMemory; + } + } else { + frame_scratch_buffer->residual_buffer_pool->Reset( + sequence_header.use_128x128_superblock, + sequence_header.color_config.subsampling_x, + sequence_header.color_config.subsampling_y, + sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t) + : sizeof(int32_t)); + } + } + + if (threading_strategy.post_filter_thread_pool() != nullptr && do_cdef) { + // We need to store 4 rows per 64x64 unit. + const int num_units = + MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4)); + // subsampling_y is set to zero irrespective of the actual frame's + // subsampling since we need to store exactly |num_units| rows of the loop + // restoration border pixels. + if (!frame_scratch_buffer->cdef_border.Realloc( + sequence_header.color_config.bitdepth, + sequence_header.color_config.is_monochrome, + MultiplyBy4(frame_header.columns4x4), num_units, + sequence_header.color_config.subsampling_x, + /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels, + kBorderPixels, nullptr, nullptr, nullptr)) { + return kStatusOutOfMemory; + } + } + + if (do_restoration && + (do_cdef || threading_strategy.post_filter_thread_pool() != nullptr)) { + // We need to store 4 rows per 64x64 unit. + const int num_units = + MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4)); + // subsampling_y is set to zero irrespective of the actual frame's + // subsampling since we need to store exactly |num_units| rows of the loop + // restoration border pixels. + if (!frame_scratch_buffer->loop_restoration_border.Realloc( + sequence_header.color_config.bitdepth, + sequence_header.color_config.is_monochrome, + frame_header.upscaled_width, num_units, + sequence_header.color_config.subsampling_x, + /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels, + kBorderPixels, nullptr, nullptr, nullptr)) { + return kStatusOutOfMemory; + } + } + + if (do_superres) { + const int pixel_size = sequence_header.color_config.bitdepth == 8 + ? sizeof(uint8_t) + : sizeof(uint16_t); + if (!frame_scratch_buffer->superres_coefficients[kPlaneTypeY].Resize( + kSuperResFilterTaps * Align(frame_header.upscaled_width, 16) * + pixel_size)) { + LIBGAV1_DLOG(ERROR, + "Failed to Resize superres_coefficients[kPlaneTypeY]."); + return kStatusOutOfMemory; + } + if (!sequence_header.color_config.is_monochrome && + sequence_header.color_config.subsampling_x != 0 && + !frame_scratch_buffer->superres_coefficients[kPlaneTypeUV].Resize( + kSuperResFilterTaps * + Align(SubsampledValue(frame_header.upscaled_width, 1), 16) * + pixel_size)) { + LIBGAV1_DLOG(ERROR, + "Failed to Resize superres_coefficients[kPlaneTypeUV]."); + return kStatusOutOfMemory; + } + } + + if (do_superres && threading_strategy.post_filter_thread_pool() != nullptr) { + const int num_threads = + threading_strategy.post_filter_thread_pool()->num_threads() + 1; + // subsampling_y is set to zero irrespective of the actual frame's + // subsampling since we need to store exactly |num_threads| rows of the + // down-scaled pixels. + // Left and right borders are for line extension. They are doubled for the Y + // plane to make sure the U and V planes have enough space after possible + // subsampling. + if (!frame_scratch_buffer->superres_line_buffer.Realloc( + sequence_header.color_config.bitdepth, + sequence_header.color_config.is_monochrome, + MultiplyBy4(frame_header.columns4x4), num_threads, + sequence_header.color_config.subsampling_x, + /*subsampling_y=*/0, 2 * kSuperResHorizontalBorder, + 2 * (kSuperResHorizontalBorder + kSuperResHorizontalPadding), 0, 0, + nullptr, nullptr, nullptr)) { + LIBGAV1_DLOG(ERROR, "Failed to resize superres line buffer.\n"); + return kStatusOutOfMemory; + } + } + + PostFilter post_filter(frame_header, sequence_header, frame_scratch_buffer, + current_frame->buffer(), dsp, + settings_.post_filter_mask); + + if (is_frame_parallel_ && !IsIntraFrame(frame_header.frame_type)) { + // We can parse the current frame if all the reference frames have been + // parsed. + for (const int index : frame_header.reference_frame_index) { + if (!state.reference_frame[index]->WaitUntilParsed()) { + return kStatusUnknownError; + } + } + } + + // If prev_segment_ids is a null pointer, it is treated as if it pointed to + // a segmentation map containing all 0s. + const SegmentationMap* prev_segment_ids = nullptr; + if (frame_header.primary_reference_frame == kPrimaryReferenceNone) { + frame_scratch_buffer->symbol_decoder_context.Initialize( + frame_header.quantizer.base_index); + } else { + const int index = + frame_header + .reference_frame_index[frame_header.primary_reference_frame]; + assert(index != -1); + const RefCountedBuffer* prev_frame = state.reference_frame[index].get(); + frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext(); + if (frame_header.segmentation.enabled && + prev_frame->columns4x4() == frame_header.columns4x4 && + prev_frame->rows4x4() == frame_header.rows4x4) { + prev_segment_ids = prev_frame->segmentation_map(); + } + } + + // The Tile class must make use of a separate buffer to store the unfiltered + // pixels for the intra prediction of the next superblock row. This is done + // only when one of the following conditions are true: + // * is_frame_parallel_ is true. + // * settings_.threads == 1. + // In the non-frame-parallel multi-threaded case, we do not run the post + // filters in the decode loop. So this buffer need not be used. + const bool use_intra_prediction_buffer = + is_frame_parallel_ || settings_.threads == 1; + if (use_intra_prediction_buffer) { + if (!frame_scratch_buffer->intra_prediction_buffers.Resize( + frame_header.tile_info.tile_rows)) { + LIBGAV1_DLOG(ERROR, "Failed to Resize intra_prediction_buffers."); + return kStatusOutOfMemory; + } + IntraPredictionBuffer* const intra_prediction_buffers = + frame_scratch_buffer->intra_prediction_buffers.get(); + for (int plane = kPlaneY; plane < num_planes; ++plane) { + const int subsampling = + (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x; + const size_t intra_prediction_buffer_size = + ((MultiplyBy4(frame_header.columns4x4) >> subsampling) * + (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t) + : sizeof(uint16_t))); + for (int tile_row = 0; tile_row < frame_header.tile_info.tile_rows; + ++tile_row) { + if (!intra_prediction_buffers[tile_row][plane].Resize( + intra_prediction_buffer_size)) { + LIBGAV1_DLOG(ERROR, + "Failed to allocate intra prediction buffer for tile " + "row %d plane %d.\n", + tile_row, plane); + return kStatusOutOfMemory; + } + } + } + } + + SymbolDecoderContext saved_symbol_decoder_context; + BlockingCounterWithStatus pending_tiles(tile_count); + for (int tile_number = 0; tile_number < tile_count; ++tile_number) { + std::unique_ptr tile = Tile::Create( + tile_number, tile_buffers[tile_number].data, + tile_buffers[tile_number].size, sequence_header, frame_header, + current_frame, state, frame_scratch_buffer, wedge_masks_, + quantizer_matrix_, &saved_symbol_decoder_context, prev_segment_ids, + &post_filter, dsp, threading_strategy.row_thread_pool(tile_number), + &pending_tiles, is_frame_parallel_, use_intra_prediction_buffer); + if (tile == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to create tile."); + return kStatusOutOfMemory; + } + tiles.push_back_unchecked(std::move(tile)); + } + assert(tiles.size() == static_cast(tile_count)); + if (is_frame_parallel_) { + if (frame_scratch_buffer->threading_strategy.thread_pool() == nullptr) { + return DecodeTilesFrameParallel( + sequence_header, frame_header, tiles, saved_symbol_decoder_context, + prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame); + } + return DecodeTilesThreadedFrameParallel( + sequence_header, frame_header, tiles, saved_symbol_decoder_context, + prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame); + } + StatusCode status; + if (settings_.threads == 1) { + status = DecodeTilesNonFrameParallel(sequence_header, frame_header, tiles, + frame_scratch_buffer, &post_filter); + } else { + status = DecodeTilesThreadedNonFrameParallel(tiles, frame_scratch_buffer, + &post_filter, &pending_tiles); + } + if (status != kStatusOk) return status; + if (frame_header.enable_frame_end_update_cdf) { + frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context; + } + current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context); + SetSegmentationMap(frame_header, prev_segment_ids, current_frame); + return kStatusOk; +} + +StatusCode DecoderImpl::ApplyFilmGrain( + const ObuSequenceHeader& sequence_header, + const ObuFrameHeader& frame_header, + const RefCountedBufferPtr& displayable_frame, + RefCountedBufferPtr* film_grain_frame, ThreadPool* thread_pool) { + if (!sequence_header.film_grain_params_present || + !displayable_frame->film_grain_params().apply_grain || + (settings_.post_filter_mask & 0x10) == 0) { + *film_grain_frame = displayable_frame; + return kStatusOk; + } + if (!frame_header.show_existing_frame && + frame_header.refresh_frame_flags == 0) { + // If show_existing_frame is true, then the current frame is a previously + // saved reference frame. If refresh_frame_flags is nonzero, then the + // state_.UpdateReferenceFrames() call above has saved the current frame as + // a reference frame. Therefore, if both of these conditions are false, then + // the current frame is not saved as a reference frame. displayable_frame + // should hold the only reference to the current frame. + assert(displayable_frame.use_count() == 1); + // Add film grain noise in place. + *film_grain_frame = displayable_frame; + } else { + *film_grain_frame = buffer_pool_.GetFreeBuffer(); + if (*film_grain_frame == nullptr) { + LIBGAV1_DLOG(ERROR, + "Could not get film_grain_frame from the buffer pool."); + return kStatusResourceExhausted; + } + if (!(*film_grain_frame) + ->Realloc(displayable_frame->buffer()->bitdepth(), + displayable_frame->buffer()->is_monochrome(), + displayable_frame->upscaled_width(), + displayable_frame->frame_height(), + displayable_frame->buffer()->subsampling_x(), + displayable_frame->buffer()->subsampling_y(), + kBorderPixelsFilmGrain, kBorderPixelsFilmGrain, + kBorderPixelsFilmGrain, kBorderPixelsFilmGrain)) { + LIBGAV1_DLOG(ERROR, "film_grain_frame->Realloc() failed."); + return kStatusOutOfMemory; + } + (*film_grain_frame) + ->set_chroma_sample_position( + displayable_frame->chroma_sample_position()); + (*film_grain_frame)->set_spatial_id(displayable_frame->spatial_id()); + (*film_grain_frame)->set_temporal_id(displayable_frame->temporal_id()); + } + const bool color_matrix_is_identity = + sequence_header.color_config.matrix_coefficients == + kMatrixCoefficientsIdentity; + assert(displayable_frame->buffer()->stride(kPlaneU) == + displayable_frame->buffer()->stride(kPlaneV)); + const int input_stride_uv = displayable_frame->buffer()->stride(kPlaneU); + assert((*film_grain_frame)->buffer()->stride(kPlaneU) == + (*film_grain_frame)->buffer()->stride(kPlaneV)); + const int output_stride_uv = (*film_grain_frame)->buffer()->stride(kPlaneU); +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (displayable_frame->buffer()->bitdepth() > 8) { + FilmGrain<10> film_grain(displayable_frame->film_grain_params(), + displayable_frame->buffer()->is_monochrome(), + color_matrix_is_identity, + displayable_frame->buffer()->subsampling_x(), + displayable_frame->buffer()->subsampling_y(), + displayable_frame->upscaled_width(), + displayable_frame->frame_height(), thread_pool); + if (!film_grain.AddNoise( + displayable_frame->buffer()->data(kPlaneY), + displayable_frame->buffer()->stride(kPlaneY), + displayable_frame->buffer()->data(kPlaneU), + displayable_frame->buffer()->data(kPlaneV), input_stride_uv, + (*film_grain_frame)->buffer()->data(kPlaneY), + (*film_grain_frame)->buffer()->stride(kPlaneY), + (*film_grain_frame)->buffer()->data(kPlaneU), + (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) { + LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed."); + return kStatusOutOfMemory; + } + return kStatusOk; + } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + FilmGrain<8> film_grain(displayable_frame->film_grain_params(), + displayable_frame->buffer()->is_monochrome(), + color_matrix_is_identity, + displayable_frame->buffer()->subsampling_x(), + displayable_frame->buffer()->subsampling_y(), + displayable_frame->upscaled_width(), + displayable_frame->frame_height(), thread_pool); + if (!film_grain.AddNoise( + displayable_frame->buffer()->data(kPlaneY), + displayable_frame->buffer()->stride(kPlaneY), + displayable_frame->buffer()->data(kPlaneU), + displayable_frame->buffer()->data(kPlaneV), input_stride_uv, + (*film_grain_frame)->buffer()->data(kPlaneY), + (*film_grain_frame)->buffer()->stride(kPlaneY), + (*film_grain_frame)->buffer()->data(kPlaneU), + (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) { + LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed."); + return kStatusOutOfMemory; + } + return kStatusOk; +} + +bool DecoderImpl::IsNewSequenceHeader(const ObuParser& obu) { + if (std::find_if(obu.obu_headers().begin(), obu.obu_headers().end(), + [](const ObuHeader& obu_header) { + return obu_header.type == kObuSequenceHeader; + }) == obu.obu_headers().end()) { + return false; + } + const ObuSequenceHeader sequence_header = obu.sequence_header(); + const bool sequence_header_changed = + !has_sequence_header_ || + sequence_header_.color_config.bitdepth != + sequence_header.color_config.bitdepth || + sequence_header_.color_config.is_monochrome != + sequence_header.color_config.is_monochrome || + sequence_header_.color_config.subsampling_x != + sequence_header.color_config.subsampling_x || + sequence_header_.color_config.subsampling_y != + sequence_header.color_config.subsampling_y || + sequence_header_.max_frame_width != sequence_header.max_frame_width || + sequence_header_.max_frame_height != sequence_header.max_frame_height; + sequence_header_ = sequence_header; + has_sequence_header_ = true; + return sequence_header_changed; +} + +bool DecoderImpl::MaybeInitializeQuantizerMatrix( + const ObuFrameHeader& frame_header) { + if (quantizer_matrix_initialized_ || !frame_header.quantizer.use_matrix) { + return true; + } + if (!InitializeQuantizerMatrix(&quantizer_matrix_)) { + return false; + } + quantizer_matrix_initialized_ = true; + return true; +} + +} // namespace libgav1 diff --git a/src/decoder_impl.h b/src/decoder_impl.h new file mode 100644 index 0000000..721b666 --- /dev/null +++ b/src/decoder_impl.h @@ -0,0 +1,266 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DECODER_IMPL_H_ +#define LIBGAV1_SRC_DECODER_IMPL_H_ + +#include +#include // NOLINT (unapproved c++11 header) +#include +#include +#include +#include // NOLINT (unapproved c++11 header) + +#include "src/buffer_pool.h" +#include "src/decoder_state.h" +#include "src/dsp/constants.h" +#include "src/frame_scratch_buffer.h" +#include "src/gav1/decoder_buffer.h" +#include "src/gav1/decoder_settings.h" +#include "src/gav1/status_code.h" +#include "src/obu_parser.h" +#include "src/quantizer.h" +#include "src/residual_buffer_pool.h" +#include "src/symbol_decoder_context.h" +#include "src/tile.h" +#include "src/utils/array_2d.h" +#include "src/utils/block_parameters_holder.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" +#include "src/utils/queue.h" +#include "src/utils/segmentation_map.h" +#include "src/utils/types.h" + +namespace libgav1 { + +struct TemporalUnit; + +struct EncodedFrame { + EncodedFrame(ObuParser* const obu, const DecoderState& state, + const RefCountedBufferPtr& frame, int position_in_temporal_unit) + : sequence_header(obu->sequence_header()), + frame_header(obu->frame_header()), + state(state), + temporal_unit(nullptr), + frame(frame), + position_in_temporal_unit(position_in_temporal_unit) { + obu->MoveTileBuffers(&tile_buffers); + frame->MarkFrameAsStarted(); + } + + const ObuSequenceHeader sequence_header; + const ObuFrameHeader frame_header; + Vector tile_buffers; + DecoderState state; + TemporalUnit* temporal_unit; + RefCountedBufferPtr frame; + const int position_in_temporal_unit; +}; + +struct TemporalUnit : public Allocable { + // The default constructor is invoked by the Queue::Init() + // method. Queue<> does not use the default-constructed elements, so it is + // safe for the default constructor to not initialize the members. + TemporalUnit() = default; + TemporalUnit(const uint8_t* data, size_t size, int64_t user_private_data, + void* buffer_private_data) + : data(data), + size(size), + user_private_data(user_private_data), + buffer_private_data(buffer_private_data), + decoded(false), + status(kStatusOk), + has_displayable_frame(false), + output_frame_position(-1), + decoded_count(0), + output_layer_count(0), + released_input_buffer(false) {} + + const uint8_t* data; + size_t size; + int64_t user_private_data; + void* buffer_private_data; + + // The following members are used only in frame parallel mode. + bool decoded; + StatusCode status; + bool has_displayable_frame; + int output_frame_position; + + Vector frames; + size_t decoded_count; + + // The struct (and the counter) is used to support output of multiple layers + // within a single temporal unit. The decoding process will store the output + // frames in |output_layers| in the order they are finished decoding. At the + // end of the decoding process, this array will be sorted in reverse order of + // |position_in_temporal_unit|. DequeueFrame() will then return the frames in + // reverse order (so that the entire process can run with a single counter + // variable). + struct OutputLayer { + // Used by std::sort to sort |output_layers| in reverse order of + // |position_in_temporal_unit|. + bool operator<(const OutputLayer& rhs) const { + return position_in_temporal_unit > rhs.position_in_temporal_unit; + } + + RefCountedBufferPtr frame; + int position_in_temporal_unit = 0; + } output_layers[kMaxLayers]; + // Number of entries in |output_layers|. + int output_layer_count; + // Flag to ensure that we release the input buffer only once if there are + // multiple output layers. + bool released_input_buffer; +}; + +class DecoderImpl : public Allocable { + public: + // The constructor saves a const reference to |*settings|. Therefore + // |*settings| must outlive the DecoderImpl object. On success, |*output| + // contains a pointer to the newly-created DecoderImpl object. On failure, + // |*output| is not modified. + static StatusCode Create(const DecoderSettings* settings, + std::unique_ptr* output); + ~DecoderImpl(); + StatusCode EnqueueFrame(const uint8_t* data, size_t size, + int64_t user_private_data, void* buffer_private_data); + StatusCode DequeueFrame(const DecoderBuffer** out_ptr); + static constexpr int GetMaxBitdepth() { + static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10, + "LIBGAV1_MAX_BITDEPTH must be 8 or 10."); + return LIBGAV1_MAX_BITDEPTH; + } + + private: + explicit DecoderImpl(const DecoderSettings* settings); + StatusCode Init(); + // Called when the first frame is enqueued. It does the OBU parsing for one + // temporal unit to retrieve the tile configuration and sets up the frame + // threading if frame parallel mode is allowed. It also initializes the + // |temporal_units_| queue based on the number of frame threads. + // + // The following are the limitations of the current implementation: + // * It assumes that all frames in the video have the same tile + // configuration. The frame parallel threading model will not be updated + // based on tile configuration changes mid-stream. + // * The above assumption holds true even when there is a new coded video + // sequence (i.e.) a new sequence header. + StatusCode InitializeFrameThreadPoolAndTemporalUnitQueue(const uint8_t* data, + size_t size); + // Used only in frame parallel mode. Signals failure and waits until the + // worker threads are aborted if |status| is a failure status. If |status| is + // equal to kStatusOk or kStatusTryAgain, this function does not do anything. + // Always returns the input parameter |status| as the return value. + // + // This function is called only from the application thread (from + // EnqueueFrame() and DequeueFrame()). + StatusCode SignalFailure(StatusCode status); + + void ReleaseOutputFrame(); + + // Decodes all the frames contained in the given temporal unit. Used only in + // non frame parallel mode. + StatusCode DecodeTemporalUnit(const TemporalUnit& temporal_unit, + const DecoderBuffer** out_ptr); + // Used only in frame parallel mode. Does the OBU parsing for |data| and + // schedules the individual frames for decoding in the |frame_thread_pool_|. + StatusCode ParseAndSchedule(const uint8_t* data, size_t size, + int64_t user_private_data, + void* buffer_private_data); + // Decodes the |encoded_frame| and updates the + // |encoded_frame->temporal_unit|'s parameters if the decoded frame is a + // displayable frame. Used only in frame parallel mode. + StatusCode DecodeFrame(EncodedFrame* encoded_frame); + + // Populates |buffer_| with values from |frame|. Adds a reference to |frame| + // in |output_frame_|. + StatusCode CopyFrameToOutputBuffer(const RefCountedBufferPtr& frame); + StatusCode DecodeTiles(const ObuSequenceHeader& sequence_header, + const ObuFrameHeader& frame_header, + const Vector& tile_buffers, + const DecoderState& state, + FrameScratchBuffer* frame_scratch_buffer, + RefCountedBuffer* current_frame); + // Applies film grain synthesis to the |displayable_frame| and stores the film + // grain applied frame into |film_grain_frame|. Returns kStatusOk on success. + StatusCode ApplyFilmGrain(const ObuSequenceHeader& sequence_header, + const ObuFrameHeader& frame_header, + const RefCountedBufferPtr& displayable_frame, + RefCountedBufferPtr* film_grain_frame, + ThreadPool* thread_pool); + + bool IsNewSequenceHeader(const ObuParser& obu); + + bool HasFailure() { + std::lock_guard lock(mutex_); + return failure_status_ != kStatusOk; + } + + // Initializes the |quantizer_matrix_| if necessary and sets + // |quantizer_matrix_initialized_| to true. + bool MaybeInitializeQuantizerMatrix(const ObuFrameHeader& frame_header); + + // Elements in this queue cannot be moved with std::move since the + // |EncodedFrame.temporal_unit| stores a pointer to elements in this queue. + Queue temporal_units_; + DecoderState state_; + + DecoderBuffer buffer_ = {}; + // |output_frame_| holds a reference to the output frame on behalf of + // |buffer_|. + RefCountedBufferPtr output_frame_; + + // Queue of output frames that are to be returned in the DequeueFrame() calls. + // If |settings_.output_all_layers| is false, this queue will never contain + // more than 1 element. This queue is used only when |is_frame_parallel_| is + // false. + Queue output_frame_queue_; + + BufferPool buffer_pool_; + WedgeMaskArray wedge_masks_; + QuantizerMatrix quantizer_matrix_; + bool quantizer_matrix_initialized_ = false; + FrameScratchBufferPool frame_scratch_buffer_pool_; + + // Used to synchronize the accesses into |temporal_units_| in order to update + // the "decoded" state of an temporal unit. + std::mutex mutex_; + std::condition_variable decoded_condvar_; + bool is_frame_parallel_; + std::unique_ptr frame_thread_pool_; + + // In frame parallel mode, there are two primary points of failure: + // 1) ParseAndSchedule() + // 2) DecodeTiles() + // Both of these functions have to respond to the other one failing by + // aborting whatever they are doing. This variable is used to accomplish that. + // If |failure_status_| is not kStatusOk, then the two functions will try to + // abort as early as they can. + StatusCode failure_status_ = kStatusOk LIBGAV1_GUARDED_BY(mutex_); + + ObuSequenceHeader sequence_header_ = {}; + // If true, sequence_header is valid. + bool has_sequence_header_ = false; + + const DecoderSettings& settings_; + bool seen_first_frame_ = false; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DECODER_IMPL_H_ diff --git a/src/decoder_settings.cc b/src/decoder_settings.cc new file mode 100644 index 0000000..9399073 --- /dev/null +++ b/src/decoder_settings.cc @@ -0,0 +1,33 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/gav1/decoder_settings.h" + +extern "C" { + +void Libgav1DecoderSettingsInitDefault(Libgav1DecoderSettings* settings) { + settings->threads = 1; + settings->frame_parallel = 0; // false + settings->blocking_dequeue = 0; // false + settings->on_frame_buffer_size_changed = nullptr; + settings->get_frame_buffer = nullptr; + settings->release_frame_buffer = nullptr; + settings->release_input_buffer = nullptr; + settings->callback_private_data = nullptr; + settings->output_all_layers = 0; // false + settings->operating_point = 0; + settings->post_filter_mask = 0x1f; +} + +} // extern "C" diff --git a/src/decoder_state.h b/src/decoder_state.h new file mode 100644 index 0000000..897c99f --- /dev/null +++ b/src/decoder_state.h @@ -0,0 +1,89 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DECODER_STATE_H_ +#define LIBGAV1_SRC_DECODER_STATE_H_ + +#include +#include + +#include "src/buffer_pool.h" +#include "src/utils/constants.h" + +namespace libgav1 { + +struct DecoderState { + // Section 7.20. Updates frames in the reference_frame array with + // |current_frame|, based on the |refresh_frame_flags| bitmask. + void UpdateReferenceFrames(const RefCountedBufferPtr& current_frame, + int refresh_frame_flags) { + for (int ref_index = 0, mask = refresh_frame_flags; mask != 0; + ++ref_index, mask >>= 1) { + if ((mask & 1) != 0) { + reference_valid[ref_index] = true; + reference_frame_id[ref_index] = current_frame_id; + reference_frame[ref_index] = current_frame; + reference_order_hint[ref_index] = order_hint; + } + } + } + + // Clears all the reference frames. + void ClearReferenceFrames() { + reference_valid = {}; + reference_frame_id = {}; + reference_order_hint = {}; + for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) { + reference_frame[ref_index] = nullptr; + } + } + + // reference_valid and reference_frame_id are used only if + // sequence_header_.frame_id_numbers_present is true. + // The reference_valid array is indexed by a reference picture slot number. + // A value (boolean) in the array signifies whether the corresponding + // reference picture slot is valid for use as a reference picture. + std::array reference_valid = {}; + std::array reference_frame_id = {}; + // A valid value of current_frame_id is an unsigned integer of at most 16 + // bits. -1 indicates current_frame_id is not initialized. + int current_frame_id = -1; + // The RefOrderHint array variable in the spec. + std::array reference_order_hint = {}; + // The OrderHint variable in the spec. Its value comes from either the + // order_hint syntax element in the uncompressed header (if + // show_existing_frame is false) or RefOrderHint[ frame_to_show_map_idx ] + // (if show_existing_frame is true and frame_type is KEY_FRAME). See Section + // 5.9.2 and Section 7.4. + // + // NOTE: When show_existing_frame is false, it is often more convenient to + // just use the order_hint field of the frame header as OrderHint. So this + // field is mainly used to update the reference_order_hint array in + // UpdateReferenceFrames(). + uint8_t order_hint = 0; + // reference_frame_sign_bias[i] (a boolean) specifies the intended direction + // of the motion vector in time for each reference frame. + // * |false| indicates that the reference frame is a forwards reference (i.e. + // the reference frame is expected to be output before the current frame); + // * |true| indicates that the reference frame is a backwards reference. + // Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used. + std::array reference_frame_sign_bias = {}; + std::array reference_frame; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DECODER_STATE_H_ diff --git a/src/dsp/arm/average_blend_neon.cc b/src/dsp/arm/average_blend_neon.cc new file mode 100644 index 0000000..834e8b4 --- /dev/null +++ b/src/dsp/arm/average_blend_neon.cc @@ -0,0 +1,146 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/average_blend.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kInterPostRoundBit = + kInterRoundBitsVertical - kInterRoundBitsCompoundVertical; + +inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0, + const int16_t* prediction_1) { + const int16x8_t pred0 = vld1q_s16(prediction_0); + const int16x8_t pred1 = vld1q_s16(prediction_1); + const int16x8_t res = vaddq_s16(pred0, pred1); + return vqrshrun_n_s16(res, kInterPostRoundBit + 1); +} + +inline void AverageBlendLargeRow(const int16_t* prediction_0, + const int16_t* prediction_1, const int width, + uint8_t* dest) { + int x = width; + do { + const int16x8_t pred_00 = vld1q_s16(prediction_0); + const int16x8_t pred_01 = vld1q_s16(prediction_1); + prediction_0 += 8; + prediction_1 += 8; + const int16x8_t res0 = vaddq_s16(pred_00, pred_01); + const uint8x8_t res_out0 = vqrshrun_n_s16(res0, kInterPostRoundBit + 1); + const int16x8_t pred_10 = vld1q_s16(prediction_0); + const int16x8_t pred_11 = vld1q_s16(prediction_1); + prediction_0 += 8; + prediction_1 += 8; + const int16x8_t res1 = vaddq_s16(pred_10, pred_11); + const uint8x8_t res_out1 = vqrshrun_n_s16(res1, kInterPostRoundBit + 1); + vst1q_u8(dest, vcombine_u8(res_out0, res_out1)); + dest += 16; + x -= 16; + } while (x != 0); +} + +void AverageBlend_NEON(const void* prediction_0, const void* prediction_1, + const int width, const int height, void* const dest, + const ptrdiff_t dest_stride) { + auto* dst = static_cast(dest); + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y = height; + + if (width == 4) { + do { + const uint8x8_t result = AverageBlend8Row(pred_0, pred_1); + pred_0 += 8; + pred_1 += 8; + + StoreLo4(dst, result); + dst += dest_stride; + StoreHi4(dst, result); + dst += dest_stride; + y -= 2; + } while (y != 0); + return; + } + + if (width == 8) { + do { + vst1_u8(dst, AverageBlend8Row(pred_0, pred_1)); + dst += dest_stride; + pred_0 += 8; + pred_1 += 8; + + vst1_u8(dst, AverageBlend8Row(pred_0, pred_1)); + dst += dest_stride; + pred_0 += 8; + pred_1 += 8; + + y -= 2; + } while (y != 0); + return; + } + + do { + AverageBlendLargeRow(pred_0, pred_1, width, dst); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + + AverageBlendLargeRow(pred_0, pred_1, width, dst); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + + y -= 2; + } while (y != 0); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->average_blend = AverageBlend_NEON; +} + +} // namespace + +void AverageBlendInit_NEON() { Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON + +namespace libgav1 { +namespace dsp { + +void AverageBlendInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/average_blend_neon.h b/src/dsp/arm/average_blend_neon.h new file mode 100644 index 0000000..d13bcd6 --- /dev/null +++ b/src/dsp/arm/average_blend_neon.h @@ -0,0 +1,36 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::average_blend. This function is not thread-safe. +void AverageBlendInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_ diff --git a/src/dsp/arm/cdef_neon.cc b/src/dsp/arm/cdef_neon.cc new file mode 100644 index 0000000..4d0e76f --- /dev/null +++ b/src/dsp/arm/cdef_neon.cc @@ -0,0 +1,697 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/cdef.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +#include "src/dsp/cdef.inc" + +// ---------------------------------------------------------------------------- +// Refer to CdefDirection_C(). +// +// int32_t partial[8][15] = {}; +// for (int i = 0; i < 8; ++i) { +// for (int j = 0; j < 8; ++j) { +// const int x = 1; +// partial[0][i + j] += x; +// partial[1][i + j / 2] += x; +// partial[2][i] += x; +// partial[3][3 + i - j / 2] += x; +// partial[4][7 + i - j] += x; +// partial[5][3 - i / 2 + j] += x; +// partial[6][j] += x; +// partial[7][i / 2 + j] += x; +// } +// } +// +// Using the code above, generate the position count for partial[8][15]. +// +// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1 +// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 +// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0 +// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 +// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1 +// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 +// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0 +// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 +// +// The SIMD code shifts the input horizontally, then adds vertically to get the +// correct partial value for the given position. +// ---------------------------------------------------------------------------- + +// ---------------------------------------------------------------------------- +// partial[0][i + j] += x; +// +// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 +// 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00 +// 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00 +// 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00 +// 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00 +// 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00 +// 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00 +// 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77 +// +// partial[4] is the same except the source is reversed. +LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(uint8x8_t* v_src, + uint16x8_t* partial_lo, + uint16x8_t* partial_hi) { + const uint8x8_t v_zero = vdup_n_u8(0); + // 00 01 02 03 04 05 06 07 + // 00 10 11 12 13 14 15 16 + *partial_lo = vaddl_u8(v_src[0], vext_u8(v_zero, v_src[1], 7)); + + // 00 00 20 21 22 23 24 25 + *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[2], 6)); + // 17 00 00 00 00 00 00 00 + // 26 27 00 00 00 00 00 00 + *partial_hi = + vaddl_u8(vext_u8(v_src[1], v_zero, 7), vext_u8(v_src[2], v_zero, 6)); + + // 00 00 00 30 31 32 33 34 + *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[3], 5)); + // 35 36 37 00 00 00 00 00 + *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[3], v_zero, 5)); + + // 00 00 00 00 40 41 42 43 + *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[4], 4)); + // 44 45 46 47 00 00 00 00 + *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[4], v_zero, 4)); + + // 00 00 00 00 00 50 51 52 + *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[5], 3)); + // 53 54 55 56 57 00 00 00 + *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[5], v_zero, 3)); + + // 00 00 00 00 00 00 60 61 + *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[6], 2)); + // 62 63 64 65 66 67 00 00 + *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[6], v_zero, 2)); + + // 00 00 00 00 00 00 00 70 + *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[7], 1)); + // 71 72 73 74 75 76 77 00 + *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[7], v_zero, 1)); +} + +// ---------------------------------------------------------------------------- +// partial[1][i + j / 2] += x; +// +// A0 = src[0] + src[1], A1 = src[2] + src[3], ... +// +// A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00 +// 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00 +// 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00 +// 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00 +// 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00 +// 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00 +// 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00 +// 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00 +// +// partial[3] is the same except the source is reversed. +LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(uint8x8_t* v_src, + uint16x8_t* partial_lo, + uint16x8_t* partial_hi) { + uint8x16_t v_d1_temp[8]; + const uint8x8_t v_zero = vdup_n_u8(0); + const uint8x16_t v_zero_16 = vdupq_n_u8(0); + + for (int i = 0; i < 8; ++i) { + v_d1_temp[i] = vcombine_u8(v_src[i], v_zero); + } + + *partial_lo = *partial_hi = vdupq_n_u16(0); + // A0 A1 A2 A3 00 00 00 00 + *partial_lo = vpadalq_u8(*partial_lo, v_d1_temp[0]); + + // 00 B0 B1 B2 B3 00 00 00 + *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[1], 14)); + + // 00 00 C0 C1 C2 C3 00 00 + *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[2], 12)); + // 00 00 00 D0 D1 D2 D3 00 + *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[3], 10)); + // 00 00 00 00 E0 E1 E2 E3 + *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[4], 8)); + + // 00 00 00 00 00 F0 F1 F2 + *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[5], 6)); + // F3 00 00 00 00 00 00 00 + *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[5], v_zero_16, 6)); + + // 00 00 00 00 00 00 G0 G1 + *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[6], 4)); + // G2 G3 00 00 00 00 00 00 + *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[6], v_zero_16, 4)); + + // 00 00 00 00 00 00 00 H0 + *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[7], 2)); + // H1 H2 H3 00 00 00 00 00 + *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[7], v_zero_16, 2)); +} + +// ---------------------------------------------------------------------------- +// partial[7][i / 2 + j] += x; +// +// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 +// 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 +// 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00 +// 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00 +// 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00 +// 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00 +// 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00 +// 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00 +// +// partial[5] is the same except the source is reversed. +LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(uint8x8_t* v_src, + uint16x8_t* partial_lo, + uint16x8_t* partial_hi) { + const uint16x8_t v_zero = vdupq_n_u16(0); + uint16x8_t v_pair_add[4]; + // Add vertical source pairs. + v_pair_add[0] = vaddl_u8(v_src[0], v_src[1]); + v_pair_add[1] = vaddl_u8(v_src[2], v_src[3]); + v_pair_add[2] = vaddl_u8(v_src[4], v_src[5]); + v_pair_add[3] = vaddl_u8(v_src[6], v_src[7]); + + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + *partial_lo = v_pair_add[0]; + // 00 00 00 00 00 00 00 00 + // 00 00 00 00 00 00 00 00 + *partial_hi = vdupq_n_u16(0); + + // 00 20 21 22 23 24 25 26 + // 00 30 31 32 33 34 35 36 + *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[1], 7)); + // 27 00 00 00 00 00 00 00 + // 37 00 00 00 00 00 00 00 + *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[1], v_zero, 7)); + + // 00 00 40 41 42 43 44 45 + // 00 00 50 51 52 53 54 55 + *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[2], 6)); + // 46 47 00 00 00 00 00 00 + // 56 57 00 00 00 00 00 00 + *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[2], v_zero, 6)); + + // 00 00 00 60 61 62 63 64 + // 00 00 00 70 71 72 73 74 + *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[3], 5)); + // 65 66 67 00 00 00 00 00 + // 75 76 77 00 00 00 00 00 + *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[3], v_zero, 5)); +} + +LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source, + ptrdiff_t stride, uint16x8_t* partial_lo, + uint16x8_t* partial_hi) { + const auto* src = static_cast(source); + + // 8x8 input + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + uint8x8_t v_src[8]; + for (int i = 0; i < 8; ++i) { + v_src[i] = vld1_u8(src); + src += stride; + } + + // partial for direction 2 + // -------------------------------------------------------------------------- + // partial[2][i] += x; + // 00 10 20 30 40 50 60 70 00 00 00 00 00 00 00 00 + // 01 11 21 33 41 51 61 71 00 00 00 00 00 00 00 00 + // 02 12 22 33 42 52 62 72 00 00 00 00 00 00 00 00 + // 03 13 23 33 43 53 63 73 00 00 00 00 00 00 00 00 + // 04 14 24 34 44 54 64 74 00 00 00 00 00 00 00 00 + // 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00 + // 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00 + // 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00 + partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), partial_lo[2], 0); + partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1); + partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2); + partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3); + partial_lo[2] = vsetq_lane_u16(SumVector(v_src[4]), partial_lo[2], 4); + partial_lo[2] = vsetq_lane_u16(SumVector(v_src[5]), partial_lo[2], 5); + partial_lo[2] = vsetq_lane_u16(SumVector(v_src[6]), partial_lo[2], 6); + partial_lo[2] = vsetq_lane_u16(SumVector(v_src[7]), partial_lo[2], 7); + + // partial for direction 6 + // -------------------------------------------------------------------------- + // partial[6][j] += x; + // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 00 + // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 00 + // 20 21 22 23 24 25 26 27 00 00 00 00 00 00 00 00 + // 30 31 32 33 34 35 36 37 00 00 00 00 00 00 00 00 + // 40 41 42 43 44 45 46 47 00 00 00 00 00 00 00 00 + // 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00 + // 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00 + // 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00 + const uint8x8_t v_zero = vdup_n_u8(0); + partial_lo[6] = vaddl_u8(v_zero, v_src[0]); + for (int i = 1; i < 8; ++i) { + partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]); + } + + // partial for direction 0 + AddPartial_D0_D4(v_src, &partial_lo[0], &partial_hi[0]); + + // partial for direction 1 + AddPartial_D1_D3(v_src, &partial_lo[1], &partial_hi[1]); + + // partial for direction 7 + AddPartial_D5_D7(v_src, &partial_lo[7], &partial_hi[7]); + + uint8x8_t v_src_reverse[8]; + for (int i = 0; i < 8; ++i) { + v_src_reverse[i] = vrev64_u8(v_src[i]); + } + + // partial for direction 4 + AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]); + + // partial for direction 3 + AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]); + + // partial for direction 5 + AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]); +} + +uint32x4_t Square(uint16x4_t a) { return vmull_u16(a, a); } + +uint32x4_t SquareAccumulate(uint32x4_t a, uint16x4_t b) { + return vmlal_u16(a, b, b); +} + +// |cost[0]| and |cost[4]| square the input and sum with the corresponding +// element from the other end of the vector: +// |kCdefDivisionTable[]| element: +// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) * +// kCdefDivisionTable[i + 1]; +// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8]; +// Because everything is being summed into a single value the distributive +// property allows us to mirror the division table and accumulate once. +uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b, + const uint32x4_t division_table[4]) { + uint32x4_t c = vmulq_u32(Square(vget_low_u16(a)), division_table[0]); + c = vmlaq_u32(c, Square(vget_high_u16(a)), division_table[1]); + c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[2]); + c = vmlaq_u32(c, Square(vget_high_u16(b)), division_table[3]); + return SumVector(c); +} + +// |cost[2]| and |cost[6]| square the input and accumulate: +// cost[2] += Square(partial[2][i]) +uint32_t SquareAccumulate(const uint16x8_t a) { + uint32x4_t c = Square(vget_low_u16(a)); + c = SquareAccumulate(c, vget_high_u16(a)); + c = vmulq_n_u32(c, kCdefDivisionTable[7]); + return SumVector(c); +} + +uint32_t CostOdd(const uint16x8_t a, const uint16x8_t b, const uint32x4_t mask, + const uint32x4_t division_table[2]) { + // Remove elements 0-2. + uint32x4_t c = vandq_u32(mask, Square(vget_low_u16(a))); + c = vaddq_u32(c, Square(vget_high_u16(a))); + c = vmulq_n_u32(c, kCdefDivisionTable[7]); + + c = vmlaq_u32(c, Square(vget_low_u16(a)), division_table[0]); + c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[1]); + return SumVector(c); +} + +void CdefDirection_NEON(const void* const source, ptrdiff_t stride, + uint8_t* const direction, int* const variance) { + assert(direction != nullptr); + assert(variance != nullptr); + const auto* src = static_cast(source); + uint32_t cost[8]; + uint16x8_t partial_lo[8], partial_hi[8]; + + AddPartial(src, stride, partial_lo, partial_hi); + + cost[2] = SquareAccumulate(partial_lo[2]); + cost[6] = SquareAccumulate(partial_lo[6]); + + const uint32x4_t division_table[4] = { + vld1q_u32(kCdefDivisionTable), vld1q_u32(kCdefDivisionTable + 4), + vld1q_u32(kCdefDivisionTable + 8), vld1q_u32(kCdefDivisionTable + 12)}; + + cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table); + cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table); + + const uint32x4_t division_table_odd[2] = { + vld1q_u32(kCdefDivisionTableOdd), vld1q_u32(kCdefDivisionTableOdd + 4)}; + + const uint32x4_t element_3_mask = {0, 0, 0, static_cast(-1)}; + + cost[1] = + CostOdd(partial_lo[1], partial_hi[1], element_3_mask, division_table_odd); + cost[3] = + CostOdd(partial_lo[3], partial_hi[3], element_3_mask, division_table_odd); + cost[5] = + CostOdd(partial_lo[5], partial_hi[5], element_3_mask, division_table_odd); + cost[7] = + CostOdd(partial_lo[7], partial_hi[7], element_3_mask, division_table_odd); + + uint32_t best_cost = 0; + *direction = 0; + for (int i = 0; i < 8; ++i) { + if (cost[i] > best_cost) { + best_cost = cost[i]; + *direction = i; + } + } + *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10; +} + +// ------------------------------------------------------------------------- +// CdefFilter + +// Load 4 vectors based on the given |direction|. +void LoadDirection(const uint16_t* const src, const ptrdiff_t stride, + uint16x8_t* output, const int direction) { + // Each |direction| describes a different set of source values. Expand this + // set by negating each set. For |direction| == 0 this gives a diagonal line + // from top right to bottom left. The first value is y, the second x. Negative + // y values move up. + // a b c d + // {-1, 1}, {1, -1}, {-2, 2}, {2, -2} + // c + // a + // 0 + // b + // d + const int y_0 = kCdefDirections[direction][0][0]; + const int x_0 = kCdefDirections[direction][0][1]; + const int y_1 = kCdefDirections[direction][1][0]; + const int x_1 = kCdefDirections[direction][1][1]; + output[0] = vld1q_u16(src + y_0 * stride + x_0); + output[1] = vld1q_u16(src - y_0 * stride - x_0); + output[2] = vld1q_u16(src + y_1 * stride + x_1); + output[3] = vld1q_u16(src - y_1 * stride - x_1); +} + +// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to +// do 2 rows at a time. +void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride, + uint16x8_t* output, const int direction) { + const int y_0 = kCdefDirections[direction][0][0]; + const int x_0 = kCdefDirections[direction][0][1]; + const int y_1 = kCdefDirections[direction][1][0]; + const int x_1 = kCdefDirections[direction][1][1]; + output[0] = vcombine_u16(vld1_u16(src + y_0 * stride + x_0), + vld1_u16(src + y_0 * stride + stride + x_0)); + output[1] = vcombine_u16(vld1_u16(src - y_0 * stride - x_0), + vld1_u16(src - y_0 * stride + stride - x_0)); + output[2] = vcombine_u16(vld1_u16(src + y_1 * stride + x_1), + vld1_u16(src + y_1 * stride + stride + x_1)); + output[3] = vcombine_u16(vld1_u16(src - y_1 * stride - x_1), + vld1_u16(src - y_1 * stride + stride - x_1)); +} + +int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference, + const uint16x8_t threshold, const int16x8_t damping) { + // If reference > pixel, the difference will be negative, so covert to 0 or + // -1. + const uint16x8_t sign = vcgtq_u16(reference, pixel); + const uint16x8_t abs_diff = vabdq_u16(pixel, reference); + const uint16x8_t shifted_diff = vshlq_u16(abs_diff, damping); + // For bitdepth == 8, the threshold range is [0, 15] and the damping range is + // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be + // larger than threshold. Subtract using saturation will return 0 when pixel + // == kCdefLargeValue. + static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue"); + const uint16x8_t thresh_minus_shifted_diff = + vqsubq_u16(threshold, shifted_diff); + const uint16x8_t clamp_abs_diff = + vminq_u16(thresh_minus_shifted_diff, abs_diff); + // Restore the sign. + return vreinterpretq_s16_u16( + vsubq_u16(veorq_u16(clamp_abs_diff, sign), sign)); +} + +template +void CdefFilter_NEON(const uint16_t* src, const ptrdiff_t src_stride, + const int height, const int primary_strength, + const int secondary_strength, const int damping, + const int direction, void* dest, + const ptrdiff_t dst_stride) { + static_assert(width == 8 || width == 4, ""); + static_assert(enable_primary || enable_secondary, ""); + constexpr bool clipping_required = enable_primary && enable_secondary; + auto* dst = static_cast(dest); + const uint16x8_t cdef_large_value_mask = + vdupq_n_u16(static_cast(~kCdefLargeValue)); + const uint16x8_t primary_threshold = vdupq_n_u16(primary_strength); + const uint16x8_t secondary_threshold = vdupq_n_u16(secondary_strength); + + int16x8_t primary_damping_shift, secondary_damping_shift; + + // FloorLog2() requires input to be > 0. + // 8-bit damping range: Y: [3, 6], UV: [2, 5]. + if (enable_primary) { + // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary + // for UV filtering. + primary_damping_shift = + vdupq_n_s16(-std::max(0, damping - FloorLog2(primary_strength))); + } + if (enable_secondary) { + // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is + // necessary. + assert(damping - FloorLog2(secondary_strength) >= 0); + secondary_damping_shift = + vdupq_n_s16(-(damping - FloorLog2(secondary_strength))); + } + + const int primary_tap_0 = kCdefPrimaryTaps[primary_strength & 1][0]; + const int primary_tap_1 = kCdefPrimaryTaps[primary_strength & 1][1]; + + int y = height; + do { + uint16x8_t pixel; + if (width == 8) { + pixel = vld1q_u16(src); + } else { + pixel = vcombine_u16(vld1_u16(src), vld1_u16(src + src_stride)); + } + + uint16x8_t min = pixel; + uint16x8_t max = pixel; + int16x8_t sum; + + if (enable_primary) { + // Primary |direction|. + uint16x8_t primary_val[4]; + if (width == 8) { + LoadDirection(src, src_stride, primary_val, direction); + } else { + LoadDirection4(src, src_stride, primary_val, direction); + } + + if (clipping_required) { + min = vminq_u16(min, primary_val[0]); + min = vminq_u16(min, primary_val[1]); + min = vminq_u16(min, primary_val[2]); + min = vminq_u16(min, primary_val[3]); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + const uint8x16_t max_p01 = + vmaxq_u8(vreinterpretq_u8_u16(primary_val[0]), + vreinterpretq_u8_u16(primary_val[1])); + const uint8x16_t max_p23 = + vmaxq_u8(vreinterpretq_u8_u16(primary_val[2]), + vreinterpretq_u8_u16(primary_val[3])); + const uint16x8_t max_p = + vreinterpretq_u16_u8(vmaxq_u8(max_p01, max_p23)); + max = vmaxq_u16(max, vandq_u16(max_p, cdef_large_value_mask)); + } + + sum = Constrain(primary_val[0], pixel, primary_threshold, + primary_damping_shift); + sum = vmulq_n_s16(sum, primary_tap_0); + sum = vmlaq_n_s16(sum, + Constrain(primary_val[1], pixel, primary_threshold, + primary_damping_shift), + primary_tap_0); + sum = vmlaq_n_s16(sum, + Constrain(primary_val[2], pixel, primary_threshold, + primary_damping_shift), + primary_tap_1); + sum = vmlaq_n_s16(sum, + Constrain(primary_val[3], pixel, primary_threshold, + primary_damping_shift), + primary_tap_1); + } else { + sum = vdupq_n_s16(0); + } + + if (enable_secondary) { + // Secondary |direction| values (+/- 2). Clamp |direction|. + uint16x8_t secondary_val[8]; + if (width == 8) { + LoadDirection(src, src_stride, secondary_val, direction + 2); + LoadDirection(src, src_stride, secondary_val + 4, direction - 2); + } else { + LoadDirection4(src, src_stride, secondary_val, direction + 2); + LoadDirection4(src, src_stride, secondary_val + 4, direction - 2); + } + + if (clipping_required) { + min = vminq_u16(min, secondary_val[0]); + min = vminq_u16(min, secondary_val[1]); + min = vminq_u16(min, secondary_val[2]); + min = vminq_u16(min, secondary_val[3]); + min = vminq_u16(min, secondary_val[4]); + min = vminq_u16(min, secondary_val[5]); + min = vminq_u16(min, secondary_val[6]); + min = vminq_u16(min, secondary_val[7]); + + const uint8x16_t max_s01 = + vmaxq_u8(vreinterpretq_u8_u16(secondary_val[0]), + vreinterpretq_u8_u16(secondary_val[1])); + const uint8x16_t max_s23 = + vmaxq_u8(vreinterpretq_u8_u16(secondary_val[2]), + vreinterpretq_u8_u16(secondary_val[3])); + const uint8x16_t max_s45 = + vmaxq_u8(vreinterpretq_u8_u16(secondary_val[4]), + vreinterpretq_u8_u16(secondary_val[5])); + const uint8x16_t max_s67 = + vmaxq_u8(vreinterpretq_u8_u16(secondary_val[6]), + vreinterpretq_u8_u16(secondary_val[7])); + const uint16x8_t max_s = vreinterpretq_u16_u8( + vmaxq_u8(vmaxq_u8(max_s01, max_s23), vmaxq_u8(max_s45, max_s67))); + max = vmaxq_u16(max, vandq_u16(max_s, cdef_large_value_mask)); + } + + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[0], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap0); + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[1], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap0); + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[2], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap1); + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[3], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap1); + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[4], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap0); + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[5], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap0); + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[6], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap1); + sum = vmlaq_n_s16(sum, + Constrain(secondary_val[7], pixel, secondary_threshold, + secondary_damping_shift), + kCdefSecondaryTap1); + } + // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max)) + const int16x8_t sum_lt_0 = vshrq_n_s16(sum, 15); + sum = vaddq_s16(sum, sum_lt_0); + int16x8_t result = vrsraq_n_s16(vreinterpretq_s16_u16(pixel), sum, 4); + if (clipping_required) { + result = vminq_s16(result, vreinterpretq_s16_u16(max)); + result = vmaxq_s16(result, vreinterpretq_s16_u16(min)); + } + + const uint8x8_t dst_pixel = vqmovun_s16(result); + if (width == 8) { + src += src_stride; + vst1_u8(dst, dst_pixel); + dst += dst_stride; + --y; + } else { + src += src_stride << 1; + StoreLo4(dst, dst_pixel); + dst += dst_stride; + StoreHi4(dst, dst_pixel); + dst += dst_stride; + y -= 2; + } + } while (y != 0); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->cdef_direction = CdefDirection_NEON; + dsp->cdef_filters[0][0] = CdefFilter_NEON<4>; + dsp->cdef_filters[0][1] = + CdefFilter_NEON<4, /*enable_primary=*/true, /*enable_secondary=*/false>; + dsp->cdef_filters[0][2] = CdefFilter_NEON<4, /*enable_primary=*/false>; + dsp->cdef_filters[1][0] = CdefFilter_NEON<8>; + dsp->cdef_filters[1][1] = + CdefFilter_NEON<8, /*enable_primary=*/true, /*enable_secondary=*/false>; + dsp->cdef_filters[1][2] = CdefFilter_NEON<8, /*enable_primary=*/false>; +} + +} // namespace +} // namespace low_bitdepth + +void CdefInit_NEON() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void CdefInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/cdef_neon.h b/src/dsp/arm/cdef_neon.h new file mode 100644 index 0000000..53d5f86 --- /dev/null +++ b/src/dsp/arm/cdef_neon.h @@ -0,0 +1,38 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not +// thread-safe. +void CdefInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_ diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h new file mode 100644 index 0000000..dcb7567 --- /dev/null +++ b/src/dsp/arm/common_neon.h @@ -0,0 +1,777 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_ + +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include + +#if 0 +#include + +#include "absl/strings/str_cat.h" + +constexpr bool kEnablePrintRegs = true; + +union DebugRegister { + int8_t i8[8]; + int16_t i16[4]; + int32_t i32[2]; + uint8_t u8[8]; + uint16_t u16[4]; + uint32_t u32[2]; +}; + +union DebugRegisterQ { + int8_t i8[16]; + int16_t i16[8]; + int32_t i32[4]; + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; +}; + +// Quite useful macro for debugging. Left here for convenience. +inline void PrintVect(const DebugRegister r, const char* const name, int size) { + int n; + if (kEnablePrintRegs) { + fprintf(stderr, "%s\t: ", name); + if (size == 8) { + for (n = 0; n < 8; ++n) fprintf(stderr, "%.2x ", r.u8[n]); + } else if (size == 16) { + for (n = 0; n < 4; ++n) fprintf(stderr, "%.4x ", r.u16[n]); + } else if (size == 32) { + for (n = 0; n < 2; ++n) fprintf(stderr, "%.8x ", r.u32[n]); + } + fprintf(stderr, "\n"); + } +} + +// Debugging macro for 128-bit types. +inline void PrintVectQ(const DebugRegisterQ r, const char* const name, + int size) { + int n; + if (kEnablePrintRegs) { + fprintf(stderr, "%s\t: ", name); + if (size == 8) { + for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", r.u8[n]); + } else if (size == 16) { + for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", r.u16[n]); + } else if (size == 32) { + for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", r.u32[n]); + } + fprintf(stderr, "\n"); + } +} + +inline void PrintReg(const int32x4x2_t val, const std::string& name) { + DebugRegisterQ r; + vst1q_u32(r.u32, val.val[0]); + const std::string name0 = absl::StrCat(name, ".val[0]").c_str(); + PrintVectQ(r, name0.c_str(), 32); + vst1q_u32(r.u32, val.val[1]); + const std::string name1 = absl::StrCat(name, ".val[1]").c_str(); + PrintVectQ(r, name1.c_str(), 32); +} + +inline void PrintReg(const uint32x4_t val, const char* name) { + DebugRegisterQ r; + vst1q_u32(r.u32, val); + PrintVectQ(r, name, 32); +} + +inline void PrintReg(const uint32x2_t val, const char* name) { + DebugRegister r; + vst1_u32(r.u32, val); + PrintVect(r, name, 32); +} + +inline void PrintReg(const uint16x8_t val, const char* name) { + DebugRegisterQ r; + vst1q_u16(r.u16, val); + PrintVectQ(r, name, 16); +} + +inline void PrintReg(const uint16x4_t val, const char* name) { + DebugRegister r; + vst1_u16(r.u16, val); + PrintVect(r, name, 16); +} + +inline void PrintReg(const uint8x16_t val, const char* name) { + DebugRegisterQ r; + vst1q_u8(r.u8, val); + PrintVectQ(r, name, 8); +} + +inline void PrintReg(const uint8x8_t val, const char* name) { + DebugRegister r; + vst1_u8(r.u8, val); + PrintVect(r, name, 8); +} + +inline void PrintReg(const int32x4_t val, const char* name) { + DebugRegisterQ r; + vst1q_s32(r.i32, val); + PrintVectQ(r, name, 32); +} + +inline void PrintReg(const int32x2_t val, const char* name) { + DebugRegister r; + vst1_s32(r.i32, val); + PrintVect(r, name, 32); +} + +inline void PrintReg(const int16x8_t val, const char* name) { + DebugRegisterQ r; + vst1q_s16(r.i16, val); + PrintVectQ(r, name, 16); +} + +inline void PrintReg(const int16x4_t val, const char* name) { + DebugRegister r; + vst1_s16(r.i16, val); + PrintVect(r, name, 16); +} + +inline void PrintReg(const int8x16_t val, const char* name) { + DebugRegisterQ r; + vst1q_s8(r.i8, val); + PrintVectQ(r, name, 8); +} + +inline void PrintReg(const int8x8_t val, const char* name) { + DebugRegister r; + vst1_s8(r.i8, val); + PrintVect(r, name, 8); +} + +// Print an individual (non-vector) value in decimal format. +inline void PrintReg(const int x, const char* name) { + if (kEnablePrintRegs) { + printf("%s: %d\n", name, x); + } +} + +// Print an individual (non-vector) value in hexadecimal format. +inline void PrintHex(const int x, const char* name) { + if (kEnablePrintRegs) { + printf("%s: %x\n", name, x); + } +} + +#define PR(x) PrintReg(x, #x) +#define PD(x) PrintReg(x, #x) +#define PX(x) PrintHex(x, #x) + +#endif // 0 + +namespace libgav1 { +namespace dsp { + +//------------------------------------------------------------------------------ +// Load functions. + +// Load 2 uint8_t values into lanes 0 and 1. Zeros the register before loading +// the values. Use caution when using this in loops because it will re-zero the +// register before loading on every iteration. +inline uint8x8_t Load2(const void* const buf) { + const uint16x4_t zero = vdup_n_u16(0); + uint16_t temp; + memcpy(&temp, buf, 2); + return vreinterpret_u8_u16(vld1_lane_u16(&temp, zero, 0)); +} + +// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1. +template +inline uint8x8_t Load2(const void* const buf, uint8x8_t val) { + uint16_t temp; + memcpy(&temp, buf, 2); + return vreinterpret_u8_u16( + vld1_lane_u16(&temp, vreinterpret_u16_u8(val), lane)); +} + +// Load 4 uint8_t values into the low half of a uint8x8_t register. Zeros the +// register before loading the values. Use caution when using this in loops +// because it will re-zero the register before loading on every iteration. +inline uint8x8_t Load4(const void* const buf) { + const uint32x2_t zero = vdup_n_u32(0); + uint32_t temp; + memcpy(&temp, buf, 4); + return vreinterpret_u8_u32(vld1_lane_u32(&temp, zero, 0)); +} + +// Load 4 uint8_t values into 4 lanes staring with |lane| * 4. +template +inline uint8x8_t Load4(const void* const buf, uint8x8_t val) { + uint32_t temp; + memcpy(&temp, buf, 4); + return vreinterpret_u8_u32( + vld1_lane_u32(&temp, vreinterpret_u32_u8(val), lane)); +} + +//------------------------------------------------------------------------------ +// Store functions. + +// Propagate type information to the compiler. Without this the compiler may +// assume the required alignment of the type (4 bytes in the case of uint32_t) +// and add alignment hints to the memory access. +template +inline void ValueToMem(void* const buf, T val) { + memcpy(buf, &val, sizeof(val)); +} + +// Store 4 int8_t values from the low half of an int8x8_t register. +inline void StoreLo4(void* const buf, const int8x8_t val) { + ValueToMem(buf, vget_lane_s32(vreinterpret_s32_s8(val), 0)); +} + +// Store 4 uint8_t values from the low half of a uint8x8_t register. +inline void StoreLo4(void* const buf, const uint8x8_t val) { + ValueToMem(buf, vget_lane_u32(vreinterpret_u32_u8(val), 0)); +} + +// Store 4 uint8_t values from the high half of a uint8x8_t register. +inline void StoreHi4(void* const buf, const uint8x8_t val) { + ValueToMem(buf, vget_lane_u32(vreinterpret_u32_u8(val), 1)); +} + +// Store 2 uint8_t values from |lane| * 2 and |lane| * 2 + 1 of a uint8x8_t +// register. +template +inline void Store2(void* const buf, const uint8x8_t val) { + ValueToMem(buf, vget_lane_u16(vreinterpret_u16_u8(val), lane)); +} + +// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x8_t +// register. +template +inline void Store2(void* const buf, const uint16x8_t val) { + ValueToMem(buf, vgetq_lane_u32(vreinterpretq_u32_u16(val), lane)); +} + +// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x4_t +// register. +template +inline void Store2(uint16_t* const buf, const uint16x4_t val) { + ValueToMem(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane)); +} + +//------------------------------------------------------------------------------ +// Bit manipulation. + +// vshXX_n_XX() requires an immediate. +template +inline uint8x8_t LeftShift(const uint8x8_t vector) { + return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift)); +} + +template +inline uint8x8_t RightShift(const uint8x8_t vector) { + return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift)); +} + +template +inline int8x8_t RightShift(const int8x8_t vector) { + return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift)); +} + +// Shim vqtbl1_u8 for armv7. +inline uint8x8_t VQTbl1U8(const uint8x16_t a, const uint8x8_t index) { +#if defined(__aarch64__) + return vqtbl1_u8(a, index); +#else + const uint8x8x2_t b = {vget_low_u8(a), vget_high_u8(a)}; + return vtbl2_u8(b, index); +#endif +} + +// Shim vqtbl1_s8 for armv7. +inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) { +#if defined(__aarch64__) + return vqtbl1_s8(a, index); +#else + const int8x8x2_t b = {vget_low_s8(a), vget_high_s8(a)}; + return vtbl2_s8(b, vreinterpret_s8_u8(index)); +#endif +} + +//------------------------------------------------------------------------------ +// Interleave. + +// vzipN is exclusive to A64. +inline uint8x8_t InterleaveLow8(const uint8x8_t a, const uint8x8_t b) { +#if defined(__aarch64__) + return vzip1_u8(a, b); +#else + // Discard |.val[1]| + return vzip_u8(a, b).val[0]; +#endif +} + +inline uint8x8_t InterleaveLow32(const uint8x8_t a, const uint8x8_t b) { +#if defined(__aarch64__) + return vreinterpret_u8_u32( + vzip1_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b))); +#else + // Discard |.val[1]| + return vreinterpret_u8_u32( + vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[0]); +#endif +} + +inline int8x8_t InterleaveLow32(const int8x8_t a, const int8x8_t b) { +#if defined(__aarch64__) + return vreinterpret_s8_u32( + vzip1_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b))); +#else + // Discard |.val[1]| + return vreinterpret_s8_u32( + vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[0]); +#endif +} + +inline uint8x8_t InterleaveHigh32(const uint8x8_t a, const uint8x8_t b) { +#if defined(__aarch64__) + return vreinterpret_u8_u32( + vzip2_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b))); +#else + // Discard |.val[0]| + return vreinterpret_u8_u32( + vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[1]); +#endif +} + +inline int8x8_t InterleaveHigh32(const int8x8_t a, const int8x8_t b) { +#if defined(__aarch64__) + return vreinterpret_s8_u32( + vzip2_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b))); +#else + // Discard |.val[0]| + return vreinterpret_s8_u32( + vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[1]); +#endif +} + +//------------------------------------------------------------------------------ +// Sum. + +inline uint16_t SumVector(const uint8x8_t a) { +#if defined(__aarch64__) + return vaddlv_u8(a); +#else + const uint16x4_t c = vpaddl_u8(a); + const uint32x2_t d = vpaddl_u16(c); + const uint64x1_t e = vpaddl_u32(d); + return static_cast(vget_lane_u64(e, 0)); +#endif // defined(__aarch64__) +} + +inline uint32_t SumVector(const uint32x4_t a) { +#if defined(__aarch64__) + return vaddvq_u32(a); +#else + const uint64x2_t b = vpaddlq_u32(a); + const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b)); + return static_cast(vget_lane_u64(c, 0)); +#endif +} + +//------------------------------------------------------------------------------ +// Transpose. + +// Transpose 32 bit elements such that: +// a: 00 01 +// b: 02 03 +// returns +// val[0]: 00 02 +// val[1]: 01 03 +inline uint8x8x2_t Interleave32(const uint8x8_t a, const uint8x8_t b) { + const uint32x2_t a_32 = vreinterpret_u32_u8(a); + const uint32x2_t b_32 = vreinterpret_u32_u8(b); + const uint32x2x2_t c = vtrn_u32(a_32, b_32); + const uint8x8x2_t d = {vreinterpret_u8_u32(c.val[0]), + vreinterpret_u8_u32(c.val[1])}; + return d; +} + +// Swap high and low 32 bit elements. +inline uint8x8_t Transpose32(const uint8x8_t a) { + const uint32x2_t b = vrev64_u32(vreinterpret_u32_u8(a)); + return vreinterpret_u8_u32(b); +} + +// Implement vtrnq_s64(). +// Input: +// a0: 00 01 02 03 04 05 06 07 +// a1: 16 17 18 19 20 21 22 23 +// Output: +// b0.val[0]: 00 01 02 03 16 17 18 19 +// b0.val[1]: 04 05 06 07 20 21 22 23 +inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) { + int16x8x2_t b0; + b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), + vreinterpret_s16_s32(vget_low_s32(a1))); + b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), + vreinterpret_s16_s32(vget_high_s32(a1))); + return b0; +} + +inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) { + uint16x8x2_t b0; + b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), + vreinterpret_u16_u32(vget_low_u32(a1))); + b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), + vreinterpret_u16_u32(vget_high_u32(a1))); + return b0; +} + +// Input: +// a: 00 01 02 03 10 11 12 13 +// b: 20 21 22 23 30 31 32 33 +// Output: +// Note that columns [1] and [2] are transposed. +// a: 00 10 20 30 02 12 22 32 +// b: 01 11 21 31 03 13 23 33 +inline void Transpose4x4(uint8x8_t* a, uint8x8_t* b) { + const uint16x4x2_t c = + vtrn_u16(vreinterpret_u16_u8(*a), vreinterpret_u16_u8(*b)); + const uint32x2x2_t d = + vtrn_u32(vreinterpret_u32_u16(c.val[0]), vreinterpret_u32_u16(c.val[1])); + const uint8x8x2_t e = + vtrn_u8(vreinterpret_u8_u32(d.val[0]), vreinterpret_u8_u32(d.val[1])); + *a = e.val[0]; + *b = e.val[1]; +} + +// Reversible if the x4 values are packed next to each other. +// x4 input / x8 output: +// a0: 00 01 02 03 40 41 42 43 44 +// a1: 10 11 12 13 50 51 52 53 54 +// a2: 20 21 22 23 60 61 62 63 64 +// a3: 30 31 32 33 70 71 72 73 74 +// x8 input / x4 output: +// a0: 00 10 20 30 40 50 60 70 +// a1: 01 11 21 31 41 51 61 71 +// a2: 02 12 22 32 42 52 62 72 +// a3: 03 13 23 33 43 53 63 73 +inline void Transpose8x4(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2, + uint8x8_t* a3) { + const uint8x8x2_t b0 = vtrn_u8(*a0, *a1); + const uint8x8x2_t b1 = vtrn_u8(*a2, *a3); + + const uint16x4x2_t c0 = + vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0])); + const uint16x4x2_t c1 = + vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1])); + + *a0 = vreinterpret_u8_u16(c0.val[0]); + *a1 = vreinterpret_u8_u16(c1.val[0]); + *a2 = vreinterpret_u8_u16(c0.val[1]); + *a3 = vreinterpret_u8_u16(c1.val[1]); +} + +// Input: +// a[0]: 00 01 02 03 04 05 06 07 +// a[1]: 10 11 12 13 14 15 16 17 +// a[2]: 20 21 22 23 24 25 26 27 +// a[3]: 30 31 32 33 34 35 36 37 +// a[4]: 40 41 42 43 44 45 46 47 +// a[5]: 50 51 52 53 54 55 56 57 +// a[6]: 60 61 62 63 64 65 66 67 +// a[7]: 70 71 72 73 74 75 76 77 + +// Output: +// a[0]: 00 10 20 30 40 50 60 70 +// a[1]: 01 11 21 31 41 51 61 71 +// a[2]: 02 12 22 32 42 52 62 72 +// a[3]: 03 13 23 33 43 53 63 73 +// a[4]: 04 14 24 34 44 54 64 74 +// a[5]: 05 15 25 35 45 55 65 75 +// a[6]: 06 16 26 36 46 56 66 76 +// a[7]: 07 17 27 37 47 57 67 77 +inline void Transpose8x8(int8x8_t a[8]) { + // Swap 8 bit elements. Goes from: + // a[0]: 00 01 02 03 04 05 06 07 + // a[1]: 10 11 12 13 14 15 16 17 + // a[2]: 20 21 22 23 24 25 26 27 + // a[3]: 30 31 32 33 34 35 36 37 + // a[4]: 40 41 42 43 44 45 46 47 + // a[5]: 50 51 52 53 54 55 56 57 + // a[6]: 60 61 62 63 64 65 66 67 + // a[7]: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56 + // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57 + // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76 + // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77 + const int8x16x2_t b0 = + vtrnq_s8(vcombine_s8(a[0], a[4]), vcombine_s8(a[1], a[5])); + const int8x16x2_t b1 = + vtrnq_s8(vcombine_s8(a[2], a[6]), vcombine_s8(a[3], a[7])); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74 + // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76 + // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75 + // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77 + const int16x8x2_t c0 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[0]), + vreinterpretq_s16_s8(b1.val[0])); + const int16x8x2_t c1 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[1]), + vreinterpretq_s16_s8(b1.val[1])); + + // Unzip 32 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + const int32x4x2_t d0 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); + + a[0] = vreinterpret_s8_s32(vget_low_s32(d0.val[0])); + a[1] = vreinterpret_s8_s32(vget_high_s32(d0.val[0])); + a[2] = vreinterpret_s8_s32(vget_low_s32(d1.val[0])); + a[3] = vreinterpret_s8_s32(vget_high_s32(d1.val[0])); + a[4] = vreinterpret_s8_s32(vget_low_s32(d0.val[1])); + a[5] = vreinterpret_s8_s32(vget_high_s32(d0.val[1])); + a[6] = vreinterpret_s8_s32(vget_low_s32(d1.val[1])); + a[7] = vreinterpret_s8_s32(vget_high_s32(d1.val[1])); +} + +// Unsigned. +inline void Transpose8x8(uint8x8_t a[8]) { + const uint8x16x2_t b0 = + vtrnq_u8(vcombine_u8(a[0], a[4]), vcombine_u8(a[1], a[5])); + const uint8x16x2_t b1 = + vtrnq_u8(vcombine_u8(a[2], a[6]), vcombine_u8(a[3], a[7])); + + const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), + vreinterpretq_u16_u8(b1.val[0])); + const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), + vreinterpretq_u16_u8(b1.val[1])); + + const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), + vreinterpretq_u32_u16(c1.val[0])); + const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), + vreinterpretq_u32_u16(c1.val[1])); + + a[0] = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); + a[1] = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); + a[2] = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); + a[3] = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); + a[4] = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); + a[5] = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); + a[6] = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); + a[7] = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); +} + +inline void Transpose8x8(uint8x8_t in[8], uint8x16_t out[4]) { + const uint8x16x2_t a0 = + vtrnq_u8(vcombine_u8(in[0], in[4]), vcombine_u8(in[1], in[5])); + const uint8x16x2_t a1 = + vtrnq_u8(vcombine_u8(in[2], in[6]), vcombine_u8(in[3], in[7])); + + const uint16x8x2_t b0 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[0]), + vreinterpretq_u16_u8(a1.val[0])); + const uint16x8x2_t b1 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[1]), + vreinterpretq_u16_u8(a1.val[1])); + + const uint32x4x2_t c0 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[0]), + vreinterpretq_u32_u16(b1.val[0])); + const uint32x4x2_t c1 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[1]), + vreinterpretq_u32_u16(b1.val[1])); + + out[0] = vreinterpretq_u8_u32(c0.val[0]); + out[1] = vreinterpretq_u8_u32(c1.val[0]); + out[2] = vreinterpretq_u8_u32(c0.val[1]); + out[3] = vreinterpretq_u8_u32(c1.val[1]); +} + +// Input: +// a[0]: 00 01 02 03 04 05 06 07 +// a[1]: 10 11 12 13 14 15 16 17 +// a[2]: 20 21 22 23 24 25 26 27 +// a[3]: 30 31 32 33 34 35 36 37 +// a[4]: 40 41 42 43 44 45 46 47 +// a[5]: 50 51 52 53 54 55 56 57 +// a[6]: 60 61 62 63 64 65 66 67 +// a[7]: 70 71 72 73 74 75 76 77 + +// Output: +// a[0]: 00 10 20 30 40 50 60 70 +// a[1]: 01 11 21 31 41 51 61 71 +// a[2]: 02 12 22 32 42 52 62 72 +// a[3]: 03 13 23 33 43 53 63 73 +// a[4]: 04 14 24 34 44 54 64 74 +// a[5]: 05 15 25 35 45 55 65 75 +// a[6]: 06 16 26 36 46 56 66 76 +// a[7]: 07 17 27 37 47 57 67 77 +inline void Transpose8x8(int16x8_t a[8]) { + const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]); + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]); + + a[0] = d0.val[0]; + a[1] = d1.val[0]; + a[2] = d2.val[0]; + a[3] = d3.val[0]; + a[4] = d0.val[1]; + a[5] = d1.val[1]; + a[6] = d2.val[1]; + a[7] = d3.val[1]; +} + +// Unsigned. +inline void Transpose8x8(uint16x8_t a[8]) { + const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]); + const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]); + const uint16x8x2_t b2 = vtrnq_u16(a[4], a[5]); + const uint16x8x2_t b3 = vtrnq_u16(a[6], a[7]); + + const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), + vreinterpretq_u32_u16(b1.val[0])); + const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), + vreinterpretq_u32_u16(b1.val[1])); + const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]), + vreinterpretq_u32_u16(b3.val[0])); + const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]), + vreinterpretq_u32_u16(b3.val[1])); + + const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c2.val[0]); + const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c3.val[0]); + const uint16x8x2_t d2 = VtrnqU64(c0.val[1], c2.val[1]); + const uint16x8x2_t d3 = VtrnqU64(c1.val[1], c3.val[1]); + + a[0] = d0.val[0]; + a[1] = d1.val[0]; + a[2] = d2.val[0]; + a[3] = d3.val[0]; + a[4] = d0.val[1]; + a[5] = d1.val[1]; + a[6] = d2.val[1]; + a[7] = d3.val[1]; +} + +// Input: +// a[0]: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87 +// a[1]: 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97 +// a[2]: 20 21 22 23 24 25 26 27 a0 a1 a2 a3 a4 a5 a6 a7 +// a[3]: 30 31 32 33 34 35 36 37 b0 b1 b2 b3 b4 b5 b6 b7 +// a[4]: 40 41 42 43 44 45 46 47 c0 c1 c2 c3 c4 c5 c6 c7 +// a[5]: 50 51 52 53 54 55 56 57 d0 d1 d2 d3 d4 d5 d6 d7 +// a[6]: 60 61 62 63 64 65 66 67 e0 e1 e2 e3 e4 e5 e6 e7 +// a[7]: 70 71 72 73 74 75 76 77 f0 f1 f2 f3 f4 f5 f6 f7 + +// Output: +// a[0]: 00 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0 +// a[1]: 01 11 21 31 41 51 61 71 81 91 a1 b1 c1 d1 e1 f1 +// a[2]: 02 12 22 32 42 52 62 72 82 92 a2 b2 c2 d2 e2 f2 +// a[3]: 03 13 23 33 43 53 63 73 83 93 a3 b3 c3 d3 e3 f3 +// a[4]: 04 14 24 34 44 54 64 74 84 94 a4 b4 c4 d4 e4 f4 +// a[5]: 05 15 25 35 45 55 65 75 85 95 a5 b5 c5 d5 e5 f5 +// a[6]: 06 16 26 36 46 56 66 76 86 96 a6 b6 c6 d6 e6 f6 +// a[7]: 07 17 27 37 47 57 67 77 87 97 a7 b7 c7 d7 e7 f7 +inline void Transpose8x16(uint8x16_t a[8]) { + // b0.val[0]: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96 + // b0.val[1]: 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97 + // b1.val[0]: 20 30 22 32 24 34 26 36 a0 b0 a2 b2 a4 b4 a6 b6 + // b1.val[1]: 21 31 23 33 25 35 27 37 a1 b1 a3 b3 a5 b5 a7 b7 + // b2.val[0]: 40 50 42 52 44 54 46 56 c0 d0 c2 d2 c4 d4 c6 d6 + // b2.val[1]: 41 51 43 53 45 55 47 57 c1 d1 c3 d3 c5 d5 c7 d7 + // b3.val[0]: 60 70 62 72 64 74 66 76 e0 f0 e2 f2 e4 f4 e6 f6 + // b3.val[1]: 61 71 63 73 65 75 67 77 e1 f1 e3 f3 e5 f5 e7 f7 + const uint8x16x2_t b0 = vtrnq_u8(a[0], a[1]); + const uint8x16x2_t b1 = vtrnq_u8(a[2], a[3]); + const uint8x16x2_t b2 = vtrnq_u8(a[4], a[5]); + const uint8x16x2_t b3 = vtrnq_u8(a[6], a[7]); + + // c0.val[0]: 00 10 20 30 04 14 24 34 80 90 a0 b0 84 94 a4 b4 + // c0.val[1]: 02 12 22 32 06 16 26 36 82 92 a2 b2 86 96 a6 b6 + // c1.val[0]: 01 11 21 31 05 15 25 35 81 91 a1 b1 85 95 a5 b5 + // c1.val[1]: 03 13 23 33 07 17 27 37 83 93 a3 b3 87 97 a7 b7 + // c2.val[0]: 40 50 60 70 44 54 64 74 c0 d0 e0 f0 c4 d4 e4 f4 + // c2.val[1]: 42 52 62 72 46 56 66 76 c2 d2 e2 f2 c6 d6 e6 f6 + // c3.val[0]: 41 51 61 71 45 55 65 75 c1 d1 e1 f1 c5 d5 e5 f5 + // c3.val[1]: 43 53 63 73 47 57 67 77 c3 d3 e3 f3 c7 d7 e7 f7 + const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), + vreinterpretq_u16_u8(b1.val[0])); + const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), + vreinterpretq_u16_u8(b1.val[1])); + const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]), + vreinterpretq_u16_u8(b3.val[0])); + const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]), + vreinterpretq_u16_u8(b3.val[1])); + + // d0.val[0]: 00 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0 + // d0.val[1]: 04 14 24 34 44 54 64 74 84 94 a4 b4 c4 d4 e4 f4 + // d1.val[0]: 01 11 21 31 41 51 61 71 81 91 a1 b1 c1 d1 e1 f1 + // d1.val[1]: 05 15 25 35 45 55 65 75 85 95 a5 b5 c5 d5 e5 f5 + // d2.val[0]: 02 12 22 32 42 52 62 72 82 92 a2 b2 c2 d2 e2 f2 + // d2.val[1]: 06 16 26 36 46 56 66 76 86 96 a6 b6 c6 d6 e6 f6 + // d3.val[0]: 03 13 23 33 43 53 63 73 83 93 a3 b3 c3 d3 e3 f3 + // d3.val[1]: 07 17 27 37 47 57 67 77 87 97 a7 b7 c7 d7 e7 f7 + const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]), + vreinterpretq_u32_u16(c2.val[0])); + const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]), + vreinterpretq_u32_u16(c3.val[0])); + const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]), + vreinterpretq_u32_u16(c2.val[1])); + const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]), + vreinterpretq_u32_u16(c3.val[1])); + + a[0] = vreinterpretq_u8_u32(d0.val[0]); + a[1] = vreinterpretq_u8_u32(d1.val[0]); + a[2] = vreinterpretq_u8_u32(d2.val[0]); + a[3] = vreinterpretq_u8_u32(d3.val[0]); + a[4] = vreinterpretq_u8_u32(d0.val[1]); + a[5] = vreinterpretq_u8_u32(d1.val[1]); + a[6] = vreinterpretq_u8_u32(d2.val[1]); + a[7] = vreinterpretq_u8_u32(d3.val[1]); +} + +inline int16x8_t ZeroExtend(const uint8x8_t in) { + return vreinterpretq_s16_u16(vmovl_u8(in)); +} + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_ENABLE_NEON +#endif // LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_ diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc new file mode 100644 index 0000000..fd9b912 --- /dev/null +++ b/src/dsp/arm/convolve_neon.cc @@ -0,0 +1,3105 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/convolve.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +// Include the constants and utility functions inside the anonymous namespace. +#include "src/dsp/convolve.inc" + +// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and +// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final +// sum from outranging int16_t. +template +int16x8_t SumOnePassTaps(const uint8x8_t* const src, + const uint8x8_t* const taps) { + uint16x8_t sum; + if (filter_index == 0) { + // 6 taps. + - + + - + + sum = vmull_u8(src[0], taps[0]); + // Unsigned overflow will result in a valid int16_t value. + sum = vmlsl_u8(sum, src[1], taps[1]); + sum = vmlal_u8(sum, src[2], taps[2]); + sum = vmlal_u8(sum, src[3], taps[3]); + sum = vmlsl_u8(sum, src[4], taps[4]); + sum = vmlal_u8(sum, src[5], taps[5]); + } else if (filter_index == 1 && negative_outside_taps) { + // 6 taps. - + + + + - + // Set a base we can subtract from. + sum = vmull_u8(src[1], taps[1]); + sum = vmlsl_u8(sum, src[0], taps[0]); + sum = vmlal_u8(sum, src[2], taps[2]); + sum = vmlal_u8(sum, src[3], taps[3]); + sum = vmlal_u8(sum, src[4], taps[4]); + sum = vmlsl_u8(sum, src[5], taps[5]); + } else if (filter_index == 1) { + // 6 taps. All are positive. + sum = vmull_u8(src[0], taps[0]); + sum = vmlal_u8(sum, src[1], taps[1]); + sum = vmlal_u8(sum, src[2], taps[2]); + sum = vmlal_u8(sum, src[3], taps[3]); + sum = vmlal_u8(sum, src[4], taps[4]); + sum = vmlal_u8(sum, src[5], taps[5]); + } else if (filter_index == 2) { + // 8 taps. - + - + + - + - + sum = vmull_u8(src[1], taps[1]); + sum = vmlsl_u8(sum, src[0], taps[0]); + sum = vmlsl_u8(sum, src[2], taps[2]); + sum = vmlal_u8(sum, src[3], taps[3]); + sum = vmlal_u8(sum, src[4], taps[4]); + sum = vmlsl_u8(sum, src[5], taps[5]); + sum = vmlal_u8(sum, src[6], taps[6]); + sum = vmlsl_u8(sum, src[7], taps[7]); + } else if (filter_index == 3) { + // 2 taps. All are positive. + sum = vmull_u8(src[0], taps[0]); + sum = vmlal_u8(sum, src[1], taps[1]); + } else if (filter_index == 4) { + // 4 taps. - + + - + sum = vmull_u8(src[1], taps[1]); + sum = vmlsl_u8(sum, src[0], taps[0]); + sum = vmlal_u8(sum, src[2], taps[2]); + sum = vmlsl_u8(sum, src[3], taps[3]); + } else if (filter_index == 5) { + // 4 taps. All are positive. + sum = vmull_u8(src[0], taps[0]); + sum = vmlal_u8(sum, src[1], taps[1]); + sum = vmlal_u8(sum, src[2], taps[2]); + sum = vmlal_u8(sum, src[3], taps[3]); + } + return vreinterpretq_s16_u16(sum); +} + +template +int16x8_t SumHorizontalTaps(const uint8_t* const src, + const uint8x8_t* const v_tap) { + uint8x8_t v_src[8]; + const uint8x16_t src_long = vld1q_u8(src); + int16x8_t sum; + + if (filter_index < 2) { + v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4)); + v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5)); + v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6)); + sum = SumOnePassTaps(v_src, v_tap + 1); + } else if (filter_index == 2) { + v_src[0] = vget_low_u8(src_long); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4)); + v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5)); + v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6)); + v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7)); + sum = SumOnePassTaps(v_src, v_tap); + } else if (filter_index == 3) { + v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4)); + sum = SumOnePassTaps(v_src, v_tap + 3); + } else if (filter_index > 3) { + v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2)); + v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3)); + v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4)); + v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5)); + sum = SumOnePassTaps(v_src, v_tap + 2); + } + return sum; +} + +template +uint8x8_t SimpleHorizontalTaps(const uint8_t* const src, + const uint8x8_t* const v_tap) { + int16x8_t sum = + SumHorizontalTaps(src, v_tap); + + // Normally the Horizontal pass does the downshift in two passes: + // kInterRoundBitsHorizontal - 1 and then (kFilterBits - + // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them + // requires adding the rounding offset from the skipped shift. + constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); + + sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit)); + return vqrshrun_n_s16(sum, kFilterBits - 1); +} + +template +uint16x8_t HorizontalTaps8To16(const uint8_t* const src, + const uint8x8_t* const v_tap) { + const int16x8_t sum = + SumHorizontalTaps(src, v_tap); + + return vreinterpretq_u16_s16( + vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1)); +} + +template +int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, + const uint8x8_t* const v_tap) { + uint16x8_t sum; + const uint8x8_t input0 = vld1_u8(src); + src += src_stride; + const uint8x8_t input1 = vld1_u8(src); + uint8x8x2_t input = vzip_u8(input0, input1); + + if (filter_index == 3) { + // tap signs : + + + sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]); + sum = vmlal_u8(sum, input.val[1], v_tap[4]); + } else if (filter_index == 4) { + // tap signs : - + + - + sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]); + sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]); + sum = vmlal_u8(sum, input.val[1], v_tap[4]); + sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]); + } else { + // tap signs : + + + + + sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]); + sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]); + sum = vmlal_u8(sum, input.val[1], v_tap[4]); + sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]); + } + + return vreinterpretq_s16_u16(sum); +} + +template +uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src, + const ptrdiff_t src_stride, + const uint8x8_t* const v_tap) { + int16x8_t sum = SumHorizontalTaps2x2(src, src_stride, v_tap); + + // Normally the Horizontal pass does the downshift in two passes: + // kInterRoundBitsHorizontal - 1 and then (kFilterBits - + // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them + // requires adding the rounding offset from the skipped shift. + constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); + + sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit)); + return vqrshrun_n_s16(sum, kFilterBits - 1); +} + +template +uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src, + const ptrdiff_t src_stride, + const uint8x8_t* const v_tap) { + const int16x8_t sum = + SumHorizontalTaps2x2(src, src_stride, v_tap); + + return vreinterpretq_u16_s16( + vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1)); +} + +template +void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, + void* const dest, const ptrdiff_t pred_stride, + const int width, const int height, + const uint8x8_t* const v_tap) { + auto* dest8 = static_cast(dest); + auto* dest16 = static_cast(dest); + + // 4 tap filters are never used when width > 4. + if (num_taps != 4 && width > 4) { + int y = 0; + do { + int x = 0; + do { + if (is_2d || is_compound) { + const uint16x8_t v_sum = + HorizontalTaps8To16(&src[x], + v_tap); + vst1q_u16(&dest16[x], v_sum); + } else { + const uint8x8_t result = + SimpleHorizontalTaps(&src[x], + v_tap); + vst1_u8(&dest8[x], result); + } + x += step; + } while (x < width); + src += src_stride; + dest8 += pred_stride; + dest16 += pred_stride; + } while (++y < height); + return; + } + + // Horizontal passes only needs to account for |num_taps| 2 and 4 when + // |width| <= 4. + assert(width <= 4); + assert(num_taps <= 4); + if (num_taps <= 4) { + if (width == 4) { + int y = 0; + do { + if (is_2d || is_compound) { + const uint16x8_t v_sum = + HorizontalTaps8To16(src, + v_tap); + vst1_u16(dest16, vget_low_u16(v_sum)); + } else { + const uint8x8_t result = + SimpleHorizontalTaps(src, + v_tap); + StoreLo4(&dest8[0], result); + } + src += src_stride; + dest8 += pred_stride; + dest16 += pred_stride; + } while (++y < height); + return; + } + + if (!is_compound) { + int y = 0; + do { + if (is_2d) { + const uint16x8_t sum = + HorizontalTaps8To16_2x2(src, src_stride, v_tap); + dest16[0] = vgetq_lane_u16(sum, 0); + dest16[1] = vgetq_lane_u16(sum, 2); + dest16 += pred_stride; + dest16[0] = vgetq_lane_u16(sum, 1); + dest16[1] = vgetq_lane_u16(sum, 3); + dest16 += pred_stride; + } else { + const uint8x8_t sum = + SimpleHorizontalTaps2x2(src, src_stride, v_tap); + + dest8[0] = vget_lane_u8(sum, 0); + dest8[1] = vget_lane_u8(sum, 2); + dest8 += pred_stride; + + dest8[0] = vget_lane_u8(sum, 1); + dest8[1] = vget_lane_u8(sum, 3); + dest8 += pred_stride; + } + + src += src_stride << 1; + y += 2; + } while (y < height - 1); + + // The 2d filters have an odd |height| because the horizontal pass + // generates context for the vertical pass. + if (is_2d) { + assert(height % 2 == 1); + uint16x8_t sum; + const uint8x8_t input = vld1_u8(src); + if (filter_index == 3) { // |num_taps| == 2 + sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]); + sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]); + } else if (filter_index == 4) { + sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]); + sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]); + sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]); + sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]); + } else { + assert(filter_index == 5); + sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]); + sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]); + sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]); + sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]); + } + // |sum| contains an int16_t value. + sum = vreinterpretq_u16_s16(vrshrq_n_s16( + vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1)); + Store2<0>(dest16, sum); + } + } + } +} + +// Process 16 bit inputs and output 32 bits. +template +inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src, + const int16x8_t taps) { + const int16x4_t taps_lo = vget_low_s16(taps); + const int16x4_t taps_hi = vget_high_s16(taps); + int32x4_t sum; + if (num_taps == 8) { + sum = vmull_lane_s16(src[0], taps_lo, 0); + sum = vmlal_lane_s16(sum, src[1], taps_lo, 1); + sum = vmlal_lane_s16(sum, src[2], taps_lo, 2); + sum = vmlal_lane_s16(sum, src[3], taps_lo, 3); + sum = vmlal_lane_s16(sum, src[4], taps_hi, 0); + sum = vmlal_lane_s16(sum, src[5], taps_hi, 1); + sum = vmlal_lane_s16(sum, src[6], taps_hi, 2); + sum = vmlal_lane_s16(sum, src[7], taps_hi, 3); + } else if (num_taps == 6) { + sum = vmull_lane_s16(src[0], taps_lo, 1); + sum = vmlal_lane_s16(sum, src[1], taps_lo, 2); + sum = vmlal_lane_s16(sum, src[2], taps_lo, 3); + sum = vmlal_lane_s16(sum, src[3], taps_hi, 0); + sum = vmlal_lane_s16(sum, src[4], taps_hi, 1); + sum = vmlal_lane_s16(sum, src[5], taps_hi, 2); + } else if (num_taps == 4) { + sum = vmull_lane_s16(src[0], taps_lo, 2); + sum = vmlal_lane_s16(sum, src[1], taps_lo, 3); + sum = vmlal_lane_s16(sum, src[2], taps_hi, 0); + sum = vmlal_lane_s16(sum, src[3], taps_hi, 1); + } else if (num_taps == 2) { + sum = vmull_lane_s16(src[0], taps_lo, 3); + sum = vmlal_lane_s16(sum, src[1], taps_hi, 0); + } + + if (is_compound) { + return vqrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1); + } + + return vqrshrn_n_s32(sum, kInterRoundBitsVertical - 1); +} + +template +int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src, + const int16x8_t taps) { + const int16x4_t taps_lo = vget_low_s16(taps); + const int16x4_t taps_hi = vget_high_s16(taps); + int32x4_t sum_lo, sum_hi; + if (num_taps == 8) { + sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0); + sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_lo, 3); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_lo, 3); + + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[6]), taps_hi, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[6]), taps_hi, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3); + } else if (num_taps == 6) { + sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1); + sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_lo, 3); + + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[4]), taps_hi, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[4]), taps_hi, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2); + } else if (num_taps == 4) { + sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2); + sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3); + + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_hi, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[2]), taps_hi, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1); + } else if (num_taps == 2) { + sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3); + sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3); + + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0); + } + + if (is_compound) { + return vcombine_s16( + vqrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1), + vqrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1)); + } + + return vcombine_s16(vqrshrn_n_s32(sum_lo, kInterRoundBitsVertical - 1), + vqrshrn_n_s32(sum_hi, kInterRoundBitsVertical - 1)); +} + +template +void Filter2DVertical(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int width, + const int height, const int16x8_t taps) { + assert(width >= 8); + constexpr int next_row = num_taps - 1; + // The Horizontal pass uses |width| as |stride| for the intermediate buffer. + const ptrdiff_t src_stride = width; + + auto* dst8 = static_cast(dst); + auto* dst16 = static_cast(dst); + + int x = 0; + do { + int16x8_t srcs[8]; + const uint16_t* src_x = src + x; + srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x)); + src_x += src_stride; + if (num_taps >= 4) { + srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x)); + src_x += src_stride; + srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x)); + src_x += src_stride; + if (num_taps >= 6) { + srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x)); + src_x += src_stride; + srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x)); + src_x += src_stride; + if (num_taps == 8) { + srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x)); + src_x += src_stride; + srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x)); + src_x += src_stride; + } + } + } + + int y = 0; + do { + srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x)); + src_x += src_stride; + + const int16x8_t sum = + SimpleSum2DVerticalTaps(srcs, taps); + if (is_compound) { + vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum)); + } else { + vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum)); + } + + srcs[0] = srcs[1]; + if (num_taps >= 4) { + srcs[1] = srcs[2]; + srcs[2] = srcs[3]; + if (num_taps >= 6) { + srcs[3] = srcs[4]; + srcs[4] = srcs[5]; + if (num_taps == 8) { + srcs[5] = srcs[6]; + srcs[6] = srcs[7]; + } + } + } + } while (++y < height); + x += 8; + } while (x < width); +} + +// Take advantage of |src_stride| == |width| to process two rows at a time. +template +void Filter2DVertical4xH(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int height, + const int16x8_t taps) { + auto* dst8 = static_cast(dst); + auto* dst16 = static_cast(dst); + + int16x8_t srcs[9]; + srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + if (num_taps >= 4) { + srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2])); + if (num_taps >= 6) { + srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4])); + if (num_taps == 8) { + srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6])); + } + } + } + + int y = 0; + do { + srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]), + vget_low_s16(srcs[num_taps])); + + const int16x8_t sum = + SimpleSum2DVerticalTaps(srcs, taps); + if (is_compound) { + const uint16x8_t results = vreinterpretq_u16_s16(sum); + vst1q_u16(dst16, results); + dst16 += 4 << 1; + } else { + const uint8x8_t results = vqmovun_s16(sum); + + StoreLo4(dst8, results); + dst8 += dst_stride; + StoreHi4(dst8, results); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + if (num_taps >= 4) { + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + if (num_taps >= 6) { + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; + if (num_taps == 8) { + srcs[5] = srcs[7]; + srcs[6] = srcs[8]; + } + } + } + y += 2; + } while (y < height); +} + +// Take advantage of |src_stride| == |width| to process four rows at a time. +template +void Filter2DVertical2xH(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int height, + const int16x8_t taps) { + constexpr int next_row = (num_taps < 6) ? 4 : 8; + + auto* dst8 = static_cast(dst); + + int16x8_t srcs[9]; + srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + if (num_taps >= 6) { + srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + srcs[1] = vextq_s16(srcs[0], srcs[4], 2); + if (num_taps == 8) { + srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4])); + srcs[3] = vextq_s16(srcs[0], srcs[4], 6); + } + } + + int y = 0; + do { + srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src)); + src += 8; + if (num_taps == 2) { + srcs[1] = vextq_s16(srcs[0], srcs[4], 2); + } else if (num_taps == 4) { + srcs[1] = vextq_s16(srcs[0], srcs[4], 2); + srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4])); + srcs[3] = vextq_s16(srcs[0], srcs[4], 6); + } else if (num_taps == 6) { + srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4])); + srcs[3] = vextq_s16(srcs[0], srcs[4], 6); + srcs[5] = vextq_s16(srcs[4], srcs[8], 2); + } else if (num_taps == 8) { + srcs[5] = vextq_s16(srcs[4], srcs[8], 2); + srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8])); + srcs[7] = vextq_s16(srcs[4], srcs[8], 6); + } + + const int16x8_t sum = + SimpleSum2DVerticalTaps(srcs, taps); + const uint8x8_t results = vqmovun_s16(sum); + + Store2<0>(dst8, results); + dst8 += dst_stride; + Store2<1>(dst8, results); + // When |height| <= 4 the taps are restricted to 2 and 4 tap variants. + // Therefore we don't need to check this condition when |height| > 4. + if (num_taps <= 4 && height == 2) return; + dst8 += dst_stride; + Store2<2>(dst8, results); + dst8 += dst_stride; + Store2<3>(dst8, results); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + if (num_taps == 6) { + srcs[1] = srcs[5]; + srcs[4] = srcs[8]; + } else if (num_taps == 8) { + srcs[1] = srcs[5]; + srcs[2] = srcs[6]; + srcs[3] = srcs[7]; + srcs[4] = srcs[8]; + } + + y += 4; + } while (y < height); +} + +template +LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( + const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, + const ptrdiff_t dst_stride, const int width, const int height, + const int filter_id, const int filter_index) { + // Duplicate the absolute value for each tap. Negative taps are corrected + // by using the vmlsl_u8 instruction. Positive taps use vmlal_u8. + uint8x8_t v_tap[kSubPixelTaps]; + assert(filter_id != 0); + + for (int k = 0; k < kSubPixelTaps; ++k) { + v_tap[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]); + } + + if (filter_index == 2) { // 8 tap. + FilterHorizontal<8, 8, 2, true, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 1) { // 6 tap. + // Check if outside taps are positive. + if ((filter_id == 1) | (filter_id == 15)) { + FilterHorizontal<6, 8, 1, false, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else { + FilterHorizontal<6, 8, 1, true, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } + } else if (filter_index == 0) { // 6 tap. + FilterHorizontal<6, 8, 0, true, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 4) { // 4 tap. + FilterHorizontal<4, 8, 4, true, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 5) { // 4 tap. + FilterHorizontal<4, 8, 5, true, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else { // 2 tap. + FilterHorizontal<2, 8, 3, true, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } +} + +void Convolve2D_NEON(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int vertical_filter_index, + const int horizontal_filter_id, + const int vertical_filter_id, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); + const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + + // The output of the horizontal filter is guaranteed to fit in 16 bits. + uint16_t + intermediate_result[kMaxSuperBlockSizeInPixels * + (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; + const int intermediate_height = height + vertical_taps - 1; + + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast(reference) - + (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset; + + DoHorizontalPass(src, src_stride, intermediate_result, width, + width, intermediate_height, + horizontal_filter_id, horiz_filter_index); + + // Vertical filter. + auto* dest = static_cast(prediction); + const ptrdiff_t dest_stride = pred_stride; + assert(vertical_filter_id != 0); + + const int16x8_t taps = vmovl_s8( + vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id])); + + if (vertical_taps == 8) { + if (width == 2) { + Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height, + taps); + } + } else if (vertical_taps == 6) { + if (width == 2) { + Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height, + taps); + } + } else if (vertical_taps == 4) { + if (width == 2) { + Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height, + taps); + } + } else { // |vertical_taps| == 2 + if (width == 2) { + Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height, + taps); + } + } +} + +// There are many opportunities for overreading in scaled convolve, because the +// range of starting points for filter windows is anywhere from 0 to 16 for 8 +// destination pixels, and the window sizes range from 2 to 8. To accommodate +// this range concisely, we use |grade_x| to mean the most steps in src that can +// be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2, +// we are guaranteed to exceed 8 whole steps in src for every 8 |step_x| +// increments. The first load covers the initial elements of src_x, while the +// final load covers the taps. +template +inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) { + uint8x8x3_t ret; + const uint8x16_t src_val = vld1q_u8(src_x); + ret.val[0] = vget_low_u8(src_val); + ret.val[1] = vget_high_u8(src_val); + if (grade_x > 1) { + ret.val[2] = vld1_u8(src_x + 16); + } + return ret; +} + +// Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3] +inline uint8x16_t GetPositive2TapFilter(const int tap_index) { + assert(tap_index < 2); + alignas( + 16) static constexpr uint8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = { + {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4}, + {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}}; + + return vld1q_u8(kAbsHalfSubPixel2TapFilterColumns[tap_index]); +} + +template +inline void ConvolveKernelHorizontal2Tap(const uint8_t* src, + const ptrdiff_t src_stride, + const int width, const int subpixel_x, + const int step_x, + const int intermediate_height, + int16_t* intermediate) { + // Account for the 0-taps that precede the 2 nonzero taps. + const int kernel_offset = 3; + const int ref_x = subpixel_x >> kScaleSubPixelBits; + const int step_x8 = step_x << 3; + const uint8x16_t filter_taps0 = GetPositive2TapFilter(0); + const uint8x16_t filter_taps1 = GetPositive2TapFilter(1); + const uint16x8_t index_steps = vmulq_n_u16( + vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast(step_x)); + const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); + + int p = subpixel_x; + if (width <= 4) { + const uint8_t* src_x = + &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; + // Only add steps to the 10-bit truncated p to avoid overflow. + const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); + const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); + const uint8x8_t filter_indices = + vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask); + // This is a special case. The 2-tap filter has no negative taps, so we + // can use unsigned values. + // For each x, a lane of tapsK has + // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends + // on x. + const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices), + VQTbl1U8(filter_taps1, filter_indices)}; + int y = 0; + do { + // Load a pool of samples to select from using stepped indices. + const uint8x16_t src_vals = vld1q_u8(src_x); + const uint8x8_t src_indices = + vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)); + + // For each x, a lane of srcK contains src_x[k]. + const uint8x8_t src[2] = { + VQTbl1U8(src_vals, src_indices), + VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))}; + + vst1q_s16(intermediate, + vrshrq_n_s16(SumOnePassTaps(src, taps), + kInterRoundBitsHorizontal - 1)); + src_x += src_stride; + intermediate += kIntermediateStride; + } while (++y < intermediate_height); + return; + } + + // |width| >= 8 + int x = 0; + do { + const uint8_t* src_x = + &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; + int16_t* intermediate_x = intermediate + x; + // Only add steps to the 10-bit truncated p to avoid overflow. + const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); + const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); + const uint8x8_t filter_indices = + vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), + filter_index_mask); + // This is a special case. The 2-tap filter has no negative taps, so we + // can use unsigned values. + // For each x, a lane of tapsK has + // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends + // on x. + const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices), + VQTbl1U8(filter_taps1, filter_indices)}; + int y = 0; + do { + // Load a pool of samples to select from using stepped indices. + const uint8x8x3_t src_vals = LoadSrcVals(src_x); + const uint8x8_t src_indices = + vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)); + + // For each x, a lane of srcK contains src_x[k]. + const uint8x8_t src[2] = { + vtbl3_u8(src_vals, src_indices), + vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))}; + + vst1q_s16(intermediate_x, + vrshrq_n_s16(SumOnePassTaps(src, taps), + kInterRoundBitsHorizontal - 1)); + src_x += src_stride; + intermediate_x += kIntermediateStride; + } while (++y < intermediate_height); + x += 8; + p += step_x8; + } while (x < width); +} + +// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5]. +inline uint8x16_t GetPositive4TapFilter(const int tap_index) { + assert(tap_index < 4); + alignas( + 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = { + {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1}, + {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17}, + {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31}, + {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}}; + + return vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]); +} + +// This filter is only possible when width <= 4. +void ConvolveKernelHorizontalPositive4Tap( + const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x, + const int step_x, const int intermediate_height, int16_t* intermediate) { + const int kernel_offset = 2; + const int ref_x = subpixel_x >> kScaleSubPixelBits; + const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); + const uint8x16_t filter_taps0 = GetPositive4TapFilter(0); + const uint8x16_t filter_taps1 = GetPositive4TapFilter(1); + const uint8x16_t filter_taps2 = GetPositive4TapFilter(2); + const uint8x16_t filter_taps3 = GetPositive4TapFilter(3); + const uint16x8_t index_steps = vmulq_n_u16( + vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast(step_x)); + const int p = subpixel_x; + // First filter is special, just a 128 tap on the center. + const uint8_t* src_x = + &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; + // Only add steps to the 10-bit truncated p to avoid overflow. + const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); + const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); + const uint8x8_t filter_indices = vand_u8( + vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask); + // Note that filter_id depends on x. + // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k]. + const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices), + VQTbl1U8(filter_taps1, filter_indices), + VQTbl1U8(filter_taps2, filter_indices), + VQTbl1U8(filter_taps3, filter_indices)}; + + const uint8x8_t src_indices = + vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)); + int y = 0; + do { + // Load a pool of samples to select from using stepped index vectors. + const uint8x16_t src_vals = vld1q_u8(src_x); + + // For each x, srcK contains src_x[k] where k=1. + // Whereas taps come from different arrays, src pixels are drawn from the + // same contiguous line. + const uint8x8_t src[4] = { + VQTbl1U8(src_vals, src_indices), + VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1))), + VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2))), + VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)))}; + + vst1q_s16(intermediate, + vrshrq_n_s16(SumOnePassTaps(src, taps), + kInterRoundBitsHorizontal - 1)); + + src_x += src_stride; + intermediate += kIntermediateStride; + } while (++y < intermediate_height); +} + +// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4]. +inline uint8x16_t GetSigned4TapFilter(const int tap_index) { + assert(tap_index < 4); + alignas(16) static constexpr uint8_t + kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = { + {0, 2, 4, 5, 6, 6, 7, 6, 6, 5, 5, 5, 4, 3, 2, 1}, + {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4}, + {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63}, + {0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 6, 6, 5, 4, 2}}; + + return vld1q_u8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]); +} + +// This filter is only possible when width <= 4. +inline void ConvolveKernelHorizontalSigned4Tap( + const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x, + const int step_x, const int intermediate_height, int16_t* intermediate) { + const int kernel_offset = 2; + const int ref_x = subpixel_x >> kScaleSubPixelBits; + const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); + const uint8x16_t filter_taps0 = GetSigned4TapFilter(0); + const uint8x16_t filter_taps1 = GetSigned4TapFilter(1); + const uint8x16_t filter_taps2 = GetSigned4TapFilter(2); + const uint8x16_t filter_taps3 = GetSigned4TapFilter(3); + const uint16x4_t index_steps = vmul_n_u16(vcreate_u16(0x0003000200010000), + static_cast(step_x)); + + const int p = subpixel_x; + const uint8_t* src_x = + &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; + // Only add steps to the 10-bit truncated p to avoid overflow. + const uint16x4_t p_fraction = vdup_n_u16(p & 1023); + const uint16x4_t subpel_index_offsets = vadd_u16(index_steps, p_fraction); + const uint8x8_t filter_index_offsets = vshrn_n_u16( + vcombine_u16(subpel_index_offsets, vdup_n_u16(0)), kFilterIndexShift); + const uint8x8_t filter_indices = + vand_u8(filter_index_offsets, filter_index_mask); + // Note that filter_id depends on x. + // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k]. + const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices), + VQTbl1U8(filter_taps1, filter_indices), + VQTbl1U8(filter_taps2, filter_indices), + VQTbl1U8(filter_taps3, filter_indices)}; + + const uint8x8_t src_indices_base = + vshr_n_u8(filter_index_offsets, kScaleSubPixelBits - kFilterIndexShift); + + const uint8x8_t src_indices[4] = {src_indices_base, + vadd_u8(src_indices_base, vdup_n_u8(1)), + vadd_u8(src_indices_base, vdup_n_u8(2)), + vadd_u8(src_indices_base, vdup_n_u8(3))}; + + int y = 0; + do { + // Load a pool of samples to select from using stepped indices. + const uint8x16_t src_vals = vld1q_u8(src_x); + + // For each x, srcK contains src_x[k] where k=1. + // Whereas taps come from different arrays, src pixels are drawn from the + // same contiguous line. + const uint8x8_t src[4] = { + VQTbl1U8(src_vals, src_indices[0]), VQTbl1U8(src_vals, src_indices[1]), + VQTbl1U8(src_vals, src_indices[2]), VQTbl1U8(src_vals, src_indices[3])}; + + vst1q_s16(intermediate, + vrshrq_n_s16(SumOnePassTaps(src, taps), + kInterRoundBitsHorizontal - 1)); + src_x += src_stride; + intermediate += kIntermediateStride; + } while (++y < intermediate_height); +} + +// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0]. +inline uint8x16_t GetSigned6TapFilter(const int tap_index) { + assert(tap_index < 6); + alignas(16) static constexpr uint8_t + kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = { + {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}, + {0, 3, 5, 6, 7, 7, 8, 7, 7, 6, 6, 6, 5, 4, 2, 1}, + {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4}, + {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63}, + {0, 1, 2, 4, 5, 6, 6, 6, 7, 7, 8, 7, 7, 6, 5, 3}, + {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}; + + return vld1q_u8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]); +} + +// This filter is only possible when width >= 8. +template +inline void ConvolveKernelHorizontalSigned6Tap( + const uint8_t* src, const ptrdiff_t src_stride, const int width, + const int subpixel_x, const int step_x, const int intermediate_height, + int16_t* intermediate) { + const int kernel_offset = 1; + const uint8x8_t one = vdup_n_u8(1); + const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); + const int ref_x = subpixel_x >> kScaleSubPixelBits; + const int step_x8 = step_x << 3; + uint8x16_t filter_taps[6]; + for (int i = 0; i < 6; ++i) { + filter_taps[i] = GetSigned6TapFilter(i); + } + const uint16x8_t index_steps = vmulq_n_u16( + vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast(step_x)); + + int x = 0; + int p = subpixel_x; + do { + // Avoid overloading outside the reference boundaries. This means + // |trailing_width| can be up to 24. + const uint8_t* src_x = + &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; + int16_t* intermediate_x = intermediate + x; + // Only add steps to the 10-bit truncated p to avoid overflow. + const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); + const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); + const uint8x8_t src_indices = + vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)); + uint8x8_t src_lookup[6]; + src_lookup[0] = src_indices; + for (int i = 1; i < 6; ++i) { + src_lookup[i] = vadd_u8(src_lookup[i - 1], one); + } + + const uint8x8_t filter_indices = + vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), + filter_index_mask); + // For each x, a lane of taps[k] has + // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends + // on x. + uint8x8_t taps[6]; + for (int i = 0; i < 6; ++i) { + taps[i] = VQTbl1U8(filter_taps[i], filter_indices); + } + int y = 0; + do { + // Load a pool of samples to select from using stepped indices. + const uint8x8x3_t src_vals = LoadSrcVals(src_x); + + const uint8x8_t src[6] = { + vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]), + vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]), + vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5])}; + + vst1q_s16(intermediate_x, + vrshrq_n_s16(SumOnePassTaps(src, taps), + kInterRoundBitsHorizontal - 1)); + src_x += src_stride; + intermediate_x += kIntermediateStride; + } while (++y < intermediate_height); + x += 8; + p += step_x8; + } while (x < width); +} + +// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter +// has mixed positive and negative outer taps which are handled in +// GetMixed6TapFilter(). +inline uint8x16_t GetPositive6TapFilter(const int tap_index) { + assert(tap_index < 6); + alignas(16) static constexpr uint8_t + kAbsHalfSubPixel6TapPositiveFilterColumns[4][16] = { + {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1}, + {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17}, + {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31}, + {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}}; + + return vld1q_u8(kAbsHalfSubPixel6TapPositiveFilterColumns[tap_index]); +} + +inline int8x16_t GetMixed6TapFilter(const int tap_index) { + assert(tap_index < 2); + alignas( + 16) static constexpr int8_t kHalfSubPixel6TapMixedFilterColumns[2][16] = { + {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}}; + + return vld1q_s8(kHalfSubPixel6TapMixedFilterColumns[tap_index]); +} + +// This filter is only possible when width >= 8. +template +inline void ConvolveKernelHorizontalMixed6Tap( + const uint8_t* src, const ptrdiff_t src_stride, const int width, + const int subpixel_x, const int step_x, const int intermediate_height, + int16_t* intermediate) { + const int kernel_offset = 1; + const uint8x8_t one = vdup_n_u8(1); + const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); + const int ref_x = subpixel_x >> kScaleSubPixelBits; + const int step_x8 = step_x << 3; + uint8x8_t taps[4]; + int16x8_t mixed_taps[2]; + uint8x16_t positive_filter_taps[4]; + for (int i = 0; i < 4; ++i) { + positive_filter_taps[i] = GetPositive6TapFilter(i); + } + int8x16_t mixed_filter_taps[2]; + mixed_filter_taps[0] = GetMixed6TapFilter(0); + mixed_filter_taps[1] = GetMixed6TapFilter(1); + const uint16x8_t index_steps = vmulq_n_u16( + vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast(step_x)); + + int x = 0; + int p = subpixel_x; + do { + const uint8_t* src_x = + &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; + int16_t* intermediate_x = intermediate + x; + // Only add steps to the 10-bit truncated p to avoid overflow. + const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); + const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); + const uint8x8_t src_indices = + vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)); + uint8x8_t src_lookup[6]; + src_lookup[0] = src_indices; + for (int i = 1; i < 6; ++i) { + src_lookup[i] = vadd_u8(src_lookup[i - 1], one); + } + + const uint8x8_t filter_indices = + vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), + filter_index_mask); + // For each x, a lane of taps[k] has + // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends + // on x. + for (int i = 0; i < 4; ++i) { + taps[i] = VQTbl1U8(positive_filter_taps[i], filter_indices); + } + mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices)); + mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices)); + + int y = 0; + do { + // Load a pool of samples to select from using stepped indices. + const uint8x8x3_t src_vals = LoadSrcVals(src_x); + + int16x8_t sum_mixed = vmulq_s16( + mixed_taps[0], ZeroExtend(vtbl3_u8(src_vals, src_lookup[0]))); + sum_mixed = vmlaq_s16(sum_mixed, mixed_taps[1], + ZeroExtend(vtbl3_u8(src_vals, src_lookup[5]))); + uint16x8_t sum = vreinterpretq_u16_s16(sum_mixed); + sum = vmlal_u8(sum, taps[0], vtbl3_u8(src_vals, src_lookup[1])); + sum = vmlal_u8(sum, taps[1], vtbl3_u8(src_vals, src_lookup[2])); + sum = vmlal_u8(sum, taps[2], vtbl3_u8(src_vals, src_lookup[3])); + sum = vmlal_u8(sum, taps[3], vtbl3_u8(src_vals, src_lookup[4])); + + vst1q_s16(intermediate_x, vrshrq_n_s16(vreinterpretq_s16_u16(sum), + kInterRoundBitsHorizontal - 1)); + src_x += src_stride; + intermediate_x += kIntermediateStride; + } while (++y < intermediate_height); + x += 8; + p += step_x8; + } while (x < width); +} + +// Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2]. +inline uint8x16_t GetSigned8TapFilter(const int tap_index) { + assert(tap_index < 8); + alignas(16) static constexpr uint8_t + kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = { + {0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0}, + {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1}, + {0, 3, 6, 9, 11, 11, 12, 12, 12, 11, 10, 9, 7, 5, 3, 1}, + {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4}, + {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63}, + {0, 1, 3, 5, 7, 9, 10, 11, 12, 12, 12, 11, 11, 9, 6, 3}, + {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1}, + {0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1}}; + + return vld1q_u8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]); +} + +// This filter is only possible when width >= 8. +template +inline void ConvolveKernelHorizontalSigned8Tap( + const uint8_t* src, const ptrdiff_t src_stride, const int width, + const int subpixel_x, const int step_x, const int intermediate_height, + int16_t* intermediate) { + const uint8x8_t one = vdup_n_u8(1); + const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask); + const int ref_x = subpixel_x >> kScaleSubPixelBits; + const int step_x8 = step_x << 3; + uint8x8_t taps[8]; + uint8x16_t filter_taps[8]; + for (int i = 0; i < 8; ++i) { + filter_taps[i] = GetSigned8TapFilter(i); + } + const uint16x8_t index_steps = vmulq_n_u16( + vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast(step_x)); + int x = 0; + int p = subpixel_x; + do { + const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x]; + int16_t* intermediate_x = intermediate + x; + // Only add steps to the 10-bit truncated p to avoid overflow. + const uint16x8_t p_fraction = vdupq_n_u16(p & 1023); + const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction); + const uint8x8_t src_indices = + vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits)); + uint8x8_t src_lookup[8]; + src_lookup[0] = src_indices; + for (int i = 1; i < 8; ++i) { + src_lookup[i] = vadd_u8(src_lookup[i - 1], one); + } + + const uint8x8_t filter_indices = + vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), + filter_index_mask); + // For each x, a lane of taps[k] has + // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends + // on x. + for (int i = 0; i < 8; ++i) { + taps[i] = VQTbl1U8(filter_taps[i], filter_indices); + } + + int y = 0; + do { + // Load a pool of samples to select from using stepped indices. + const uint8x8x3_t src_vals = LoadSrcVals(src_x); + + const uint8x8_t src[8] = { + vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]), + vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]), + vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5]), + vtbl3_u8(src_vals, src_lookup[6]), vtbl3_u8(src_vals, src_lookup[7])}; + + vst1q_s16(intermediate_x, + vrshrq_n_s16(SumOnePassTaps(src, taps), + kInterRoundBitsHorizontal - 1)); + src_x += src_stride; + intermediate_x += kIntermediateStride; + } while (++y < intermediate_height); + x += 8; + p += step_x8; + } while (x < width); +} + +// This function handles blocks of width 2 or 4. +template +void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y, + const int filter_index, const int step_y, + const int height, void* dest, + const ptrdiff_t dest_stride) { + constexpr ptrdiff_t src_stride = kIntermediateStride; + const int16_t* src_y = src; + // |dest| is 16-bit in compound mode, Pixel otherwise. + uint16_t* dest16_y = static_cast(dest); + uint8_t* dest_y = static_cast(dest); + int16x4_t s[num_taps + grade_y]; + + int p = subpixel_y & 1023; + int prev_p = p; + int y = 0; + do { // y < height + for (int i = 0; i < num_taps; ++i) { + s[i] = vld1_s16(src_y + i * src_stride); + } + int filter_id = (p >> 6) & kSubPixelMask; + int16x8_t filter = + vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id])); + int16x4_t sums = Sum2DVerticalTaps4(s, filter); + if (is_compound) { + assert(width != 2); + const uint16x4_t result = vreinterpret_u16_s16(sums); + vst1_u16(dest16_y, result); + } else { + const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums)); + if (width == 2) { + Store2<0>(dest_y, result); + } else { + StoreLo4(dest_y, result); + } + } + p += step_y; + const int p_diff = + (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits); + prev_p = p; + // Here we load extra source in case it is needed. If |p_diff| == 0, these + // values will be unused, but it's faster to load than to branch. + s[num_taps] = vld1_s16(src_y + num_taps * src_stride); + if (grade_y > 1) { + s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride); + } + dest16_y += dest_stride; + dest_y += dest_stride; + + filter_id = (p >> 6) & kSubPixelMask; + filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id])); + sums = Sum2DVerticalTaps4(&s[p_diff], filter); + if (is_compound) { + assert(width != 2); + const uint16x4_t result = vreinterpret_u16_s16(sums); + vst1_u16(dest16_y, result); + } else { + const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums)); + if (width == 2) { + Store2<0>(dest_y, result); + } else { + StoreLo4(dest_y, result); + } + } + p += step_y; + src_y = src + (p >> kScaleSubPixelBits) * src_stride; + prev_p = p; + dest16_y += dest_stride; + dest_y += dest_stride; + + y += 2; + } while (y < height); +} + +template +inline void ConvolveVerticalScale(const int16_t* src, const int width, + const int subpixel_y, const int filter_index, + const int step_y, const int height, + void* dest, const ptrdiff_t dest_stride) { + constexpr ptrdiff_t src_stride = kIntermediateStride; + // A possible improvement is to use arithmetic to decide how many times to + // apply filters to same source before checking whether to load new srcs. + // However, this will only improve performance with very small step sizes. + int16x8_t s[num_taps + grade_y]; + // |dest| is 16-bit in compound mode, Pixel otherwise. + uint16_t* dest16_y; + uint8_t* dest_y; + + int x = 0; + do { // x < width + const int16_t* src_x = src + x; + const int16_t* src_y = src_x; + dest16_y = static_cast(dest) + x; + dest_y = static_cast(dest) + x; + int p = subpixel_y & 1023; + int prev_p = p; + int y = 0; + do { // y < height + for (int i = 0; i < num_taps; ++i) { + s[i] = vld1q_s16(src_y + i * src_stride); + } + int filter_id = (p >> 6) & kSubPixelMask; + int16x8_t filter = + vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id])); + int16x8_t sum = SimpleSum2DVerticalTaps(s, filter); + if (is_compound) { + vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum)); + } else { + vst1_u8(dest_y, vqmovun_s16(sum)); + } + p += step_y; + const int p_diff = + (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits); + // |grade_y| > 1 always means p_diff > 0, so load vectors that may be + // needed. Otherwise, we only need to load one vector because |p_diff| + // can't exceed 1. + s[num_taps] = vld1q_s16(src_y + num_taps * src_stride); + if (grade_y > 1) { + s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride); + } + dest16_y += dest_stride; + dest_y += dest_stride; + + filter_id = (p >> 6) & kSubPixelMask; + filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id])); + sum = SimpleSum2DVerticalTaps(&s[p_diff], filter); + if (is_compound) { + vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum)); + } else { + vst1_u8(dest_y, vqmovun_s16(sum)); + } + p += step_y; + src_y = src_x + (p >> kScaleSubPixelBits) * src_stride; + prev_p = p; + dest16_y += dest_stride; + dest_y += dest_stride; + + y += 2; + } while (y < height); + x += 8; + } while (x < width); +} + +template +void ConvolveScale2D_NEON(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int vertical_filter_index, const int subpixel_x, + const int subpixel_y, const int step_x, + const int step_y, const int width, const int height, + void* prediction, const ptrdiff_t pred_stride) { + const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); + const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); + assert(step_x <= 2048); + const int num_vert_taps = GetNumTapsInFilter(vert_filter_index); + const int intermediate_height = + (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> + kScaleSubPixelBits) + + num_vert_taps; + assert(step_x <= 2048); + // The output of the horizontal filter, i.e. the intermediate_result, is + // guaranteed to fit in int16_t. + int16_t intermediate_result[kMaxSuperBlockSizeInPixels * + (2 * kMaxSuperBlockSizeInPixels + 8)]; + + // Horizontal filter. + // Filter types used for width <= 4 are different from those for width > 4. + // When width > 4, the valid filter index range is always [0, 3]. + // When width <= 4, the valid filter index range is always [3, 5]. + // Similarly for height. + int filter_index = GetFilterIndex(horizontal_filter_index, width); + int16_t* intermediate = intermediate_result; + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast(reference); + const int vert_kernel_offset = (8 - num_vert_taps) / 2; + src += vert_kernel_offset * src_stride; + + // Derive the maximum value of |step_x| at which all source values fit in one + // 16-byte load. Final index is src_x + |num_taps| - 1 < 16 + // step_x*7 is the final base subpel index for the shuffle mask for filter + // inputs in each iteration on large blocks. When step_x is large, we need a + // larger structure and use a larger table lookup in order to gather all + // filter inputs. + // |num_taps| - 1 is the shuffle index of the final filter input. + const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index); + const int kernel_start_ceiling = 16 - num_horiz_taps; + // This truncated quotient |grade_x_threshold| selects |step_x| such that: + // (step_x * 7) >> kScaleSubPixelBits < single load limit + const int grade_x_threshold = + (kernel_start_ceiling << kScaleSubPixelBits) / 7; + switch (filter_index) { + case 0: + if (step_x > grade_x_threshold) { + ConvolveKernelHorizontalSigned6Tap<2>( + src, src_stride, width, subpixel_x, step_x, intermediate_height, + intermediate); + } else { + ConvolveKernelHorizontalSigned6Tap<1>( + src, src_stride, width, subpixel_x, step_x, intermediate_height, + intermediate); + } + break; + case 1: + if (step_x > grade_x_threshold) { + ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + + } else { + ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } + break; + case 2: + if (step_x > grade_x_threshold) { + ConvolveKernelHorizontalSigned8Tap<2>( + src, src_stride, width, subpixel_x, step_x, intermediate_height, + intermediate); + } else { + ConvolveKernelHorizontalSigned8Tap<1>( + src, src_stride, width, subpixel_x, step_x, intermediate_height, + intermediate); + } + break; + case 3: + if (step_x > grade_x_threshold) { + ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } else { + ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } + break; + case 4: + assert(width <= 4); + ConvolveKernelHorizontalSigned4Tap(src, src_stride, subpixel_x, step_x, + intermediate_height, intermediate); + break; + default: + assert(filter_index == 5); + ConvolveKernelHorizontalPositive4Tap(src, src_stride, subpixel_x, step_x, + intermediate_height, intermediate); + } + // Vertical filter. + filter_index = GetFilterIndex(vertical_filter_index, height); + intermediate = intermediate_result; + + switch (filter_index) { + case 0: + case 1: + if (step_y <= 1024) { + if (!is_compound && width == 2) { + ConvolveVerticalScale4xH<6, 1, 2, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale4xH<6, 1, 4, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<6, 1, is_compound>( + intermediate, width, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } + } else { + if (!is_compound && width == 2) { + ConvolveVerticalScale4xH<6, 2, 2, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale4xH<6, 2, 4, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<6, 2, is_compound>( + intermediate, width, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } + } + break; + case 2: + if (step_y <= 1024) { + if (!is_compound && width == 2) { + ConvolveVerticalScale4xH<8, 1, 2, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale4xH<8, 1, 4, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<8, 1, is_compound>( + intermediate, width, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } + } else { + if (!is_compound && width == 2) { + ConvolveVerticalScale4xH<8, 2, 2, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale4xH<8, 2, 4, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<8, 2, is_compound>( + intermediate, width, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } + } + break; + case 3: + if (step_y <= 1024) { + if (!is_compound && width == 2) { + ConvolveVerticalScale4xH<2, 1, 2, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale4xH<2, 1, 4, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<2, 1, is_compound>( + intermediate, width, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } + } else { + if (!is_compound && width == 2) { + ConvolveVerticalScale4xH<2, 2, 2, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale4xH<2, 2, 4, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<2, 2, is_compound>( + intermediate, width, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } + } + break; + case 4: + default: + assert(filter_index == 4 || filter_index == 5); + assert(height <= 4); + if (step_y <= 1024) { + if (!is_compound && width == 2) { + ConvolveVerticalScale4xH<4, 1, 2, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale4xH<4, 1, 4, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<4, 1, is_compound>( + intermediate, width, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } + } else { + if (!is_compound && width == 2) { + ConvolveVerticalScale4xH<4, 2, 2, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale4xH<4, 2, 4, is_compound>( + intermediate, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<4, 2, is_compound>( + intermediate, width, subpixel_y, filter_index, step_y, height, + prediction, pred_stride); + } + } + } +} + +void ConvolveHorizontal_NEON(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int /*vertical_filter_index*/, + const int horizontal_filter_id, + const int /*vertical_filter_id*/, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + const int filter_index = GetFilterIndex(horizontal_filter_index, width); + // Set |src| to the outermost tap. + const auto* src = static_cast(reference) - kHorizontalOffset; + auto* dest = static_cast(prediction); + + DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height, + horizontal_filter_id, filter_index); +} + +// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D +// Vertical calculations. +uint16x8_t Compound1DShift(const int16x8_t sum) { + return vreinterpretq_u16_s16( + vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1)); +} + +template +void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int width, const int height, + const uint8x8_t* const taps) { + const int num_taps = GetNumTapsInFilter(filter_index); + const int next_row = num_taps - 1; + auto* dst8 = static_cast(dst); + auto* dst16 = static_cast(dst); + assert(width >= 8); + + int x = 0; + do { + const uint8_t* src_x = src + x; + uint8x8_t srcs[8]; + srcs[0] = vld1_u8(src_x); + src_x += src_stride; + if (num_taps >= 4) { + srcs[1] = vld1_u8(src_x); + src_x += src_stride; + srcs[2] = vld1_u8(src_x); + src_x += src_stride; + if (num_taps >= 6) { + srcs[3] = vld1_u8(src_x); + src_x += src_stride; + srcs[4] = vld1_u8(src_x); + src_x += src_stride; + if (num_taps == 8) { + srcs[5] = vld1_u8(src_x); + src_x += src_stride; + srcs[6] = vld1_u8(src_x); + src_x += src_stride; + } + } + } + + int y = 0; + do { + srcs[next_row] = vld1_u8(src_x); + src_x += src_stride; + + const int16x8_t sums = + SumOnePassTaps(srcs, taps); + if (is_compound) { + const uint16x8_t results = Compound1DShift(sums); + vst1q_u16(dst16 + x + y * dst_stride, results); + } else { + const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); + vst1_u8(dst8 + x + y * dst_stride, results); + } + + srcs[0] = srcs[1]; + if (num_taps >= 4) { + srcs[1] = srcs[2]; + srcs[2] = srcs[3]; + if (num_taps >= 6) { + srcs[3] = srcs[4]; + srcs[4] = srcs[5]; + if (num_taps == 8) { + srcs[5] = srcs[6]; + srcs[6] = srcs[7]; + } + } + } + } while (++y < height); + x += 8; + } while (x < width); +} + +template +void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int height, const uint8x8_t* const taps) { + const int num_taps = GetNumTapsInFilter(filter_index); + auto* dst8 = static_cast(dst); + auto* dst16 = static_cast(dst); + + uint8x8_t srcs[9]; + + if (num_taps == 2) { + srcs[2] = vdup_n_u8(0); + + srcs[0] = Load4(src); + src += src_stride; + + int y = 0; + do { + srcs[0] = Load4<1>(src, srcs[0]); + src += src_stride; + srcs[2] = Load4<0>(src, srcs[2]); + src += src_stride; + srcs[1] = vext_u8(srcs[0], srcs[2], 4); + + const int16x8_t sums = + SumOnePassTaps(srcs, taps); + if (is_compound) { + const uint16x8_t results = Compound1DShift(sums); + + vst1q_u16(dst16, results); + dst16 += 4 << 1; + } else { + const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); + + StoreLo4(dst8, results); + dst8 += dst_stride; + StoreHi4(dst8, results); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + y += 2; + } while (y < height); + } else if (num_taps == 4) { + srcs[4] = vdup_n_u8(0); + + srcs[0] = Load4(src); + src += src_stride; + srcs[0] = Load4<1>(src, srcs[0]); + src += src_stride; + srcs[2] = Load4(src); + src += src_stride; + srcs[1] = vext_u8(srcs[0], srcs[2], 4); + + int y = 0; + do { + srcs[2] = Load4<1>(src, srcs[2]); + src += src_stride; + srcs[4] = Load4<0>(src, srcs[4]); + src += src_stride; + srcs[3] = vext_u8(srcs[2], srcs[4], 4); + + const int16x8_t sums = + SumOnePassTaps(srcs, taps); + if (is_compound) { + const uint16x8_t results = Compound1DShift(sums); + + vst1q_u16(dst16, results); + dst16 += 4 << 1; + } else { + const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); + + StoreLo4(dst8, results); + dst8 += dst_stride; + StoreHi4(dst8, results); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + y += 2; + } while (y < height); + } else if (num_taps == 6) { + srcs[6] = vdup_n_u8(0); + + srcs[0] = Load4(src); + src += src_stride; + srcs[0] = Load4<1>(src, srcs[0]); + src += src_stride; + srcs[2] = Load4(src); + src += src_stride; + srcs[1] = vext_u8(srcs[0], srcs[2], 4); + srcs[2] = Load4<1>(src, srcs[2]); + src += src_stride; + srcs[4] = Load4(src); + src += src_stride; + srcs[3] = vext_u8(srcs[2], srcs[4], 4); + + int y = 0; + do { + srcs[4] = Load4<1>(src, srcs[4]); + src += src_stride; + srcs[6] = Load4<0>(src, srcs[6]); + src += src_stride; + srcs[5] = vext_u8(srcs[4], srcs[6], 4); + + const int16x8_t sums = + SumOnePassTaps(srcs, taps); + if (is_compound) { + const uint16x8_t results = Compound1DShift(sums); + + vst1q_u16(dst16, results); + dst16 += 4 << 1; + } else { + const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); + + StoreLo4(dst8, results); + dst8 += dst_stride; + StoreHi4(dst8, results); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; + y += 2; + } while (y < height); + } else if (num_taps == 8) { + srcs[8] = vdup_n_u8(0); + + srcs[0] = Load4(src); + src += src_stride; + srcs[0] = Load4<1>(src, srcs[0]); + src += src_stride; + srcs[2] = Load4(src); + src += src_stride; + srcs[1] = vext_u8(srcs[0], srcs[2], 4); + srcs[2] = Load4<1>(src, srcs[2]); + src += src_stride; + srcs[4] = Load4(src); + src += src_stride; + srcs[3] = vext_u8(srcs[2], srcs[4], 4); + srcs[4] = Load4<1>(src, srcs[4]); + src += src_stride; + srcs[6] = Load4(src); + src += src_stride; + srcs[5] = vext_u8(srcs[4], srcs[6], 4); + + int y = 0; + do { + srcs[6] = Load4<1>(src, srcs[6]); + src += src_stride; + srcs[8] = Load4<0>(src, srcs[8]); + src += src_stride; + srcs[7] = vext_u8(srcs[6], srcs[8], 4); + + const int16x8_t sums = + SumOnePassTaps(srcs, taps); + if (is_compound) { + const uint16x8_t results = Compound1DShift(sums); + + vst1q_u16(dst16, results); + dst16 += 4 << 1; + } else { + const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); + + StoreLo4(dst8, results); + dst8 += dst_stride; + StoreHi4(dst8, results); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; + srcs[5] = srcs[7]; + srcs[6] = srcs[8]; + y += 2; + } while (y < height); + } +} + +template +void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int height, const uint8x8_t* const taps) { + const int num_taps = GetNumTapsInFilter(filter_index); + auto* dst8 = static_cast(dst); + + uint8x8_t srcs[9]; + + if (num_taps == 2) { + srcs[2] = vdup_n_u8(0); + + srcs[0] = Load2(src); + src += src_stride; + + int y = 0; + do { + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + srcs[2] = Load2<0>(src, srcs[2]); + src += src_stride; + srcs[1] = vext_u8(srcs[0], srcs[2], 2); + + // This uses srcs[0]..srcs[1]. + const int16x8_t sums = + SumOnePassTaps(srcs, taps); + const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); + + Store2<0>(dst8, results); + dst8 += dst_stride; + Store2<1>(dst8, results); + if (height == 2) return; + dst8 += dst_stride; + Store2<2>(dst8, results); + dst8 += dst_stride; + Store2<3>(dst8, results); + dst8 += dst_stride; + + srcs[0] = srcs[2]; + y += 4; + } while (y < height); + } else if (num_taps == 4) { + srcs[4] = vdup_n_u8(0); + + srcs[0] = Load2(src); + src += src_stride; + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + + int y = 0; + do { + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + srcs[4] = Load2<0>(src, srcs[4]); + src += src_stride; + srcs[1] = vext_u8(srcs[0], srcs[4], 2); + srcs[4] = Load2<1>(src, srcs[4]); + src += src_stride; + srcs[2] = vext_u8(srcs[0], srcs[4], 4); + srcs[4] = Load2<2>(src, srcs[4]); + src += src_stride; + srcs[3] = vext_u8(srcs[0], srcs[4], 6); + + // This uses srcs[0]..srcs[3]. + const int16x8_t sums = + SumOnePassTaps(srcs, taps); + const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); + + Store2<0>(dst8, results); + dst8 += dst_stride; + Store2<1>(dst8, results); + if (height == 2) return; + dst8 += dst_stride; + Store2<2>(dst8, results); + dst8 += dst_stride; + Store2<3>(dst8, results); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + y += 4; + } while (y < height); + } else if (num_taps == 6) { + // During the vertical pass the number of taps is restricted when + // |height| <= 4. + assert(height > 4); + srcs[8] = vdup_n_u8(0); + + srcs[0] = Load2(src); + src += src_stride; + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + srcs[4] = Load2(src); + src += src_stride; + srcs[1] = vext_u8(srcs[0], srcs[4], 2); + + int y = 0; + do { + srcs[4] = Load2<1>(src, srcs[4]); + src += src_stride; + srcs[2] = vext_u8(srcs[0], srcs[4], 4); + srcs[4] = Load2<2>(src, srcs[4]); + src += src_stride; + srcs[3] = vext_u8(srcs[0], srcs[4], 6); + srcs[4] = Load2<3>(src, srcs[4]); + src += src_stride; + srcs[8] = Load2<0>(src, srcs[8]); + src += src_stride; + srcs[5] = vext_u8(srcs[4], srcs[8], 2); + + // This uses srcs[0]..srcs[5]. + const int16x8_t sums = + SumOnePassTaps(srcs, taps); + const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); + + Store2<0>(dst8, results); + dst8 += dst_stride; + Store2<1>(dst8, results); + dst8 += dst_stride; + Store2<2>(dst8, results); + dst8 += dst_stride; + Store2<3>(dst8, results); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + srcs[1] = srcs[5]; + srcs[4] = srcs[8]; + y += 4; + } while (y < height); + } else if (num_taps == 8) { + // During the vertical pass the number of taps is restricted when + // |height| <= 4. + assert(height > 4); + srcs[8] = vdup_n_u8(0); + + srcs[0] = Load2(src); + src += src_stride; + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + srcs[4] = Load2(src); + src += src_stride; + srcs[1] = vext_u8(srcs[0], srcs[4], 2); + srcs[4] = Load2<1>(src, srcs[4]); + src += src_stride; + srcs[2] = vext_u8(srcs[0], srcs[4], 4); + srcs[4] = Load2<2>(src, srcs[4]); + src += src_stride; + srcs[3] = vext_u8(srcs[0], srcs[4], 6); + + int y = 0; + do { + srcs[4] = Load2<3>(src, srcs[4]); + src += src_stride; + srcs[8] = Load2<0>(src, srcs[8]); + src += src_stride; + srcs[5] = vext_u8(srcs[4], srcs[8], 2); + srcs[8] = Load2<1>(src, srcs[8]); + src += src_stride; + srcs[6] = vext_u8(srcs[4], srcs[8], 4); + srcs[8] = Load2<2>(src, srcs[8]); + src += src_stride; + srcs[7] = vext_u8(srcs[4], srcs[8], 6); + + // This uses srcs[0]..srcs[7]. + const int16x8_t sums = + SumOnePassTaps(srcs, taps); + const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1); + + Store2<0>(dst8, results); + dst8 += dst_stride; + Store2<1>(dst8, results); + dst8 += dst_stride; + Store2<2>(dst8, results); + dst8 += dst_stride; + Store2<3>(dst8, results); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + srcs[1] = srcs[5]; + srcs[2] = srcs[6]; + srcs[3] = srcs[7]; + srcs[4] = srcs[8]; + y += 4; + } while (y < height); + } +} + +// This function is a simplified version of Convolve2D_C. +// It is called when it is single prediction mode, where only vertical +// filtering is required. +// The output is the single prediction of the block, clipped to valid pixel +// range. +void ConvolveVertical_NEON(const void* const reference, + const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, + const int vertical_filter_index, + const int /*horizontal_filter_id*/, + const int vertical_filter_id, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + const int filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(filter_index); + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast(reference) - + (vertical_taps / 2 - 1) * src_stride; + auto* dest = static_cast(prediction); + const ptrdiff_t dest_stride = pred_stride; + assert(vertical_filter_id != 0); + + uint8x8_t taps[8]; + for (int k = 0; k < kSubPixelTaps; ++k) { + taps[k] = + vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]); + } + + if (filter_index == 0) { // 6 tap. + if (width == 2) { + FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, + taps + 1); + } else if (width == 4) { + FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, + taps + 1); + } else { + FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, + taps + 1); + } + } else if ((filter_index == 1) & ((vertical_filter_id == 1) | + (vertical_filter_id == 15))) { // 5 tap. + if (width == 2) { + FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height, + taps + 1); + } else if (width == 4) { + FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height, + taps + 1); + } else { + FilterVertical<1>(src, src_stride, dest, dest_stride, width, height, + taps + 1); + } + } else if ((filter_index == 1) & + ((vertical_filter_id == 7) | (vertical_filter_id == 8) | + (vertical_filter_id == 9))) { // 6 tap with weird negative taps. + if (width == 2) { + FilterVertical2xH<1, + /*negative_outside_taps=*/true>( + src, src_stride, dest, dest_stride, height, taps + 1); + } else if (width == 4) { + FilterVertical4xH<1, /*is_compound=*/false, + /*negative_outside_taps=*/true>( + src, src_stride, dest, dest_stride, height, taps + 1); + } else { + FilterVertical<1, /*is_compound=*/false, /*negative_outside_taps=*/true>( + src, src_stride, dest, dest_stride, width, height, taps + 1); + } + } else if (filter_index == 2) { // 8 tap. + if (width == 2) { + FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); + } else if (width == 4) { + FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); + } else { + FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, + taps); + } + } else if (filter_index == 3) { // 2 tap. + if (width == 2) { + FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, + taps + 3); + } else if (width == 4) { + FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, + taps + 3); + } else { + FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, + taps + 3); + } + } else if (filter_index == 4) { // 4 tap. + // Outside taps are negative. + if (width == 2) { + FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, + taps + 2); + } else if (width == 4) { + FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, + taps + 2); + } else { + FilterVertical<4>(src, src_stride, dest, dest_stride, width, height, + taps + 2); + } + } else { + // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed + // below map to 4 tap filters. + assert(filter_index == 5 || + (filter_index == 1 && + (vertical_filter_id == 2 || vertical_filter_id == 3 || + vertical_filter_id == 4 || vertical_filter_id == 5 || + vertical_filter_id == 6 || vertical_filter_id == 10 || + vertical_filter_id == 11 || vertical_filter_id == 12 || + vertical_filter_id == 13 || vertical_filter_id == 14))); + // According to GetNumTapsInFilter() this has 6 taps but here we are + // treating it as though it has 4. + if (filter_index == 1) src += src_stride; + if (width == 2) { + FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, + taps + 2); + } else if (width == 4) { + FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, + taps + 2); + } else { + FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, + taps + 2); + } + } +} + +void ConvolveCompoundCopy_NEON( + const void* const reference, const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, + const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, + const int width, const int height, void* prediction, + const ptrdiff_t /*pred_stride*/) { + const auto* src = static_cast(reference); + const ptrdiff_t src_stride = reference_stride; + auto* dest = static_cast(prediction); + constexpr int final_shift = + kInterRoundBitsVertical - kInterRoundBitsCompoundVertical; + + if (width >= 16) { + int y = 0; + do { + int x = 0; + do { + const uint8x16_t v_src = vld1q_u8(&src[x]); + const uint16x8_t v_dest_lo = + vshll_n_u8(vget_low_u8(v_src), final_shift); + const uint16x8_t v_dest_hi = + vshll_n_u8(vget_high_u8(v_src), final_shift); + vst1q_u16(&dest[x], v_dest_lo); + x += 8; + vst1q_u16(&dest[x], v_dest_hi); + x += 8; + } while (x < width); + src += src_stride; + dest += width; + } while (++y < height); + } else if (width == 8) { + int y = 0; + do { + const uint8x8_t v_src = vld1_u8(&src[0]); + const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift); + vst1q_u16(&dest[0], v_dest); + src += src_stride; + dest += width; + } while (++y < height); + } else { /* width == 4 */ + uint8x8_t v_src = vdup_n_u8(0); + + int y = 0; + do { + v_src = Load4<0>(&src[0], v_src); + src += src_stride; + v_src = Load4<1>(&src[0], v_src); + src += src_stride; + const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift); + vst1q_u16(&dest[0], v_dest); + dest += 4 << 1; + y += 2; + } while (y < height); + } +} + +void ConvolveCompoundVertical_NEON( + const void* const reference, const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, const int vertical_filter_index, + const int /*horizontal_filter_id*/, const int vertical_filter_id, + const int width, const int height, void* prediction, + const ptrdiff_t /*pred_stride*/) { + const int filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(filter_index); + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast(reference) - + (vertical_taps / 2 - 1) * src_stride; + auto* dest = static_cast(prediction); + assert(vertical_filter_id != 0); + + uint8x8_t taps[8]; + for (int k = 0; k < kSubPixelTaps; ++k) { + taps[k] = + vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]); + } + + if (filter_index == 0) { // 6 tap. + if (width == 4) { + FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps + 1); + } else { + FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps + 1); + } + } else if ((filter_index == 1) & ((vertical_filter_id == 1) | + (vertical_filter_id == 15))) { // 5 tap. + if (width == 4) { + FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps + 1); + } else { + FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps + 1); + } + } else if ((filter_index == 1) & + ((vertical_filter_id == 7) | (vertical_filter_id == 8) | + (vertical_filter_id == 9))) { // 6 tap with weird negative taps. + if (width == 4) { + FilterVertical4xH<1, /*is_compound=*/true, + /*negative_outside_taps=*/true>(src, src_stride, dest, + 4, height, taps + 1); + } else { + FilterVertical<1, /*is_compound=*/true, /*negative_outside_taps=*/true>( + src, src_stride, dest, width, width, height, taps + 1); + } + } else if (filter_index == 2) { // 8 tap. + if (width == 4) { + FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); + } else { + FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps); + } + } else if (filter_index == 3) { // 2 tap. + if (width == 4) { + FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps + 3); + } else { + FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps + 3); + } + } else if (filter_index == 4) { // 4 tap. + if (width == 4) { + FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps + 2); + } else { + FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps + 2); + } + } else { + // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map + // to 4 tap filters. + assert(filter_index == 5 || + (filter_index == 1 && + (vertical_filter_id == 2 || vertical_filter_id == 3 || + vertical_filter_id == 4 || vertical_filter_id == 5 || + vertical_filter_id == 6 || vertical_filter_id == 10 || + vertical_filter_id == 11 || vertical_filter_id == 12 || + vertical_filter_id == 13 || vertical_filter_id == 14))); + // According to GetNumTapsInFilter() this has 6 taps but here we are + // treating it as though it has 4. + if (filter_index == 1) src += src_stride; + if (width == 4) { + FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps + 2); + } else { + FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps + 2); + } + } +} + +void ConvolveCompoundHorizontal_NEON( + const void* const reference, const ptrdiff_t reference_stride, + const int horizontal_filter_index, const int /*vertical_filter_index*/, + const int horizontal_filter_id, const int /*vertical_filter_id*/, + const int width, const int height, void* prediction, + const ptrdiff_t /*pred_stride*/) { + const int filter_index = GetFilterIndex(horizontal_filter_index, width); + const auto* src = static_cast(reference) - kHorizontalOffset; + auto* dest = static_cast(prediction); + + DoHorizontalPass( + src, reference_stride, dest, width, width, height, horizontal_filter_id, + filter_index); +} + +void ConvolveCompound2D_NEON(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int vertical_filter_index, + const int horizontal_filter_id, + const int vertical_filter_id, const int width, + const int height, void* prediction, + const ptrdiff_t /*pred_stride*/) { + // The output of the horizontal filter, i.e. the intermediate_result, is + // guaranteed to fit in int16_t. + uint16_t + intermediate_result[kMaxSuperBlockSizeInPixels * + (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; + + // Horizontal filter. + // Filter types used for width <= 4 are different from those for width > 4. + // When width > 4, the valid filter index range is always [0, 3]. + // When width <= 4, the valid filter index range is always [4, 5]. + // Similarly for height. + const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); + const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int intermediate_height = height + vertical_taps - 1; + const ptrdiff_t src_stride = reference_stride; + const auto* const src = static_cast(reference) - + (vertical_taps / 2 - 1) * src_stride - + kHorizontalOffset; + + DoHorizontalPass( + src, src_stride, intermediate_result, width, width, intermediate_height, + horizontal_filter_id, horiz_filter_index); + + // Vertical filter. + auto* dest = static_cast(prediction); + assert(vertical_filter_id != 0); + + const ptrdiff_t dest_stride = width; + const int16x8_t taps = vmovl_s8( + vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id])); + + if (vertical_taps == 8) { + if (width == 4) { + Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<8, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } else if (vertical_taps == 6) { + if (width == 4) { + Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<6, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } else if (vertical_taps == 4) { + if (width == 4) { + Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<4, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } else { // |vertical_taps| == 2 + if (width == 4) { + Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<2, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } +} + +inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) { + const uint8x16_t left = vld1q_u8(src); + const uint8x16_t right = vld1q_u8(src + 1); + vst1q_u8(dst, vrhaddq_u8(left, right)); +} + +template +inline void IntraBlockCopyHorizontal(const uint8_t* src, + const ptrdiff_t src_stride, + const int height, uint8_t* dst, + const ptrdiff_t dst_stride) { + const ptrdiff_t src_remainder_stride = src_stride - (width - 16); + const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16); + + int y = 0; + do { + HalfAddHorizontal(src, dst); + if (width >= 32) { + src += 16; + dst += 16; + HalfAddHorizontal(src, dst); + if (width >= 64) { + src += 16; + dst += 16; + HalfAddHorizontal(src, dst); + src += 16; + dst += 16; + HalfAddHorizontal(src, dst); + if (width == 128) { + src += 16; + dst += 16; + HalfAddHorizontal(src, dst); + src += 16; + dst += 16; + HalfAddHorizontal(src, dst); + src += 16; + dst += 16; + HalfAddHorizontal(src, dst); + src += 16; + dst += 16; + HalfAddHorizontal(src, dst); + } + } + } + src += src_remainder_stride; + dst += dst_remainder_stride; + } while (++y < height); +} + +void ConvolveIntraBlockCopyHorizontal_NEON( + const void* const reference, const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, + const int /*subpixel_x*/, const int /*subpixel_y*/, const int width, + const int height, void* const prediction, const ptrdiff_t pred_stride) { + const auto* src = static_cast(reference); + auto* dest = static_cast(prediction); + + if (width == 128) { + IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 64) { + IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 32) { + IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 16) { + IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 8) { + int y = 0; + do { + const uint8x8_t left = vld1_u8(src); + const uint8x8_t right = vld1_u8(src + 1); + vst1_u8(dest, vrhadd_u8(left, right)); + + src += reference_stride; + dest += pred_stride; + } while (++y < height); + } else if (width == 4) { + uint8x8_t left = vdup_n_u8(0); + uint8x8_t right = vdup_n_u8(0); + int y = 0; + do { + left = Load4<0>(src, left); + right = Load4<0>(src + 1, right); + src += reference_stride; + left = Load4<1>(src, left); + right = Load4<1>(src + 1, right); + src += reference_stride; + + const uint8x8_t result = vrhadd_u8(left, right); + + StoreLo4(dest, result); + dest += pred_stride; + StoreHi4(dest, result); + dest += pred_stride; + y += 2; + } while (y < height); + } else { + assert(width == 2); + uint8x8_t left = vdup_n_u8(0); + uint8x8_t right = vdup_n_u8(0); + int y = 0; + do { + left = Load2<0>(src, left); + right = Load2<0>(src + 1, right); + src += reference_stride; + left = Load2<1>(src, left); + right = Load2<1>(src + 1, right); + src += reference_stride; + + const uint8x8_t result = vrhadd_u8(left, right); + + Store2<0>(dest, result); + dest += pred_stride; + Store2<1>(dest, result); + dest += pred_stride; + y += 2; + } while (y < height); + } +} + +template +inline void IntraBlockCopyVertical(const uint8_t* src, + const ptrdiff_t src_stride, const int height, + uint8_t* dst, const ptrdiff_t dst_stride) { + const ptrdiff_t src_remainder_stride = src_stride - (width - 16); + const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16); + uint8x16_t row[8], below[8]; + + row[0] = vld1q_u8(src); + if (width >= 32) { + src += 16; + row[1] = vld1q_u8(src); + if (width >= 64) { + src += 16; + row[2] = vld1q_u8(src); + src += 16; + row[3] = vld1q_u8(src); + if (width == 128) { + src += 16; + row[4] = vld1q_u8(src); + src += 16; + row[5] = vld1q_u8(src); + src += 16; + row[6] = vld1q_u8(src); + src += 16; + row[7] = vld1q_u8(src); + } + } + } + src += src_remainder_stride; + + int y = 0; + do { + below[0] = vld1q_u8(src); + if (width >= 32) { + src += 16; + below[1] = vld1q_u8(src); + if (width >= 64) { + src += 16; + below[2] = vld1q_u8(src); + src += 16; + below[3] = vld1q_u8(src); + if (width == 128) { + src += 16; + below[4] = vld1q_u8(src); + src += 16; + below[5] = vld1q_u8(src); + src += 16; + below[6] = vld1q_u8(src); + src += 16; + below[7] = vld1q_u8(src); + } + } + } + src += src_remainder_stride; + + vst1q_u8(dst, vrhaddq_u8(row[0], below[0])); + row[0] = below[0]; + if (width >= 32) { + dst += 16; + vst1q_u8(dst, vrhaddq_u8(row[1], below[1])); + row[1] = below[1]; + if (width >= 64) { + dst += 16; + vst1q_u8(dst, vrhaddq_u8(row[2], below[2])); + row[2] = below[2]; + dst += 16; + vst1q_u8(dst, vrhaddq_u8(row[3], below[3])); + row[3] = below[3]; + if (width >= 128) { + dst += 16; + vst1q_u8(dst, vrhaddq_u8(row[4], below[4])); + row[4] = below[4]; + dst += 16; + vst1q_u8(dst, vrhaddq_u8(row[5], below[5])); + row[5] = below[5]; + dst += 16; + vst1q_u8(dst, vrhaddq_u8(row[6], below[6])); + row[6] = below[6]; + dst += 16; + vst1q_u8(dst, vrhaddq_u8(row[7], below[7])); + row[7] = below[7]; + } + } + } + dst += dst_remainder_stride; + } while (++y < height); +} + +void ConvolveIntraBlockCopyVertical_NEON( + const void* const reference, const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, + const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, + const int width, const int height, void* const prediction, + const ptrdiff_t pred_stride) { + const auto* src = static_cast(reference); + auto* dest = static_cast(prediction); + + if (width == 128) { + IntraBlockCopyVertical<128>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 64) { + IntraBlockCopyVertical<64>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 32) { + IntraBlockCopyVertical<32>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 16) { + IntraBlockCopyVertical<16>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 8) { + uint8x8_t row, below; + row = vld1_u8(src); + src += reference_stride; + + int y = 0; + do { + below = vld1_u8(src); + src += reference_stride; + + vst1_u8(dest, vrhadd_u8(row, below)); + dest += pred_stride; + + row = below; + } while (++y < height); + } else if (width == 4) { + uint8x8_t row = Load4(src); + uint8x8_t below = vdup_n_u8(0); + src += reference_stride; + + int y = 0; + do { + below = Load4<0>(src, below); + src += reference_stride; + + StoreLo4(dest, vrhadd_u8(row, below)); + dest += pred_stride; + + row = below; + } while (++y < height); + } else { + assert(width == 2); + uint8x8_t row = Load2(src); + uint8x8_t below = vdup_n_u8(0); + src += reference_stride; + + int y = 0; + do { + below = Load2<0>(src, below); + src += reference_stride; + + Store2<0>(dest, vrhadd_u8(row, below)); + dest += pred_stride; + + row = below; + } while (++y < height); + } +} + +template +inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride, + const int height, uint8_t* dst, + const ptrdiff_t dst_stride) { + const ptrdiff_t src_remainder_stride = src_stride - (width - 8); + const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8); + uint16x8_t row[16]; + row[0] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + if (width >= 16) { + src += 8; + row[1] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + if (width >= 32) { + src += 8; + row[2] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + src += 8; + row[3] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + if (width >= 64) { + src += 8; + row[4] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + src += 8; + row[5] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + src += 8; + row[6] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + src += 8; + row[7] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + if (width == 128) { + src += 8; + row[8] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + src += 8; + row[9] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + src += 8; + row[10] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + src += 8; + row[11] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + src += 8; + row[12] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + src += 8; + row[13] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + src += 8; + row[14] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + src += 8; + row[15] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + } + } + } + } + src += src_remainder_stride; + + int y = 0; + do { + const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2)); + row[0] = below_0; + if (width >= 16) { + src += 8; + dst += 8; + + const uint16x8_t below_1 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[1], below_1), 2)); + row[1] = below_1; + if (width >= 32) { + src += 8; + dst += 8; + + const uint16x8_t below_2 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[2], below_2), 2)); + row[2] = below_2; + src += 8; + dst += 8; + + const uint16x8_t below_3 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[3], below_3), 2)); + row[3] = below_3; + if (width >= 64) { + src += 8; + dst += 8; + + const uint16x8_t below_4 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[4], below_4), 2)); + row[4] = below_4; + src += 8; + dst += 8; + + const uint16x8_t below_5 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[5], below_5), 2)); + row[5] = below_5; + src += 8; + dst += 8; + + const uint16x8_t below_6 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[6], below_6), 2)); + row[6] = below_6; + src += 8; + dst += 8; + + const uint16x8_t below_7 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[7], below_7), 2)); + row[7] = below_7; + if (width == 128) { + src += 8; + dst += 8; + + const uint16x8_t below_8 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[8], below_8), 2)); + row[8] = below_8; + src += 8; + dst += 8; + + const uint16x8_t below_9 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[9], below_9), 2)); + row[9] = below_9; + src += 8; + dst += 8; + + const uint16x8_t below_10 = + vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[10], below_10), 2)); + row[10] = below_10; + src += 8; + dst += 8; + + const uint16x8_t below_11 = + vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[11], below_11), 2)); + row[11] = below_11; + src += 8; + dst += 8; + + const uint16x8_t below_12 = + vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[12], below_12), 2)); + row[12] = below_12; + src += 8; + dst += 8; + + const uint16x8_t below_13 = + vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[13], below_13), 2)); + row[13] = below_13; + src += 8; + dst += 8; + + const uint16x8_t below_14 = + vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[14], below_14), 2)); + row[14] = below_14; + src += 8; + dst += 8; + + const uint16x8_t below_15 = + vaddl_u8(vld1_u8(src), vld1_u8(src + 1)); + vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[15], below_15), 2)); + row[15] = below_15; + } + } + } + } + src += src_remainder_stride; + dst += dst_remainder_stride; + } while (++y < height); +} + +void ConvolveIntraBlockCopy2D_NEON( + const void* const reference, const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, + const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, + const int width, const int height, void* const prediction, + const ptrdiff_t pred_stride) { + const auto* src = static_cast(reference); + auto* dest = static_cast(prediction); + // Note: allow vertical access to height + 1. Because this function is only + // for u/v plane of intra block copy, such access is guaranteed to be within + // the prediction block. + + if (width == 128) { + IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride); + } else if (width == 64) { + IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride); + } else if (width == 32) { + IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride); + } else if (width == 16) { + IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride); + } else if (width == 8) { + IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride); + } else if (width == 4) { + uint8x8_t left = Load4(src); + uint8x8_t right = Load4(src + 1); + src += reference_stride; + + uint16x4_t row = vget_low_u16(vaddl_u8(left, right)); + + int y = 0; + do { + left = Load4<0>(src, left); + right = Load4<0>(src + 1, right); + src += reference_stride; + left = Load4<1>(src, left); + right = Load4<1>(src + 1, right); + src += reference_stride; + + const uint16x8_t below = vaddl_u8(left, right); + + const uint8x8_t result = vrshrn_n_u16( + vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2); + StoreLo4(dest, result); + dest += pred_stride; + StoreHi4(dest, result); + dest += pred_stride; + + row = vget_high_u16(below); + y += 2; + } while (y < height); + } else { + uint8x8_t left = Load2(src); + uint8x8_t right = Load2(src + 1); + src += reference_stride; + + uint16x4_t row = vget_low_u16(vaddl_u8(left, right)); + + int y = 0; + do { + left = Load2<0>(src, left); + right = Load2<0>(src + 1, right); + src += reference_stride; + left = Load2<2>(src, left); + right = Load2<2>(src + 1, right); + src += reference_stride; + + const uint16x8_t below = vaddl_u8(left, right); + + const uint8x8_t result = vrshrn_n_u16( + vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2); + Store2<0>(dest, result); + dest += pred_stride; + Store2<2>(dest, result); + dest += pred_stride; + + row = vget_high_u16(below); + y += 2; + } while (y < height); + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON; + dsp->convolve[0][0][1][0] = ConvolveVertical_NEON; + dsp->convolve[0][0][1][1] = Convolve2D_NEON; + + dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_NEON; + dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_NEON; + dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON; + dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON; + + dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON; + dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON; + dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON; + + dsp->convolve_scale[0] = ConvolveScale2D_NEON; + dsp->convolve_scale[1] = ConvolveScale2D_NEON; +} + +} // namespace +} // namespace low_bitdepth + +void ConvolveInit_NEON() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON + +namespace libgav1 { +namespace dsp { + +void ConvolveInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/convolve_neon.h b/src/dsp/arm/convolve_neon.h new file mode 100644 index 0000000..948ef4d --- /dev/null +++ b/src/dsp/arm/convolve_neon.h @@ -0,0 +1,50 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::convolve. This function is not thread-safe. +void ConvolveInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_ diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc new file mode 100644 index 0000000..04952ab --- /dev/null +++ b/src/dsp/arm/distance_weighted_blend_neon.cc @@ -0,0 +1,203 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/distance_weighted_blend.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kInterPostRoundBit = 4; + +inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0, + const int16x8_t pred1, + const int16x4_t weights[2]) { + // TODO(https://issuetracker.google.com/issues/150325685): Investigate range. + const int32x4_t wpred0_lo = vmull_s16(weights[0], vget_low_s16(pred0)); + const int32x4_t wpred0_hi = vmull_s16(weights[0], vget_high_s16(pred0)); + const int32x4_t blended_lo = + vmlal_s16(wpred0_lo, weights[1], vget_low_s16(pred1)); + const int32x4_t blended_hi = + vmlal_s16(wpred0_hi, weights[1], vget_high_s16(pred1)); + + return vcombine_s16(vqrshrn_n_s32(blended_lo, kInterPostRoundBit + 4), + vqrshrn_n_s32(blended_hi, kInterPostRoundBit + 4)); +} + +template +inline void DistanceWeightedBlendSmall_NEON(const int16_t* prediction_0, + const int16_t* prediction_1, + const int16x4_t weights[2], + void* const dest, + const ptrdiff_t dest_stride) { + auto* dst = static_cast(dest); + constexpr int step = 16 / width; + + for (int y = 0; y < height; y += step) { + const int16x8_t src_00 = vld1q_s16(prediction_0); + const int16x8_t src_10 = vld1q_s16(prediction_1); + prediction_0 += 8; + prediction_1 += 8; + const int16x8_t res0 = ComputeWeightedAverage8(src_00, src_10, weights); + + const int16x8_t src_01 = vld1q_s16(prediction_0); + const int16x8_t src_11 = vld1q_s16(prediction_1); + prediction_0 += 8; + prediction_1 += 8; + const int16x8_t res1 = ComputeWeightedAverage8(src_01, src_11, weights); + + const uint8x8_t result0 = vqmovun_s16(res0); + const uint8x8_t result1 = vqmovun_s16(res1); + if (width == 4) { + StoreLo4(dst, result0); + dst += dest_stride; + StoreHi4(dst, result0); + dst += dest_stride; + StoreLo4(dst, result1); + dst += dest_stride; + StoreHi4(dst, result1); + dst += dest_stride; + } else { + assert(width == 8); + vst1_u8(dst, result0); + dst += dest_stride; + vst1_u8(dst, result1); + dst += dest_stride; + } + } +} + +inline void DistanceWeightedBlendLarge_NEON(const int16_t* prediction_0, + const int16_t* prediction_1, + const int16x4_t weights[2], + const int width, const int height, + void* const dest, + const ptrdiff_t dest_stride) { + auto* dst = static_cast(dest); + + int y = height; + do { + int x = 0; + do { + const int16x8_t src0_lo = vld1q_s16(prediction_0 + x); + const int16x8_t src1_lo = vld1q_s16(prediction_1 + x); + const int16x8_t res_lo = + ComputeWeightedAverage8(src0_lo, src1_lo, weights); + + const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8); + const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8); + const int16x8_t res_hi = + ComputeWeightedAverage8(src0_hi, src1_hi, weights); + + const uint8x16_t result = + vcombine_u8(vqmovun_s16(res_lo), vqmovun_s16(res_hi)); + vst1q_u8(dst + x, result); + x += 16; + } while (x < width); + dst += dest_stride; + prediction_0 += width; + prediction_1 += width; + } while (--y != 0); +} + +inline void DistanceWeightedBlend_NEON(const void* prediction_0, + const void* prediction_1, + const uint8_t weight_0, + const uint8_t weight_1, const int width, + const int height, void* const dest, + const ptrdiff_t dest_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int16x4_t weights[2] = {vdup_n_s16(weight_0), vdup_n_s16(weight_1)}; + // TODO(johannkoenig): Investigate the branching. May be fine to call with a + // variable height. + if (width == 4) { + if (height == 4) { + DistanceWeightedBlendSmall_NEON<4, 4>(pred_0, pred_1, weights, dest, + dest_stride); + } else if (height == 8) { + DistanceWeightedBlendSmall_NEON<4, 8>(pred_0, pred_1, weights, dest, + dest_stride); + } else { + assert(height == 16); + DistanceWeightedBlendSmall_NEON<4, 16>(pred_0, pred_1, weights, dest, + dest_stride); + } + return; + } + + if (width == 8) { + switch (height) { + case 4: + DistanceWeightedBlendSmall_NEON<8, 4>(pred_0, pred_1, weights, dest, + dest_stride); + return; + case 8: + DistanceWeightedBlendSmall_NEON<8, 8>(pred_0, pred_1, weights, dest, + dest_stride); + return; + case 16: + DistanceWeightedBlendSmall_NEON<8, 16>(pred_0, pred_1, weights, dest, + dest_stride); + return; + default: + assert(height == 32); + DistanceWeightedBlendSmall_NEON<8, 32>(pred_0, pred_1, weights, dest, + dest_stride); + + return; + } + } + + DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weights, width, height, dest, + dest_stride); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->distance_weighted_blend = DistanceWeightedBlend_NEON; +} + +} // namespace + +void DistanceWeightedBlendInit_NEON() { Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON + +namespace libgav1 { +namespace dsp { + +void DistanceWeightedBlendInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/distance_weighted_blend_neon.h b/src/dsp/arm/distance_weighted_blend_neon.h new file mode 100644 index 0000000..4d8824c --- /dev/null +++ b/src/dsp/arm/distance_weighted_blend_neon.h @@ -0,0 +1,39 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::distance_weighted_blend. This function is not thread-safe. +void DistanceWeightedBlendInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +// If NEON is enabled signal the NEON implementation should be used instead of +// normal C. +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON + +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_ diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc new file mode 100644 index 0000000..2612466 --- /dev/null +++ b/src/dsp/arm/film_grain_neon.cc @@ -0,0 +1,1188 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/film_grain.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON +#include + +#include +#include +#include +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/arm/film_grain_neon.h" +#include "src/dsp/common.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/film_grain_common.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/logging.h" + +namespace libgav1 { +namespace dsp { +namespace film_grain { +namespace { + +// These functions are overloaded for both possible sizes in order to simplify +// loading and storing to and from intermediate value types from within a +// template function. +inline int16x8_t GetSignedSource8(const int8_t* src) { + return vmovl_s8(vld1_s8(src)); +} + +inline int16x8_t GetSignedSource8(const uint8_t* src) { + return ZeroExtend(vld1_u8(src)); +} + +inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) { + vst1_u8(dest, vmovn_u16(data)); +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +inline int16x8_t GetSignedSource8(const int16_t* src) { return vld1q_s16(src); } + +inline int16x8_t GetSignedSource8(const uint16_t* src) { + return vreinterpretq_s16_u16(vld1q_u16(src)); +} + +inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) { + vst1q_u16(dest, data); +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +// Each element in |sum| represents one destination value's running +// autoregression formula. The fixed source values in |grain_lo| and |grain_hi| +// allow for a sliding window in successive calls to this function. +template +inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo, + const int16x8_t grain_hi, + int16_t coeff, int32x4x2_t sum) { + const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset); + sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff); + sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff); + return sum; +} + +// Because the autoregressive filter requires the output of each pixel to +// compute pixels that come after in the row, we have to finish the calculations +// one at a time. +template +inline void WriteFinalAutoRegression(int8_t* grain_cursor, int32x4x2_t sum, + const int8_t* coeffs, int pos, int shift) { + int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3); + + for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) { + result += grain_cursor[lane + delta_col] * coeffs[pos]; + ++pos; + } + grain_cursor[lane] = + Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift), + GetGrainMin(), GetGrainMax()); +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +template +inline void WriteFinalAutoRegression(int16_t* grain_cursor, int32x4x2_t sum, + const int8_t* coeffs, int pos, int shift) { + int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3); + + for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) { + result += grain_cursor[lane + delta_col] * coeffs[pos]; + ++pos; + } + grain_cursor[lane] = + Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift), + GetGrainMin(), GetGrainMax()); +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +// Because the autoregressive filter requires the output of each pixel to +// compute pixels that come after in the row, we have to finish the calculations +// one at a time. +template +inline void WriteFinalAutoRegressionChroma(int8_t* u_grain_cursor, + int8_t* v_grain_cursor, + int32x4x2_t sum_u, int32x4x2_t sum_v, + const int8_t* coeffs_u, + const int8_t* coeffs_v, int pos, + int shift) { + WriteFinalAutoRegression( + u_grain_cursor, sum_u, coeffs_u, pos, shift); + WriteFinalAutoRegression( + v_grain_cursor, sum_v, coeffs_v, pos, shift); +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +template +inline void WriteFinalAutoRegressionChroma(int16_t* u_grain_cursor, + int16_t* v_grain_cursor, + int32x4x2_t sum_u, int32x4x2_t sum_v, + const int8_t* coeffs_u, + const int8_t* coeffs_v, int pos, + int shift) { + WriteFinalAutoRegression( + u_grain_cursor, sum_u, coeffs_u, pos, shift); + WriteFinalAutoRegression( + v_grain_cursor, sum_v, coeffs_v, pos, shift); +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +inline void SetZero(int32x4x2_t* v) { + v->val[0] = vdupq_n_s32(0); + v->val[1] = vdupq_n_s32(0); +} + +// Computes subsampled luma for use with chroma, by averaging in the x direction +// or y direction when applicable. +int16x8_t GetSubsampledLuma(const int8_t* const luma, int subsampling_x, + int subsampling_y, ptrdiff_t stride) { + if (subsampling_y != 0) { + assert(subsampling_x != 0); + const int8x16_t src0 = vld1q_s8(luma); + const int8x16_t src1 = vld1q_s8(luma + stride); + const int16x8_t ret0 = vcombine_s16(vpaddl_s8(vget_low_s8(src0)), + vpaddl_s8(vget_high_s8(src0))); + const int16x8_t ret1 = vcombine_s16(vpaddl_s8(vget_low_s8(src1)), + vpaddl_s8(vget_high_s8(src1))); + return vrshrq_n_s16(vaddq_s16(ret0, ret1), 2); + } + if (subsampling_x != 0) { + const int8x16_t src = vld1q_s8(luma); + return vrshrq_n_s16( + vcombine_s16(vpaddl_s8(vget_low_s8(src)), vpaddl_s8(vget_high_s8(src))), + 1); + } + return vmovl_s8(vld1_s8(luma)); +} + +// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed. +inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) { + if (subsampling_x != 0) { + const uint8x16_t src = vld1q_u8(luma); + return vrshrq_n_u16(vpaddlq_u8(src), 1); + } + return vmovl_u8(vld1_u8(luma)); +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +// Computes subsampled luma for use with chroma, by averaging in the x direction +// or y direction when applicable. +int16x8_t GetSubsampledLuma(const int16_t* const luma, int subsampling_x, + int subsampling_y, ptrdiff_t stride) { + if (subsampling_y != 0) { + assert(subsampling_x != 0); + int16x8_t src0_lo = vld1q_s16(luma); + int16x8_t src0_hi = vld1q_s16(luma + 8); + const int16x8_t src1_lo = vld1q_s16(luma + stride); + const int16x8_t src1_hi = vld1q_s16(luma + stride + 8); + const int16x8_t src0 = + vcombine_s16(vpadd_s16(vget_low_s16(src0_lo), vget_high_s16(src0_lo)), + vpadd_s16(vget_low_s16(src0_hi), vget_high_s16(src0_hi))); + const int16x8_t src1 = + vcombine_s16(vpadd_s16(vget_low_s16(src1_lo), vget_high_s16(src1_lo)), + vpadd_s16(vget_low_s16(src1_hi), vget_high_s16(src1_hi))); + return vrshrq_n_s16(vaddq_s16(src0, src1), 2); + } + if (subsampling_x != 0) { + const int16x8_t src_lo = vld1q_s16(luma); + const int16x8_t src_hi = vld1q_s16(luma + 8); + const int16x8_t ret = + vcombine_s16(vpadd_s16(vget_low_s16(src_lo), vget_high_s16(src_lo)), + vpadd_s16(vget_low_s16(src_hi), vget_high_s16(src_hi))); + return vrshrq_n_s16(ret, 1); + } + return vld1q_s16(luma); +} + +// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed. +inline uint16x8_t GetAverageLuma(const uint16_t* const luma, + int subsampling_x) { + if (subsampling_x != 0) { + const uint16x8x2_t src = vld2q_u16(luma); + return vrhaddq_u16(src.val[0], src.val[1]); + } + return vld1q_u16(luma); +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +template +void ApplyAutoRegressiveFilterToChromaGrains_NEON(const FilmGrainParams& params, + const void* luma_grain_buffer, + int subsampling_x, + int subsampling_y, + void* u_grain_buffer, + void* v_grain_buffer) { + static_assert(auto_regression_coeff_lag <= 3, "Invalid autoregression lag."); + const auto* luma_grain = static_cast(luma_grain_buffer); + auto* u_grain = static_cast(u_grain_buffer); + auto* v_grain = static_cast(v_grain_buffer); + const int auto_regression_shift = params.auto_regression_shift; + const int chroma_width = + (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth; + const int chroma_height = + (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight; + // When |chroma_width| == 44, we write 8 at a time from x in [3, 34], + // leaving [35, 40] to write at the end. + const int chroma_width_remainder = + (chroma_width - 2 * kAutoRegressionBorder) & 7; + + int y = kAutoRegressionBorder; + luma_grain += kLumaWidth * y; + u_grain += chroma_width * y; + v_grain += chroma_width * y; + do { + // Each row is computed 8 values at a time in the following loop. At the + // end of the loop, 4 values remain to write. They are given a special + // reduced iteration at the end. + int x = kAutoRegressionBorder; + int luma_x = kAutoRegressionBorder; + do { + int pos = 0; + int32x4x2_t sum_u; + int32x4x2_t sum_v; + SetZero(&sum_u); + SetZero(&sum_v); + + if (auto_regression_coeff_lag > 0) { + for (int delta_row = -auto_regression_coeff_lag; delta_row < 0; + ++delta_row) { + // These loads may overflow to the next row, but they are never called + // on the final row of a grain block. Therefore, they will never + // exceed the block boundaries. + // Note: this could be slightly optimized to a single load in 8bpp, + // but requires making a special first iteration and accumulate + // function that takes an int8x16_t. + const int16x8_t u_grain_lo = + GetSignedSource8(u_grain + x + delta_row * chroma_width - + auto_regression_coeff_lag); + const int16x8_t u_grain_hi = + GetSignedSource8(u_grain + x + delta_row * chroma_width - + auto_regression_coeff_lag + 8); + const int16x8_t v_grain_lo = + GetSignedSource8(v_grain + x + delta_row * chroma_width - + auto_regression_coeff_lag); + const int16x8_t v_grain_hi = + GetSignedSource8(v_grain + x + delta_row * chroma_width - + auto_regression_coeff_lag + 8); +#define ACCUMULATE_WEIGHTED_GRAIN(offset) \ + sum_u = AccumulateWeightedGrain( \ + u_grain_lo, u_grain_hi, params.auto_regression_coeff_u[pos], sum_u); \ + sum_v = AccumulateWeightedGrain( \ + v_grain_lo, v_grain_hi, params.auto_regression_coeff_v[pos++], sum_v) + + ACCUMULATE_WEIGHTED_GRAIN(0); + ACCUMULATE_WEIGHTED_GRAIN(1); + ACCUMULATE_WEIGHTED_GRAIN(2); + // The horizontal |auto_regression_coeff_lag| loop is replaced with + // if-statements to give vextq_s16 an immediate param. + if (auto_regression_coeff_lag > 1) { + ACCUMULATE_WEIGHTED_GRAIN(3); + ACCUMULATE_WEIGHTED_GRAIN(4); + } + if (auto_regression_coeff_lag > 2) { + assert(auto_regression_coeff_lag == 3); + ACCUMULATE_WEIGHTED_GRAIN(5); + ACCUMULATE_WEIGHTED_GRAIN(6); + } + } + } + + if (use_luma) { + const int16x8_t luma = GetSubsampledLuma( + luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth); + + // Luma samples get the final coefficient in the formula, but are best + // computed all at once before the final row. + const int coeff_u = + params.auto_regression_coeff_u[pos + auto_regression_coeff_lag]; + const int coeff_v = + params.auto_regression_coeff_v[pos + auto_regression_coeff_lag]; + + sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u); + sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u); + sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v); + sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v); + } + // At this point in the filter, the source addresses and destination + // addresses overlap. Because this is an auto-regressive filter, the + // higher lanes cannot be computed without the results of the lower lanes. + // Each call to WriteFinalAutoRegression incorporates preceding values + // on the final row, and writes a single sample. This allows the next + // pixel's value to be computed in the next call. +#define WRITE_AUTO_REGRESSION_RESULT(lane) \ + WriteFinalAutoRegressionChroma( \ + u_grain + x, v_grain + x, sum_u, sum_v, params.auto_regression_coeff_u, \ + params.auto_regression_coeff_v, pos, auto_regression_shift) + + WRITE_AUTO_REGRESSION_RESULT(0); + WRITE_AUTO_REGRESSION_RESULT(1); + WRITE_AUTO_REGRESSION_RESULT(2); + WRITE_AUTO_REGRESSION_RESULT(3); + WRITE_AUTO_REGRESSION_RESULT(4); + WRITE_AUTO_REGRESSION_RESULT(5); + WRITE_AUTO_REGRESSION_RESULT(6); + WRITE_AUTO_REGRESSION_RESULT(7); + + x += 8; + luma_x += 8 << subsampling_x; + } while (x < chroma_width - kAutoRegressionBorder - chroma_width_remainder); + + // This is the "final iteration" of the above loop over width. We fill in + // the remainder of the width, which is less than 8. + int pos = 0; + int32x4x2_t sum_u; + int32x4x2_t sum_v; + SetZero(&sum_u); + SetZero(&sum_v); + + for (int delta_row = -auto_regression_coeff_lag; delta_row < 0; + ++delta_row) { + // These loads may overflow to the next row, but they are never called on + // the final row of a grain block. Therefore, they will never exceed the + // block boundaries. + const int16x8_t u_grain_lo = GetSignedSource8( + u_grain + x + delta_row * chroma_width - auto_regression_coeff_lag); + const int16x8_t u_grain_hi = + GetSignedSource8(u_grain + x + delta_row * chroma_width - + auto_regression_coeff_lag + 8); + const int16x8_t v_grain_lo = GetSignedSource8( + v_grain + x + delta_row * chroma_width - auto_regression_coeff_lag); + const int16x8_t v_grain_hi = + GetSignedSource8(v_grain + x + delta_row * chroma_width - + auto_regression_coeff_lag + 8); + + ACCUMULATE_WEIGHTED_GRAIN(0); + ACCUMULATE_WEIGHTED_GRAIN(1); + ACCUMULATE_WEIGHTED_GRAIN(2); + // The horizontal |auto_regression_coeff_lag| loop is replaced with + // if-statements to give vextq_s16 an immediate param. + if (auto_regression_coeff_lag > 1) { + ACCUMULATE_WEIGHTED_GRAIN(3); + ACCUMULATE_WEIGHTED_GRAIN(4); + } + if (auto_regression_coeff_lag > 2) { + assert(auto_regression_coeff_lag == 3); + ACCUMULATE_WEIGHTED_GRAIN(5); + ACCUMULATE_WEIGHTED_GRAIN(6); + } + } + + if (use_luma) { + const int16x8_t luma = GetSubsampledLuma( + luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth); + + // Luma samples get the final coefficient in the formula, but are best + // computed all at once before the final row. + const int coeff_u = + params.auto_regression_coeff_u[pos + auto_regression_coeff_lag]; + const int coeff_v = + params.auto_regression_coeff_v[pos + auto_regression_coeff_lag]; + + sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u); + sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u); + sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v); + sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v); + } + + WRITE_AUTO_REGRESSION_RESULT(0); + WRITE_AUTO_REGRESSION_RESULT(1); + WRITE_AUTO_REGRESSION_RESULT(2); + WRITE_AUTO_REGRESSION_RESULT(3); + if (chroma_width_remainder == 6) { + WRITE_AUTO_REGRESSION_RESULT(4); + WRITE_AUTO_REGRESSION_RESULT(5); + } + + luma_grain += kLumaWidth << subsampling_y; + u_grain += chroma_width; + v_grain += chroma_width; + } while (++y < chroma_height); +#undef ACCUMULATE_WEIGHTED_GRAIN +#undef WRITE_AUTO_REGRESSION_RESULT +} + +// Applies an auto-regressive filter to the white noise in luma_grain. +template +void ApplyAutoRegressiveFilterToLumaGrain_NEON(const FilmGrainParams& params, + void* luma_grain_buffer) { + static_assert(auto_regression_coeff_lag > 0, ""); + const int8_t* const auto_regression_coeff_y = params.auto_regression_coeff_y; + const uint8_t auto_regression_shift = params.auto_regression_shift; + + int y = kAutoRegressionBorder; + auto* luma_grain = + static_cast(luma_grain_buffer) + kLumaWidth * y; + do { + // Each row is computed 8 values at a time in the following loop. At the + // end of the loop, 4 values remain to write. They are given a special + // reduced iteration at the end. + int x = kAutoRegressionBorder; + do { + int pos = 0; + int32x4x2_t sum; + SetZero(&sum); + for (int delta_row = -auto_regression_coeff_lag; delta_row < 0; + ++delta_row) { + // These loads may overflow to the next row, but they are never called + // on the final row of a grain block. Therefore, they will never exceed + // the block boundaries. + const int16x8_t src_grain_lo = + GetSignedSource8(luma_grain + x + delta_row * kLumaWidth - + auto_regression_coeff_lag); + const int16x8_t src_grain_hi = + GetSignedSource8(luma_grain + x + delta_row * kLumaWidth - + auto_regression_coeff_lag + 8); + + // A pictorial representation of the auto-regressive filter for + // various values of params.auto_regression_coeff_lag. The letter 'O' + // represents the current sample. (The filter always operates on the + // current sample with filter coefficient 1.) The letters 'X' + // represent the neighboring samples that the filter operates on, below + // their corresponding "offset" number. + // + // params.auto_regression_coeff_lag == 3: + // 0 1 2 3 4 5 6 + // X X X X X X X + // X X X X X X X + // X X X X X X X + // X X X O + // params.auto_regression_coeff_lag == 2: + // 0 1 2 3 4 + // X X X X X + // X X X X X + // X X O + // params.auto_regression_coeff_lag == 1: + // 0 1 2 + // X X X + // X O + // params.auto_regression_coeff_lag == 0: + // O + // The function relies on the caller to skip the call in the 0 lag + // case. + +#define ACCUMULATE_WEIGHTED_GRAIN(offset) \ + sum = AccumulateWeightedGrain(src_grain_lo, src_grain_hi, \ + auto_regression_coeff_y[pos++], sum) + ACCUMULATE_WEIGHTED_GRAIN(0); + ACCUMULATE_WEIGHTED_GRAIN(1); + ACCUMULATE_WEIGHTED_GRAIN(2); + // The horizontal |auto_regression_coeff_lag| loop is replaced with + // if-statements to give vextq_s16 an immediate param. + if (auto_regression_coeff_lag > 1) { + ACCUMULATE_WEIGHTED_GRAIN(3); + ACCUMULATE_WEIGHTED_GRAIN(4); + } + if (auto_regression_coeff_lag > 2) { + assert(auto_regression_coeff_lag == 3); + ACCUMULATE_WEIGHTED_GRAIN(5); + ACCUMULATE_WEIGHTED_GRAIN(6); + } + } + // At this point in the filter, the source addresses and destination + // addresses overlap. Because this is an auto-regressive filter, the + // higher lanes cannot be computed without the results of the lower lanes. + // Each call to WriteFinalAutoRegression incorporates preceding values + // on the final row, and writes a single sample. This allows the next + // pixel's value to be computed in the next call. +#define WRITE_AUTO_REGRESSION_RESULT(lane) \ + WriteFinalAutoRegression( \ + luma_grain + x, sum, auto_regression_coeff_y, pos, \ + auto_regression_shift) + + WRITE_AUTO_REGRESSION_RESULT(0); + WRITE_AUTO_REGRESSION_RESULT(1); + WRITE_AUTO_REGRESSION_RESULT(2); + WRITE_AUTO_REGRESSION_RESULT(3); + WRITE_AUTO_REGRESSION_RESULT(4); + WRITE_AUTO_REGRESSION_RESULT(5); + WRITE_AUTO_REGRESSION_RESULT(6); + WRITE_AUTO_REGRESSION_RESULT(7); + x += 8; + // Leave the final four pixels for the special iteration below. + } while (x < kLumaWidth - kAutoRegressionBorder - 4); + + // Final 4 pixels in the row. + int pos = 0; + int32x4x2_t sum; + SetZero(&sum); + for (int delta_row = -auto_regression_coeff_lag; delta_row < 0; + ++delta_row) { + const int16x8_t src_grain_lo = GetSignedSource8( + luma_grain + x + delta_row * kLumaWidth - auto_regression_coeff_lag); + const int16x8_t src_grain_hi = + GetSignedSource8(luma_grain + x + delta_row * kLumaWidth - + auto_regression_coeff_lag + 8); + + ACCUMULATE_WEIGHTED_GRAIN(0); + ACCUMULATE_WEIGHTED_GRAIN(1); + ACCUMULATE_WEIGHTED_GRAIN(2); + // The horizontal |auto_regression_coeff_lag| loop is replaced with + // if-statements to give vextq_s16 an immediate param. + if (auto_regression_coeff_lag > 1) { + ACCUMULATE_WEIGHTED_GRAIN(3); + ACCUMULATE_WEIGHTED_GRAIN(4); + } + if (auto_regression_coeff_lag > 2) { + assert(auto_regression_coeff_lag == 3); + ACCUMULATE_WEIGHTED_GRAIN(5); + ACCUMULATE_WEIGHTED_GRAIN(6); + } + } + // delta_row == 0 + WRITE_AUTO_REGRESSION_RESULT(0); + WRITE_AUTO_REGRESSION_RESULT(1); + WRITE_AUTO_REGRESSION_RESULT(2); + WRITE_AUTO_REGRESSION_RESULT(3); + luma_grain += kLumaWidth; + } while (++y < kLumaHeight); + +#undef WRITE_AUTO_REGRESSION_RESULT +#undef ACCUMULATE_WEIGHTED_GRAIN +} + +void InitializeScalingLookupTable_NEON( + int num_points, const uint8_t point_value[], const uint8_t point_scaling[], + uint8_t scaling_lut[kScalingLookupTableSize]) { + if (num_points == 0) { + memset(scaling_lut, 0, sizeof(scaling_lut[0]) * kScalingLookupTableSize); + return; + } + static_assert(sizeof(scaling_lut[0]) == 1, ""); + memset(scaling_lut, point_scaling[0], point_value[0]); + const uint32x4_t steps = vmovl_u16(vcreate_u16(0x0003000200010000)); + const uint32x4_t offset = vdupq_n_u32(32768); + for (int i = 0; i < num_points - 1; ++i) { + const int delta_y = point_scaling[i + 1] - point_scaling[i]; + const int delta_x = point_value[i + 1] - point_value[i]; + const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x); + const int delta4 = delta << 2; + const uint8x8_t base_point = vdup_n_u8(point_scaling[i]); + uint32x4_t upscaled_points0 = vmlaq_n_u32(offset, steps, delta); + const uint32x4_t line_increment4 = vdupq_n_u32(delta4); + // Get the second set of 4 points by adding 4 steps to the first set. + uint32x4_t upscaled_points1 = vaddq_u32(upscaled_points0, line_increment4); + // We obtain the next set of 8 points by adding 8 steps to each of the + // current 8 points. + const uint32x4_t line_increment8 = vshlq_n_u32(line_increment4, 1); + int x = 0; + do { + const uint16x4_t interp_points0 = vshrn_n_u32(upscaled_points0, 16); + const uint16x4_t interp_points1 = vshrn_n_u32(upscaled_points1, 16); + const uint8x8_t interp_points = + vmovn_u16(vcombine_u16(interp_points0, interp_points1)); + // The spec guarantees that the max value of |point_value[i]| + x is 255. + // Writing 8 bytes starting at the final table byte, leaves 7 bytes of + // required padding. + vst1_u8(&scaling_lut[point_value[i] + x], + vadd_u8(interp_points, base_point)); + upscaled_points0 = vaddq_u32(upscaled_points0, line_increment8); + upscaled_points1 = vaddq_u32(upscaled_points1, line_increment8); + x += 8; + } while (x < delta_x); + } + const uint8_t last_point_value = point_value[num_points - 1]; + memset(&scaling_lut[last_point_value], point_scaling[num_points - 1], + kScalingLookupTableSize - last_point_value); +} + +inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low, + const int16x8_t high) { + const int16x8_t clipped_to_ceiling = vminq_s16(high, value); + return vmaxq_s16(low, clipped_to_ceiling); +} + +template +inline int16x8_t GetScalingFactors( + const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) { + int16_t start_vals[8]; + if (bitdepth == 8) { + start_vals[0] = scaling_lut[source[0]]; + start_vals[1] = scaling_lut[source[1]]; + start_vals[2] = scaling_lut[source[2]]; + start_vals[3] = scaling_lut[source[3]]; + start_vals[4] = scaling_lut[source[4]]; + start_vals[5] = scaling_lut[source[5]]; + start_vals[6] = scaling_lut[source[6]]; + start_vals[7] = scaling_lut[source[7]]; + return vld1q_s16(start_vals); + } + int16_t end_vals[8]; + // TODO(petersonab): Precompute this into a larger table for direct lookups. + int index = source[0] >> 2; + start_vals[0] = scaling_lut[index]; + end_vals[0] = scaling_lut[index + 1]; + index = source[1] >> 2; + start_vals[1] = scaling_lut[index]; + end_vals[1] = scaling_lut[index + 1]; + index = source[2] >> 2; + start_vals[2] = scaling_lut[index]; + end_vals[2] = scaling_lut[index + 1]; + index = source[3] >> 2; + start_vals[3] = scaling_lut[index]; + end_vals[3] = scaling_lut[index + 1]; + index = source[4] >> 2; + start_vals[4] = scaling_lut[index]; + end_vals[4] = scaling_lut[index + 1]; + index = source[5] >> 2; + start_vals[5] = scaling_lut[index]; + end_vals[5] = scaling_lut[index + 1]; + index = source[6] >> 2; + start_vals[6] = scaling_lut[index]; + end_vals[6] = scaling_lut[index + 1]; + index = source[7] >> 2; + start_vals[7] = scaling_lut[index]; + end_vals[7] = scaling_lut[index + 1]; + const int16x8_t start = vld1q_s16(start_vals); + const int16x8_t end = vld1q_s16(end_vals); + int16x8_t remainder = GetSignedSource8(source); + remainder = vandq_s16(remainder, vdupq_n_s16(3)); + const int16x8_t delta = vmulq_s16(vsubq_s16(end, start), remainder); + return vaddq_s16(start, vrshrq_n_s16(delta, 2)); +} + +inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling, + const int16x8_t scaling_shift_vect) { + const int16x8_t upscaled_noise = vmulq_s16(noise, scaling); + return vrshlq_s16(upscaled_noise, scaling_shift_vect); +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling, + const int32x4_t scaling_shift_vect) { + // TODO(petersonab): Try refactoring scaling lookup table to int16_t and + // upscaling by 7 bits to permit high half multiply. This would eliminate + // the intermediate 32x4 registers. Also write the averaged values directly + // into the table so it doesn't have to be done for every pixel in + // the frame. + const int32x4_t upscaled_noise_lo = + vmull_s16(vget_low_s16(noise), vget_low_s16(scaling)); + const int32x4_t upscaled_noise_hi = + vmull_s16(vget_high_s16(noise), vget_high_s16(scaling)); + const int16x4_t noise_lo = + vmovn_s32(vrshlq_s32(upscaled_noise_lo, scaling_shift_vect)); + const int16x4_t noise_hi = + vmovn_s32(vrshlq_s32(upscaled_noise_hi, scaling_shift_vect)); + return vcombine_s16(noise_lo, noise_hi); +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +template +void BlendNoiseWithImageLuma_NEON( + const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift, + int width, int height, int start_height, + const uint8_t scaling_lut_y[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y, + ptrdiff_t dest_stride_y) { + const auto* noise_image = + static_cast*>(noise_image_ptr); + const auto* in_y_row = static_cast(source_plane_y); + source_stride_y /= sizeof(Pixel); + auto* out_y_row = static_cast(dest_plane_y); + dest_stride_y /= sizeof(Pixel); + const int16x8_t floor = vdupq_n_s16(min_value); + const int16x8_t ceiling = vdupq_n_s16(max_luma); + // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe + // for 16 bit signed integers. In higher bitdepths, however, we have to + // expand to 32 to protect the sign bit. + const int16x8_t scaling_shift_vect16 = vdupq_n_s16(-scaling_shift); +#if LIBGAV1_MAX_BITDEPTH >= 10 + const int32x4_t scaling_shift_vect32 = vdupq_n_s32(-scaling_shift); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + + int y = 0; + do { + int x = 0; + do { + // This operation on the unsigned input is safe in 8bpp because the vector + // is widened before it is reinterpreted. + const int16x8_t orig = GetSignedSource8(&in_y_row[x]); + const int16x8_t scaling = + GetScalingFactors(scaling_lut_y, &in_y_row[x]); + int16x8_t noise = + GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x])); + + if (bitdepth == 8) { + noise = ScaleNoise(noise, scaling, scaling_shift_vect16); + } else { +#if LIBGAV1_MAX_BITDEPTH >= 10 + noise = ScaleNoise(noise, scaling, scaling_shift_vect32); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + } + const int16x8_t combined = vaddq_s16(orig, noise); + // In 8bpp, when params_.clip_to_restricted_range == false, we can replace + // clipping with vqmovun_s16, but it's not likely to be worth copying the + // function for just that case, though the gain would be very small. + StoreUnsigned8(&out_y_row[x], + vreinterpretq_u16_s16(Clip3(combined, floor, ceiling))); + x += 8; + } while (x < width); + in_y_row += source_stride_y; + out_y_row += dest_stride_y; + } while (++y < height); +} + +template +inline int16x8_t BlendChromaValsWithCfl( + const Pixel* average_luma_buffer, + const uint8_t scaling_lut[kScalingLookupTableSize], + const Pixel* chroma_cursor, const GrainType* noise_image_cursor, + const int16x8_t scaling_shift_vect16, + const int32x4_t scaling_shift_vect32) { + const int16x8_t scaling = + GetScalingFactors(scaling_lut, average_luma_buffer); + const int16x8_t orig = GetSignedSource8(chroma_cursor); + int16x8_t noise = GetSignedSource8(noise_image_cursor); + if (bitdepth == 8) { + noise = ScaleNoise(noise, scaling, scaling_shift_vect16); + } else { + noise = ScaleNoise(noise, scaling, scaling_shift_vect32); + } + return vaddq_s16(orig, noise); +} + +template +LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON( + const Array2D& noise_image, int min_value, int max_chroma, + int width, int height, int start_height, int subsampling_x, + int subsampling_y, int scaling_shift, + const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row, + ptrdiff_t source_stride_y, const Pixel* in_chroma_row, + ptrdiff_t source_stride_chroma, Pixel* out_chroma_row, + ptrdiff_t dest_stride) { + const int16x8_t floor = vdupq_n_s16(min_value); + const int16x8_t ceiling = vdupq_n_s16(max_chroma); + Pixel luma_buffer[16]; + memset(luma_buffer, 0, sizeof(luma_buffer)); + // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe + // for 16 bit signed integers. In higher bitdepths, however, we have to + // expand to 32 to protect the sign bit. + const int16x8_t scaling_shift_vect16 = vdupq_n_s16(-scaling_shift); + const int32x4_t scaling_shift_vect32 = vdupq_n_s32(-scaling_shift); + + const int chroma_height = (height + subsampling_y) >> subsampling_y; + const int chroma_width = (width + subsampling_x) >> subsampling_x; + const int safe_chroma_width = chroma_width & ~7; + + // Writing to this buffer avoids the cost of doing 8 lane lookups in a row + // in GetScalingFactors. + Pixel average_luma_buffer[8]; + assert(start_height % 2 == 0); + start_height >>= subsampling_y; + int y = 0; + do { + int x = 0; + do { + const int luma_x = x << subsampling_x; + // TODO(petersonab): Consider specializing by subsampling_x. In the 444 + // case &in_y_row[x] can be passed to GetScalingFactors directly. + const uint16x8_t average_luma = + GetAverageLuma(&in_y_row[luma_x], subsampling_x); + StoreUnsigned8(average_luma_buffer, average_luma); + + const int16x8_t blended = + BlendChromaValsWithCfl( + average_luma_buffer, scaling_lut, &in_chroma_row[x], + &(noise_image[y + start_height][x]), scaling_shift_vect16, + scaling_shift_vect32); + + // In 8bpp, when params_.clip_to_restricted_range == false, we can replace + // clipping with vqmovun_s16, but it's not likely to be worth copying the + // function for just that case. + StoreUnsigned8(&out_chroma_row[x], + vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); + x += 8; + } while (x < safe_chroma_width); + + if (x < chroma_width) { + const int luma_x = x << subsampling_x; + const int valid_range = width - luma_x; + memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0])); + luma_buffer[valid_range] = in_y_row[width - 1]; + const uint16x8_t average_luma = + GetAverageLuma(luma_buffer, subsampling_x); + StoreUnsigned8(average_luma_buffer, average_luma); + + const int16x8_t blended = + BlendChromaValsWithCfl( + average_luma_buffer, scaling_lut, &in_chroma_row[x], + &(noise_image[y + start_height][x]), scaling_shift_vect16, + scaling_shift_vect32); + // In 8bpp, when params_.clip_to_restricted_range == false, we can replace + // clipping with vqmovun_s16, but it's not likely to be worth copying the + // function for just that case. + StoreUnsigned8(&out_chroma_row[x], + vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); + } + + in_y_row += source_stride_y << subsampling_y; + in_chroma_row += source_stride_chroma; + out_chroma_row += dest_stride; + } while (++y < chroma_height); +} + +// This function is for the case params_.chroma_scaling_from_luma == true. +// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y. +template +void BlendNoiseWithImageChromaWithCfl_NEON( + Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, + int min_value, int max_chroma, int width, int height, int start_height, + int subsampling_x, int subsampling_y, + const uint8_t scaling_lut[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, + const void* source_plane_uv, ptrdiff_t source_stride_uv, + void* dest_plane_uv, ptrdiff_t dest_stride_uv) { + const auto* noise_image = + static_cast*>(noise_image_ptr); + const auto* in_y = static_cast(source_plane_y); + source_stride_y /= sizeof(Pixel); + + const auto* in_uv = static_cast(source_plane_uv); + source_stride_uv /= sizeof(Pixel); + auto* out_uv = static_cast(dest_plane_uv); + dest_stride_uv /= sizeof(Pixel); + // Looping over one plane at a time is faster in higher resolutions, despite + // re-computing luma. + BlendChromaPlaneWithCfl_NEON( + noise_image[plane], min_value, max_chroma, width, height, start_height, + subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y, + source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv); +} + +} // namespace + +namespace low_bitdepth { +namespace { + +inline int16x8_t BlendChromaValsNoCfl( + const uint8_t scaling_lut[kScalingLookupTableSize], + const uint8_t* chroma_cursor, const int8_t* noise_image_cursor, + const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect, + const int16x8_t& offset, int luma_multiplier, int chroma_multiplier) { + uint8_t merged_buffer[8]; + const int16x8_t orig = GetSignedSource8(chroma_cursor); + const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier); + const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier); + // Maximum value of |combined_u| is 127*255 = 0x7E81. + const int16x8_t combined = vhaddq_s16(weighted_luma, weighted_chroma); + // Maximum value of u_offset is (255 << 5) = 0x1FE0. + // 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required. + const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4); + vst1_u8(merged_buffer, merged); + const int16x8_t scaling = + GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer); + int16x8_t noise = GetSignedSource8(noise_image_cursor); + noise = ScaleNoise(noise, scaling, scaling_shift_vect); + return vaddq_s16(orig, noise); +} + +LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON( + const Array2D& noise_image, int min_value, int max_chroma, + int width, int height, int start_height, int subsampling_x, + int subsampling_y, int scaling_shift, int chroma_offset, + int chroma_multiplier, int luma_multiplier, + const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row, + ptrdiff_t source_stride_y, const uint8_t* in_chroma_row, + ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row, + ptrdiff_t dest_stride) { + const int16x8_t floor = vdupq_n_s16(min_value); + const int16x8_t ceiling = vdupq_n_s16(max_chroma); + // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe + // for 16 bit signed integers. In higher bitdepths, however, we have to + // expand to 32 to protect the sign bit. + const int16x8_t scaling_shift_vect = vdupq_n_s16(-scaling_shift); + + const int chroma_height = (height + subsampling_y) >> subsampling_y; + const int chroma_width = (width + subsampling_x) >> subsampling_x; + const int safe_chroma_width = chroma_width & ~7; + uint8_t luma_buffer[16]; + const int16x8_t offset = vdupq_n_s16(chroma_offset << 5); + + start_height >>= subsampling_y; + int y = 0; + do { + int x = 0; + do { + const int luma_x = x << subsampling_x; + const int16x8_t average_luma = vreinterpretq_s16_u16( + GetAverageLuma(&in_y_row[luma_x], subsampling_x)); + const int16x8_t blended = BlendChromaValsNoCfl( + scaling_lut, &in_chroma_row[x], &(noise_image[y + start_height][x]), + average_luma, scaling_shift_vect, offset, luma_multiplier, + chroma_multiplier); + // In 8bpp, when params_.clip_to_restricted_range == false, we can + // replace clipping with vqmovun_s16, but the gain would be small. + StoreUnsigned8(&out_chroma_row[x], + vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); + + x += 8; + } while (x < safe_chroma_width); + + if (x < chroma_width) { + // Begin right edge iteration. Same as the normal iterations, but the + // |average_luma| computation requires a duplicated luma value at the + // end. + const int luma_x = x << subsampling_x; + const int valid_range = width - luma_x; + memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0])); + luma_buffer[valid_range] = in_y_row[width - 1]; + + const int16x8_t average_luma = + vreinterpretq_s16_u16(GetAverageLuma(luma_buffer, subsampling_x)); + const int16x8_t blended = BlendChromaValsNoCfl( + scaling_lut, &in_chroma_row[x], &(noise_image[y + start_height][x]), + average_luma, scaling_shift_vect, offset, luma_multiplier, + chroma_multiplier); + StoreUnsigned8(&out_chroma_row[x], + vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); + // End of right edge iteration. + } + + in_y_row += source_stride_y << subsampling_y; + in_chroma_row += source_stride_chroma; + out_chroma_row += dest_stride; + } while (++y < chroma_height); +} + +// This function is for the case params_.chroma_scaling_from_luma == false. +void BlendNoiseWithImageChroma8bpp_NEON( + Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, + int min_value, int max_chroma, int width, int height, int start_height, + int subsampling_x, int subsampling_y, + const uint8_t scaling_lut[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, + const void* source_plane_uv, ptrdiff_t source_stride_uv, + void* dest_plane_uv, ptrdiff_t dest_stride_uv) { + assert(plane == kPlaneU || plane == kPlaneV); + const auto* noise_image = + static_cast*>(noise_image_ptr); + const auto* in_y = static_cast(source_plane_y); + const auto* in_uv = static_cast(source_plane_uv); + auto* out_uv = static_cast(dest_plane_uv); + + const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset; + const int luma_multiplier = + (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier; + const int multiplier = + (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier; + BlendChromaPlane8bpp_NEON(noise_image[plane], min_value, max_chroma, width, + height, start_height, subsampling_x, subsampling_y, + params.chroma_scaling, offset, multiplier, + luma_multiplier, scaling_lut, in_y, source_stride_y, + in_uv, source_stride_uv, out_uv, dest_stride_uv); +} + +inline void WriteOverlapLine8bpp_NEON(const int8_t* noise_stripe_row, + const int8_t* noise_stripe_row_prev, + int plane_width, + const int8x8_t grain_coeff, + const int8x8_t old_coeff, + int8_t* noise_image_row) { + int x = 0; + do { + // Note that these reads may exceed noise_stripe_row's width by up to 7 + // bytes. + const int8x8_t source_grain = vld1_s8(noise_stripe_row + x); + const int8x8_t source_old = vld1_s8(noise_stripe_row_prev + x); + const int16x8_t weighted_grain = vmull_s8(grain_coeff, source_grain); + const int16x8_t grain = vmlal_s8(weighted_grain, old_coeff, source_old); + // Note that this write may exceed noise_image_row's width by up to 7 bytes. + vst1_s8(noise_image_row + x, vqrshrn_n_s16(grain, 5)); + x += 8; + } while (x < plane_width); +} + +void ConstructNoiseImageOverlap8bpp_NEON(const void* noise_stripes_buffer, + int width, int height, + int subsampling_x, int subsampling_y, + void* noise_image_buffer) { + const auto* noise_stripes = + static_cast*>(noise_stripes_buffer); + auto* noise_image = static_cast*>(noise_image_buffer); + const int plane_width = (width + subsampling_x) >> subsampling_x; + const int plane_height = (height + subsampling_y) >> subsampling_y; + const int stripe_height = 32 >> subsampling_y; + const int stripe_mask = stripe_height - 1; + int y = stripe_height; + int luma_num = 1; + if (subsampling_y == 0) { + const int8x8_t first_row_grain_coeff = vdup_n_s8(17); + const int8x8_t first_row_old_coeff = vdup_n_s8(27); + const int8x8_t second_row_grain_coeff = first_row_old_coeff; + const int8x8_t second_row_old_coeff = first_row_grain_coeff; + for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) { + const int8_t* noise_stripe = (*noise_stripes)[luma_num]; + const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1]; + WriteOverlapLine8bpp_NEON( + noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width, + first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]); + + WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width], + &noise_stripe_prev[(32 + 1) * plane_width], + plane_width, second_row_grain_coeff, + second_row_old_coeff, (*noise_image)[y + 1]); + } + // Either one partial stripe remains (remaining_height > 0), + // OR image is less than one stripe high (remaining_height < 0), + // OR all stripes are completed (remaining_height == 0). + const int remaining_height = plane_height - y; + if (remaining_height <= 0) { + return; + } + const int8_t* noise_stripe = (*noise_stripes)[luma_num]; + const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1]; + WriteOverlapLine8bpp_NEON( + noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width, + first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]); + + if (remaining_height > 1) { + WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width], + &noise_stripe_prev[(32 + 1) * plane_width], + plane_width, second_row_grain_coeff, + second_row_old_coeff, (*noise_image)[y + 1]); + } + } else { // subsampling_y == 1 + const int8x8_t first_row_grain_coeff = vdup_n_s8(22); + const int8x8_t first_row_old_coeff = vdup_n_s8(23); + for (; y < plane_height; ++luma_num, y += stripe_height) { + const int8_t* noise_stripe = (*noise_stripes)[luma_num]; + const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1]; + WriteOverlapLine8bpp_NEON( + noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width, + first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]); + } + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + + // LumaAutoRegressionFunc + dsp->film_grain.luma_auto_regression[0] = + ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 1>; + dsp->film_grain.luma_auto_regression[1] = + ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 2>; + dsp->film_grain.luma_auto_regression[2] = + ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 3>; + + // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag] + // Chroma autoregression should never be called when lag is 0 and use_luma + // is false. + dsp->film_grain.chroma_auto_regression[0][0] = nullptr; + dsp->film_grain.chroma_auto_regression[0][1] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 1, false>; + dsp->film_grain.chroma_auto_regression[0][2] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 2, false>; + dsp->film_grain.chroma_auto_regression[0][3] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 3, false>; + dsp->film_grain.chroma_auto_regression[1][0] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 0, true>; + dsp->film_grain.chroma_auto_regression[1][1] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 1, true>; + dsp->film_grain.chroma_auto_regression[1][2] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 2, true>; + dsp->film_grain.chroma_auto_regression[1][3] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 3, true>; + + dsp->film_grain.construct_noise_image_overlap = + ConstructNoiseImageOverlap8bpp_NEON; + + dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_NEON; + + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_NEON<8, int8_t, uint8_t>; + dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_NEON; + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_NEON<8, int8_t, uint8_t>; +} + +} // namespace +} // namespace low_bitdepth + +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + + // LumaAutoRegressionFunc + dsp->film_grain.luma_auto_regression[0] = + ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 1>; + dsp->film_grain.luma_auto_regression[1] = + ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 2>; + dsp->film_grain.luma_auto_regression[2] = + ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 3>; + + // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag][subsampling] + // Chroma autoregression should never be called when lag is 0 and use_luma + // is false. + dsp->film_grain.chroma_auto_regression[0][0] = nullptr; + dsp->film_grain.chroma_auto_regression[0][1] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 1, false>; + dsp->film_grain.chroma_auto_regression[0][2] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 2, false>; + dsp->film_grain.chroma_auto_regression[0][3] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 3, false>; + dsp->film_grain.chroma_auto_regression[1][0] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 0, true>; + dsp->film_grain.chroma_auto_regression[1][1] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 1, true>; + dsp->film_grain.chroma_auto_regression[1][2] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 2, true>; + dsp->film_grain.chroma_auto_regression[1][3] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 3, true>; + + dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_NEON; + + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_NEON<10, int16_t, uint16_t>; + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_NEON<10, int16_t, uint16_t>; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace film_grain + +void FilmGrainInit_NEON() { + film_grain::low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + film_grain::high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON + +namespace libgav1 { +namespace dsp { + +void FilmGrainInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/film_grain_neon.h b/src/dsp/arm/film_grain_neon.h new file mode 100644 index 0000000..44b3d1d --- /dev/null +++ b/src/dsp/arm/film_grain_neon.h @@ -0,0 +1,47 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initialize members of Dsp::film_grain. This function is not thread-safe. +void FilmGrainInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_FilmGrainAutoregressionLuma LIBGAV1_DSP_NEON +#define LIBGAV1_Dsp10bpp_FilmGrainAutoregressionLuma LIBGAV1_DSP_NEON +#define LIBGAV1_Dsp8bpp_FilmGrainAutoregressionChroma LIBGAV1_DSP_NEON +#define LIBGAV1_Dsp10bpp_FilmGrainAutoregressionChroma LIBGAV1_DSP_NEON +#define LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseImageOverlap LIBGAV1_DSP_NEON +#define LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON +#define LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON +#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON +#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON +#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON +#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON +#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_FILM_GRAIN_NEON_H_ diff --git a/src/dsp/arm/intra_edge_neon.cc b/src/dsp/arm/intra_edge_neon.cc new file mode 100644 index 0000000..00b186a --- /dev/null +++ b/src/dsp/arm/intra_edge_neon.cc @@ -0,0 +1,301 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intra_edge.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" // RightShiftWithRounding() + +namespace libgav1 { +namespace dsp { +namespace { + +// Simplified version of intra_edge.cc:kKernels[][]. Only |strength| 1 and 2 are +// required. +constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}}; + +void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) { + assert(strength == 1 || strength == 2 || strength == 3); + const int kernel_index = strength - 1; + auto* const dst_buffer = static_cast(buffer); + + // The first element is not written out (but it is input) so the number of + // elements written is |size| - 1. + if (size == 1) return; + + // |strength| 1 and 2 use a 3 tap filter. + if (strength < 3) { + // The last value requires extending the buffer (duplicating + // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in + // neon. + const uint8_t last_val = RightShiftWithRounding( + kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] + + kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] + + kKernelsNEON[kernel_index][0] * dst_buffer[size - 1], + 4); + + const uint8x8_t krn1 = vdup_n_u8(kKernelsNEON[kernel_index][1]); + + // The first value we need gets overwritten by the output from the + // previous iteration. + uint8x16_t src_0 = vld1q_u8(dst_buffer); + int i = 1; + + // Process blocks until there are less than 16 values remaining. + for (; i < size - 15; i += 16) { + // Loading these at the end of the block with |src_0| will read past the + // end of |top_row_data[160]|, the source of |buffer|. + const uint8x16_t src_1 = vld1q_u8(dst_buffer + i); + const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1); + uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2)); + sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]); + sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1); + uint16x8_t sum_hi = vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_2)); + sum_hi = vmulq_n_u16(sum_hi, kKernelsNEON[kernel_index][0]); + sum_hi = vmlal_u8(sum_hi, vget_high_u8(src_1), krn1); + + const uint8x16_t result = + vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4)); + + // Load the next row before overwriting. This loads an extra 15 values + // past |size| on the trailing iteration. + src_0 = vld1q_u8(dst_buffer + i + 15); + + vst1q_u8(dst_buffer + i, result); + } + + // The last output value |last_val| was already calculated so if + // |remainder| == 1 then we don't have to do anything. + const int remainder = (size - 1) & 0xf; + if (remainder > 1) { + uint8_t temp[16]; + const uint8x16_t src_1 = vld1q_u8(dst_buffer + i); + const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1); + + uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2)); + sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]); + sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1); + uint16x8_t sum_hi = vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_2)); + sum_hi = vmulq_n_u16(sum_hi, kKernelsNEON[kernel_index][0]); + sum_hi = vmlal_u8(sum_hi, vget_high_u8(src_1), krn1); + + const uint8x16_t result = + vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4)); + + vst1q_u8(temp, result); + memcpy(dst_buffer + i, temp, remainder); + } + + dst_buffer[size - 1] = last_val; + return; + } + + assert(strength == 3); + // 5 tap filter. The first element requires duplicating |buffer[0]| and the + // last two elements require duplicating |buffer[size - 1]|. + uint8_t special_vals[3]; + special_vals[0] = RightShiftWithRounding( + (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) + + (dst_buffer[2] << 2) + (dst_buffer[3] << 1), + 4); + // Clamp index for very small |size| values. + const int first_index_min = std::max(size - 4, 0); + const int second_index_min = std::max(size - 3, 0); + const int third_index_min = std::max(size - 2, 0); + special_vals[1] = RightShiftWithRounding( + (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) + + (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) + + (dst_buffer[size - 1] << 1), + 4); + special_vals[2] = RightShiftWithRounding( + (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) + + // x << 2 + x << 2 == x << 3 + (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1), + 4); + + // The first two values we need get overwritten by the output from the + // previous iteration. + uint8x16_t src_0 = vld1q_u8(dst_buffer - 1); + uint8x16_t src_1 = vld1q_u8(dst_buffer); + int i = 1; + + for (; i < size - 15; i += 16) { + // Loading these at the end of the block with |src_[01]| will read past + // the end of |top_row_data[160]|, the source of |buffer|. + const uint8x16_t src_2 = vld1q_u8(dst_buffer + i); + const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1); + const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2); + + uint16x8_t sum_lo = + vshlq_n_u16(vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_4)), 1); + const uint16x8_t sum_123_lo = vaddw_u8( + vaddl_u8(vget_low_u8(src_1), vget_low_u8(src_2)), vget_low_u8(src_3)); + sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2)); + + uint16x8_t sum_hi = + vshlq_n_u16(vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_4)), 1); + const uint16x8_t sum_123_hi = + vaddw_u8(vaddl_u8(vget_high_u8(src_1), vget_high_u8(src_2)), + vget_high_u8(src_3)); + sum_hi = vaddq_u16(sum_hi, vshlq_n_u16(sum_123_hi, 2)); + + const uint8x16_t result = + vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4)); + + src_0 = vld1q_u8(dst_buffer + i + 14); + src_1 = vld1q_u8(dst_buffer + i + 15); + + vst1q_u8(dst_buffer + i, result); + } + + const int remainder = (size - 1) & 0xf; + // Like the 3 tap but if there are two remaining values we have already + // calculated them. + if (remainder > 2) { + uint8_t temp[16]; + const uint8x16_t src_2 = vld1q_u8(dst_buffer + i); + const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1); + const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2); + + uint16x8_t sum_lo = + vshlq_n_u16(vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_4)), 1); + const uint16x8_t sum_123_lo = vaddw_u8( + vaddl_u8(vget_low_u8(src_1), vget_low_u8(src_2)), vget_low_u8(src_3)); + sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2)); + + uint16x8_t sum_hi = + vshlq_n_u16(vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_4)), 1); + const uint16x8_t sum_123_hi = + vaddw_u8(vaddl_u8(vget_high_u8(src_1), vget_high_u8(src_2)), + vget_high_u8(src_3)); + sum_hi = vaddq_u16(sum_hi, vshlq_n_u16(sum_123_hi, 2)); + + const uint8x16_t result = + vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4)); + + vst1q_u8(temp, result); + memcpy(dst_buffer + i, temp, remainder); + } + + dst_buffer[1] = special_vals[0]; + // Avoid overwriting |dst_buffer[0]|. + if (size > 2) dst_buffer[size - 2] = special_vals[1]; + dst_buffer[size - 1] = special_vals[2]; +} + +// (-|src0| + |src1| * 9 + |src2| * 9 - |src3|) >> 4 +uint8x8_t Upsample(const uint8x8_t src0, const uint8x8_t src1, + const uint8x8_t src2, const uint8x8_t src3) { + const uint16x8_t middle = vmulq_n_u16(vaddl_u8(src1, src2), 9); + const uint16x8_t ends = vaddl_u8(src0, src3); + const int16x8_t sum = + vsubq_s16(vreinterpretq_s16_u16(middle), vreinterpretq_s16_u16(ends)); + return vqrshrun_n_s16(sum, 4); +} + +void IntraEdgeUpsampler_NEON(void* buffer, const int size) { + assert(size % 4 == 0 && size <= 16); + auto* const pixel_buffer = static_cast(buffer); + // This is OK because we don't read this value for |size| 4 or 8 but if we + // write |pixel_buffer[size]| and then vld() it, that seems to introduce + // some latency. + pixel_buffer[-2] = pixel_buffer[-1]; + if (size == 4) { + // This uses one load and two vtbl() which is better than 4x Load{Lo,Hi}4(). + const uint8x8_t src = vld1_u8(pixel_buffer - 1); + // The outside values are negated so put those in the same vector. + const uint8x8_t src03 = vtbl1_u8(src, vcreate_u8(0x0404030202010000)); + // Reverse |src1| and |src2| so we can use |src2| for the interleave at the + // end. + const uint8x8_t src21 = vtbl1_u8(src, vcreate_u8(0x0302010004030201)); + + const uint16x8_t middle = vmull_u8(src21, vdup_n_u8(9)); + const int16x8_t half_sum = vsubq_s16( + vreinterpretq_s16_u16(middle), vreinterpretq_s16_u16(vmovl_u8(src03))); + const int16x4_t sum = + vadd_s16(vget_low_s16(half_sum), vget_high_s16(half_sum)); + const uint8x8_t result = vqrshrun_n_s16(vcombine_s16(sum, sum), 4); + + vst1_u8(pixel_buffer - 1, InterleaveLow8(result, src21)); + return; + } else if (size == 8) { + // Likewise, one load + multiple vtbls seems preferred to multiple loads. + const uint8x16_t src = vld1q_u8(pixel_buffer - 1); + const uint8x8_t src0 = VQTbl1U8(src, vcreate_u8(0x0605040302010000)); + const uint8x8_t src1 = vget_low_u8(src); + const uint8x8_t src2 = VQTbl1U8(src, vcreate_u8(0x0807060504030201)); + const uint8x8_t src3 = VQTbl1U8(src, vcreate_u8(0x0808070605040302)); + + const uint8x8x2_t output = {Upsample(src0, src1, src2, src3), src2}; + vst2_u8(pixel_buffer - 1, output); + return; + } + assert(size == 12 || size == 16); + // Extend the input borders to avoid branching later. + pixel_buffer[size] = pixel_buffer[size - 1]; + const uint8x16_t src0 = vld1q_u8(pixel_buffer - 2); + const uint8x16_t src1 = vld1q_u8(pixel_buffer - 1); + const uint8x16_t src2 = vld1q_u8(pixel_buffer); + const uint8x16_t src3 = vld1q_u8(pixel_buffer + 1); + + const uint8x8_t result_lo = Upsample(vget_low_u8(src0), vget_low_u8(src1), + vget_low_u8(src2), vget_low_u8(src3)); + + const uint8x8x2_t output_lo = {result_lo, vget_low_u8(src2)}; + vst2_u8(pixel_buffer - 1, output_lo); + + const uint8x8_t result_hi = Upsample(vget_high_u8(src0), vget_high_u8(src1), + vget_high_u8(src2), vget_high_u8(src3)); + + if (size == 12) { + vst1_u8(pixel_buffer + 15, InterleaveLow8(result_hi, vget_high_u8(src2))); + } else /* size == 16 */ { + const uint8x8x2_t output_hi = {result_hi, vget_high_u8(src2)}; + vst2_u8(pixel_buffer + 15, output_hi); + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->intra_edge_filter = IntraEdgeFilter_NEON; + dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON; +} + +} // namespace + +void IntraEdgeInit_NEON() { Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void IntraEdgeInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/intra_edge_neon.h b/src/dsp/arm/intra_edge_neon.h new file mode 100644 index 0000000..d3bb243 --- /dev/null +++ b/src/dsp/arm/intra_edge_neon.h @@ -0,0 +1,39 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This +// function is not thread-safe. +void IntraEdgeInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON + +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_ diff --git a/src/dsp/arm/intrapred_cfl_neon.cc b/src/dsp/arm/intrapred_cfl_neon.cc new file mode 100644 index 0000000..45fe33b --- /dev/null +++ b/src/dsp/arm/intrapred_cfl_neon.cc @@ -0,0 +1,479 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +uint8x16_t Set2ValuesQ(const uint8_t* a) { + uint16_t combined_values = a[0] | a[1] << 8; + return vreinterpretq_u8_u16(vdupq_n_u16(combined_values)); +} + +uint32_t SumVector(uint32x2_t a) { +#if defined(__aarch64__) + return vaddv_u32(a); +#else + const uint64x1_t b = vpaddl_u32(a); + return vget_lane_u32(vreinterpret_u32_u64(b), 0); +#endif // defined(__aarch64__) +} + +uint32_t SumVector(uint32x4_t a) { +#if defined(__aarch64__) + return vaddvq_u32(a); +#else + const uint64x2_t b = vpaddlq_u32(a); + const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b)); + return vget_lane_u32(vreinterpret_u32_u64(c), 0); +#endif // defined(__aarch64__) +} + +// Divide by the number of elements. +uint32_t Average(const uint32_t sum, const int width, const int height) { + return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height)); +} + +// Subtract |val| from every element in |a|. +void BlockSubtract(const uint32_t val, + int16_t a[kCflLumaBufferStride][kCflLumaBufferStride], + const int width, const int height) { + assert(val <= INT16_MAX); + const int16x8_t val_v = vdupq_n_s16(static_cast(val)); + + for (int y = 0; y < height; ++y) { + if (width == 4) { + const int16x4_t b = vld1_s16(a[y]); + vst1_s16(a[y], vsub_s16(b, vget_low_s16(val_v))); + } else if (width == 8) { + const int16x8_t b = vld1q_s16(a[y]); + vst1q_s16(a[y], vsubq_s16(b, val_v)); + } else if (width == 16) { + const int16x8_t b = vld1q_s16(a[y]); + const int16x8_t c = vld1q_s16(a[y] + 8); + vst1q_s16(a[y], vsubq_s16(b, val_v)); + vst1q_s16(a[y] + 8, vsubq_s16(c, val_v)); + } else /* block_width == 32 */ { + const int16x8_t b = vld1q_s16(a[y]); + const int16x8_t c = vld1q_s16(a[y] + 8); + const int16x8_t d = vld1q_s16(a[y] + 16); + const int16x8_t e = vld1q_s16(a[y] + 24); + vst1q_s16(a[y], vsubq_s16(b, val_v)); + vst1q_s16(a[y] + 8, vsubq_s16(c, val_v)); + vst1q_s16(a[y] + 16, vsubq_s16(d, val_v)); + vst1q_s16(a[y] + 24, vsubq_s16(e, val_v)); + } + } +} + +template +void CflSubsampler420_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, const ptrdiff_t stride) { + const auto* src = static_cast(source); + uint32_t sum; + if (block_width == 4) { + assert(max_luma_width >= 8); + uint32x2_t running_sum = vdup_n_u32(0); + + for (int y = 0; y < block_height; ++y) { + const uint8x8_t row0 = vld1_u8(src); + const uint8x8_t row1 = vld1_u8(src + stride); + + uint16x4_t sum_row = vpadal_u8(vpaddl_u8(row0), row1); + sum_row = vshl_n_u16(sum_row, 1); + running_sum = vpadal_u16(running_sum, sum_row); + vst1_s16(luma[y], vreinterpret_s16_u16(sum_row)); + + if (y << 1 < max_luma_height - 2) { + // Once this threshold is reached the loop could be simplified. + src += stride << 1; + } + } + + sum = SumVector(running_sum); + } else if (block_width == 8) { + const uint8x16_t x_index = {0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14}; + const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2); + const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index); + + uint32x4_t running_sum = vdupq_n_u32(0); + + for (int y = 0; y < block_height; ++y) { + const uint8x16_t x_max0 = Set2ValuesQ(src + max_luma_width - 2); + const uint8x16_t x_max1 = Set2ValuesQ(src + max_luma_width - 2 + stride); + + uint8x16_t row0 = vld1q_u8(src); + row0 = vbslq_u8(x_mask, row0, x_max0); + uint8x16_t row1 = vld1q_u8(src + stride); + row1 = vbslq_u8(x_mask, row1, x_max1); + + uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1); + sum_row = vshlq_n_u16(sum_row, 1); + running_sum = vpadalq_u16(running_sum, sum_row); + vst1q_s16(luma[y], vreinterpretq_s16_u16(sum_row)); + + if (y << 1 < max_luma_height - 2) { + src += stride << 1; + } + } + + sum = SumVector(running_sum); + } else /* block_width >= 16 */ { + const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2); + uint32x4_t running_sum = vdupq_n_u32(0); + + for (int y = 0; y < block_height; ++y) { + uint8x16_t x_index = {0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30}; + const uint8x16_t x_max00 = vdupq_n_u8(src[max_luma_width - 2]); + const uint8x16_t x_max01 = vdupq_n_u8(src[max_luma_width - 2 + 1]); + const uint8x16_t x_max10 = vdupq_n_u8(src[stride + max_luma_width - 2]); + const uint8x16_t x_max11 = + vdupq_n_u8(src[stride + max_luma_width - 2 + 1]); + for (int x = 0; x < block_width; x += 16) { + const ptrdiff_t src_x_offset = x << 1; + const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index); + const uint8x16x2_t row0 = vld2q_u8(src + src_x_offset); + const uint8x16x2_t row1 = vld2q_u8(src + src_x_offset + stride); + const uint8x16_t row_masked_00 = vbslq_u8(x_mask, row0.val[0], x_max00); + const uint8x16_t row_masked_01 = vbslq_u8(x_mask, row0.val[1], x_max01); + const uint8x16_t row_masked_10 = vbslq_u8(x_mask, row1.val[0], x_max10); + const uint8x16_t row_masked_11 = vbslq_u8(x_mask, row1.val[1], x_max11); + + uint16x8_t sum_row_lo = + vaddl_u8(vget_low_u8(row_masked_00), vget_low_u8(row_masked_01)); + sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_10)); + sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_11)); + sum_row_lo = vshlq_n_u16(sum_row_lo, 1); + running_sum = vpadalq_u16(running_sum, sum_row_lo); + vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(sum_row_lo)); + + uint16x8_t sum_row_hi = + vaddl_u8(vget_high_u8(row_masked_00), vget_high_u8(row_masked_01)); + sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_10)); + sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_11)); + sum_row_hi = vshlq_n_u16(sum_row_hi, 1); + running_sum = vpadalq_u16(running_sum, sum_row_hi); + vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(sum_row_hi)); + + x_index = vaddq_u8(x_index, vdupq_n_u8(32)); + } + if (y << 1 < max_luma_height - 2) { + src += stride << 1; + } + } + sum = SumVector(running_sum); + } + + const uint32_t average = Average(sum, block_width, block_height); + BlockSubtract(average, luma, block_width, block_height); +} + +template +void CflSubsampler444_NEON( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, const ptrdiff_t stride) { + const auto* src = static_cast(source); + uint32_t sum; + if (block_width == 4) { + assert(max_luma_width >= 4); + uint32x4_t running_sum = vdupq_n_u32(0); + uint8x8_t row = vdup_n_u8(0); + + for (int y = 0; y < block_height; y += 2) { + row = Load4<0>(src, row); + row = Load4<1>(src + stride, row); + if (y < (max_luma_height - 1)) { + src += stride << 1; + } + + const uint16x8_t row_shifted = vshll_n_u8(row, 3); + running_sum = vpadalq_u16(running_sum, row_shifted); + vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted))); + vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted))); + } + + sum = SumVector(running_sum); + } else if (block_width == 8) { + const uint8x8_t x_index = {0, 1, 2, 3, 4, 5, 6, 7}; + const uint8x8_t x_max_index = vdup_n_u8(max_luma_width - 1); + const uint8x8_t x_mask = vclt_u8(x_index, x_max_index); + + uint32x4_t running_sum = vdupq_n_u32(0); + + for (int y = 0; y < block_height; ++y) { + const uint8x8_t x_max = vdup_n_u8(src[max_luma_width - 1]); + const uint8x8_t row = vbsl_u8(x_mask, vld1_u8(src), x_max); + + const uint16x8_t row_shifted = vshll_n_u8(row, 3); + running_sum = vpadalq_u16(running_sum, row_shifted); + vst1q_s16(luma[y], vreinterpretq_s16_u16(row_shifted)); + + if (y < max_luma_height - 1) { + src += stride; + } + } + + sum = SumVector(running_sum); + } else /* block_width >= 16 */ { + const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 1); + uint32x4_t running_sum = vdupq_n_u32(0); + + for (int y = 0; y < block_height; ++y) { + uint8x16_t x_index = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + const uint8x16_t x_max = vdupq_n_u8(src[max_luma_width - 1]); + for (int x = 0; x < block_width; x += 16) { + const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index); + const uint8x16_t row = vbslq_u8(x_mask, vld1q_u8(src + x), x_max); + + const uint16x8_t row_shifted_low = vshll_n_u8(vget_low_u8(row), 3); + const uint16x8_t row_shifted_high = vshll_n_u8(vget_high_u8(row), 3); + running_sum = vpadalq_u16(running_sum, row_shifted_low); + running_sum = vpadalq_u16(running_sum, row_shifted_high); + vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(row_shifted_low)); + vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(row_shifted_high)); + + x_index = vaddq_u8(x_index, vdupq_n_u8(16)); + } + if (y < max_luma_height - 1) { + src += stride; + } + } + sum = SumVector(running_sum); + } + + const uint32_t average = Average(sum, block_width, block_height); + BlockSubtract(average, luma, block_width, block_height); +} + +// Saturate |dc + ((alpha * luma) >> 6))| to uint8_t. +inline uint8x8_t Combine8(const int16x8_t luma, const int alpha, + const int16x8_t dc) { + const int16x8_t la = vmulq_n_s16(luma, alpha); + // Subtract the sign bit to round towards zero. + const int16x8_t sub_sign = vsraq_n_s16(la, la, 15); + // Shift and accumulate. + const int16x8_t result = vrsraq_n_s16(dc, sub_sign, 6); + return vqmovun_s16(result); +} + +// The range of luma/alpha is not really important because it gets saturated to +// uint8_t. Saturated int16_t >> 6 outranges uint8_t. +template +inline void CflIntraPredictor4xN_NEON( + void* const dest, const ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast(dest); + const int16x8_t dc = vdupq_n_s16(dst[0]); + for (int y = 0; y < block_height; y += 2) { + const int16x4_t luma_row0 = vld1_s16(luma[y]); + const int16x4_t luma_row1 = vld1_s16(luma[y + 1]); + const uint8x8_t sum = + Combine8(vcombine_s16(luma_row0, luma_row1), alpha, dc); + StoreLo4(dst, sum); + dst += stride; + StoreHi4(dst, sum); + dst += stride; + } +} + +template +inline void CflIntraPredictor8xN_NEON( + void* const dest, const ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast(dest); + const int16x8_t dc = vdupq_n_s16(dst[0]); + for (int y = 0; y < block_height; ++y) { + const int16x8_t luma_row = vld1q_s16(luma[y]); + const uint8x8_t sum = Combine8(luma_row, alpha, dc); + vst1_u8(dst, sum); + dst += stride; + } +} + +template +inline void CflIntraPredictor16xN_NEON( + void* const dest, const ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast(dest); + const int16x8_t dc = vdupq_n_s16(dst[0]); + for (int y = 0; y < block_height; ++y) { + const int16x8_t luma_row_0 = vld1q_s16(luma[y]); + const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8); + const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc); + const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc); + vst1_u8(dst, sum_0); + vst1_u8(dst + 8, sum_1); + dst += stride; + } +} + +template +inline void CflIntraPredictor32xN_NEON( + void* const dest, const ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast(dest); + const int16x8_t dc = vdupq_n_s16(dst[0]); + for (int y = 0; y < block_height; ++y) { + const int16x8_t luma_row_0 = vld1q_s16(luma[y]); + const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8); + const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16); + const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24); + const uint8x8_t sum_0 = Combine8(luma_row_0, alpha, dc); + const uint8x8_t sum_1 = Combine8(luma_row_1, alpha, dc); + const uint8x8_t sum_2 = Combine8(luma_row_2, alpha, dc); + const uint8x8_t sum_3 = Combine8(luma_row_3, alpha, dc); + vst1_u8(dst, sum_0); + vst1_u8(dst + 8, sum_1); + vst1_u8(dst + 16, sum_2); + vst1_u8(dst + 24, sum_3); + dst += stride; + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] = + CflSubsampler420_NEON<4, 4>; + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] = + CflSubsampler420_NEON<4, 8>; + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] = + CflSubsampler420_NEON<4, 16>; + + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] = + CflSubsampler420_NEON<8, 4>; + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] = + CflSubsampler420_NEON<8, 8>; + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] = + CflSubsampler420_NEON<8, 16>; + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] = + CflSubsampler420_NEON<8, 32>; + + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] = + CflSubsampler420_NEON<16, 4>; + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] = + CflSubsampler420_NEON<16, 8>; + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] = + CflSubsampler420_NEON<16, 16>; + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] = + CflSubsampler420_NEON<16, 32>; + + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] = + CflSubsampler420_NEON<32, 8>; + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] = + CflSubsampler420_NEON<32, 16>; + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] = + CflSubsampler420_NEON<32, 32>; + + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] = + CflSubsampler444_NEON<4, 4>; + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] = + CflSubsampler444_NEON<4, 8>; + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] = + CflSubsampler444_NEON<4, 16>; + + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] = + CflSubsampler444_NEON<8, 4>; + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] = + CflSubsampler444_NEON<8, 8>; + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] = + CflSubsampler444_NEON<8, 16>; + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] = + CflSubsampler444_NEON<8, 32>; + + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] = + CflSubsampler444_NEON<16, 4>; + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] = + CflSubsampler444_NEON<16, 8>; + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] = + CflSubsampler444_NEON<16, 16>; + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] = + CflSubsampler444_NEON<16, 32>; + + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] = + CflSubsampler444_NEON<32, 8>; + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] = + CflSubsampler444_NEON<32, 16>; + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] = + CflSubsampler444_NEON<32, 32>; + + dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>; + dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>; + dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>; + + dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>; + dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>; + dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>; + dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>; + + dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>; + dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>; + dsp->cfl_intra_predictors[kTransformSize16x16] = + CflIntraPredictor16xN_NEON<16>; + dsp->cfl_intra_predictors[kTransformSize16x32] = + CflIntraPredictor16xN_NEON<32>; + + dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>; + dsp->cfl_intra_predictors[kTransformSize32x16] = + CflIntraPredictor32xN_NEON<16>; + dsp->cfl_intra_predictors[kTransformSize32x32] = + CflIntraPredictor32xN_NEON<32>; + // Max Cfl predictor size is 32x32. +} + +} // namespace +} // namespace low_bitdepth + +void IntraPredCflInit_NEON() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void IntraPredCflInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc new file mode 100644 index 0000000..805ba81 --- /dev/null +++ b/src/dsp/arm/intrapred_directional_neon.cc @@ -0,0 +1,926 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include // std::min +#include +#include +#include +#include // memset + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +// Blend two values based on a 32 bit weight. +inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b, + const uint8x8_t a_weight, + const uint8x8_t b_weight) { + const uint16x8_t a_product = vmull_u8(a, a_weight); + const uint16x8_t b_product = vmull_u8(b, b_weight); + + return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5); +} + +// For vertical operations the weights are one constant value. +inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b, + const uint8_t weight) { + return WeightedBlend(a, b, vdup_n_u8(32 - weight), vdup_n_u8(weight)); +} + +// Fill |left| and |right| with the appropriate values for a given |base_step|. +inline void LoadStepwise(const uint8_t* const source, const uint8x8_t left_step, + const uint8x8_t right_step, uint8x8_t* left, + uint8x8_t* right) { + const uint8x16_t mixed = vld1q_u8(source); + *left = VQTbl1U8(mixed, left_step); + *right = VQTbl1U8(mixed, right_step); +} + +// Handle signed step arguments by ignoring the sign. Negative values are +// considered out of range and overwritten later. +inline void LoadStepwise(const uint8_t* const source, const int8x8_t left_step, + const int8x8_t right_step, uint8x8_t* left, + uint8x8_t* right) { + LoadStepwise(source, vreinterpret_u8_s8(left_step), + vreinterpret_u8_s8(right_step), left, right); +} + +// Process 4 or 8 |width| by any |height|. +template +inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride, + const int height, const uint8_t* const top, + const int xstep, const bool upsampled) { + assert(width == 4 || width == 8); + + const int upsample_shift = static_cast(upsampled); + const int scale_bits = 6 - upsample_shift; + + const int max_base_x = (width + height - 1) << upsample_shift; + const int8x8_t max_base = vdup_n_s8(max_base_x); + const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]); + + const int8x8_t all = vcreate_s8(0x0706050403020100); + const int8x8_t even = vcreate_s8(0x0e0c0a0806040200); + const int8x8_t base_step = upsampled ? even : all; + const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1)); + + int top_x = xstep; + int y = 0; + do { + const int top_base_x = top_x >> scale_bits; + + if (top_base_x >= max_base_x) { + for (int i = y; i < height; ++i) { + memset(dst, top[max_base_x], 4 /* width */); + dst += stride; + } + return; + } + + const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1; + + // Zone2 uses negative values for xstep. Use signed values to compare + // |top_base_x| to |max_base_x|. + const int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step); + + const uint8x8_t max_base_mask = vclt_s8(base_v, max_base); + + // 4 wide subsamples the output. 8 wide subsamples the input. + if (width == 4) { + const uint8x8_t left_values = vld1_u8(top + top_base_x); + const uint8x8_t right_values = RightShift<8>(left_values); + const uint8x8_t value = WeightedBlend(left_values, right_values, shift); + + // If |upsampled| is true then extract every other value for output. + const uint8x8_t value_stepped = + vtbl1_u8(value, vreinterpret_u8_s8(base_step)); + const uint8x8_t masked_value = + vbsl_u8(max_base_mask, value_stepped, top_max_base); + + StoreLo4(dst, masked_value); + } else /* width == 8 */ { + uint8x8_t left_values, right_values; + // WeightedBlend() steps up to Q registers. Downsample the input to avoid + // doing extra calculations. + LoadStepwise(top + top_base_x, base_step, right_step, &left_values, + &right_values); + + const uint8x8_t value = WeightedBlend(left_values, right_values, shift); + const uint8x8_t masked_value = + vbsl_u8(max_base_mask, value, top_max_base); + + vst1_u8(dst, masked_value); + } + dst += stride; + top_x += xstep; + } while (++y < height); +} + +// Process a multiple of 8 |width| by any |height|. Processes horizontally +// before vertically in the hopes of being a little more cache friendly. +inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride, + const int width, const int height, + const uint8_t* const top, const int xstep, + const bool upsampled) { + assert(width % 8 == 0); + const int upsample_shift = static_cast(upsampled); + const int scale_bits = 6 - upsample_shift; + + const int max_base_x = (width + height - 1) << upsample_shift; + const int8x8_t max_base = vdup_n_s8(max_base_x); + const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]); + + const int8x8_t all = vcreate_s8(0x0706050403020100); + const int8x8_t even = vcreate_s8(0x0e0c0a0806040200); + const int8x8_t base_step = upsampled ? even : all; + const int8x8_t right_step = vadd_s8(base_step, vdup_n_s8(1)); + const int8x8_t block_step = vdup_n_s8(8 << upsample_shift); + + int top_x = xstep; + int y = 0; + do { + const int top_base_x = top_x >> scale_bits; + + if (top_base_x >= max_base_x) { + for (int i = y; i < height; ++i) { + memset(dst, top[max_base_x], 4 /* width */); + dst += stride; + } + return; + } + + const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1; + + // Zone2 uses negative values for xstep. Use signed values to compare + // |top_base_x| to |max_base_x|. + int8x8_t base_v = vadd_s8(vdup_n_s8(top_base_x), base_step); + + int x = 0; + do { + const uint8x8_t max_base_mask = vclt_s8(base_v, max_base); + + // Extract the input values based on |upsampled| here to avoid doing twice + // as many calculations. + uint8x8_t left_values, right_values; + LoadStepwise(top + top_base_x + x, base_step, right_step, &left_values, + &right_values); + + const uint8x8_t value = WeightedBlend(left_values, right_values, shift); + const uint8x8_t masked_value = + vbsl_u8(max_base_mask, value, top_max_base); + + vst1_u8(dst + x, masked_value); + + base_v = vadd_s8(base_v, block_step); + x += 8; + } while (x < width); + top_x += xstep; + dst += stride; + } while (++y < height); +} + +void DirectionalIntraPredictorZone1_NEON(void* const dest, + const ptrdiff_t stride, + const void* const top_row, + const int width, const int height, + const int xstep, + const bool upsampled_top) { + const uint8_t* const top = static_cast(top_row); + uint8_t* dst = static_cast(dest); + + assert(xstep > 0); + + const int upsample_shift = static_cast(upsampled_top); + + const uint8x8_t all = vcreate_u8(0x0706050403020100); + + if (xstep == 64) { + assert(!upsampled_top); + const uint8_t* top_ptr = top + 1; + int y = 0; + do { + memcpy(dst, top_ptr, width); + memcpy(dst + stride, top_ptr + 1, width); + memcpy(dst + 2 * stride, top_ptr + 2, width); + memcpy(dst + 3 * stride, top_ptr + 3, width); + dst += 4 * stride; + top_ptr += 4; + y += 4; + } while (y < height); + } else if (width == 4) { + DirectionalZone1_WxH<4>(dst, stride, height, top, xstep, upsampled_top); + } else if (xstep > 51) { + // 7.11.2.10. Intra edge upsample selection process + // if ( d <= 0 || d >= 40 ) useUpsample = 0 + // For |upsample_top| the delta is from vertical so |prediction_angle - 90|. + // In |kDirectionalIntraPredictorDerivative[]| angles less than 51 will meet + // this criteria. The |xstep| value for angle 51 happens to be 51 as well. + // Shallower angles have greater xstep values. + assert(!upsampled_top); + const int max_base_x = ((width + height) - 1); + const uint8x8_t max_base = vdup_n_u8(max_base_x); + const uint8x8_t top_max_base = vdup_n_u8(top[max_base_x]); + const uint8x8_t block_step = vdup_n_u8(8); + + int top_x = xstep; + int y = 0; + do { + const int top_base_x = top_x >> 6; + const uint8_t shift = ((top_x << upsample_shift) & 0x3F) >> 1; + uint8x8_t base_v = vadd_u8(vdup_n_u8(top_base_x), all); + int x = 0; + // Only calculate a block of 8 when at least one of the output values is + // within range. Otherwise it can read off the end of |top|. + const int must_calculate_width = + std::min(width, max_base_x - top_base_x + 7) & ~7; + for (; x < must_calculate_width; x += 8) { + const uint8x8_t max_base_mask = vclt_u8(base_v, max_base); + + // Since these |xstep| values can not be upsampled the load is + // simplified. + const uint8x8_t left_values = vld1_u8(top + top_base_x + x); + const uint8x8_t right_values = vld1_u8(top + top_base_x + x + 1); + const uint8x8_t value = WeightedBlend(left_values, right_values, shift); + const uint8x8_t masked_value = + vbsl_u8(max_base_mask, value, top_max_base); + + vst1_u8(dst + x, masked_value); + base_v = vadd_u8(base_v, block_step); + } + memset(dst + x, top[max_base_x], width - x); + dst += stride; + top_x += xstep; + } while (++y < height); + } else { + DirectionalZone1_WxH(dst, stride, width, height, top, xstep, upsampled_top); + } +} + +// Process 4 or 8 |width| by 4 or 8 |height|. +template +inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride, + const int height, + const uint8_t* const left_column, + const int base_left_y, const int ystep, + const int upsample_shift) { + assert(width == 4 || width == 8); + assert(height == 4 || height == 8); + const int scale_bits = 6 - upsample_shift; + + // Zone3 never runs out of left_column values. + assert((width + height - 1) << upsample_shift > // max_base_y + ((ystep * width) >> scale_bits) + + (/* base_step */ 1 << upsample_shift) * + (height - 1)); // left_base_y + + // Limited improvement for 8x8. ~20% faster for 64x64. + const uint8x8_t all = vcreate_u8(0x0706050403020100); + const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200); + const uint8x8_t base_step = upsample_shift ? even : all; + const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1)); + + uint8_t* dst = dest; + uint8x8_t left_v[8], right_v[8], value_v[8]; + const uint8_t* const left = left_column; + + const int index_0 = base_left_y; + LoadStepwise(left + (index_0 >> scale_bits), base_step, right_step, + &left_v[0], &right_v[0]); + value_v[0] = WeightedBlend(left_v[0], right_v[0], + ((index_0 << upsample_shift) & 0x3F) >> 1); + + const int index_1 = base_left_y + ystep; + LoadStepwise(left + (index_1 >> scale_bits), base_step, right_step, + &left_v[1], &right_v[1]); + value_v[1] = WeightedBlend(left_v[1], right_v[1], + ((index_1 << upsample_shift) & 0x3F) >> 1); + + const int index_2 = base_left_y + ystep * 2; + LoadStepwise(left + (index_2 >> scale_bits), base_step, right_step, + &left_v[2], &right_v[2]); + value_v[2] = WeightedBlend(left_v[2], right_v[2], + ((index_2 << upsample_shift) & 0x3F) >> 1); + + const int index_3 = base_left_y + ystep * 3; + LoadStepwise(left + (index_3 >> scale_bits), base_step, right_step, + &left_v[3], &right_v[3]); + value_v[3] = WeightedBlend(left_v[3], right_v[3], + ((index_3 << upsample_shift) & 0x3F) >> 1); + + const int index_4 = base_left_y + ystep * 4; + LoadStepwise(left + (index_4 >> scale_bits), base_step, right_step, + &left_v[4], &right_v[4]); + value_v[4] = WeightedBlend(left_v[4], right_v[4], + ((index_4 << upsample_shift) & 0x3F) >> 1); + + const int index_5 = base_left_y + ystep * 5; + LoadStepwise(left + (index_5 >> scale_bits), base_step, right_step, + &left_v[5], &right_v[5]); + value_v[5] = WeightedBlend(left_v[5], right_v[5], + ((index_5 << upsample_shift) & 0x3F) >> 1); + + const int index_6 = base_left_y + ystep * 6; + LoadStepwise(left + (index_6 >> scale_bits), base_step, right_step, + &left_v[6], &right_v[6]); + value_v[6] = WeightedBlend(left_v[6], right_v[6], + ((index_6 << upsample_shift) & 0x3F) >> 1); + + const int index_7 = base_left_y + ystep * 7; + LoadStepwise(left + (index_7 >> scale_bits), base_step, right_step, + &left_v[7], &right_v[7]); + value_v[7] = WeightedBlend(left_v[7], right_v[7], + ((index_7 << upsample_shift) & 0x3F) >> 1); + + // 8x8 transpose. + const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(value_v[0], value_v[4]), + vcombine_u8(value_v[1], value_v[5])); + const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(value_v[2], value_v[6]), + vcombine_u8(value_v[3], value_v[7])); + + const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), + vreinterpretq_u16_u8(b1.val[0])); + const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), + vreinterpretq_u16_u8(b1.val[1])); + + const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), + vreinterpretq_u32_u16(c1.val[0])); + const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), + vreinterpretq_u32_u16(c1.val[1])); + + if (width == 4) { + StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0]))); + dst += stride; + StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0]))); + dst += stride; + StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0]))); + dst += stride; + StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0]))); + if (height == 4) return; + dst += stride; + StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1]))); + dst += stride; + StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1]))); + dst += stride; + StoreLo4(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1]))); + dst += stride; + StoreLo4(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1]))); + } else { + vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[0]))); + dst += stride; + vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[0]))); + dst += stride; + vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[0]))); + dst += stride; + vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[0]))); + if (height == 4) return; + dst += stride; + vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d0.val[1]))); + dst += stride; + vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d0.val[1]))); + dst += stride; + vst1_u8(dst, vreinterpret_u8_u32(vget_low_u32(d1.val[1]))); + dst += stride; + vst1_u8(dst, vreinterpret_u8_u32(vget_high_u32(d1.val[1]))); + } +} + +// Because the source values "move backwards" as the row index increases, the +// indices derived from ystep are generally negative. This is accommodated by +// making sure the relative indices are within [-15, 0] when the function is +// called, and sliding them into the inclusive range [0, 15], relative to a +// lower base address. +constexpr int kPositiveIndexOffset = 15; + +// Process 4 or 8 |width| by any |height|. +template +inline void DirectionalZone2FromLeftCol_WxH(uint8_t* dst, + const ptrdiff_t stride, + const int height, + const uint8_t* const left_column, + const int16x8_t left_y, + const int upsample_shift) { + assert(width == 4 || width == 8); + + // The shift argument must be a constant. + int16x8_t offset_y, shift_upsampled = left_y; + if (upsample_shift) { + offset_y = vshrq_n_s16(left_y, 5); + shift_upsampled = vshlq_n_s16(shift_upsampled, 1); + } else { + offset_y = vshrq_n_s16(left_y, 6); + } + + // Select values to the left of the starting point. + // The 15th element (and 16th) will be all the way at the end, to the right. + // With a negative ystep everything else will be "left" of them. + // This supports cumulative steps up to 15. We could support up to 16 by doing + // separate loads for |left_values| and |right_values|. vtbl supports 2 Q + // registers as input which would allow for cumulative offsets of 32. + const int16x8_t sampler = + vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffset)); + const uint8x8_t left_values = vqmovun_s16(sampler); + const uint8x8_t right_values = vadd_u8(left_values, vdup_n_u8(1)); + + const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f)); + const uint8x8_t shift_mul = vreinterpret_u8_s8(vshrn_n_s16(shift_masked, 1)); + const uint8x8_t inv_shift_mul = vsub_u8(vdup_n_u8(32), shift_mul); + + int y = 0; + do { + uint8x8_t src_left, src_right; + LoadStepwise(left_column - kPositiveIndexOffset + (y << upsample_shift), + left_values, right_values, &src_left, &src_right); + const uint8x8_t val = + WeightedBlend(src_left, src_right, inv_shift_mul, shift_mul); + + if (width == 4) { + StoreLo4(dst, val); + } else { + vst1_u8(dst, val); + } + dst += stride; + } while (++y < height); +} + +// Process 4 or 8 |width| by any |height|. +template +inline void DirectionalZone1Blend_WxH(uint8_t* dest, const ptrdiff_t stride, + const int height, + const uint8_t* const top_row, + int zone_bounds, int top_x, + const int xstep, + const int upsample_shift) { + assert(width == 4 || width == 8); + + const int scale_bits_x = 6 - upsample_shift; + + const uint8x8_t all = vcreate_u8(0x0706050403020100); + const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200); + const uint8x8_t base_step = upsample_shift ? even : all; + const uint8x8_t right_step = vadd_u8(base_step, vdup_n_u8(1)); + + int y = 0; + do { + const uint8_t* const src = top_row + (top_x >> scale_bits_x); + uint8x8_t left, right; + LoadStepwise(src, base_step, right_step, &left, &right); + + const uint8_t shift = ((top_x << upsample_shift) & 0x3f) >> 1; + const uint8x8_t val = WeightedBlend(left, right, shift); + + uint8x8_t dst_blend = vld1_u8(dest); + // |zone_bounds| values can be negative. + uint8x8_t blend = + vcge_s8(vreinterpret_s8_u8(all), vdup_n_s8((zone_bounds >> 6))); + uint8x8_t output = vbsl_u8(blend, val, dst_blend); + + if (width == 4) { + StoreLo4(dest, output); + } else { + vst1_u8(dest, output); + } + dest += stride; + zone_bounds += xstep; + top_x -= xstep; + } while (++y < height); +} + +// The height at which a load of 16 bytes will not contain enough source pixels +// from |left_column| to supply an accurate row when computing 8 pixels at a +// time. The values are found by inspection. By coincidence, all angles that +// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up +// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. +constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { + 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; + +// 7.11.2.4 (8) 90 < angle > 180 +// The strategy for these functions (4xH and 8+xH) is to know how many blocks +// can be processed with just pixels from |top_ptr|, then handle mixed blocks, +// then handle only blocks that take from |left_ptr|. Additionally, a fast +// index-shuffle approach is used for pred values from |left_column| in sections +// that permit it. +inline void DirectionalZone2_4xH(uint8_t* dst, const ptrdiff_t stride, + const uint8_t* const top_row, + const uint8_t* const left_column, + const int height, const int xstep, + const int ystep, const bool upsampled_top, + const bool upsampled_left) { + const int upsample_left_shift = static_cast(upsampled_left); + const int upsample_top_shift = static_cast(upsampled_top); + + // Helper vector. + const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; + + // Loop incrementers for moving by block (4xN). Vertical still steps by 8. If + // it's only 4, it will be finished in the first iteration. + const ptrdiff_t stride8 = stride << 3; + const int xstep8 = xstep << 3; + + const int min_height = (height == 4) ? 4 : 8; + + // All columns from |min_top_only_x| to the right will only need |top_row| to + // compute and can therefore call the Zone1 functions. This assumes |xstep| is + // at least 3. + assert(xstep >= 3); + const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4); + + // For steep angles, the source pixels from |left_column| may not fit in a + // 16-byte load for shuffling. + // TODO(petersonab): Find a more precise formula for this subject to x. + // TODO(johannkoenig): Revisit this for |width| == 4. + const int max_shuffle_height = + std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height); + + // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 + int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; + + const int left_base_increment = ystep >> 6; + const int ystep_remainder = ystep & 0x3F; + + // If the 64 scaling is regarded as a decimal point, the first value of the + // left_y vector omits the portion which is covered under the left_column + // offset. The following values need the full ystep as a relative offset. + int16x8_t left_y = vmulq_n_s16(zero_to_seven, -ystep); + left_y = vaddq_s16(left_y, vdupq_n_s16(-ystep_remainder)); + + // This loop treats each set of 4 columns in 3 stages with y-value boundaries. + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + if (min_top_only_x > 0) { + // Round down to the nearest multiple of 8. + // TODO(johannkoenig): This never hits for Wx4 blocks but maybe it should. + const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7; + DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep, + upsampled_top); + + if (max_top_only_y == height) return; + + int y = max_top_only_y; + dst += stride * y; + const int xstep_y = xstep * y; + + // All rows from |min_left_only_y| down for this set of columns only need + // |left_column| to compute. + const int min_left_only_y = std::min((4 << 6) / xstep, height); + // At high angles such that min_left_only_y < 8, ystep is low and xstep is + // high. This means that max_shuffle_height is unbounded and xstep_bounds + // will overflow in 16 bits. This is prevented by stopping the first + // blending loop at min_left_only_y for such cases, which means we skip over + // the second blending loop as well. + const int left_shuffle_stop_y = + std::min(max_shuffle_height, min_left_only_y); + int xstep_bounds = xstep_bounds_base + xstep_y; + int top_x = -xstep - xstep_y; + + // +8 increment is OK because if height is 4 this only goes once. + for (; y < left_shuffle_stop_y; + y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + DirectionalZone2FromLeftCol_WxH<4>( + dst, stride, min_height, + left_column + ((y - left_base_increment) << upsample_left_shift), + left_y, upsample_left_shift); + + DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row, + xstep_bounds, top_x, xstep, + upsample_top_shift); + } + + // Pick up from the last y-value, using the slower but secure method for + // left prediction. + const int16_t base_left_y = vgetq_lane_s16(left_y, 0); + for (; y < min_left_only_y; + y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + DirectionalZone3_WxH<4>( + dst, stride, min_height, + left_column + ((y - left_base_increment) << upsample_left_shift), + base_left_y, -ystep, upsample_left_shift); + + DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row, + xstep_bounds, top_x, xstep, + upsample_top_shift); + } + // Loop over y for left_only rows. + for (; y < height; y += 8, dst += stride8) { + DirectionalZone3_WxH<4>( + dst, stride, min_height, + left_column + ((y - left_base_increment) << upsample_left_shift), + base_left_y, -ystep, upsample_left_shift); + } + } else { + DirectionalZone1_WxH<4>(dst, stride, height, top_row, -xstep, + upsampled_top); + } +} + +// Process a multiple of 8 |width|. +inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride, + const uint8_t* const top_row, + const uint8_t* const left_column, + const int width, const int height, + const int xstep, const int ystep, + const bool upsampled_top, + const bool upsampled_left) { + const int upsample_left_shift = static_cast(upsampled_left); + const int upsample_top_shift = static_cast(upsampled_top); + + // Helper vector. + const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; + + // Loop incrementers for moving by block (8x8). This function handles blocks + // with height 4 as well. They are calculated in one pass so these variables + // do not get used. + const ptrdiff_t stride8 = stride << 3; + const int xstep8 = xstep << 3; + const int ystep8 = ystep << 3; + + // Process Wx4 blocks. + const int min_height = (height == 4) ? 4 : 8; + + // All columns from |min_top_only_x| to the right will only need |top_row| to + // compute and can therefore call the Zone1 functions. This assumes |xstep| is + // at least 3. + assert(xstep >= 3); + const int min_top_only_x = std::min((height * xstep) >> 6, width); + + // For steep angles, the source pixels from |left_column| may not fit in a + // 16-byte load for shuffling. + // TODO(petersonab): Find a more precise formula for this subject to x. + const int max_shuffle_height = + std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height); + + // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 + int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; + + const int left_base_increment = ystep >> 6; + const int ystep_remainder = ystep & 0x3F; + + const int left_base_increment8 = ystep8 >> 6; + const int ystep_remainder8 = ystep8 & 0x3F; + const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8); + + // If the 64 scaling is regarded as a decimal point, the first value of the + // left_y vector omits the portion which is covered under the left_column + // offset. Following values need the full ystep as a relative offset. + int16x8_t left_y = vmulq_n_s16(zero_to_seven, -ystep); + left_y = vaddq_s16(left_y, vdupq_n_s16(-ystep_remainder)); + + // This loop treats each set of 4 columns in 3 stages with y-value boundaries. + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + int x = 0; + for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, + xstep_bounds_base -= (8 << 6), + left_y = vsubq_s16(left_y, increment_left8), + left_offset -= left_base_increment8) { + uint8_t* dst_x = dst + x; + + // Round down to the nearest multiple of 8. + const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; + DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y, + top_row + (x << upsample_top_shift), -xstep, + upsampled_top); + + if (max_top_only_y == height) continue; + + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + + // All rows from |min_left_only_y| down for this set of columns only need + // |left_column| to compute. + const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); + // At high angles such that min_left_only_y < 8, ystep is low and xstep is + // high. This means that max_shuffle_height is unbounded and xstep_bounds + // will overflow in 16 bits. This is prevented by stopping the first + // blending loop at min_left_only_y for such cases, which means we skip over + // the second blending loop as well. + const int left_shuffle_stop_y = + std::min(max_shuffle_height, min_left_only_y); + int xstep_bounds = xstep_bounds_base + xstep_y; + int top_x = -xstep - xstep_y; + + for (; y < left_shuffle_stop_y; + y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + DirectionalZone2FromLeftCol_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), left_y, + upsample_left_shift); + + DirectionalZone1Blend_WxH<8>( + dst_x, stride, min_height, top_row + (x << upsample_top_shift), + xstep_bounds, top_x, xstep, upsample_top_shift); + } + + // Pick up from the last y-value, using the slower but secure method for + // left prediction. + const int16_t base_left_y = vgetq_lane_s16(left_y, 0); + for (; y < min_left_only_y; + y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + DirectionalZone3_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep, upsample_left_shift); + + DirectionalZone1Blend_WxH<8>( + dst_x, stride, min_height, top_row + (x << upsample_top_shift), + xstep_bounds, top_x, xstep, upsample_top_shift); + } + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep, upsample_left_shift); + } + } + // TODO(johannkoenig): May be able to remove this branch. + if (x < width) { + DirectionalZone1_WxH(dst + x, stride, width - x, height, + top_row + (x << upsample_top_shift), -xstep, + upsampled_top); + } +} + +void DirectionalIntraPredictorZone2_NEON( + void* const dest, const ptrdiff_t stride, const void* const top_row, + const void* const left_column, const int width, const int height, + const int xstep, const int ystep, const bool upsampled_top, + const bool upsampled_left) { + // Increasing the negative buffer for this function allows more rows to be + // processed at a time without branching in an inner loop to check the base. + uint8_t top_buffer[288]; + uint8_t left_buffer[288]; + memcpy(top_buffer + 128, static_cast(top_row) - 16, 160); + memcpy(left_buffer + 128, static_cast(left_column) - 16, 160); + const uint8_t* top_ptr = top_buffer + 144; + const uint8_t* left_ptr = left_buffer + 144; + auto* dst = static_cast(dest); + + if (width == 4) { + DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep, + upsampled_top, upsampled_left); + } else { + DirectionalZone2_8(dst, stride, top_ptr, left_ptr, width, height, xstep, + ystep, upsampled_top, upsampled_left); + } +} + +void DirectionalIntraPredictorZone3_NEON(void* const dest, + const ptrdiff_t stride, + const void* const left_column, + const int width, const int height, + const int ystep, + const bool upsampled_left) { + const auto* const left = static_cast(left_column); + + assert(ystep > 0); + + const int upsample_shift = static_cast(upsampled_left); + const int scale_bits = 6 - upsample_shift; + const int base_step = 1 << upsample_shift; + + if (width == 4 || height == 4) { + // This block can handle all sizes but the specializations for other sizes + // are faster. + const uint8x8_t all = vcreate_u8(0x0706050403020100); + const uint8x8_t even = vcreate_u8(0x0e0c0a0806040200); + const uint8x8_t base_step_v = upsampled_left ? even : all; + const uint8x8_t right_step = vadd_u8(base_step_v, vdup_n_u8(1)); + + int y = 0; + do { + int x = 0; + do { + uint8_t* dst = static_cast(dest); + dst += y * stride + x; + uint8x8_t left_v[4], right_v[4], value_v[4]; + const int ystep_base = ystep * x; + const int offset = y * base_step; + + const int index_0 = ystep_base + ystep * 1; + LoadStepwise(left + offset + (index_0 >> scale_bits), base_step_v, + right_step, &left_v[0], &right_v[0]); + value_v[0] = WeightedBlend(left_v[0], right_v[0], + ((index_0 << upsample_shift) & 0x3F) >> 1); + + const int index_1 = ystep_base + ystep * 2; + LoadStepwise(left + offset + (index_1 >> scale_bits), base_step_v, + right_step, &left_v[1], &right_v[1]); + value_v[1] = WeightedBlend(left_v[1], right_v[1], + ((index_1 << upsample_shift) & 0x3F) >> 1); + + const int index_2 = ystep_base + ystep * 3; + LoadStepwise(left + offset + (index_2 >> scale_bits), base_step_v, + right_step, &left_v[2], &right_v[2]); + value_v[2] = WeightedBlend(left_v[2], right_v[2], + ((index_2 << upsample_shift) & 0x3F) >> 1); + + const int index_3 = ystep_base + ystep * 4; + LoadStepwise(left + offset + (index_3 >> scale_bits), base_step_v, + right_step, &left_v[3], &right_v[3]); + value_v[3] = WeightedBlend(left_v[3], right_v[3], + ((index_3 << upsample_shift) & 0x3F) >> 1); + + // 8x4 transpose. + const uint8x8x2_t b0 = vtrn_u8(value_v[0], value_v[1]); + const uint8x8x2_t b1 = vtrn_u8(value_v[2], value_v[3]); + + const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u8(b0.val[0]), + vreinterpret_u16_u8(b1.val[0])); + const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u8(b0.val[1]), + vreinterpret_u16_u8(b1.val[1])); + + StoreLo4(dst, vreinterpret_u8_u16(c0.val[0])); + dst += stride; + StoreLo4(dst, vreinterpret_u8_u16(c1.val[0])); + dst += stride; + StoreLo4(dst, vreinterpret_u8_u16(c0.val[1])); + dst += stride; + StoreLo4(dst, vreinterpret_u8_u16(c1.val[1])); + + if (height > 4) { + dst += stride; + StoreHi4(dst, vreinterpret_u8_u16(c0.val[0])); + dst += stride; + StoreHi4(dst, vreinterpret_u8_u16(c1.val[0])); + dst += stride; + StoreHi4(dst, vreinterpret_u8_u16(c0.val[1])); + dst += stride; + StoreHi4(dst, vreinterpret_u8_u16(c1.val[1])); + } + x += 4; + } while (x < width); + y += 8; + } while (y < height); + } else { // 8x8 at a time. + // Limited improvement for 8x8. ~20% faster for 64x64. + int y = 0; + do { + int x = 0; + do { + uint8_t* dst = static_cast(dest); + dst += y * stride + x; + const int ystep_base = ystep * (x + 1); + + DirectionalZone3_WxH<8>(dst, stride, 8, left + (y << upsample_shift), + ystep_base, ystep, upsample_shift); + x += 8; + } while (x < width); + y += 8; + } while (y < height); + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON; + dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON; + dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON; +} + +} // namespace +} // namespace low_bitdepth + +void IntraPredDirectionalInit_NEON() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void IntraPredDirectionalInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/intrapred_filter_intra_neon.cc b/src/dsp/arm/intrapred_filter_intra_neon.cc new file mode 100644 index 0000000..411708e --- /dev/null +++ b/src/dsp/arm/intrapred_filter_intra_neon.cc @@ -0,0 +1,176 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { + +namespace low_bitdepth { +namespace { + +// Transpose kFilterIntraTaps and convert the first row to unsigned values. +// +// With the previous orientation we were able to multiply all the input values +// by a single tap. This required that all the input values be in one vector +// which requires expensive set up operations (shifts, vext, vtbl). All the +// elements of the result needed to be summed (easy on A64 - vaddvq_s16) but +// then the shifting, rounding, and clamping was done in GP registers. +// +// Switching to unsigned values allows multiplying the 8 bit inputs directly. +// When one value was negative we needed to vmovl_u8 first so that the results +// maintained the proper sign. +// +// We take this into account when summing the values by subtracting the product +// of the first row. +alignas(8) constexpr uint8_t kTransposedTaps[kNumFilterIntraPredictors][7][8] = + {{{6, 5, 3, 3, 4, 3, 3, 3}, // Original values are negative. + {10, 2, 1, 1, 6, 2, 2, 1}, + {0, 10, 1, 1, 0, 6, 2, 2}, + {0, 0, 10, 2, 0, 0, 6, 2}, + {0, 0, 0, 10, 0, 0, 0, 6}, + {12, 9, 7, 5, 2, 2, 2, 3}, + {0, 0, 0, 0, 12, 9, 7, 5}}, + {{10, 6, 4, 2, 10, 6, 4, 2}, // Original values are negative. + {16, 0, 0, 0, 16, 0, 0, 0}, + {0, 16, 0, 0, 0, 16, 0, 0}, + {0, 0, 16, 0, 0, 0, 16, 0}, + {0, 0, 0, 16, 0, 0, 0, 16}, + {10, 6, 4, 2, 0, 0, 0, 0}, + {0, 0, 0, 0, 10, 6, 4, 2}}, + {{8, 8, 8, 8, 4, 4, 4, 4}, // Original values are negative. + {8, 0, 0, 0, 4, 0, 0, 0}, + {0, 8, 0, 0, 0, 4, 0, 0}, + {0, 0, 8, 0, 0, 0, 4, 0}, + {0, 0, 0, 8, 0, 0, 0, 4}, + {16, 16, 16, 16, 0, 0, 0, 0}, + {0, 0, 0, 0, 16, 16, 16, 16}}, + {{2, 1, 1, 0, 1, 1, 1, 1}, // Original values are negative. + {8, 3, 2, 1, 4, 3, 2, 2}, + {0, 8, 3, 2, 0, 4, 3, 2}, + {0, 0, 8, 3, 0, 0, 4, 3}, + {0, 0, 0, 8, 0, 0, 0, 4}, + {10, 6, 4, 2, 3, 4, 4, 3}, + {0, 0, 0, 0, 10, 6, 4, 3}}, + {{12, 10, 9, 8, 10, 9, 8, 7}, // Original values are negative. + {14, 0, 0, 0, 12, 1, 0, 0}, + {0, 14, 0, 0, 0, 12, 0, 0}, + {0, 0, 14, 0, 0, 0, 12, 1}, + {0, 0, 0, 14, 0, 0, 0, 12}, + {14, 12, 11, 10, 0, 0, 1, 1}, + {0, 0, 0, 0, 14, 12, 11, 9}}}; + +void FilterIntraPredictor_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column, + FilterIntraPredictor pred, int width, + int height) { + const uint8_t* const top = static_cast(top_row); + const uint8_t* const left = static_cast(left_column); + + assert(width <= 32 && height <= 32); + + uint8_t* dst = static_cast(dest); + + uint8x8_t transposed_taps[7]; + for (int i = 0; i < 7; ++i) { + transposed_taps[i] = vld1_u8(kTransposedTaps[pred][i]); + } + + uint8_t relative_top_left = top[-1]; + const uint8_t* relative_top = top; + uint8_t relative_left[2] = {left[0], left[1]}; + + int y = 0; + do { + uint8_t* row_dst = dst; + int x = 0; + do { + uint16x8_t sum = vdupq_n_u16(0); + const uint16x8_t subtrahend = + vmull_u8(transposed_taps[0], vdup_n_u8(relative_top_left)); + for (int i = 1; i < 5; ++i) { + sum = vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_top[i - 1])); + } + for (int i = 5; i < 7; ++i) { + sum = + vmlal_u8(sum, transposed_taps[i], vdup_n_u8(relative_left[i - 5])); + } + + const int16x8_t sum_signed = + vreinterpretq_s16_u16(vsubq_u16(sum, subtrahend)); + const int16x8_t sum_shifted = vrshrq_n_s16(sum_signed, 4); + + uint8x8_t sum_saturated = vqmovun_s16(sum_shifted); + + StoreLo4(row_dst, sum_saturated); + StoreHi4(row_dst + stride, sum_saturated); + + // Progress across + relative_top_left = relative_top[3]; + relative_top += 4; + relative_left[0] = row_dst[3]; + relative_left[1] = row_dst[3 + stride]; + row_dst += 4; + x += 4; + } while (x < width); + + // Progress down. + relative_top_left = left[y + 1]; + relative_top = dst + stride; + relative_left[0] = left[y + 2]; + relative_left[1] = left[y + 3]; + + dst += 2 * stride; + y += 2; + } while (y < height); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->filter_intra_predictor = FilterIntraPredictor_NEON; +} + +} // namespace +} // namespace low_bitdepth + +void IntraPredFilterIntraInit_NEON() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void IntraPredFilterIntraInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc new file mode 100644 index 0000000..c967d82 --- /dev/null +++ b/src/dsp/arm/intrapred_neon.cc @@ -0,0 +1,1144 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" + +namespace libgav1 { +namespace dsp { +namespace { + +//------------------------------------------------------------------------------ +// DcPredFuncs_NEON + +using DcSumFunc = uint32x2_t (*)(const void* ref_0, const int ref_0_size_log2, + const bool use_ref_1, const void* ref_1, + const int ref_1_size_log2); +using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const uint32x2_t dc); + +// DC intra-predictors for square blocks. +template +struct DcPredFuncs_NEON { + DcPredFuncs_NEON() = delete; + + static void DcTop(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void Dc(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); +}; + +template +void DcPredFuncs_NEON::DcTop(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* /*left_column*/) { + const uint32x2_t sum = sumfn(top_row, block_width_log2, false, nullptr, 0); + const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2); + storefn(dest, stride, dc); +} + +template +void DcPredFuncs_NEON::DcLeft(void* const dest, ptrdiff_t stride, + const void* /*top_row*/, + const void* const left_column) { + const uint32x2_t sum = + sumfn(left_column, block_height_log2, false, nullptr, 0); + const uint32x2_t dc = vrshr_n_u32(sum, block_height_log2); + storefn(dest, stride, dc); +} + +template +void DcPredFuncs_NEON::Dc( + void* const dest, ptrdiff_t stride, const void* const top_row, + const void* const left_column) { + const uint32x2_t sum = + sumfn(top_row, block_width_log2, true, left_column, block_height_log2); + if (block_width_log2 == block_height_log2) { + const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2 + 1); + storefn(dest, stride, dc); + } else { + // TODO(johannkoenig): Compare this to mul/shift in vectors. + const int divisor = (1 << block_width_log2) + (1 << block_height_log2); + uint32_t dc = vget_lane_u32(sum, 0); + dc += divisor >> 1; + dc /= divisor; + storefn(dest, stride, vdup_n_u32(dc)); + } +} + +// Sum all the elements in the vector into the low 32 bits. +inline uint32x2_t Sum(const uint16x4_t val) { + const uint32x2_t sum = vpaddl_u16(val); + return vpadd_u32(sum, sum); +} + +// Sum all the elements in the vector into the low 32 bits. +inline uint32x2_t Sum(const uint16x8_t val) { + const uint32x4_t sum_0 = vpaddlq_u16(val); + const uint64x2_t sum_1 = vpaddlq_u32(sum_0); + return vadd_u32(vget_low_u32(vreinterpretq_u32_u64(sum_1)), + vget_high_u32(vreinterpretq_u32_u64(sum_1))); +} + +} // namespace + +//------------------------------------------------------------------------------ +namespace low_bitdepth { +namespace { + +// Add and expand the elements in the |val_[01]| to uint16_t but do not sum the +// entire vector. +inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1) { + const uint16x8_t sum_0 = vpaddlq_u8(val_0); + const uint16x8_t sum_1 = vpaddlq_u8(val_1); + return vaddq_u16(sum_0, sum_1); +} + +// Add and expand the elements in the |val_[0123]| to uint16_t but do not sum +// the entire vector. +inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1, + const uint8x16_t val_2, const uint8x16_t val_3) { + const uint16x8_t sum_0 = Add(val_0, val_1); + const uint16x8_t sum_1 = Add(val_2, val_3); + return vaddq_u16(sum_0, sum_1); +} + +// Load and combine 32 uint8_t values. +inline uint16x8_t LoadAndAdd32(const uint8_t* buf) { + const uint8x16_t val_0 = vld1q_u8(buf); + const uint8x16_t val_1 = vld1q_u8(buf + 16); + return Add(val_0, val_1); +} + +// Load and combine 64 uint8_t values. +inline uint16x8_t LoadAndAdd64(const uint8_t* buf) { + const uint8x16_t val_0 = vld1q_u8(buf); + const uint8x16_t val_1 = vld1q_u8(buf + 16); + const uint8x16_t val_2 = vld1q_u8(buf + 32); + const uint8x16_t val_3 = vld1q_u8(buf + 48); + return Add(val_0, val_1, val_2, val_3); +} + +// |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint8_t values. +// If |use_ref_1| is false then only sum |ref_0|. +// For |ref[01]_size_log2| == 4 this relies on |ref_[01]| being aligned to +// uint32_t. +inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2, + const bool use_ref_1, const void* ref_1, + const int ref_1_size_log2) { + const auto* const ref_0_u8 = static_cast(ref_0); + const auto* const ref_1_u8 = static_cast(ref_1); + if (ref_0_size_log2 == 2) { + uint8x8_t val = Load4(ref_0_u8); + if (use_ref_1) { + if (ref_1_size_log2 == 2) { // 4x4 + val = Load4<1>(ref_1_u8, val); + return Sum(vpaddl_u8(val)); + } else if (ref_1_size_log2 == 3) { // 4x8 + const uint8x8_t val_1 = vld1_u8(ref_1_u8); + const uint16x4_t sum_0 = vpaddl_u8(val); + const uint16x4_t sum_1 = vpaddl_u8(val_1); + return Sum(vadd_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 4) { // 4x16 + const uint8x16_t val_1 = vld1q_u8(ref_1_u8); + return Sum(vaddw_u8(vpaddlq_u8(val_1), val)); + } + } + // 4x1 + const uint16x4_t sum = vpaddl_u8(val); + return vpaddl_u16(sum); + } else if (ref_0_size_log2 == 3) { + const uint8x8_t val_0 = vld1_u8(ref_0_u8); + if (use_ref_1) { + if (ref_1_size_log2 == 2) { // 8x4 + const uint8x8_t val_1 = Load4(ref_1_u8); + const uint16x4_t sum_0 = vpaddl_u8(val_0); + const uint16x4_t sum_1 = vpaddl_u8(val_1); + return Sum(vadd_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 3) { // 8x8 + const uint8x8_t val_1 = vld1_u8(ref_1_u8); + const uint16x4_t sum_0 = vpaddl_u8(val_0); + const uint16x4_t sum_1 = vpaddl_u8(val_1); + return Sum(vadd_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 4) { // 8x16 + const uint8x16_t val_1 = vld1q_u8(ref_1_u8); + return Sum(vaddw_u8(vpaddlq_u8(val_1), val_0)); + } else if (ref_1_size_log2 == 5) { // 8x32 + return Sum(vaddw_u8(LoadAndAdd32(ref_1_u8), val_0)); + } + } + // 8x1 + return Sum(vpaddl_u8(val_0)); + } else if (ref_0_size_log2 == 4) { + const uint8x16_t val_0 = vld1q_u8(ref_0_u8); + if (use_ref_1) { + if (ref_1_size_log2 == 2) { // 16x4 + const uint8x8_t val_1 = Load4(ref_1_u8); + return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1)); + } else if (ref_1_size_log2 == 3) { // 16x8 + const uint8x8_t val_1 = vld1_u8(ref_1_u8); + return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1)); + } else if (ref_1_size_log2 == 4) { // 16x16 + const uint8x16_t val_1 = vld1q_u8(ref_1_u8); + return Sum(Add(val_0, val_1)); + } else if (ref_1_size_log2 == 5) { // 16x32 + const uint16x8_t sum_0 = vpaddlq_u8(val_0); + const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8); + return Sum(vaddq_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 6) { // 16x64 + const uint16x8_t sum_0 = vpaddlq_u8(val_0); + const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8); + return Sum(vaddq_u16(sum_0, sum_1)); + } + } + // 16x1 + return Sum(vpaddlq_u8(val_0)); + } else if (ref_0_size_log2 == 5) { + const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u8); + if (use_ref_1) { + if (ref_1_size_log2 == 3) { // 32x8 + const uint8x8_t val_1 = vld1_u8(ref_1_u8); + return Sum(vaddw_u8(sum_0, val_1)); + } else if (ref_1_size_log2 == 4) { // 32x16 + const uint8x16_t val_1 = vld1q_u8(ref_1_u8); + const uint16x8_t sum_1 = vpaddlq_u8(val_1); + return Sum(vaddq_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 5) { // 32x32 + const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8); + return Sum(vaddq_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 6) { // 32x64 + const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8); + return Sum(vaddq_u16(sum_0, sum_1)); + } + } + // 32x1 + return Sum(sum_0); + } + + assert(ref_0_size_log2 == 6); + const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u8); + if (use_ref_1) { + if (ref_1_size_log2 == 4) { // 64x16 + const uint8x16_t val_1 = vld1q_u8(ref_1_u8); + const uint16x8_t sum_1 = vpaddlq_u8(val_1); + return Sum(vaddq_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 5) { // 64x32 + const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8); + return Sum(vaddq_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 6) { // 64x64 + const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8); + return Sum(vaddq_u16(sum_0, sum_1)); + } + } + // 64x1 + return Sum(sum_0); +} + +template +inline void DcStore_NEON(void* const dest, ptrdiff_t stride, + const uint32x2_t dc) { + const uint8x16_t dc_dup = vdupq_lane_u8(vreinterpret_u8_u32(dc), 0); + auto* dst = static_cast(dest); + if (width == 4) { + int i = height - 1; + do { + StoreLo4(dst, vget_low_u8(dc_dup)); + dst += stride; + } while (--i != 0); + StoreLo4(dst, vget_low_u8(dc_dup)); + } else if (width == 8) { + int i = height - 1; + do { + vst1_u8(dst, vget_low_u8(dc_dup)); + dst += stride; + } while (--i != 0); + vst1_u8(dst, vget_low_u8(dc_dup)); + } else if (width == 16) { + int i = height - 1; + do { + vst1q_u8(dst, dc_dup); + dst += stride; + } while (--i != 0); + vst1q_u8(dst, dc_dup); + } else if (width == 32) { + int i = height - 1; + do { + vst1q_u8(dst, dc_dup); + vst1q_u8(dst + 16, dc_dup); + dst += stride; + } while (--i != 0); + vst1q_u8(dst, dc_dup); + vst1q_u8(dst + 16, dc_dup); + } else { + assert(width == 64); + int i = height - 1; + do { + vst1q_u8(dst, dc_dup); + vst1q_u8(dst + 16, dc_dup); + vst1q_u8(dst + 32, dc_dup); + vst1q_u8(dst + 48, dc_dup); + dst += stride; + } while (--i != 0); + vst1q_u8(dst, dc_dup); + vst1q_u8(dst + 16, dc_dup); + vst1q_u8(dst + 32, dc_dup); + vst1q_u8(dst + 48, dc_dup); + } +} + +template +inline void Paeth4Or8xN_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + auto* dest_u8 = static_cast(dest); + const auto* const top_row_u8 = static_cast(top_row); + const auto* const left_col_u8 = static_cast(left_column); + + const uint8x8_t top_left = vdup_n_u8(top_row_u8[-1]); + const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]); + uint8x8_t top; + if (width == 4) { + top = Load4(top_row_u8); + } else { // width == 8 + top = vld1_u8(top_row_u8); + } + + for (int y = 0; y < height; ++y) { + const uint8x8_t left = vdup_n_u8(left_col_u8[y]); + + const uint8x8_t left_dist = vabd_u8(top, top_left); + const uint8x8_t top_dist = vabd_u8(left, top_left); + const uint16x8_t top_left_dist = + vabdq_u16(vaddl_u8(top, left), top_left_x2); + + const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist); + const uint8x8_t left_le_top_left = + vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist)); + const uint8x8_t top_le_top_left = + vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist)); + + // if (left_dist <= top_dist && left_dist <= top_left_dist) + const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left); + // dest[x] = left_column[y]; + // Fill all the unused spaces with 'top'. They will be overwritten when + // the positions for top_left are known. + uint8x8_t result = vbsl_u8(left_mask, left, top); + // else if (top_dist <= top_left_dist) + // dest[x] = top_row[x]; + // Add these values to the mask. They were already set. + const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left); + // else + // dest[x] = top_left; + result = vbsl_u8(left_or_top_mask, result, top_left); + + if (width == 4) { + StoreLo4(dest_u8, result); + } else { // width == 8 + vst1_u8(dest_u8, result); + } + dest_u8 += stride; + } +} + +// Calculate X distance <= TopLeft distance and pack the resulting mask into +// uint8x8_t. +inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist, + const uint16x8_t top_left_dist_low, + const uint16x8_t top_left_dist_high) { + // TODO(johannkoenig): cle() should work with vmovn(top_left_dist) instead of + // using movl(x_dist). + const uint8x8_t x_le_top_left_low = + vmovn_u16(vcleq_u16(vmovl_u8(vget_low_u8(x_dist)), top_left_dist_low)); + const uint8x8_t x_le_top_left_high = + vmovn_u16(vcleq_u16(vmovl_u8(vget_high_u8(x_dist)), top_left_dist_high)); + return vcombine_u8(x_le_top_left_low, x_le_top_left_high); +} + +// Select the closest values and collect them. +inline uint8x16_t SelectPaeth(const uint8x16_t top, const uint8x16_t left, + const uint8x16_t top_left, + const uint8x16_t left_le_top, + const uint8x16_t left_le_top_left, + const uint8x16_t top_le_top_left) { + // if (left_dist <= top_dist && left_dist <= top_left_dist) + const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left); + // dest[x] = left_column[y]; + // Fill all the unused spaces with 'top'. They will be overwritten when + // the positions for top_left are known. + uint8x16_t result = vbslq_u8(left_mask, left, top); + // else if (top_dist <= top_left_dist) + // dest[x] = top_row[x]; + // Add these values to the mask. They were already set. + const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left); + // else + // dest[x] = top_left; + return vbslq_u8(left_or_top_mask, result, top_left); +} + +// Generate numbered and high/low versions of top_left_dist. +#define TOP_LEFT_DIST(num) \ + const uint16x8_t top_left_##num##_dist_low = vabdq_u16( \ + vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \ + const uint16x8_t top_left_##num##_dist_high = vabdq_u16( \ + vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2) + +// Generate numbered versions of XLeTopLeft with x = left. +#define LEFT_LE_TOP_LEFT(num) \ + const uint8x16_t left_le_top_left_##num = \ + XLeTopLeft(left_##num##_dist, top_left_##num##_dist_low, \ + top_left_##num##_dist_high) + +// Generate numbered versions of XLeTopLeft with x = top. +#define TOP_LE_TOP_LEFT(num) \ + const uint8x16_t top_le_top_left_##num = XLeTopLeft( \ + top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high) + +template +inline void Paeth16PlusxN_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + auto* dest_u8 = static_cast(dest); + const auto* const top_row_u8 = static_cast(top_row); + const auto* const left_col_u8 = static_cast(left_column); + + const uint8x16_t top_left = vdupq_n_u8(top_row_u8[-1]); + const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]); + uint8x16_t top[4]; + top[0] = vld1q_u8(top_row_u8); + if (width > 16) { + top[1] = vld1q_u8(top_row_u8 + 16); + if (width == 64) { + top[2] = vld1q_u8(top_row_u8 + 32); + top[3] = vld1q_u8(top_row_u8 + 48); + } + } + + for (int y = 0; y < height; ++y) { + const uint8x16_t left = vdupq_n_u8(left_col_u8[y]); + + const uint8x16_t top_dist = vabdq_u8(left, top_left); + + const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left); + TOP_LEFT_DIST(0); + const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist); + LEFT_LE_TOP_LEFT(0); + TOP_LE_TOP_LEFT(0); + + const uint8x16_t result_0 = + SelectPaeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0, + top_le_top_left_0); + vst1q_u8(dest_u8, result_0); + + if (width > 16) { + const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left); + TOP_LEFT_DIST(1); + const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist); + LEFT_LE_TOP_LEFT(1); + TOP_LE_TOP_LEFT(1); + + const uint8x16_t result_1 = + SelectPaeth(top[1], left, top_left, left_1_le_top, left_le_top_left_1, + top_le_top_left_1); + vst1q_u8(dest_u8 + 16, result_1); + + if (width == 64) { + const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left); + TOP_LEFT_DIST(2); + const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist); + LEFT_LE_TOP_LEFT(2); + TOP_LE_TOP_LEFT(2); + + const uint8x16_t result_2 = + SelectPaeth(top[2], left, top_left, left_2_le_top, + left_le_top_left_2, top_le_top_left_2); + vst1q_u8(dest_u8 + 32, result_2); + + const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left); + TOP_LEFT_DIST(3); + const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist); + LEFT_LE_TOP_LEFT(3); + TOP_LE_TOP_LEFT(3); + + const uint8x16_t result_3 = + SelectPaeth(top[3], left, top_left, left_3_le_top, + left_le_top_left_3, top_le_top_left_3); + vst1q_u8(dest_u8 + 48, result_3); + } + } + + dest_u8 += stride; + } +} + +struct DcDefs { + DcDefs() = delete; + + using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>; + using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>; + using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>; + using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>; + using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>; + using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>; + using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>; + using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>; + using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>; + using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>; + using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>; + using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>; + using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>; + using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>; + using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>; + using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>; + using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>; + using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>; + using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>; +}; + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + // 4x4 + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] = + DcDefs::_4x4::DcTop; + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] = + DcDefs::_4x4::DcLeft; + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = + DcDefs::_4x4::Dc; + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] = + Paeth4Or8xN_NEON<4, 4>; + + // 4x8 + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] = + DcDefs::_4x8::DcTop; + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] = + DcDefs::_4x8::DcLeft; + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] = + DcDefs::_4x8::Dc; + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] = + Paeth4Or8xN_NEON<4, 8>; + + // 4x16 + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] = + DcDefs::_4x16::DcTop; + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] = + DcDefs::_4x16::DcLeft; + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] = + DcDefs::_4x16::Dc; + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] = + Paeth4Or8xN_NEON<4, 16>; + + // 8x4 + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] = + DcDefs::_8x4::DcTop; + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] = + DcDefs::_8x4::DcLeft; + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] = + DcDefs::_8x4::Dc; + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] = + Paeth4Or8xN_NEON<8, 4>; + + // 8x8 + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] = + DcDefs::_8x8::DcTop; + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] = + DcDefs::_8x8::DcLeft; + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] = + DcDefs::_8x8::Dc; + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] = + Paeth4Or8xN_NEON<8, 8>; + + // 8x16 + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] = + DcDefs::_8x16::DcTop; + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] = + DcDefs::_8x16::DcLeft; + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] = + DcDefs::_8x16::Dc; + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] = + Paeth4Or8xN_NEON<8, 16>; + + // 8x32 + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] = + DcDefs::_8x32::DcTop; + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] = + DcDefs::_8x32::DcLeft; + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] = + DcDefs::_8x32::Dc; + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] = + Paeth4Or8xN_NEON<8, 32>; + + // 16x4 + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] = + DcDefs::_16x4::DcTop; + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] = + DcDefs::_16x4::DcLeft; + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] = + DcDefs::_16x4::Dc; + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] = + Paeth16PlusxN_NEON<16, 4>; + + // 16x8 + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] = + DcDefs::_16x8::DcTop; + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] = + DcDefs::_16x8::DcLeft; + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] = + DcDefs::_16x8::Dc; + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] = + Paeth16PlusxN_NEON<16, 8>; + + // 16x16 + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] = + DcDefs::_16x16::DcTop; + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] = + DcDefs::_16x16::DcLeft; + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] = + DcDefs::_16x16::Dc; + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] = + Paeth16PlusxN_NEON<16, 16>; + + // 16x32 + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] = + DcDefs::_16x32::DcTop; + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] = + DcDefs::_16x32::DcLeft; + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] = + DcDefs::_16x32::Dc; + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] = + Paeth16PlusxN_NEON<16, 32>; + + // 16x64 + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] = + DcDefs::_16x64::DcTop; + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] = + DcDefs::_16x64::DcLeft; + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] = + DcDefs::_16x64::Dc; + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] = + Paeth16PlusxN_NEON<16, 64>; + + // 32x8 + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] = + DcDefs::_32x8::DcTop; + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] = + DcDefs::_32x8::DcLeft; + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] = + DcDefs::_32x8::Dc; + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] = + Paeth16PlusxN_NEON<32, 8>; + + // 32x16 + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] = + DcDefs::_32x16::DcTop; + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] = + DcDefs::_32x16::DcLeft; + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] = + DcDefs::_32x16::Dc; + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] = + Paeth16PlusxN_NEON<32, 16>; + + // 32x32 + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] = + DcDefs::_32x32::DcTop; + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] = + DcDefs::_32x32::DcLeft; + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] = + DcDefs::_32x32::Dc; + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] = + Paeth16PlusxN_NEON<32, 32>; + + // 32x64 + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] = + DcDefs::_32x64::DcTop; + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] = + DcDefs::_32x64::DcLeft; + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] = + DcDefs::_32x64::Dc; + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] = + Paeth16PlusxN_NEON<32, 64>; + + // 64x16 + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] = + DcDefs::_64x16::DcTop; + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] = + DcDefs::_64x16::DcLeft; + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] = + DcDefs::_64x16::Dc; + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] = + Paeth16PlusxN_NEON<64, 16>; + + // 64x32 + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] = + DcDefs::_64x32::DcTop; + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] = + DcDefs::_64x32::DcLeft; + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] = + DcDefs::_64x32::Dc; + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] = + Paeth16PlusxN_NEON<64, 32>; + + // 64x64 + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] = + DcDefs::_64x64::DcTop; + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] = + DcDefs::_64x64::DcLeft; + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] = + DcDefs::_64x64::Dc; + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] = + Paeth16PlusxN_NEON<64, 64>; +} + +} // namespace +} // namespace low_bitdepth + +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +// Add the elements in the given vectors together but do not sum the entire +// vector. +inline uint16x8_t Add(const uint16x8_t val_0, const uint16x8_t val_1, + const uint16x8_t val_2, const uint16x8_t val_3) { + const uint16x8_t sum_0 = vaddq_u16(val_0, val_1); + const uint16x8_t sum_1 = vaddq_u16(val_2, val_3); + return vaddq_u16(sum_0, sum_1); +} + +// Load and combine 16 uint16_t values. +inline uint16x8_t LoadAndAdd16(const uint16_t* buf) { + const uint16x8_t val_0 = vld1q_u16(buf); + const uint16x8_t val_1 = vld1q_u16(buf + 8); + return vaddq_u16(val_0, val_1); +} + +// Load and combine 32 uint16_t values. +inline uint16x8_t LoadAndAdd32(const uint16_t* buf) { + const uint16x8_t val_0 = vld1q_u16(buf); + const uint16x8_t val_1 = vld1q_u16(buf + 8); + const uint16x8_t val_2 = vld1q_u16(buf + 16); + const uint16x8_t val_3 = vld1q_u16(buf + 24); + return Add(val_0, val_1, val_2, val_3); +} + +// Load and combine 64 uint16_t values. +inline uint16x8_t LoadAndAdd64(const uint16_t* buf) { + const uint16x8_t val_0 = vld1q_u16(buf); + const uint16x8_t val_1 = vld1q_u16(buf + 8); + const uint16x8_t val_2 = vld1q_u16(buf + 16); + const uint16x8_t val_3 = vld1q_u16(buf + 24); + const uint16x8_t val_4 = vld1q_u16(buf + 32); + const uint16x8_t val_5 = vld1q_u16(buf + 40); + const uint16x8_t val_6 = vld1q_u16(buf + 48); + const uint16x8_t val_7 = vld1q_u16(buf + 56); + const uint16x8_t sum_0 = Add(val_0, val_1, val_2, val_3); + const uint16x8_t sum_1 = Add(val_4, val_5, val_6, val_7); + return vaddq_u16(sum_0, sum_1); +} + +// |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint16_t values. +// If |use_ref_1| is false then only sum |ref_0|. +inline uint32x2_t DcSum_NEON(const void* ref_0, const int ref_0_size_log2, + const bool use_ref_1, const void* ref_1, + const int ref_1_size_log2) { + const auto* ref_0_u16 = static_cast(ref_0); + const auto* ref_1_u16 = static_cast(ref_1); + if (ref_0_size_log2 == 2) { + const uint16x4_t val_0 = vld1_u16(ref_0_u16); + if (use_ref_1) { + if (ref_1_size_log2 == 2) { // 4x4 + const uint16x4_t val_1 = vld1_u16(ref_1_u16); + return Sum(vadd_u16(val_0, val_1)); + } else if (ref_1_size_log2 == 3) { // 4x8 + const uint16x8_t val_1 = vld1q_u16(ref_1_u16); + const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0); + return Sum(vaddq_u16(sum_0, val_1)); + } else if (ref_1_size_log2 == 4) { // 4x16 + const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0); + const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16); + return Sum(vaddq_u16(sum_0, sum_1)); + } + } + // 4x1 + return Sum(val_0); + } else if (ref_0_size_log2 == 3) { + const uint16x8_t val_0 = vld1q_u16(ref_0_u16); + if (use_ref_1) { + if (ref_1_size_log2 == 2) { // 8x4 + const uint16x4_t val_1 = vld1_u16(ref_1_u16); + const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1); + return Sum(vaddq_u16(val_0, sum_1)); + } else if (ref_1_size_log2 == 3) { // 8x8 + const uint16x8_t val_1 = vld1q_u16(ref_1_u16); + return Sum(vaddq_u16(val_0, val_1)); + } else if (ref_1_size_log2 == 4) { // 8x16 + const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16); + return Sum(vaddq_u16(val_0, sum_1)); + } else if (ref_1_size_log2 == 5) { // 8x32 + const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16); + return Sum(vaddq_u16(val_0, sum_1)); + } + } + // 8x1 + return Sum(val_0); + } else if (ref_0_size_log2 == 4) { + const uint16x8_t sum_0 = LoadAndAdd16(ref_0_u16); + if (use_ref_1) { + if (ref_1_size_log2 == 2) { // 16x4 + const uint16x4_t val_1 = vld1_u16(ref_1_u16); + const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1); + return Sum(vaddq_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 3) { // 16x8 + const uint16x8_t val_1 = vld1q_u16(ref_1_u16); + return Sum(vaddq_u16(sum_0, val_1)); + } else if (ref_1_size_log2 == 4) { // 16x16 + const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16); + return Sum(vaddq_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 5) { // 16x32 + const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16); + return Sum(vaddq_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 6) { // 16x64 + const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16); + return Sum(vaddq_u16(sum_0, sum_1)); + } + } + // 16x1 + return Sum(sum_0); + } else if (ref_0_size_log2 == 5) { + const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u16); + if (use_ref_1) { + if (ref_1_size_log2 == 3) { // 32x8 + const uint16x8_t val_1 = vld1q_u16(ref_1_u16); + return Sum(vaddq_u16(sum_0, val_1)); + } else if (ref_1_size_log2 == 4) { // 32x16 + const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16); + return Sum(vaddq_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 5) { // 32x32 + const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16); + return Sum(vaddq_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 6) { // 32x64 + const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16); + return Sum(vaddq_u16(sum_0, sum_1)); + } + } + // 32x1 + return Sum(sum_0); + } + + assert(ref_0_size_log2 == 6); + const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u16); + if (use_ref_1) { + if (ref_1_size_log2 == 4) { // 64x16 + const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16); + return Sum(vaddq_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 5) { // 64x32 + const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16); + return Sum(vaddq_u16(sum_0, sum_1)); + } else if (ref_1_size_log2 == 6) { // 64x64 + const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16); + return Sum(vaddq_u16(sum_0, sum_1)); + } + } + // 64x1 + return Sum(sum_0); +} + +template +inline void DcStore_NEON(void* const dest, ptrdiff_t stride, + const uint32x2_t dc) { + auto* dest_u16 = static_cast(dest); + ptrdiff_t stride_u16 = stride >> 1; + const uint16x8_t dc_dup = vdupq_lane_u16(vreinterpret_u16_u32(dc), 0); + if (width == 4) { + int i = height - 1; + do { + vst1_u16(dest_u16, vget_low_u16(dc_dup)); + dest_u16 += stride_u16; + } while (--i != 0); + vst1_u16(dest_u16, vget_low_u16(dc_dup)); + } else if (width == 8) { + int i = height - 1; + do { + vst1q_u16(dest_u16, dc_dup); + dest_u16 += stride_u16; + } while (--i != 0); + vst1q_u16(dest_u16, dc_dup); + } else if (width == 16) { + int i = height - 1; + do { + vst1q_u16(dest_u16, dc_dup); + vst1q_u16(dest_u16 + 8, dc_dup); + dest_u16 += stride_u16; + } while (--i != 0); + vst1q_u16(dest_u16, dc_dup); + vst1q_u16(dest_u16 + 8, dc_dup); + } else if (width == 32) { + int i = height - 1; + do { + vst1q_u16(dest_u16, dc_dup); + vst1q_u16(dest_u16 + 8, dc_dup); + vst1q_u16(dest_u16 + 16, dc_dup); + vst1q_u16(dest_u16 + 24, dc_dup); + dest_u16 += stride_u16; + } while (--i != 0); + vst1q_u16(dest_u16, dc_dup); + vst1q_u16(dest_u16 + 8, dc_dup); + vst1q_u16(dest_u16 + 16, dc_dup); + vst1q_u16(dest_u16 + 24, dc_dup); + } else { + assert(width == 64); + int i = height - 1; + do { + vst1q_u16(dest_u16, dc_dup); + vst1q_u16(dest_u16 + 8, dc_dup); + vst1q_u16(dest_u16 + 16, dc_dup); + vst1q_u16(dest_u16 + 24, dc_dup); + vst1q_u16(dest_u16 + 32, dc_dup); + vst1q_u16(dest_u16 + 40, dc_dup); + vst1q_u16(dest_u16 + 48, dc_dup); + vst1q_u16(dest_u16 + 56, dc_dup); + dest_u16 += stride_u16; + } while (--i != 0); + vst1q_u16(dest_u16, dc_dup); + vst1q_u16(dest_u16 + 8, dc_dup); + vst1q_u16(dest_u16 + 16, dc_dup); + vst1q_u16(dest_u16 + 24, dc_dup); + vst1q_u16(dest_u16 + 32, dc_dup); + vst1q_u16(dest_u16 + 40, dc_dup); + vst1q_u16(dest_u16 + 48, dc_dup); + vst1q_u16(dest_u16 + 56, dc_dup); + } +} + +struct DcDefs { + DcDefs() = delete; + + using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>; + using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>; + using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>; + using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>; + using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>; + using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>; + using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>; + using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>; + using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>; + using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>; + using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>; + using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>; + using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>; + using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>; + using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>; + using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>; + using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>; + using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>; + using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>; +}; + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] = + DcDefs::_4x4::DcTop; + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] = + DcDefs::_4x4::DcLeft; + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = + DcDefs::_4x4::Dc; + + // 4x8 + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] = + DcDefs::_4x8::DcTop; + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] = + DcDefs::_4x8::DcLeft; + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] = + DcDefs::_4x8::Dc; + + // 4x16 + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] = + DcDefs::_4x16::DcTop; + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] = + DcDefs::_4x16::DcLeft; + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] = + DcDefs::_4x16::Dc; + + // 8x4 + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] = + DcDefs::_8x4::DcTop; + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] = + DcDefs::_8x4::DcLeft; + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] = + DcDefs::_8x4::Dc; + + // 8x8 + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] = + DcDefs::_8x8::DcTop; + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] = + DcDefs::_8x8::DcLeft; + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] = + DcDefs::_8x8::Dc; + + // 8x16 + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] = + DcDefs::_8x16::DcTop; + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] = + DcDefs::_8x16::DcLeft; + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] = + DcDefs::_8x16::Dc; + + // 8x32 + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] = + DcDefs::_8x32::DcTop; + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] = + DcDefs::_8x32::DcLeft; + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] = + DcDefs::_8x32::Dc; + + // 16x4 + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] = + DcDefs::_16x4::DcTop; + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] = + DcDefs::_16x4::DcLeft; + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] = + DcDefs::_16x4::Dc; + + // 16x8 + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] = + DcDefs::_16x8::DcTop; + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] = + DcDefs::_16x8::DcLeft; + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] = + DcDefs::_16x8::Dc; + + // 16x16 + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] = + DcDefs::_16x16::DcTop; + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] = + DcDefs::_16x16::DcLeft; + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] = + DcDefs::_16x16::Dc; + + // 16x32 + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] = + DcDefs::_16x32::DcTop; + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] = + DcDefs::_16x32::DcLeft; + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] = + DcDefs::_16x32::Dc; + + // 16x64 + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] = + DcDefs::_16x64::DcTop; + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] = + DcDefs::_16x64::DcLeft; + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] = + DcDefs::_16x64::Dc; + + // 32x8 + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] = + DcDefs::_32x8::DcTop; + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] = + DcDefs::_32x8::DcLeft; + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] = + DcDefs::_32x8::Dc; + + // 32x16 + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] = + DcDefs::_32x16::DcTop; + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] = + DcDefs::_32x16::DcLeft; + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] = + DcDefs::_32x16::Dc; + + // 32x32 + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] = + DcDefs::_32x32::DcTop; + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] = + DcDefs::_32x32::DcLeft; + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] = + DcDefs::_32x32::Dc; + + // 32x64 + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] = + DcDefs::_32x64::DcTop; + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] = + DcDefs::_32x64::DcLeft; + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] = + DcDefs::_32x64::Dc; + + // 64x16 + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] = + DcDefs::_64x16::DcTop; + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] = + DcDefs::_64x16::DcLeft; + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] = + DcDefs::_64x16::Dc; + + // 64x32 + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] = + DcDefs::_64x32::DcTop; + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] = + DcDefs::_64x32::DcLeft; + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] = + DcDefs::_64x32::Dc; + + // 64x64 + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] = + DcDefs::_64x64::DcTop; + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] = + DcDefs::_64x64::DcLeft; + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] = + DcDefs::_64x64::Dc; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void IntraPredInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void IntraPredInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/intrapred_neon.h b/src/dsp/arm/intrapred_neon.h new file mode 100644 index 0000000..16f858c --- /dev/null +++ b/src/dsp/arm/intrapred_neon.h @@ -0,0 +1,418 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*, +// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and +// Dsp::filter_intra_predictor, see the defines below for specifics. These +// functions are not thread-safe. +void IntraPredCflInit_NEON(); +void IntraPredDirectionalInit_NEON(); +void IntraPredFilterIntraInit_NEON(); +void IntraPredInit_NEON(); +void IntraPredSmoothInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +// 8 bit +#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON + +// 4x4 +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 4x8 +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 4x16 +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x4 +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x8 +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x16 +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 8x32 +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x4 +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x8 +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x16 +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x32 +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON + +// 16x64 +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +// 32x8 +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x16 +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x32 +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON + +// 32x64 +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +// 64x16 +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +// 64x32 +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +// 64x64 +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_NEON + +// 10 bit +// 4x4 +#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON + +// 4x8 +#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON + +// 4x16 +#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON + +// 8x4 +#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON + +// 8x8 +#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON + +// 8x16 +#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON + +// 8x32 +#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON + +// 16x4 +#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON + +// 16x8 +#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON + +// 16x16 +#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON + +// 16x32 +#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON + +// 16x64 +#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON + +// 32x8 +#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON + +// 32x16 +#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON + +// 32x32 +#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON + +// 32x64 +#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON + +// 64x16 +#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON + +// 64x32 +#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON + +// 64x64 +#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_ diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc new file mode 100644 index 0000000..abc93e8 --- /dev/null +++ b/src/dsp/arm/intrapred_smooth_neon.cc @@ -0,0 +1,616 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" + +namespace libgav1 { +namespace dsp { + +namespace low_bitdepth { +namespace { + +// Note these constants are duplicated from intrapred.cc to allow the compiler +// to have visibility of the values. This helps reduce loads and in the +// creation of the inverse weights. +constexpr uint8_t kSmoothWeights[] = { + // block dimension = 4 + 255, 149, 85, 64, + // block dimension = 8 + 255, 197, 146, 105, 73, 50, 37, 32, + // block dimension = 16 + 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, + // block dimension = 32 + 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, + 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, + // block dimension = 64 + 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, + 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, + 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, + 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4}; + +// TODO(b/150459137): Keeping the intermediate values in uint16_t would allow +// processing more values at once. At the high end, it could do 4x4 or 8x2 at a +// time. +inline uint16x4_t CalculatePred(const uint16x4_t weighted_top, + const uint16x4_t weighted_left, + const uint16x4_t weighted_bl, + const uint16x4_t weighted_tr) { + const uint32x4_t pred_0 = vaddl_u16(weighted_top, weighted_left); + const uint32x4_t pred_1 = vaddl_u16(weighted_bl, weighted_tr); + const uint32x4_t pred_2 = vaddq_u32(pred_0, pred_1); + return vrshrn_n_u32(pred_2, kSmoothWeightScale + 1); +} + +template +inline void Smooth4Or8xN_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const uint8_t* const top = static_cast(top_row); + const uint8_t* const left = static_cast(left_column); + const uint8_t top_right = top[width - 1]; + const uint8_t bottom_left = left[height - 1]; + const uint8_t* const weights_y = kSmoothWeights + height - 4; + uint8_t* dst = static_cast(dest); + + uint8x8_t top_v; + if (width == 4) { + top_v = Load4(top); + } else { // width == 8 + top_v = vld1_u8(top); + } + const uint8x8_t top_right_v = vdup_n_u8(top_right); + const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); + // Over-reads for 4xN but still within the array. + const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4); + // 256 - weights = vneg_s8(weights) + const uint8x8_t scaled_weights_x = + vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v))); + + for (int y = 0; y < height; ++y) { + const uint8x8_t left_v = vdup_n_u8(left[y]); + const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); + const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); + + const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); + const uint16x8_t weighted_left = vmull_u8(weights_x_v, left_v); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); + const uint16x4_t dest_0 = + CalculatePred(vget_low_u16(weighted_top), vget_low_u16(weighted_left), + vget_low_u16(weighted_tr), vget_low_u16(weighted_bl)); + + if (width == 4) { + StoreLo4(dst, vmovn_u16(vcombine_u16(dest_0, dest_0))); + } else { // width == 8 + const uint16x4_t dest_1 = CalculatePred( + vget_high_u16(weighted_top), vget_high_u16(weighted_left), + vget_high_u16(weighted_tr), vget_high_u16(weighted_bl)); + vst1_u8(dst, vmovn_u16(vcombine_u16(dest_0, dest_1))); + } + dst += stride; + } +} + +inline uint8x16_t CalculateWeightsAndPred( + const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right, + const uint8x8_t weights_y, const uint8x16_t weights_x, + const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) { + const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top)); + const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); + const uint16x8_t weighted_tr_low = + vmull_u8(vget_low_u8(scaled_weights_x), top_right); + const uint16x4_t dest_0 = CalculatePred( + vget_low_u16(weighted_top_low), vget_low_u16(weighted_left_low), + vget_low_u16(weighted_tr_low), vget_low_u16(weighted_bl)); + const uint16x4_t dest_1 = CalculatePred( + vget_high_u16(weighted_top_low), vget_high_u16(weighted_left_low), + vget_high_u16(weighted_tr_low), vget_high_u16(weighted_bl)); + const uint8x8_t dest_0_u8 = vmovn_u16(vcombine_u16(dest_0, dest_1)); + + const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top)); + const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); + const uint16x8_t weighted_tr_high = + vmull_u8(vget_high_u8(scaled_weights_x), top_right); + const uint16x4_t dest_2 = CalculatePred( + vget_low_u16(weighted_top_high), vget_low_u16(weighted_left_high), + vget_low_u16(weighted_tr_high), vget_low_u16(weighted_bl)); + const uint16x4_t dest_3 = CalculatePred( + vget_high_u16(weighted_top_high), vget_high_u16(weighted_left_high), + vget_high_u16(weighted_tr_high), vget_high_u16(weighted_bl)); + const uint8x8_t dest_1_u8 = vmovn_u16(vcombine_u16(dest_2, dest_3)); + + return vcombine_u8(dest_0_u8, dest_1_u8); +} + +template +inline void Smooth16PlusxN_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const uint8_t* const top = static_cast(top_row); + const uint8_t* const left = static_cast(left_column); + const uint8_t top_right = top[width - 1]; + const uint8_t bottom_left = left[height - 1]; + const uint8_t* const weights_y = kSmoothWeights + height - 4; + uint8_t* dst = static_cast(dest); + + uint8x16_t top_v[4]; + top_v[0] = vld1q_u8(top); + if (width > 16) { + top_v[1] = vld1q_u8(top + 16); + if (width == 64) { + top_v[2] = vld1q_u8(top + 32); + top_v[3] = vld1q_u8(top + 48); + } + } + + const uint8x8_t top_right_v = vdup_n_u8(top_right); + const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); + + // TODO(johannkoenig): Consider re-reading top_v and weights_x_v in the loop. + // This currently has a performance slope similar to Paeth so it does not + // appear to be register bound for arm64. + uint8x16_t weights_x_v[4]; + weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4); + if (width > 16) { + weights_x_v[1] = vld1q_u8(kSmoothWeights + width + 16 - 4); + if (width == 64) { + weights_x_v[2] = vld1q_u8(kSmoothWeights + width + 32 - 4); + weights_x_v[3] = vld1q_u8(kSmoothWeights + width + 48 - 4); + } + } + + uint8x16_t scaled_weights_x[4]; + scaled_weights_x[0] = + vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[0]))); + if (width > 16) { + scaled_weights_x[1] = + vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[1]))); + if (width == 64) { + scaled_weights_x[2] = + vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[2]))); + scaled_weights_x[3] = + vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[3]))); + } + } + + for (int y = 0; y < height; ++y) { + const uint8x8_t left_v = vdup_n_u8(left[y]); + const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); + const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); + + vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v, + weights_y_v, weights_x_v[0], + scaled_weights_x[0], weighted_bl)); + + if (width > 16) { + vst1q_u8(dst + 16, CalculateWeightsAndPred( + top_v[1], left_v, top_right_v, weights_y_v, + weights_x_v[1], scaled_weights_x[1], weighted_bl)); + if (width == 64) { + vst1q_u8(dst + 32, + CalculateWeightsAndPred(top_v[2], left_v, top_right_v, + weights_y_v, weights_x_v[2], + scaled_weights_x[2], weighted_bl)); + vst1q_u8(dst + 48, + CalculateWeightsAndPred(top_v[3], left_v, top_right_v, + weights_y_v, weights_x_v[3], + scaled_weights_x[3], weighted_bl)); + } + } + + dst += stride; + } +} + +template +inline void SmoothVertical4Or8xN_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const uint8_t* const top = static_cast(top_row); + const uint8_t* const left = static_cast(left_column); + const uint8_t bottom_left = left[height - 1]; + const uint8_t* const weights_y = kSmoothWeights + height - 4; + uint8_t* dst = static_cast(dest); + + uint8x8_t top_v; + if (width == 4) { + top_v = Load4(top); + } else { // width == 8 + top_v = vld1_u8(top); + } + + const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); + + for (int y = 0; y < height; ++y) { + const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); + const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + + const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); + const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); + const uint16x8_t pred = vaddq_u16(weighted_top, weighted_bl); + const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale); + + if (width == 4) { + StoreLo4(dst, pred_scaled); + } else { // width == 8 + vst1_u8(dst, pred_scaled); + } + dst += stride; + } +} + +inline uint8x16_t CalculateVerticalWeightsAndPred( + const uint8x16_t top, const uint8x8_t weights_y, + const uint16x8_t weighted_bl) { + const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top)); + const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top)); + const uint16x8_t pred_low = vaddq_u16(weighted_top_low, weighted_bl); + const uint16x8_t pred_high = vaddq_u16(weighted_top_high, weighted_bl); + const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale); + const uint8x8_t pred_scaled_high = + vrshrn_n_u16(pred_high, kSmoothWeightScale); + return vcombine_u8(pred_scaled_low, pred_scaled_high); +} + +template +inline void SmoothVertical16PlusxN_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const uint8_t* const top = static_cast(top_row); + const uint8_t* const left = static_cast(left_column); + const uint8_t bottom_left = left[height - 1]; + const uint8_t* const weights_y = kSmoothWeights + height - 4; + uint8_t* dst = static_cast(dest); + + uint8x16_t top_v[4]; + top_v[0] = vld1q_u8(top); + if (width > 16) { + top_v[1] = vld1q_u8(top + 16); + if (width == 64) { + top_v[2] = vld1q_u8(top + 32); + top_v[3] = vld1q_u8(top + 48); + } + } + + const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); + + for (int y = 0; y < height; ++y) { + const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); + const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]); + const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); + + const uint8x16_t pred_0 = + CalculateVerticalWeightsAndPred(top_v[0], weights_y_v, weighted_bl); + vst1q_u8(dst, pred_0); + + if (width > 16) { + const uint8x16_t pred_1 = + CalculateVerticalWeightsAndPred(top_v[1], weights_y_v, weighted_bl); + vst1q_u8(dst + 16, pred_1); + + if (width == 64) { + const uint8x16_t pred_2 = + CalculateVerticalWeightsAndPred(top_v[2], weights_y_v, weighted_bl); + vst1q_u8(dst + 32, pred_2); + + const uint8x16_t pred_3 = + CalculateVerticalWeightsAndPred(top_v[3], weights_y_v, weighted_bl); + vst1q_u8(dst + 48, pred_3); + } + } + + dst += stride; + } +} + +template +inline void SmoothHorizontal4Or8xN_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const uint8_t* const top = static_cast(top_row); + const uint8_t* const left = static_cast(left_column); + const uint8_t top_right = top[width - 1]; + uint8_t* dst = static_cast(dest); + + const uint8x8_t top_right_v = vdup_n_u8(top_right); + // Over-reads for 4xN but still within the array. + const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4); + // 256 - weights = vneg_s8(weights) + const uint8x8_t scaled_weights_x = + vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x))); + + for (int y = 0; y < height; ++y) { + const uint8x8_t left_v = vdup_n_u8(left[y]); + + const uint16x8_t weighted_left = vmull_u8(weights_x, left_v); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); + const uint16x8_t pred = vaddq_u16(weighted_left, weighted_tr); + const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale); + + if (width == 4) { + StoreLo4(dst, pred_scaled); + } else { // width == 8 + vst1_u8(dst, pred_scaled); + } + dst += stride; + } +} + +inline uint8x16_t CalculateHorizontalWeightsAndPred( + const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x, + const uint8x16_t scaled_weights_x) { + const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); + const uint16x8_t weighted_tr_low = + vmull_u8(vget_low_u8(scaled_weights_x), top_right); + const uint16x8_t pred_low = vaddq_u16(weighted_left_low, weighted_tr_low); + const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale); + + const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); + const uint16x8_t weighted_tr_high = + vmull_u8(vget_high_u8(scaled_weights_x), top_right); + const uint16x8_t pred_high = vaddq_u16(weighted_left_high, weighted_tr_high); + const uint8x8_t pred_scaled_high = + vrshrn_n_u16(pred_high, kSmoothWeightScale); + + return vcombine_u8(pred_scaled_low, pred_scaled_high); +} + +template +inline void SmoothHorizontal16PlusxN_NEON(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const uint8_t* const top = static_cast(top_row); + const uint8_t* const left = static_cast(left_column); + const uint8_t top_right = top[width - 1]; + uint8_t* dst = static_cast(dest); + + const uint8x8_t top_right_v = vdup_n_u8(top_right); + + uint8x16_t weights_x[4]; + weights_x[0] = vld1q_u8(kSmoothWeights + width - 4); + if (width > 16) { + weights_x[1] = vld1q_u8(kSmoothWeights + width + 16 - 4); + if (width == 64) { + weights_x[2] = vld1q_u8(kSmoothWeights + width + 32 - 4); + weights_x[3] = vld1q_u8(kSmoothWeights + width + 48 - 4); + } + } + + uint8x16_t scaled_weights_x[4]; + scaled_weights_x[0] = + vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[0]))); + if (width > 16) { + scaled_weights_x[1] = + vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[1]))); + if (width == 64) { + scaled_weights_x[2] = + vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[2]))); + scaled_weights_x[3] = + vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[3]))); + } + } + + for (int y = 0; y < height; ++y) { + const uint8x8_t left_v = vdup_n_u8(left[y]); + + const uint8x16_t pred_0 = CalculateHorizontalWeightsAndPred( + left_v, top_right_v, weights_x[0], scaled_weights_x[0]); + vst1q_u8(dst, pred_0); + + if (width > 16) { + const uint8x16_t pred_1 = CalculateHorizontalWeightsAndPred( + left_v, top_right_v, weights_x[1], scaled_weights_x[1]); + vst1q_u8(dst + 16, pred_1); + + if (width == 64) { + const uint8x16_t pred_2 = CalculateHorizontalWeightsAndPred( + left_v, top_right_v, weights_x[2], scaled_weights_x[2]); + vst1q_u8(dst + 32, pred_2); + + const uint8x16_t pred_3 = CalculateHorizontalWeightsAndPred( + left_v, top_right_v, weights_x[3], scaled_weights_x[3]); + vst1q_u8(dst + 48, pred_3); + } + } + dst += stride; + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + // 4x4 + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] = + Smooth4Or8xN_NEON<4, 4>; + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] = + SmoothVertical4Or8xN_NEON<4, 4>; + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal4Or8xN_NEON<4, 4>; + + // 4x8 + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] = + Smooth4Or8xN_NEON<4, 8>; + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] = + SmoothVertical4Or8xN_NEON<4, 8>; + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal4Or8xN_NEON<4, 8>; + + // 4x16 + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] = + Smooth4Or8xN_NEON<4, 16>; + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] = + SmoothVertical4Or8xN_NEON<4, 16>; + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal4Or8xN_NEON<4, 16>; + + // 8x4 + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] = + Smooth4Or8xN_NEON<8, 4>; + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] = + SmoothVertical4Or8xN_NEON<8, 4>; + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal4Or8xN_NEON<8, 4>; + + // 8x8 + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] = + Smooth4Or8xN_NEON<8, 8>; + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] = + SmoothVertical4Or8xN_NEON<8, 8>; + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal4Or8xN_NEON<8, 8>; + + // 8x16 + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] = + Smooth4Or8xN_NEON<8, 16>; + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] = + SmoothVertical4Or8xN_NEON<8, 16>; + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal4Or8xN_NEON<8, 16>; + + // 8x32 + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] = + Smooth4Or8xN_NEON<8, 32>; + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] = + SmoothVertical4Or8xN_NEON<8, 32>; + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal4Or8xN_NEON<8, 32>; + + // 16x4 + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] = + Smooth16PlusxN_NEON<16, 4>; + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] = + SmoothVertical16PlusxN_NEON<16, 4>; + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16PlusxN_NEON<16, 4>; + + // 16x8 + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] = + Smooth16PlusxN_NEON<16, 8>; + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] = + SmoothVertical16PlusxN_NEON<16, 8>; + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16PlusxN_NEON<16, 8>; + + // 16x16 + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] = + Smooth16PlusxN_NEON<16, 16>; + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] = + SmoothVertical16PlusxN_NEON<16, 16>; + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16PlusxN_NEON<16, 16>; + + // 16x32 + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] = + Smooth16PlusxN_NEON<16, 32>; + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] = + SmoothVertical16PlusxN_NEON<16, 32>; + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16PlusxN_NEON<16, 32>; + + // 16x64 + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] = + Smooth16PlusxN_NEON<16, 64>; + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] = + SmoothVertical16PlusxN_NEON<16, 64>; + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16PlusxN_NEON<16, 64>; + + // 32x8 + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] = + Smooth16PlusxN_NEON<32, 8>; + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] = + SmoothVertical16PlusxN_NEON<32, 8>; + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16PlusxN_NEON<32, 8>; + + // 32x16 + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] = + Smooth16PlusxN_NEON<32, 16>; + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] = + SmoothVertical16PlusxN_NEON<32, 16>; + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16PlusxN_NEON<32, 16>; + + // 32x32 + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] = + Smooth16PlusxN_NEON<32, 32>; + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] = + SmoothVertical16PlusxN_NEON<32, 32>; + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16PlusxN_NEON<32, 32>; + + // 32x64 + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] = + Smooth16PlusxN_NEON<32, 64>; + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] = + SmoothVertical16PlusxN_NEON<32, 64>; + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16PlusxN_NEON<32, 64>; + + // 64x16 + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] = + Smooth16PlusxN_NEON<64, 16>; + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] = + SmoothVertical16PlusxN_NEON<64, 16>; + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16PlusxN_NEON<64, 16>; + + // 64x32 + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] = + Smooth16PlusxN_NEON<64, 32>; + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] = + SmoothVertical16PlusxN_NEON<64, 32>; + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16PlusxN_NEON<64, 32>; + + // 64x64 + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] = + Smooth16PlusxN_NEON<64, 64>; + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] = + SmoothVertical16PlusxN_NEON<64, 64>; + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16PlusxN_NEON<64, 64>; +} + +} // namespace +} // namespace low_bitdepth + +void IntraPredSmoothInit_NEON() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void IntraPredSmoothInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc new file mode 100644 index 0000000..072991a --- /dev/null +++ b/src/dsp/arm/inverse_transform_neon.cc @@ -0,0 +1,3128 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/inverse_transform.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/array_2d.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +// Include the constants and utility functions inside the anonymous namespace. +#include "src/dsp/inverse_transform.inc" + +//------------------------------------------------------------------------------ + +// TODO(slavarnway): Move transpose functions to transpose_neon.h or +// common_neon.h. + +LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int16x8_t in[4], + int16x8_t out[4]) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + const int16x4_t a0 = vget_low_s16(in[0]); + const int16x4_t a1 = vget_low_s16(in[1]); + const int16x4_t a2 = vget_low_s16(in[2]); + const int16x4_t a3 = vget_low_s16(in[3]); + + const int16x4x2_t b0 = vtrn_s16(a0, a1); + const int16x4x2_t b1 = vtrn_s16(a2, a3); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), + vreinterpret_s32_s16(b1.val[0])); + const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), + vreinterpret_s32_s16(b1.val[1])); + + const int16x4_t d0 = vreinterpret_s16_s32(c0.val[0]); + const int16x4_t d1 = vreinterpret_s16_s32(c1.val[0]); + const int16x4_t d2 = vreinterpret_s16_s32(c0.val[1]); + const int16x4_t d3 = vreinterpret_s16_s32(c1.val[1]); + + out[0] = vcombine_s16(d0, d0); + out[1] = vcombine_s16(d1, d1); + out[2] = vcombine_s16(d2, d2); + out[3] = vcombine_s16(d3, d3); +} + +// Note this is only used in the final stage of Dct32/64 and Adst16 as the in +// place version causes additional stack usage with clang. +LIBGAV1_ALWAYS_INLINE void Transpose8x8(const int16x8_t in[8], + int16x8_t out[8]) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]); + const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]); + const int16x8x2_t b2 = vtrnq_s16(in[4], in[5]); + const int16x8x2_t b3 = vtrnq_s16(in[6], in[7]); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]); + + out[0] = d0.val[0]; + out[1] = d1.val[0]; + out[2] = d2.val[0]; + out[3] = d3.val[0]; + out[4] = d0.val[1]; + out[5] = d1.val[1]; + out[6] = d2.val[1]; + out[7] = d3.val[1]; +} + +LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4(const uint16x8_t in[8], + uint16x8_t out[4]) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // a4: 40 41 42 43 + // a5: 50 51 52 53 + // a6: 60 61 62 63 + // a7: 70 71 72 73 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + // b2.val[0]: 40 50 42 52 + // b2.val[1]: 41 51 43 53 + // b3.val[0]: 60 70 62 72 + // b3.val[1]: 61 71 63 73 + + uint16x4x2_t b0 = vtrn_u16(vget_low_u16(in[0]), vget_low_u16(in[1])); + uint16x4x2_t b1 = vtrn_u16(vget_low_u16(in[2]), vget_low_u16(in[3])); + uint16x4x2_t b2 = vtrn_u16(vget_low_u16(in[4]), vget_low_u16(in[5])); + uint16x4x2_t b3 = vtrn_u16(vget_low_u16(in[6]), vget_low_u16(in[7])); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + // c2.val[0]: 40 50 60 70 + // c2.val[1]: 42 52 62 72 + // c3.val[0]: 41 51 61 71 + // c3.val[1]: 43 53 63 73 + + uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), + vreinterpret_u32_u16(b1.val[0])); + uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]), + vreinterpret_u32_u16(b1.val[1])); + uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]), + vreinterpret_u32_u16(b3.val[0])); + uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]), + vreinterpret_u32_u16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // o0: 00 10 20 30 40 50 60 70 + // o1: 01 11 21 31 41 51 61 71 + // o2: 02 12 22 32 42 52 62 72 + // o3: 03 13 23 33 43 53 63 73 + + out[0] = vcombine_u16(vreinterpret_u16_u32(c0.val[0]), + vreinterpret_u16_u32(c2.val[0])); + out[1] = vcombine_u16(vreinterpret_u16_u32(c1.val[0]), + vreinterpret_u16_u32(c3.val[0])); + out[2] = vcombine_u16(vreinterpret_u16_u32(c0.val[1]), + vreinterpret_u16_u32(c2.val[1])); + out[3] = vcombine_u16(vreinterpret_u16_u32(c1.val[1]), + vreinterpret_u16_u32(c3.val[1])); +} + +LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4(const int16x8_t in[8], + int16x8_t out[4]) { + Transpose4x8To8x4(reinterpret_cast(in), + reinterpret_cast(out)); +} + +LIBGAV1_ALWAYS_INLINE void Transpose8x4To4x8(const int16x8_t in[4], + int16x8_t out[8]) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]); + const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + + // The upper 8 bytes are don't cares. + // out[0]: 00 10 20 30 04 14 24 34 + // out[1]: 01 11 21 31 05 15 25 35 + // out[2]: 02 12 22 32 06 16 26 36 + // out[3]: 03 13 23 33 07 17 27 37 + // out[4]: 04 14 24 34 04 14 24 34 + // out[5]: 05 15 25 35 05 15 25 35 + // out[6]: 06 16 26 36 06 16 26 36 + // out[7]: 07 17 27 37 07 17 27 37 + out[0] = vreinterpretq_s16_s32(c0.val[0]); + out[1] = vreinterpretq_s16_s32(c1.val[0]); + out[2] = vreinterpretq_s16_s32(c0.val[1]); + out[3] = vreinterpretq_s16_s32(c1.val[1]); + out[4] = vreinterpretq_s16_s32( + vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[0]))); + out[5] = vreinterpretq_s16_s32( + vcombine_s32(vget_high_s32(c1.val[0]), vget_high_s32(c1.val[0]))); + out[6] = vreinterpretq_s16_s32( + vcombine_s32(vget_high_s32(c0.val[1]), vget_high_s32(c0.val[1]))); + out[7] = vreinterpretq_s16_s32( + vcombine_s32(vget_high_s32(c1.val[1]), vget_high_s32(c1.val[1]))); +} + +//------------------------------------------------------------------------------ +template +LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx, + const int16x8_t* const s) { + assert(store_count % 4 == 0); + assert(store_width == 8 || store_width == 16); + // NOTE: It is expected that the compiler will unroll these loops. + if (store_width == 16) { + for (int i = 0; i < store_count; i += 4) { + vst1q_s16(&dst[i * stride + idx], (s[i])); + vst1q_s16(&dst[(i + 1) * stride + idx], (s[i + 1])); + vst1q_s16(&dst[(i + 2) * stride + idx], (s[i + 2])); + vst1q_s16(&dst[(i + 3) * stride + idx], (s[i + 3])); + } + } else { + // store_width == 8 + for (int i = 0; i < store_count; i += 4) { + vst1_s16(&dst[i * stride + idx], vget_low_s16(s[i])); + vst1_s16(&dst[(i + 1) * stride + idx], vget_low_s16(s[i + 1])); + vst1_s16(&dst[(i + 2) * stride + idx], vget_low_s16(s[i + 2])); + vst1_s16(&dst[(i + 3) * stride + idx], vget_low_s16(s[i + 3])); + } + } +} + +template +LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* src, int32_t stride, + int32_t idx, int16x8_t* x) { + assert(load_count % 4 == 0); + assert(load_width == 8 || load_width == 16); + // NOTE: It is expected that the compiler will unroll these loops. + if (load_width == 16) { + for (int i = 0; i < load_count; i += 4) { + x[i] = vld1q_s16(&src[i * stride + idx]); + x[i + 1] = vld1q_s16(&src[(i + 1) * stride + idx]); + x[i + 2] = vld1q_s16(&src[(i + 2) * stride + idx]); + x[i + 3] = vld1q_s16(&src[(i + 3) * stride + idx]); + } + } else { + // load_width == 8 + const int64x2_t zero = vdupq_n_s64(0); + for (int i = 0; i < load_count; i += 4) { + // The src buffer is aligned to 32 bytes. Each load will always be 8 + // byte aligned. + x[i] = vreinterpretq_s16_s64(vld1q_lane_s64( + reinterpret_cast(&src[i * stride + idx]), zero, 0)); + x[i + 1] = vreinterpretq_s16_s64(vld1q_lane_s64( + reinterpret_cast(&src[(i + 1) * stride + idx]), zero, + 0)); + x[i + 2] = vreinterpretq_s16_s64(vld1q_lane_s64( + reinterpret_cast(&src[(i + 2) * stride + idx]), zero, + 0)); + x[i + 3] = vreinterpretq_s16_s64(vld1q_lane_s64( + reinterpret_cast(&src[(i + 3) * stride + idx]), zero, + 0)); + } + } +} + +// Butterfly rotate 4 values. +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int16x8_t* a, int16x8_t* b, + const int angle, + const bool flip) { + const int16_t cos128 = Cos128(angle); + const int16_t sin128 = Sin128(angle); + const int32x4_t acc_x = vmull_n_s16(vget_low_s16(*a), cos128); + const int32x4_t acc_y = vmull_n_s16(vget_low_s16(*a), sin128); + const int32x4_t x0 = vmlsl_n_s16(acc_x, vget_low_s16(*b), sin128); + const int32x4_t y0 = vmlal_n_s16(acc_y, vget_low_s16(*b), cos128); + const int16x4_t x1 = vqrshrn_n_s32(x0, 12); + const int16x4_t y1 = vqrshrn_n_s32(y0, 12); + const int16x8_t x = vcombine_s16(x1, x1); + const int16x8_t y = vcombine_s16(y1, y1); + if (flip) { + *a = y; + *b = x; + } else { + *a = x; + *b = y; + } +} + +// Butterfly rotate 8 values. +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(int16x8_t* a, int16x8_t* b, + const int angle, + const bool flip) { + const int16_t cos128 = Cos128(angle); + const int16_t sin128 = Sin128(angle); + const int32x4_t acc_x = vmull_n_s16(vget_low_s16(*a), cos128); + const int32x4_t acc_y = vmull_n_s16(vget_low_s16(*a), sin128); + const int32x4_t x0 = vmlsl_n_s16(acc_x, vget_low_s16(*b), sin128); + const int32x4_t y0 = vmlal_n_s16(acc_y, vget_low_s16(*b), cos128); + const int16x4_t x1 = vqrshrn_n_s32(x0, 12); + const int16x4_t y1 = vqrshrn_n_s32(y0, 12); + + const int32x4_t acc_x_hi = vmull_n_s16(vget_high_s16(*a), cos128); + const int32x4_t acc_y_hi = vmull_n_s16(vget_high_s16(*a), sin128); + const int32x4_t x0_hi = vmlsl_n_s16(acc_x_hi, vget_high_s16(*b), sin128); + const int32x4_t y0_hi = vmlal_n_s16(acc_y_hi, vget_high_s16(*b), cos128); + const int16x4_t x1_hi = vqrshrn_n_s32(x0_hi, 12); + const int16x4_t y1_hi = vqrshrn_n_s32(y0_hi, 12); + + const int16x8_t x = vcombine_s16(x1, x1_hi); + const int16x8_t y = vcombine_s16(y1, y1_hi); + if (flip) { + *a = y; + *b = x; + } else { + *a = x; + *b = y; + } +} + +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int16x8_t* a, + int16x8_t* b, + const int angle, + const bool flip) { + const int16_t cos128 = Cos128(angle); + const int16_t sin128 = Sin128(angle); + // For this function, the max value returned by Sin128() is 4091, which fits + // inside 12 bits. This leaves room for the sign bit and the 3 left shifted + // bits. + assert(sin128 <= 0xfff); + const int16x8_t x = vqrdmulhq_n_s16(*b, -sin128 << 3); + const int16x8_t y = vqrdmulhq_n_s16(*b, cos128 << 3); + if (flip) { + *a = y; + *b = x; + } else { + *a = x; + *b = y; + } +} + +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int16x8_t* a, + int16x8_t* b, + const int angle, + const bool flip) { + const int16_t cos128 = Cos128(angle); + const int16_t sin128 = Sin128(angle); + const int16x8_t x = vqrdmulhq_n_s16(*a, cos128 << 3); + const int16x8_t y = vqrdmulhq_n_s16(*a, sin128 << 3); + if (flip) { + *a = y; + *b = x; + } else { + *a = x; + *b = y; + } +} + +LIBGAV1_ALWAYS_INLINE void HadamardRotation(int16x8_t* a, int16x8_t* b, + bool flip) { + int16x8_t x, y; + if (flip) { + y = vqaddq_s16(*b, *a); + x = vqsubq_s16(*b, *a); + } else { + x = vqaddq_s16(*a, *b); + y = vqsubq_s16(*a, *b); + } + *a = x; + *b = y; +} + +using ButterflyRotationFunc = void (*)(int16x8_t* a, int16x8_t* b, int angle, + bool flip); + +//------------------------------------------------------------------------------ +// Discrete Cosine Transforms (DCT). + +template +LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const int16x8_t v_src = vdupq_n_s16(dst[0]); + const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0); + const int16x8_t v_src_round = + vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3); + const int16x8_t s0 = vbslq_s16(v_mask, v_src_round, v_src); + const int16_t cos128 = Cos128(32); + const int16x8_t xy = vqrdmulhq_n_s16(s0, cos128 << 3); + // vqrshlq_s16 will shift right if shift value is negative. + const int16x8_t xy_shifted = vqrshlq_s16(xy, vdupq_n_s16(-row_shift)); + + if (width == 4) { + vst1_s16(dst, vget_low_s16(xy_shifted)); + } else { + for (int i = 0; i < width; i += 8) { + vst1q_s16(dst, xy_shifted); + dst += 8; + } + } + return true; +} + +template +LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const int16_t cos128 = Cos128(32); + + // Calculate dc values for first row. + if (width == 4) { + const int16x4_t v_src = vld1_s16(dst); + const int16x4_t xy = vqrdmulh_n_s16(v_src, cos128 << 3); + vst1_s16(dst, xy); + } else { + int i = 0; + do { + const int16x8_t v_src = vld1q_s16(&dst[i]); + const int16x8_t xy = vqrdmulhq_n_s16(v_src, cos128 << 3); + vst1q_s16(&dst[i], xy); + i += 8; + } while (i < width); + } + + // Copy first row to the rest of the block. + for (int y = 1; y < height; ++y) { + memcpy(&dst[y * width], dst, width * sizeof(dst[0])); + } + return true; +} + +template +LIBGAV1_ALWAYS_INLINE void Dct4Stages(int16x8_t* s) { + // stage 12. + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true); + ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false); + } else { + butterfly_rotation(&s[0], &s[1], 32, true); + butterfly_rotation(&s[2], &s[3], 48, false); + } + + // stage 17. + HadamardRotation(&s[0], &s[3], false); + HadamardRotation(&s[1], &s[2], false); +} + +template +LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) { + auto* const dst = static_cast(dest); + int16x8_t s[4], x[4]; + + if (stage_is_rectangular) { + if (transpose) { + int16x8_t input[8]; + LoadSrc<8, 8>(dst, step, 0, input); + Transpose4x8To8x4(input, x); + } else { + LoadSrc<16, 4>(dst, step, 0, x); + } + } else { + LoadSrc<8, 4>(dst, step, 0, x); + if (transpose) { + Transpose4x4(x, x); + } + } + + // stage 1. + // kBitReverseLookup 0, 2, 1, 3 + s[0] = x[0]; + s[1] = x[2]; + s[2] = x[1]; + s[3] = x[3]; + + Dct4Stages(s); + + if (stage_is_rectangular) { + if (transpose) { + int16x8_t output[8]; + Transpose8x4To4x8(s, output); + StoreDst<8, 8>(dst, step, 0, output); + } else { + StoreDst<16, 4>(dst, step, 0, s); + } + } else { + if (transpose) { + Transpose4x4(s, s); + } + StoreDst<8, 4>(dst, step, 0, s); + } +} + +template +LIBGAV1_ALWAYS_INLINE void Dct8Stages(int16x8_t* s) { + // stage 8. + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false); + ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false); + } else { + butterfly_rotation(&s[4], &s[7], 56, false); + butterfly_rotation(&s[5], &s[6], 24, false); + } + + // stage 13. + HadamardRotation(&s[4], &s[5], false); + HadamardRotation(&s[6], &s[7], true); + + // stage 18. + butterfly_rotation(&s[6], &s[5], 32, true); + + // stage 22. + HadamardRotation(&s[0], &s[7], false); + HadamardRotation(&s[1], &s[6], false); + HadamardRotation(&s[2], &s[5], false); + HadamardRotation(&s[3], &s[4], false); +} + +// Process dct8 rows or columns, depending on the transpose flag. +template +LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool transpose) { + auto* const dst = static_cast(dest); + int16x8_t s[8], x[8]; + + if (stage_is_rectangular) { + if (transpose) { + int16x8_t input[4]; + LoadSrc<16, 4>(dst, step, 0, input); + Transpose8x4To4x8(input, x); + } else { + LoadSrc<8, 8>(dst, step, 0, x); + } + } else if (transpose) { + LoadSrc<16, 8>(dst, step, 0, x); + dsp::Transpose8x8(x); + } else { + LoadSrc<16, 8>(dst, step, 0, x); + } + + // stage 1. + // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7, + s[0] = x[0]; + s[1] = x[4]; + s[2] = x[2]; + s[3] = x[6]; + s[4] = x[1]; + s[5] = x[5]; + s[6] = x[3]; + s[7] = x[7]; + + Dct4Stages(s); + Dct8Stages(s); + + if (stage_is_rectangular) { + if (transpose) { + int16x8_t output[4]; + Transpose4x8To8x4(s, output); + StoreDst<16, 4>(dst, step, 0, output); + } else { + StoreDst<8, 8>(dst, step, 0, s); + } + } else if (transpose) { + dsp::Transpose8x8(s); + StoreDst<16, 8>(dst, step, 0, s); + } else { + StoreDst<16, 8>(dst, step, 0, s); + } +} + +template +LIBGAV1_ALWAYS_INLINE void Dct16Stages(int16x8_t* s) { + // stage 5. + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false); + ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false); + ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false); + ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false); + } else { + butterfly_rotation(&s[8], &s[15], 60, false); + butterfly_rotation(&s[9], &s[14], 28, false); + butterfly_rotation(&s[10], &s[13], 44, false); + butterfly_rotation(&s[11], &s[12], 12, false); + } + + // stage 9. + HadamardRotation(&s[8], &s[9], false); + HadamardRotation(&s[10], &s[11], true); + HadamardRotation(&s[12], &s[13], false); + HadamardRotation(&s[14], &s[15], true); + + // stage 14. + butterfly_rotation(&s[14], &s[9], 48, true); + butterfly_rotation(&s[13], &s[10], 112, true); + + // stage 19. + HadamardRotation(&s[8], &s[11], false); + HadamardRotation(&s[9], &s[10], false); + HadamardRotation(&s[12], &s[15], true); + HadamardRotation(&s[13], &s[14], true); + + // stage 23. + butterfly_rotation(&s[13], &s[10], 32, true); + butterfly_rotation(&s[12], &s[11], 32, true); + + // stage 26. + HadamardRotation(&s[0], &s[15], false); + HadamardRotation(&s[1], &s[14], false); + HadamardRotation(&s[2], &s[13], false); + HadamardRotation(&s[3], &s[12], false); + HadamardRotation(&s[4], &s[11], false); + HadamardRotation(&s[5], &s[10], false); + HadamardRotation(&s[6], &s[9], false); + HadamardRotation(&s[7], &s[8], false); +} + +// Process dct16 rows or columns, depending on the transpose flag. +template +LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row, + int row_shift) { + auto* const dst = static_cast(dest); + int16x8_t s[16], x[16]; + + if (stage_is_rectangular) { + if (is_row) { + int16x8_t input[4]; + LoadSrc<16, 4>(dst, step, 0, input); + Transpose8x4To4x8(input, x); + LoadSrc<16, 4>(dst, step, 8, input); + Transpose8x4To4x8(input, &x[8]); + } else { + LoadSrc<8, 16>(dst, step, 0, x); + } + } else if (is_row) { + for (int idx = 0; idx < 16; idx += 8) { + LoadSrc<16, 8>(dst, step, idx, &x[idx]); + dsp::Transpose8x8(&x[idx]); + } + } else { + LoadSrc<16, 16>(dst, step, 0, x); + } + + // stage 1 + // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15, + s[0] = x[0]; + s[1] = x[8]; + s[2] = x[4]; + s[3] = x[12]; + s[4] = x[2]; + s[5] = x[10]; + s[6] = x[6]; + s[7] = x[14]; + s[8] = x[1]; + s[9] = x[9]; + s[10] = x[5]; + s[11] = x[13]; + s[12] = x[3]; + s[13] = x[11]; + s[14] = x[7]; + s[15] = x[15]; + + Dct4Stages(s); + Dct8Stages(s); + Dct16Stages(s); + + if (is_row) { + const int16x8_t v_row_shift = vdupq_n_s16(-row_shift); + for (int i = 0; i < 16; ++i) { + s[i] = vqrshlq_s16(s[i], v_row_shift); + } + } + + if (stage_is_rectangular) { + if (is_row) { + int16x8_t output[4]; + Transpose4x8To8x4(s, output); + StoreDst<16, 4>(dst, step, 0, output); + Transpose4x8To8x4(&s[8], output); + StoreDst<16, 4>(dst, step, 8, output); + } else { + StoreDst<8, 16>(dst, step, 0, s); + } + } else if (is_row) { + for (int idx = 0; idx < 16; idx += 8) { + dsp::Transpose8x8(&s[idx]); + StoreDst<16, 8>(dst, step, idx, &s[idx]); + } + } else { + StoreDst<16, 16>(dst, step, 0, s); + } +} + +template +LIBGAV1_ALWAYS_INLINE void Dct32Stages(int16x8_t* s) { + // stage 3 + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false); + ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false); + ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false); + ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false); + ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false); + ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false); + ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false); + ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false); + } else { + butterfly_rotation(&s[16], &s[31], 62, false); + butterfly_rotation(&s[17], &s[30], 30, false); + butterfly_rotation(&s[18], &s[29], 46, false); + butterfly_rotation(&s[19], &s[28], 14, false); + butterfly_rotation(&s[20], &s[27], 54, false); + butterfly_rotation(&s[21], &s[26], 22, false); + butterfly_rotation(&s[22], &s[25], 38, false); + butterfly_rotation(&s[23], &s[24], 6, false); + } + // stage 6. + HadamardRotation(&s[16], &s[17], false); + HadamardRotation(&s[18], &s[19], true); + HadamardRotation(&s[20], &s[21], false); + HadamardRotation(&s[22], &s[23], true); + HadamardRotation(&s[24], &s[25], false); + HadamardRotation(&s[26], &s[27], true); + HadamardRotation(&s[28], &s[29], false); + HadamardRotation(&s[30], &s[31], true); + + // stage 10. + butterfly_rotation(&s[30], &s[17], 24 + 32, true); + butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true); + butterfly_rotation(&s[26], &s[21], 24, true); + butterfly_rotation(&s[25], &s[22], 24 + 64, true); + + // stage 15. + HadamardRotation(&s[16], &s[19], false); + HadamardRotation(&s[17], &s[18], false); + HadamardRotation(&s[20], &s[23], true); + HadamardRotation(&s[21], &s[22], true); + HadamardRotation(&s[24], &s[27], false); + HadamardRotation(&s[25], &s[26], false); + HadamardRotation(&s[28], &s[31], true); + HadamardRotation(&s[29], &s[30], true); + + // stage 20. + butterfly_rotation(&s[29], &s[18], 48, true); + butterfly_rotation(&s[28], &s[19], 48, true); + butterfly_rotation(&s[27], &s[20], 48 + 64, true); + butterfly_rotation(&s[26], &s[21], 48 + 64, true); + + // stage 24. + HadamardRotation(&s[16], &s[23], false); + HadamardRotation(&s[17], &s[22], false); + HadamardRotation(&s[18], &s[21], false); + HadamardRotation(&s[19], &s[20], false); + HadamardRotation(&s[24], &s[31], true); + HadamardRotation(&s[25], &s[30], true); + HadamardRotation(&s[26], &s[29], true); + HadamardRotation(&s[27], &s[28], true); + + // stage 27. + butterfly_rotation(&s[27], &s[20], 32, true); + butterfly_rotation(&s[26], &s[21], 32, true); + butterfly_rotation(&s[25], &s[22], 32, true); + butterfly_rotation(&s[24], &s[23], 32, true); + + // stage 29. + HadamardRotation(&s[0], &s[31], false); + HadamardRotation(&s[1], &s[30], false); + HadamardRotation(&s[2], &s[29], false); + HadamardRotation(&s[3], &s[28], false); + HadamardRotation(&s[4], &s[27], false); + HadamardRotation(&s[5], &s[26], false); + HadamardRotation(&s[6], &s[25], false); + HadamardRotation(&s[7], &s[24], false); + HadamardRotation(&s[8], &s[23], false); + HadamardRotation(&s[9], &s[22], false); + HadamardRotation(&s[10], &s[21], false); + HadamardRotation(&s[11], &s[20], false); + HadamardRotation(&s[12], &s[19], false); + HadamardRotation(&s[13], &s[18], false); + HadamardRotation(&s[14], &s[17], false); + HadamardRotation(&s[15], &s[16], false); +} + +// Process dct32 rows or columns, depending on the transpose flag. +LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step, + const bool is_row, int row_shift) { + auto* const dst = static_cast(dest); + int16x8_t s[32], x[32]; + + if (is_row) { + for (int idx = 0; idx < 32; idx += 8) { + LoadSrc<16, 8>(dst, step, idx, &x[idx]); + dsp::Transpose8x8(&x[idx]); + } + } else { + LoadSrc<16, 32>(dst, step, 0, x); + } + + // stage 1 + // kBitReverseLookup + // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30, + s[0] = x[0]; + s[1] = x[16]; + s[2] = x[8]; + s[3] = x[24]; + s[4] = x[4]; + s[5] = x[20]; + s[6] = x[12]; + s[7] = x[28]; + s[8] = x[2]; + s[9] = x[18]; + s[10] = x[10]; + s[11] = x[26]; + s[12] = x[6]; + s[13] = x[22]; + s[14] = x[14]; + s[15] = x[30]; + + // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31, + s[16] = x[1]; + s[17] = x[17]; + s[18] = x[9]; + s[19] = x[25]; + s[20] = x[5]; + s[21] = x[21]; + s[22] = x[13]; + s[23] = x[29]; + s[24] = x[3]; + s[25] = x[19]; + s[26] = x[11]; + s[27] = x[27]; + s[28] = x[7]; + s[29] = x[23]; + s[30] = x[15]; + s[31] = x[31]; + + Dct4Stages(s); + Dct8Stages(s); + Dct16Stages(s); + Dct32Stages(s); + + if (is_row) { + const int16x8_t v_row_shift = vdupq_n_s16(-row_shift); + for (int idx = 0; idx < 32; idx += 8) { + int16x8_t output[8]; + Transpose8x8(&s[idx], output); + for (int i = 0; i < 8; ++i) { + output[i] = vqrshlq_s16(output[i], v_row_shift); + } + StoreDst<16, 8>(dst, step, idx, output); + } + } else { + StoreDst<16, 32>(dst, step, 0, s); + } +} + +// Allow the compiler to call this function instead of force inlining. Tests +// show the performance is slightly faster. +void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) { + auto* const dst = static_cast(dest); + int16x8_t s[64], x[32]; + + if (is_row) { + // The last 32 values of every row are always zero if the |tx_width| is + // 64. + for (int idx = 0; idx < 32; idx += 8) { + LoadSrc<16, 8>(dst, step, idx, &x[idx]); + dsp::Transpose8x8(&x[idx]); + } + } else { + // The last 32 values of every column are always zero if the |tx_height| is + // 64. + LoadSrc<16, 32>(dst, step, 0, x); + } + + // stage 1 + // kBitReverseLookup + // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60, + s[0] = x[0]; + s[2] = x[16]; + s[4] = x[8]; + s[6] = x[24]; + s[8] = x[4]; + s[10] = x[20]; + s[12] = x[12]; + s[14] = x[28]; + + // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62, + s[16] = x[2]; + s[18] = x[18]; + s[20] = x[10]; + s[22] = x[26]; + s[24] = x[6]; + s[26] = x[22]; + s[28] = x[14]; + s[30] = x[30]; + + // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61, + s[32] = x[1]; + s[34] = x[17]; + s[36] = x[9]; + s[38] = x[25]; + s[40] = x[5]; + s[42] = x[21]; + s[44] = x[13]; + s[46] = x[29]; + + // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63 + s[48] = x[3]; + s[50] = x[19]; + s[52] = x[11]; + s[54] = x[27]; + s[56] = x[7]; + s[58] = x[23]; + s[60] = x[15]; + s[62] = x[31]; + + Dct4Stages(s); + Dct8Stages(s); + Dct16Stages(s); + Dct32Stages(s); + + //-- start dct 64 stages + // stage 2. + ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false); + ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false); + ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false); + ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false); + ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false); + ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false); + ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false); + ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false); + ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false); + ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false); + ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false); + ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false); + ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false); + ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false); + ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false); + ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false); + + // stage 4. + HadamardRotation(&s[32], &s[33], false); + HadamardRotation(&s[34], &s[35], true); + HadamardRotation(&s[36], &s[37], false); + HadamardRotation(&s[38], &s[39], true); + HadamardRotation(&s[40], &s[41], false); + HadamardRotation(&s[42], &s[43], true); + HadamardRotation(&s[44], &s[45], false); + HadamardRotation(&s[46], &s[47], true); + HadamardRotation(&s[48], &s[49], false); + HadamardRotation(&s[50], &s[51], true); + HadamardRotation(&s[52], &s[53], false); + HadamardRotation(&s[54], &s[55], true); + HadamardRotation(&s[56], &s[57], false); + HadamardRotation(&s[58], &s[59], true); + HadamardRotation(&s[60], &s[61], false); + HadamardRotation(&s[62], &s[63], true); + + // stage 7. + ButterflyRotation_8(&s[62], &s[33], 60 - 0, true); + ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true); + ButterflyRotation_8(&s[58], &s[37], 60 - 32, true); + ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true); + ButterflyRotation_8(&s[54], &s[41], 60 - 16, true); + ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true); + ButterflyRotation_8(&s[50], &s[45], 60 - 48, true); + ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true); + + // stage 11. + HadamardRotation(&s[32], &s[35], false); + HadamardRotation(&s[33], &s[34], false); + HadamardRotation(&s[36], &s[39], true); + HadamardRotation(&s[37], &s[38], true); + HadamardRotation(&s[40], &s[43], false); + HadamardRotation(&s[41], &s[42], false); + HadamardRotation(&s[44], &s[47], true); + HadamardRotation(&s[45], &s[46], true); + HadamardRotation(&s[48], &s[51], false); + HadamardRotation(&s[49], &s[50], false); + HadamardRotation(&s[52], &s[55], true); + HadamardRotation(&s[53], &s[54], true); + HadamardRotation(&s[56], &s[59], false); + HadamardRotation(&s[57], &s[58], false); + HadamardRotation(&s[60], &s[63], true); + HadamardRotation(&s[61], &s[62], true); + + // stage 16. + ButterflyRotation_8(&s[61], &s[34], 56, true); + ButterflyRotation_8(&s[60], &s[35], 56, true); + ButterflyRotation_8(&s[59], &s[36], 56 + 64, true); + ButterflyRotation_8(&s[58], &s[37], 56 + 64, true); + ButterflyRotation_8(&s[53], &s[42], 56 - 32, true); + ButterflyRotation_8(&s[52], &s[43], 56 - 32, true); + ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true); + ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true); + + // stage 21. + HadamardRotation(&s[32], &s[39], false); + HadamardRotation(&s[33], &s[38], false); + HadamardRotation(&s[34], &s[37], false); + HadamardRotation(&s[35], &s[36], false); + HadamardRotation(&s[40], &s[47], true); + HadamardRotation(&s[41], &s[46], true); + HadamardRotation(&s[42], &s[45], true); + HadamardRotation(&s[43], &s[44], true); + HadamardRotation(&s[48], &s[55], false); + HadamardRotation(&s[49], &s[54], false); + HadamardRotation(&s[50], &s[53], false); + HadamardRotation(&s[51], &s[52], false); + HadamardRotation(&s[56], &s[63], true); + HadamardRotation(&s[57], &s[62], true); + HadamardRotation(&s[58], &s[61], true); + HadamardRotation(&s[59], &s[60], true); + + // stage 25. + ButterflyRotation_8(&s[59], &s[36], 48, true); + ButterflyRotation_8(&s[58], &s[37], 48, true); + ButterflyRotation_8(&s[57], &s[38], 48, true); + ButterflyRotation_8(&s[56], &s[39], 48, true); + ButterflyRotation_8(&s[55], &s[40], 112, true); + ButterflyRotation_8(&s[54], &s[41], 112, true); + ButterflyRotation_8(&s[53], &s[42], 112, true); + ButterflyRotation_8(&s[52], &s[43], 112, true); + + // stage 28. + HadamardRotation(&s[32], &s[47], false); + HadamardRotation(&s[33], &s[46], false); + HadamardRotation(&s[34], &s[45], false); + HadamardRotation(&s[35], &s[44], false); + HadamardRotation(&s[36], &s[43], false); + HadamardRotation(&s[37], &s[42], false); + HadamardRotation(&s[38], &s[41], false); + HadamardRotation(&s[39], &s[40], false); + HadamardRotation(&s[48], &s[63], true); + HadamardRotation(&s[49], &s[62], true); + HadamardRotation(&s[50], &s[61], true); + HadamardRotation(&s[51], &s[60], true); + HadamardRotation(&s[52], &s[59], true); + HadamardRotation(&s[53], &s[58], true); + HadamardRotation(&s[54], &s[57], true); + HadamardRotation(&s[55], &s[56], true); + + // stage 30. + ButterflyRotation_8(&s[55], &s[40], 32, true); + ButterflyRotation_8(&s[54], &s[41], 32, true); + ButterflyRotation_8(&s[53], &s[42], 32, true); + ButterflyRotation_8(&s[52], &s[43], 32, true); + ButterflyRotation_8(&s[51], &s[44], 32, true); + ButterflyRotation_8(&s[50], &s[45], 32, true); + ButterflyRotation_8(&s[49], &s[46], 32, true); + ButterflyRotation_8(&s[48], &s[47], 32, true); + + // stage 31. + for (int i = 0; i < 32; i += 4) { + HadamardRotation(&s[i], &s[63 - i], false); + HadamardRotation(&s[i + 1], &s[63 - i - 1], false); + HadamardRotation(&s[i + 2], &s[63 - i - 2], false); + HadamardRotation(&s[i + 3], &s[63 - i - 3], false); + } + //-- end dct 64 stages + + if (is_row) { + const int16x8_t v_row_shift = vdupq_n_s16(-row_shift); + for (int idx = 0; idx < 64; idx += 8) { + int16x8_t output[8]; + Transpose8x8(&s[idx], output); + for (int i = 0; i < 8; ++i) { + output[i] = vqrshlq_s16(output[i], v_row_shift); + } + StoreDst<16, 8>(dst, step, idx, output); + } + } else { + StoreDst<16, 64>(dst, step, 0, s); + } +} + +//------------------------------------------------------------------------------ +// Asymmetric Discrete Sine Transforms (ADST). +template +LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, + bool transpose) { + auto* const dst = static_cast(dest); + int32x4_t s[8]; + int16x8_t x[4]; + + if (stage_is_rectangular) { + if (transpose) { + int16x8_t input[8]; + LoadSrc<8, 8>(dst, step, 0, input); + Transpose4x8To8x4(input, x); + } else { + LoadSrc<16, 4>(dst, step, 0, x); + } + } else { + LoadSrc<8, 4>(dst, step, 0, x); + if (transpose) { + Transpose4x4(x, x); + } + } + + // stage 1. + s[5] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[1]); + s[6] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[3]); + + // stage 2. + const int32x4_t a7 = vsubl_s16(vget_low_s16(x[0]), vget_low_s16(x[2])); + const int32x4_t b7 = vaddw_s16(a7, vget_low_s16(x[3])); + + // stage 3. + s[0] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[0]); + s[1] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[1]); + // s[0] = s[0] + s[3] + s[0] = vmlal_n_s16(s[0], vget_low_s16(x[2]), kAdst4Multiplier[3]); + // s[1] = s[1] - s[4] + s[1] = vmlsl_n_s16(s[1], vget_low_s16(x[2]), kAdst4Multiplier[0]); + + s[3] = vmull_n_s16(vget_low_s16(x[1]), kAdst4Multiplier[2]); + s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]); + + // stage 4. + s[0] = vaddq_s32(s[0], s[5]); + s[1] = vsubq_s32(s[1], s[6]); + + // stages 5 and 6. + const int32x4_t x0 = vaddq_s32(s[0], s[3]); + const int32x4_t x1 = vaddq_s32(s[1], s[3]); + const int32x4_t x3_a = vaddq_s32(s[0], s[1]); + const int32x4_t x3 = vsubq_s32(x3_a, s[3]); + const int16x4_t dst_0 = vqrshrn_n_s32(x0, 12); + const int16x4_t dst_1 = vqrshrn_n_s32(x1, 12); + const int16x4_t dst_2 = vqrshrn_n_s32(s[2], 12); + const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12); + + x[0] = vcombine_s16(dst_0, dst_0); + x[1] = vcombine_s16(dst_1, dst_1); + x[2] = vcombine_s16(dst_2, dst_2); + x[3] = vcombine_s16(dst_3, dst_3); + + if (stage_is_rectangular) { + if (transpose) { + int16x8_t output[8]; + Transpose8x4To4x8(x, output); + StoreDst<8, 8>(dst, step, 0, output); + } else { + StoreDst<16, 4>(dst, step, 0, x); + } + } else { + if (transpose) { + Transpose4x4(x, x); + } + StoreDst<8, 4>(dst, step, 0, x); + } +} + +alignas(8) constexpr int16_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344, + 2482}; + +LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + int32x4_t s[2]; + + const int16x4_t v_src0 = vdup_n_s16(dst[0]); + const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0); + const int16x4_t v_src_round = + vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3); + const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0); + const int16x4_t kAdst4DcOnlyMultipliers = vld1_s16(kAdst4DcOnlyMultiplier); + s[1] = vdupq_n_s32(0); + + // s0*k0 s0*k1 s0*k2 s0*k1 + s[0] = vmull_s16(kAdst4DcOnlyMultipliers, v_src); + // 0 0 0 s0*k0 + s[1] = vextq_s32(s[1], s[0], 1); + + const int32x4_t x3 = vaddq_s32(s[0], s[1]); + const int16x4_t dst_0 = vqrshrn_n_s32(x3, 12); + + // vqrshlq_s16 will shift right if shift value is negative. + vst1_s16(dst, vqrshl_s16(dst_0, vdup_n_s16(-row_shift))); + + return true; +} + +LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + int32x4_t s[4]; + + int i = 0; + do { + const int16x4_t v_src = vld1_s16(&dst[i]); + + s[0] = vmull_n_s16(v_src, kAdst4Multiplier[0]); + s[1] = vmull_n_s16(v_src, kAdst4Multiplier[1]); + s[2] = vmull_n_s16(v_src, kAdst4Multiplier[2]); + + const int32x4_t x0 = s[0]; + const int32x4_t x1 = s[1]; + const int32x4_t x2 = s[2]; + const int32x4_t x3 = vaddq_s32(s[0], s[1]); + const int16x4_t dst_0 = vqrshrn_n_s32(x0, 12); + const int16x4_t dst_1 = vqrshrn_n_s32(x1, 12); + const int16x4_t dst_2 = vqrshrn_n_s32(x2, 12); + const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12); + + vst1_s16(&dst[i], dst_0); + vst1_s16(&dst[i + width * 1], dst_1); + vst1_s16(&dst[i + width * 2], dst_2); + vst1_s16(&dst[i + width * 3], dst_3); + + i += 4; + } while (i < width); + + return true; +} + +template +LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, + bool transpose) { + auto* const dst = static_cast(dest); + int16x8_t s[8], x[8]; + + if (stage_is_rectangular) { + if (transpose) { + int16x8_t input[4]; + LoadSrc<16, 4>(dst, step, 0, input); + Transpose8x4To4x8(input, x); + } else { + LoadSrc<8, 8>(dst, step, 0, x); + } + } else { + if (transpose) { + LoadSrc<16, 8>(dst, step, 0, x); + dsp::Transpose8x8(x); + } else { + LoadSrc<16, 8>(dst, step, 0, x); + } + } + + // stage 1. + s[0] = x[7]; + s[1] = x[0]; + s[2] = x[5]; + s[3] = x[2]; + s[4] = x[3]; + s[5] = x[4]; + s[6] = x[1]; + s[7] = x[6]; + + // stage 2. + butterfly_rotation(&s[0], &s[1], 60 - 0, true); + butterfly_rotation(&s[2], &s[3], 60 - 16, true); + butterfly_rotation(&s[4], &s[5], 60 - 32, true); + butterfly_rotation(&s[6], &s[7], 60 - 48, true); + + // stage 3. + HadamardRotation(&s[0], &s[4], false); + HadamardRotation(&s[1], &s[5], false); + HadamardRotation(&s[2], &s[6], false); + HadamardRotation(&s[3], &s[7], false); + + // stage 4. + butterfly_rotation(&s[4], &s[5], 48 - 0, true); + butterfly_rotation(&s[7], &s[6], 48 - 32, true); + + // stage 5. + HadamardRotation(&s[0], &s[2], false); + HadamardRotation(&s[4], &s[6], false); + HadamardRotation(&s[1], &s[3], false); + HadamardRotation(&s[5], &s[7], false); + + // stage 6. + butterfly_rotation(&s[2], &s[3], 32, true); + butterfly_rotation(&s[6], &s[7], 32, true); + + // stage 7. + x[0] = s[0]; + x[1] = vqnegq_s16(s[4]); + x[2] = s[6]; + x[3] = vqnegq_s16(s[2]); + x[4] = s[3]; + x[5] = vqnegq_s16(s[7]); + x[6] = s[5]; + x[7] = vqnegq_s16(s[1]); + + if (stage_is_rectangular) { + if (transpose) { + int16x8_t output[4]; + Transpose4x8To8x4(x, output); + StoreDst<16, 4>(dst, step, 0, output); + } else { + StoreDst<8, 8>(dst, step, 0, x); + } + } else { + if (transpose) { + dsp::Transpose8x8(x); + StoreDst<16, 8>(dst, step, 0, x); + } else { + StoreDst<16, 8>(dst, step, 0, x); + } + } +} + +LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + int16x8_t s[8]; + + const int16x8_t v_src = vdupq_n_s16(dst[0]); + const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0); + const int16x8_t v_src_round = + vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3); + // stage 1. + s[1] = vbslq_s16(v_mask, v_src_round, v_src); + + // stage 2. + ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true); + + // stage 3. + s[4] = s[0]; + s[5] = s[1]; + + // stage 4. + ButterflyRotation_4(&s[4], &s[5], 48, true); + + // stage 5. + s[2] = s[0]; + s[3] = s[1]; + s[6] = s[4]; + s[7] = s[5]; + + // stage 6. + ButterflyRotation_4(&s[2], &s[3], 32, true); + ButterflyRotation_4(&s[6], &s[7], 32, true); + + // stage 7. + int16x8_t x[8]; + x[0] = s[0]; + x[1] = vqnegq_s16(s[4]); + x[2] = s[6]; + x[3] = vqnegq_s16(s[2]); + x[4] = s[3]; + x[5] = vqnegq_s16(s[7]); + x[6] = s[5]; + x[7] = vqnegq_s16(s[1]); + + for (int i = 0; i < 8; ++i) { + // vqrshlq_s16 will shift right if shift value is negative. + x[i] = vqrshlq_s16(x[i], vdupq_n_s16(-row_shift)); + vst1q_lane_s16(&dst[i], x[i], 0); + } + + return true; +} + +LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + int16x8_t s[8]; + + int i = 0; + do { + const int16x8_t v_src = vld1q_s16(dst); + // stage 1. + s[1] = v_src; + + // stage 2. + ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true); + + // stage 3. + s[4] = s[0]; + s[5] = s[1]; + + // stage 4. + ButterflyRotation_4(&s[4], &s[5], 48, true); + + // stage 5. + s[2] = s[0]; + s[3] = s[1]; + s[6] = s[4]; + s[7] = s[5]; + + // stage 6. + ButterflyRotation_4(&s[2], &s[3], 32, true); + ButterflyRotation_4(&s[6], &s[7], 32, true); + + // stage 7. + int16x8_t x[8]; + x[0] = s[0]; + x[1] = vqnegq_s16(s[4]); + x[2] = s[6]; + x[3] = vqnegq_s16(s[2]); + x[4] = s[3]; + x[5] = vqnegq_s16(s[7]); + x[6] = s[5]; + x[7] = vqnegq_s16(s[1]); + + for (int j = 0; j < 8; ++j) { + vst1_s16(&dst[j * width], vget_low_s16(x[j])); + } + i += 4; + dst += 4; + } while (i < width); + + return true; +} + +template +LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row, + int row_shift) { + auto* const dst = static_cast(dest); + int16x8_t s[16], x[16]; + + if (stage_is_rectangular) { + if (is_row) { + int16x8_t input[4]; + LoadSrc<16, 4>(dst, step, 0, input); + Transpose8x4To4x8(input, x); + LoadSrc<16, 4>(dst, step, 8, input); + Transpose8x4To4x8(input, &x[8]); + } else { + LoadSrc<8, 16>(dst, step, 0, x); + } + } else { + if (is_row) { + for (int idx = 0; idx < 16; idx += 8) { + LoadSrc<16, 8>(dst, step, idx, &x[idx]); + dsp::Transpose8x8(&x[idx]); + } + } else { + LoadSrc<16, 16>(dst, step, 0, x); + } + } + + // stage 1. + s[0] = x[15]; + s[1] = x[0]; + s[2] = x[13]; + s[3] = x[2]; + s[4] = x[11]; + s[5] = x[4]; + s[6] = x[9]; + s[7] = x[6]; + s[8] = x[7]; + s[9] = x[8]; + s[10] = x[5]; + s[11] = x[10]; + s[12] = x[3]; + s[13] = x[12]; + s[14] = x[1]; + s[15] = x[14]; + + // stage 2. + butterfly_rotation(&s[0], &s[1], 62 - 0, true); + butterfly_rotation(&s[2], &s[3], 62 - 8, true); + butterfly_rotation(&s[4], &s[5], 62 - 16, true); + butterfly_rotation(&s[6], &s[7], 62 - 24, true); + butterfly_rotation(&s[8], &s[9], 62 - 32, true); + butterfly_rotation(&s[10], &s[11], 62 - 40, true); + butterfly_rotation(&s[12], &s[13], 62 - 48, true); + butterfly_rotation(&s[14], &s[15], 62 - 56, true); + + // stage 3. + HadamardRotation(&s[0], &s[8], false); + HadamardRotation(&s[1], &s[9], false); + HadamardRotation(&s[2], &s[10], false); + HadamardRotation(&s[3], &s[11], false); + HadamardRotation(&s[4], &s[12], false); + HadamardRotation(&s[5], &s[13], false); + HadamardRotation(&s[6], &s[14], false); + HadamardRotation(&s[7], &s[15], false); + + // stage 4. + butterfly_rotation(&s[8], &s[9], 56 - 0, true); + butterfly_rotation(&s[13], &s[12], 8 + 0, true); + butterfly_rotation(&s[10], &s[11], 56 - 32, true); + butterfly_rotation(&s[15], &s[14], 8 + 32, true); + + // stage 5. + HadamardRotation(&s[0], &s[4], false); + HadamardRotation(&s[8], &s[12], false); + HadamardRotation(&s[1], &s[5], false); + HadamardRotation(&s[9], &s[13], false); + HadamardRotation(&s[2], &s[6], false); + HadamardRotation(&s[10], &s[14], false); + HadamardRotation(&s[3], &s[7], false); + HadamardRotation(&s[11], &s[15], false); + + // stage 6. + butterfly_rotation(&s[4], &s[5], 48 - 0, true); + butterfly_rotation(&s[12], &s[13], 48 - 0, true); + butterfly_rotation(&s[7], &s[6], 48 - 32, true); + butterfly_rotation(&s[15], &s[14], 48 - 32, true); + + // stage 7. + HadamardRotation(&s[0], &s[2], false); + HadamardRotation(&s[4], &s[6], false); + HadamardRotation(&s[8], &s[10], false); + HadamardRotation(&s[12], &s[14], false); + HadamardRotation(&s[1], &s[3], false); + HadamardRotation(&s[5], &s[7], false); + HadamardRotation(&s[9], &s[11], false); + HadamardRotation(&s[13], &s[15], false); + + // stage 8. + butterfly_rotation(&s[2], &s[3], 32, true); + butterfly_rotation(&s[6], &s[7], 32, true); + butterfly_rotation(&s[10], &s[11], 32, true); + butterfly_rotation(&s[14], &s[15], 32, true); + + // stage 9. + x[0] = s[0]; + x[1] = vqnegq_s16(s[8]); + x[2] = s[12]; + x[3] = vqnegq_s16(s[4]); + x[4] = s[6]; + x[5] = vqnegq_s16(s[14]); + x[6] = s[10]; + x[7] = vqnegq_s16(s[2]); + x[8] = s[3]; + x[9] = vqnegq_s16(s[11]); + x[10] = s[15]; + x[11] = vqnegq_s16(s[7]); + x[12] = s[5]; + x[13] = vqnegq_s16(s[13]); + x[14] = s[9]; + x[15] = vqnegq_s16(s[1]); + + if (stage_is_rectangular) { + if (is_row) { + const int16x8_t v_row_shift = vdupq_n_s16(-row_shift); + int16x8_t output[4]; + Transpose4x8To8x4(x, output); + for (int i = 0; i < 4; ++i) { + output[i] = vqrshlq_s16(output[i], v_row_shift); + } + StoreDst<16, 4>(dst, step, 0, output); + Transpose4x8To8x4(&x[8], output); + for (int i = 0; i < 4; ++i) { + output[i] = vqrshlq_s16(output[i], v_row_shift); + } + StoreDst<16, 4>(dst, step, 8, output); + } else { + StoreDst<8, 16>(dst, step, 0, x); + } + } else { + if (is_row) { + const int16x8_t v_row_shift = vdupq_n_s16(-row_shift); + for (int idx = 0; idx < 16; idx += 8) { + int16x8_t output[8]; + Transpose8x8(&x[idx], output); + for (int i = 0; i < 8; ++i) { + output[i] = vqrshlq_s16(output[i], v_row_shift); + } + StoreDst<16, 8>(dst, step, idx, output); + } + } else { + StoreDst<16, 16>(dst, step, 0, x); + } + } +} + +LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int16x8_t* s, int16x8_t* x) { + // stage 2. + ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true); + + // stage 3. + s[8] = s[0]; + s[9] = s[1]; + + // stage 4. + ButterflyRotation_4(&s[8], &s[9], 56, true); + + // stage 5. + s[4] = s[0]; + s[12] = s[8]; + s[5] = s[1]; + s[13] = s[9]; + + // stage 6. + ButterflyRotation_4(&s[4], &s[5], 48, true); + ButterflyRotation_4(&s[12], &s[13], 48, true); + + // stage 7. + s[2] = s[0]; + s[6] = s[4]; + s[10] = s[8]; + s[14] = s[12]; + s[3] = s[1]; + s[7] = s[5]; + s[11] = s[9]; + s[15] = s[13]; + + // stage 8. + ButterflyRotation_4(&s[2], &s[3], 32, true); + ButterflyRotation_4(&s[6], &s[7], 32, true); + ButterflyRotation_4(&s[10], &s[11], 32, true); + ButterflyRotation_4(&s[14], &s[15], 32, true); + + // stage 9. + x[0] = s[0]; + x[1] = vqnegq_s16(s[8]); + x[2] = s[12]; + x[3] = vqnegq_s16(s[4]); + x[4] = s[6]; + x[5] = vqnegq_s16(s[14]); + x[6] = s[10]; + x[7] = vqnegq_s16(s[2]); + x[8] = s[3]; + x[9] = vqnegq_s16(s[11]); + x[10] = s[15]; + x[11] = vqnegq_s16(s[7]); + x[12] = s[5]; + x[13] = vqnegq_s16(s[13]); + x[14] = s[9]; + x[15] = vqnegq_s16(s[1]); +} + +LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + int16x8_t s[16]; + int16x8_t x[16]; + + const int16x8_t v_src = vdupq_n_s16(dst[0]); + const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0); + const int16x8_t v_src_round = + vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3); + // stage 1. + s[1] = vbslq_s16(v_mask, v_src_round, v_src); + + Adst16DcOnlyInternal(s, x); + + for (int i = 0; i < 16; ++i) { + // vqrshlq_s16 will shift right if shift value is negative. + x[i] = vqrshlq_s16(x[i], vdupq_n_s16(-row_shift)); + vst1q_lane_s16(&dst[i], x[i], 0); + } + + return true; +} + +LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest, + int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + int i = 0; + do { + int16x8_t s[16]; + int16x8_t x[16]; + const int16x8_t v_src = vld1q_s16(dst); + // stage 1. + s[1] = v_src; + + Adst16DcOnlyInternal(s, x); + + for (int j = 0; j < 16; ++j) { + vst1_s16(&dst[j * width], vget_low_s16(x[j])); + } + i += 4; + dst += 4; + } while (i < width); + + return true; +} + +//------------------------------------------------------------------------------ +// Identity Transforms. + +template +LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step) { + auto* const dst = static_cast(dest); + + if (is_row_shift) { + const int shift = 1; + const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11); + const int16x4_t v_multiplier = vdup_n_s16(kIdentity4Multiplier); + const int32x4_t v_shift = vdupq_n_s32(-(12 + shift)); + for (int i = 0; i < 4; i += 2) { + const int16x8_t v_src = vld1q_s16(&dst[i * step]); + const int32x4_t v_src_mult_lo = + vmlal_s16(v_dual_round, vget_low_s16(v_src), v_multiplier); + const int32x4_t v_src_mult_hi = + vmlal_s16(v_dual_round, vget_high_s16(v_src), v_multiplier); + const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift); + const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift); + vst1q_s16(&dst[i * step], + vcombine_s16(vqmovn_s32(shift_lo), vqmovn_s32(shift_hi))); + } + } else { + for (int i = 0; i < 4; i += 2) { + const int16x8_t v_src = vld1q_s16(&dst[i * step]); + const int16x8_t a = + vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 3); + const int16x8_t b = vqaddq_s16(v_src, a); + vst1q_s16(&dst[i * step], b); + } + } +} + +LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int tx_height) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const int16x4_t v_src0 = vdup_n_s16(dst[0]); + const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0); + const int16x4_t v_src_round = + vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3); + const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0); + const int shift = tx_height < 16 ? 0 : 1; + const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11); + const int16x4_t v_multiplier = vdup_n_s16(kIdentity4Multiplier); + const int32x4_t v_shift = vdupq_n_s32(-(12 + shift)); + const int32x4_t v_src_mult_lo = vmlal_s16(v_dual_round, v_src, v_multiplier); + const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift); + vst1_lane_s16(dst, vqmovn_s32(dst_0), 0); + return true; +} + +template +LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame( + Array2DView frame, const int start_x, const int start_y, + const int tx_width, const int tx_height, const int16_t* source) { + const int stride = frame.columns(); + uint8_t* dst = frame[start_y] + start_x; + + if (identity_size < 32) { + if (tx_width == 4) { + uint8x8_t frame_data = vdup_n_u8(0); + int i = 0; + do { + const int16x4_t v_src = vld1_s16(&source[i * tx_width]); + + int16x4_t v_dst_i; + if (identity_size == 4) { + const int16x4_t v_src_fraction = + vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 3); + v_dst_i = vqadd_s16(v_src, v_src_fraction); + } else if (identity_size == 8) { + v_dst_i = vqadd_s16(v_src, v_src); + } else { // identity_size == 16 + const int16x4_t v_src_mult = + vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 4); + const int16x4_t v_srcx2 = vqadd_s16(v_src, v_src); + v_dst_i = vqadd_s16(v_srcx2, v_src_mult); + } + + frame_data = Load4<0>(dst, frame_data); + const int16x4_t a = vrshr_n_s16(v_dst_i, 4); + const uint16x8_t b = + vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b)); + StoreLo4(dst, d); + dst += stride; + } while (++i < tx_height); + } else { + int i = 0; + do { + const int row = i * tx_width; + int j = 0; + do { + const int16x8_t v_src = vld1q_s16(&source[row + j]); + + int16x8_t v_dst_i; + if (identity_size == 4) { + const int16x8_t v_src_fraction = + vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 3); + v_dst_i = vqaddq_s16(v_src, v_src_fraction); + } else if (identity_size == 8) { + v_dst_i = vqaddq_s16(v_src, v_src); + } else { // identity_size == 16 + const int16x8_t v_src_mult = + vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 4); + const int16x8_t v_srcx2 = vqaddq_s16(v_src, v_src); + v_dst_i = vqaddq_s16(v_src_mult, v_srcx2); + } + + const uint8x8_t frame_data = vld1_u8(dst + j); + const int16x8_t a = vrshrq_n_s16(v_dst_i, 4); + const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b)); + vst1_u8(dst + j, d); + j += 8; + } while (j < tx_width); + dst += stride; + } while (++i < tx_height); + } + } else { + int i = 0; + do { + const int row = i * tx_width; + int j = 0; + do { + const int16x8_t v_dst_i = vld1q_s16(&source[row + j]); + const uint8x8_t frame_data = vld1_u8(dst + j); + const int16x8_t a = vrshrq_n_s16(v_dst_i, 2); + const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b)); + vst1_u8(dst + j, d); + j += 8; + } while (j < tx_width); + dst += stride; + } while (++i < tx_height); + } +} + +LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame( + Array2DView frame, const int start_x, const int start_y, + const int tx_width, const int tx_height, const int16_t* source) { + const int stride = frame.columns(); + uint8_t* dst = frame[start_y] + start_x; + + if (tx_width == 4) { + uint8x8_t frame_data = vdup_n_u8(0); + int i = 0; + do { + const int16x4_t v_src = vld1_s16(&source[i * tx_width]); + const int16x4_t v_src_mult = + vqrdmulh_n_s16(v_src, kIdentity4MultiplierFraction << 3); + const int16x4_t v_dst_row = vqadd_s16(v_src, v_src_mult); + const int16x4_t v_src_mult2 = + vqrdmulh_n_s16(v_dst_row, kIdentity4MultiplierFraction << 3); + const int16x4_t v_dst_col = vqadd_s16(v_dst_row, v_src_mult2); + frame_data = Load4<0>(dst, frame_data); + const int16x4_t a = vrshr_n_s16(v_dst_col, 4); + const uint16x8_t b = + vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b)); + StoreLo4(dst, d); + dst += stride; + } while (++i < tx_height); + } else { + int i = 0; + do { + const int row = i * tx_width; + int j = 0; + do { + const int16x8_t v_src = vld1q_s16(&source[row + j]); + const int16x8_t v_src_round = + vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3); + const int16x8_t v_dst_row = vqaddq_s16(v_src_round, v_src_round); + const int16x8_t v_src_mult2 = + vqrdmulhq_n_s16(v_dst_row, kIdentity4MultiplierFraction << 3); + const int16x8_t v_dst_col = vqaddq_s16(v_dst_row, v_src_mult2); + const uint8x8_t frame_data = vld1_u8(dst + j); + const int16x8_t a = vrshrq_n_s16(v_dst_col, 4); + const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b)); + vst1_u8(dst + j, d); + j += 8; + } while (j < tx_width); + dst += stride; + } while (++i < tx_height); + } +} + +LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) { + auto* const dst = static_cast(dest); + + // When combining the identity8 multiplier with the row shift, the + // calculations for tx_height equal to 32 can be simplified from + // ((A * 2) + 2) >> 2) to ((A + 1) >> 1). + for (int i = 0; i < 4; ++i) { + const int16x8_t v_src = vld1q_s16(&dst[i * step]); + const int16x8_t a = vrshrq_n_s16(v_src, 1); + vst1q_s16(&dst[i * step], a); + } +} + +LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) { + auto* const dst = static_cast(dest); + + for (int i = 0; i < 4; ++i) { + const int16x8_t v_src = vld1q_s16(&dst[i * step]); + // For bitdepth == 8, the identity row clamps to a signed 16bit value, so + // saturating add here is ok. + const int16x8_t v_srcx2 = vqaddq_s16(v_src, v_src); + vst1q_s16(&dst[i * step], v_srcx2); + } +} + +LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const int16x4_t v_src0 = vdup_n_s16(dst[0]); + const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0); + const int16x4_t v_src_round = + vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3); + const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0); + const int32x4_t v_srcx2 = vaddl_s16(v_src, v_src); + const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift)); + vst1_lane_s16(dst, vqmovn_s32(dst_0), 0); + return true; +} + +LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step, + int shift) { + auto* const dst = static_cast(dest); + const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11); + const int32x4_t v_shift = vdupq_n_s32(-(12 + shift)); + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 2; ++j) { + const int16x8_t v_src = vld1q_s16(&dst[i * step + j * 8]); + const int32x4_t v_src_mult_lo = + vmlal_n_s16(v_dual_round, vget_low_s16(v_src), kIdentity16Multiplier); + const int32x4_t v_src_mult_hi = vmlal_n_s16( + v_dual_round, vget_high_s16(v_src), kIdentity16Multiplier); + const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift); + const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift); + vst1q_s16(&dst[i * step + j * 8], + vcombine_s16(vqmovn_s32(shift_lo), vqmovn_s32(shift_hi))); + } + } +} + +LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const int16x4_t v_src0 = vdup_n_s16(dst[0]); + const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0); + const int16x4_t v_src_round = + vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3); + const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0); + const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11); + const int16x4_t v_multiplier = vdup_n_s16(kIdentity16Multiplier); + const int32x4_t v_shift = vdupq_n_s32(-(12 + shift)); + const int32x4_t v_src_mult_lo = + vmlal_s16(v_dual_round, (v_src), v_multiplier); + const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift); + vst1_lane_s16(dst, vqmovn_s32(dst_0), 0); + return true; +} + +LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest, + const int32_t step) { + auto* const dst = static_cast(dest); + + // When combining the identity32 multiplier with the row shift, the + // calculation for tx_height equal to 16 can be simplified from + // ((A * 4) + 1) >> 1) to (A * 2). + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 32; j += 8) { + const int16x8_t v_src = vld1q_s16(&dst[i * step + j]); + // For bitdepth == 8, the identity row clamps to a signed 16bit value, so + // saturating add here is ok. + const int16x8_t v_dst_i = vqaddq_s16(v_src, v_src); + vst1q_s16(&dst[i * step + j], v_dst_i); + } + } +} + +LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest, + int adjusted_tx_height) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const int16x4_t v_src0 = vdup_n_s16(dst[0]); + const int16x4_t v_src = vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3); + // When combining the identity32 multiplier with the row shift, the + // calculation for tx_height equal to 16 can be simplified from + // ((A * 4) + 1) >> 1) to (A * 2). + const int16x4_t v_dst_0 = vqadd_s16(v_src, v_src); + vst1_lane_s16(dst, v_dst_0, 0); + return true; +} + +//------------------------------------------------------------------------------ +// Walsh Hadamard Transform. + +// Transposes a 4x4 matrix and then permutes the rows of the transposed matrix +// for the WHT. The input matrix is in two "wide" int16x8_t variables. The +// output matrix is in four int16x4_t variables. +// +// Input: +// in[0]: 00 01 02 03 10 11 12 13 +// in[1]: 20 21 22 23 30 31 32 33 +// Output: +// out[0]: 00 10 20 30 +// out[1]: 03 13 23 33 +// out[2]: 01 11 21 31 +// out[3]: 02 12 22 32 +LIBGAV1_ALWAYS_INLINE void TransposeAndPermute4x4WideInput( + const int16x8_t in[2], int16x4_t out[4]) { + // Swap 32 bit elements. Goes from: + // in[0]: 00 01 02 03 10 11 12 13 + // in[1]: 20 21 22 23 30 31 32 33 + // to: + // b0.val[0]: 00 01 20 21 10 11 30 31 + // b0.val[1]: 02 03 22 23 12 13 32 33 + + const int32x4x2_t b0 = + vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1])); + + // Swap 16 bit elements. Goes from: + // vget_low_s32(b0.val[0]): 00 01 20 21 + // vget_high_s32(b0.val[0]): 10 11 30 31 + // vget_low_s32(b0.val[1]): 02 03 22 23 + // vget_high_s32(b0.val[1]): 12 13 32 33 + // to: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 01 11 21 32 + // c1.val[0]: 02 12 22 32 + // c1.val[1]: 03 13 23 33 + + const int16x4x2_t c0 = + vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])), + vreinterpret_s16_s32(vget_high_s32(b0.val[0]))); + const int16x4x2_t c1 = + vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])), + vreinterpret_s16_s32(vget_high_s32(b0.val[1]))); + + out[0] = c0.val[0]; + out[1] = c1.val[1]; + out[2] = c0.val[1]; + out[3] = c1.val[0]; +} + +// Process 4 wht4 rows and columns. +LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint8_t* dst, const int dst_stride, + const void* source, + const int adjusted_tx_height) { + const auto* const src = static_cast(source); + int16x4_t s[4]; + + if (adjusted_tx_height == 1) { + // Special case: only src[0] is nonzero. + // src[0] 0 0 0 + // 0 0 0 0 + // 0 0 0 0 + // 0 0 0 0 + // + // After the row and column transforms are applied, we have: + // f h h h + // g i i i + // g i i i + // g i i i + // where f, g, h, i are computed as follows. + int16_t f = (src[0] >> 2) - (src[0] >> 3); + const int16_t g = f >> 1; + f = f - (f >> 1); + const int16_t h = (src[0] >> 3) - (src[0] >> 4); + const int16_t i = (src[0] >> 4); + s[0] = vdup_n_s16(h); + s[0] = vset_lane_s16(f, s[0], 0); + s[1] = vdup_n_s16(i); + s[1] = vset_lane_s16(g, s[1], 0); + s[2] = s[3] = s[1]; + } else { + // Load the 4x4 source in transposed form. + int16x4x4_t columns = vld4_s16(src); + // Shift right and permute the columns for the WHT. + s[0] = vshr_n_s16(columns.val[0], 2); + s[2] = vshr_n_s16(columns.val[1], 2); + s[3] = vshr_n_s16(columns.val[2], 2); + s[1] = vshr_n_s16(columns.val[3], 2); + + // Row transforms. + s[0] = vadd_s16(s[0], s[2]); + s[3] = vsub_s16(s[3], s[1]); + int16x4_t e = vhsub_s16(s[0], s[3]); // e = (s[0] - s[3]) >> 1 + s[1] = vsub_s16(e, s[1]); + s[2] = vsub_s16(e, s[2]); + s[0] = vsub_s16(s[0], s[1]); + s[3] = vadd_s16(s[3], s[2]); + + int16x8_t x[2]; + x[0] = vcombine_s16(s[0], s[1]); + x[1] = vcombine_s16(s[2], s[3]); + TransposeAndPermute4x4WideInput(x, s); + + // Column transforms. + s[0] = vadd_s16(s[0], s[2]); + s[3] = vsub_s16(s[3], s[1]); + e = vhsub_s16(s[0], s[3]); // e = (s[0] - s[3]) >> 1 + s[1] = vsub_s16(e, s[1]); + s[2] = vsub_s16(e, s[2]); + s[0] = vsub_s16(s[0], s[1]); + s[3] = vadd_s16(s[3], s[2]); + } + + // Store to frame. + uint8x8_t frame_data = vdup_n_u8(0); + for (int row = 0; row < 4; row += 2) { + frame_data = Load4<0>(dst, frame_data); + frame_data = Load4<1>(dst + dst_stride, frame_data); + const int16x8_t residual = vcombine_s16(s[row], s[row + 1]); + const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(residual), frame_data); + frame_data = vqmovun_s16(vreinterpretq_s16_u16(b)); + StoreLo4(dst, frame_data); + dst += dst_stride; + StoreHi4(dst, frame_data); + dst += dst_stride; + } +} + +//------------------------------------------------------------------------------ +// row/column transform loops + +template +LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) { + if (tx_width >= 16) { + int i = 0; + do { + const int16x8_t a = vld1q_s16(&source[i]); + const int16x8_t b = vld1q_s16(&source[i + 8]); + const int16x8_t c = vrev64q_s16(a); + const int16x8_t d = vrev64q_s16(b); + vst1q_s16(&source[i], vcombine_s16(vget_high_s16(d), vget_low_s16(d))); + vst1q_s16(&source[i + 8], + vcombine_s16(vget_high_s16(c), vget_low_s16(c))); + i += 16; + } while (i < tx_width * tx_height); + } else if (tx_width == 8) { + for (int i = 0; i < 8 * tx_height; i += 8) { + const int16x8_t a = vld1q_s16(&source[i]); + const int16x8_t b = vrev64q_s16(a); + vst1q_s16(&source[i], vcombine_s16(vget_high_s16(b), vget_low_s16(b))); + } + } else { + // Process two rows per iteration. + for (int i = 0; i < 4 * tx_height; i += 8) { + const int16x8_t a = vld1q_s16(&source[i]); + vst1q_s16(&source[i], vrev64q_s16(a)); + } + } +} + +template +LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) { + if (tx_width == 4) { + // Process two rows per iteration. + int i = 0; + do { + const int16x8_t a = vld1q_s16(&source[i]); + const int16x8_t b = vqrdmulhq_n_s16(a, kTransformRowMultiplier << 3); + vst1q_s16(&source[i], b); + i += 8; + } while (i < tx_width * num_rows); + } else { + int i = 0; + do { + // The last 32 values of every row are always zero if the |tx_width| is + // 64. + const int non_zero_width = (tx_width < 64) ? tx_width : 32; + int j = 0; + do { + const int16x8_t a = vld1q_s16(&source[i * tx_width + j]); + const int16x8_t b = vqrdmulhq_n_s16(a, kTransformRowMultiplier << 3); + vst1q_s16(&source[i * tx_width + j], b); + j += 8; + } while (j < non_zero_width); + } while (++i < num_rows); + } +} + +template +LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows, + int row_shift) { + // vqrshlq_s16 will shift right if shift value is negative. + row_shift = -row_shift; + + if (tx_width == 4) { + // Process two rows per iteration. + int i = 0; + do { + const int16x8_t residual = vld1q_s16(&source[i]); + vst1q_s16(&source[i], vqrshlq_s16(residual, vdupq_n_s16(row_shift))); + i += 8; + } while (i < tx_width * num_rows); + } else { + int i = 0; + do { + for (int j = 0; j < tx_width; j += 8) { + const int16x8_t residual = vld1q_s16(&source[i * tx_width + j]); + const int16x8_t residual_shifted = + vqrshlq_s16(residual, vdupq_n_s16(row_shift)); + vst1q_s16(&source[i * tx_width + j], residual_shifted); + } + } while (++i < num_rows); + } +} + +template +LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound( + Array2DView frame, const int start_x, const int start_y, + const int tx_width, const int16_t* source, TransformType tx_type) { + const bool flip_rows = + enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false; + const int stride = frame.columns(); + uint8_t* dst = frame[start_y] + start_x; + + // Enable for 4x4, 4x8, 4x16 + if (tx_height < 32 && tx_width == 4) { + uint8x8_t frame_data = vdup_n_u8(0); + for (int i = 0; i < tx_height; ++i) { + const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4; + const int16x4_t residual = vld1_s16(&source[row]); + frame_data = Load4<0>(dst, frame_data); + const int16x4_t a = vrshr_n_s16(residual, 4); + const uint16x8_t b = + vaddw_u8(vreinterpretq_u16_s16(vcombine_s16(a, a)), frame_data); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b)); + StoreLo4(dst, d); + dst += stride; + } + // Enable for 8x4, 8x8, 8x16, 8x32 + } else if (tx_height < 64 && tx_width == 8) { + for (int i = 0; i < tx_height; ++i) { + const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8; + const int16x8_t residual = vld1q_s16(&source[row]); + const uint8x8_t frame_data = vld1_u8(dst); + const int16x8_t a = vrshrq_n_s16(residual, 4); + const uint16x8_t b = vaddw_u8(vreinterpretq_u16_s16(a), frame_data); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(b)); + vst1_u8(dst, d); + dst += stride; + } + // Remaining widths >= 16. + } else { + for (int i = 0; i < tx_height; ++i) { + const int y = start_y + i; + const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width; + int j = 0; + do { + const int x = start_x + j; + const int16x8_t residual = vld1q_s16(&source[row + j]); + const int16x8_t residual_hi = vld1q_s16(&source[row + j + 8]); + const uint8x16_t frame_data = vld1q_u8(frame[y] + x); + const int16x8_t a = vrshrq_n_s16(residual, 4); + const int16x8_t a_hi = vrshrq_n_s16(residual_hi, 4); + const uint16x8_t b = + vaddw_u8(vreinterpretq_u16_s16(a), vget_low_u8(frame_data)); + const uint16x8_t b_hi = + vaddw_u8(vreinterpretq_u16_s16(a_hi), vget_high_u8(frame_data)); + vst1q_u8(frame[y] + x, + vcombine_u8(vqmovun_s16(vreinterpretq_s16_u16(b)), + vqmovun_s16(vreinterpretq_s16_u16(b_hi)))); + j += 16; + } while (j < tx_width); + } + } +} + +void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const bool should_round = (tx_height == 8); + const int row_shift = (tx_height == 16); + + if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<4>(src, adjusted_tx_height); + } + + if (adjusted_tx_height == 4) { + // Process 4 1d dct4 rows in parallel. + Dct4_NEON(src, /*step=*/4, /*transpose=*/true); + } else { + // Process 8 1d dct4 rows in parallel per iteration. + int i = adjusted_tx_height; + auto* data = src; + do { + Dct4_NEON(data, /*step=*/4, + /*transpose=*/true); + data += 32; + i -= 8; + } while (i != 0); + } + if (tx_height == 16) { + RowShift<4>(src, adjusted_tx_height, 1); + } +} + +void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<4>(src, tx_width); + } + + if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) { + if (tx_width == 4) { + // Process 4 1d dct4 columns in parallel. + Dct4_NEON(src, tx_width, /*transpose=*/false); + } else { + // Process 8 1d dct4 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Dct4_NEON(data, tx_width, + /*transpose=*/false); + data += 8; + i -= 8; + } while (i != 0); + } + } + + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<8>(src, adjusted_tx_height); + } + + if (adjusted_tx_height == 4) { + // Process 4 1d dct8 rows in parallel. + Dct8_NEON(src, /*step=*/8, /*transpose=*/true); + } else { + // Process 8 1d dct8 rows in parallel per iteration. + assert(adjusted_tx_height % 8 == 0); + int i = adjusted_tx_height; + auto* data = src; + do { + Dct8_NEON(data, /*step=*/8, + /*transpose=*/true); + data += 64; + i -= 8; + } while (i != 0); + } + if (row_shift > 0) { + RowShift<8>(src, adjusted_tx_height, row_shift); + } +} + +void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<8>(src, tx_width); + } + + if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) { + if (tx_width == 4) { + // Process 4 1d dct8 columns in parallel. + Dct8_NEON(src, 4, /*transpose=*/false); + } else { + // Process 8 1d dct8 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Dct8_NEON(data, tx_width, + /*transpose=*/false); + data += 8; + i -= 8; + } while (i != 0); + } + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<16>(src, adjusted_tx_height); + } + + if (adjusted_tx_height == 4) { + // Process 4 1d dct16 rows in parallel. + Dct16_NEON(src, 16, /*is_row=*/true, row_shift); + } else { + assert(adjusted_tx_height % 8 == 0); + int i = adjusted_tx_height; + do { + // Process 8 1d dct16 rows in parallel per iteration. + Dct16_NEON(src, 16, /*is_row=*/true, + row_shift); + src += 128; + i -= 8; + } while (i != 0); + } +} + +void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<16>(src, tx_width); + } + + if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) { + if (tx_width == 4) { + // Process 4 1d dct16 columns in parallel. + Dct16_NEON(src, 4, /*is_row=*/false, + /*row_shift=*/0); + } else { + int i = tx_width; + auto* data = src; + do { + // Process 8 1d dct16 columns in parallel per iteration. + Dct16_NEON(data, tx_width, /*is_row=*/false, + /*row_shift=*/0); + data += 8; + i -= 8; + } while (i != 0); + } + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<32>(src, adjusted_tx_height); + } + // Process 8 1d dct32 rows in parallel per iteration. + int i = 0; + do { + Dct32_NEON(&src[i * 32], 32, /*is_row=*/true, row_shift); + i += 8; + } while (i < adjusted_tx_height); +} + +void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) { + // Process 8 1d dct32 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0); + data += 8; + i -= 8; + } while (i != 0); + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<64>(src, adjusted_tx_height); + } + // Process 8 1d dct64 rows in parallel per iteration. + int i = 0; + do { + Dct64_NEON(&src[i * 64], 64, /*is_row=*/true, row_shift); + i += 8; + } while (i < adjusted_tx_height); +} + +void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) { + // Process 8 1d dct64 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0); + data += 8; + i -= 8; + } while (i != 0); + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type); +} + +void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const int row_shift = static_cast(tx_height == 16); + const bool should_round = (tx_height == 8); + + if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<4>(src, adjusted_tx_height); + } + + // Process 4 1d adst4 rows in parallel per iteration. + int i = adjusted_tx_height; + auto* data = src; + do { + Adst4_NEON(data, /*step=*/4, /*transpose=*/true); + data += 16; + i -= 4; + } while (i != 0); + + if (tx_height == 16) { + RowShift<4>(src, adjusted_tx_height, 1); + } +} + +void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<4>(src, tx_width); + } + + if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) { + // Process 4 1d adst4 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Adst4_NEON(data, tx_width, /*transpose=*/false); + data += 4; + i -= 4; + } while (i != 0); + } + + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y, + tx_width, src, tx_type); +} + +void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<8>(src, adjusted_tx_height); + } + + if (adjusted_tx_height == 4) { + // Process 4 1d adst8 rows in parallel. + Adst8_NEON(src, /*step=*/8, /*transpose=*/true); + } else { + // Process 8 1d adst8 rows in parallel per iteration. + assert(adjusted_tx_height % 8 == 0); + int i = adjusted_tx_height; + auto* data = src; + do { + Adst8_NEON(data, /*step=*/8, + /*transpose=*/true); + data += 64; + i -= 8; + } while (i != 0); + } + if (row_shift > 0) { + RowShift<8>(src, adjusted_tx_height, row_shift); + } +} + +void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<8>(src, tx_width); + } + + if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) { + if (tx_width == 4) { + // Process 4 1d adst8 columns in parallel. + Adst8_NEON(src, 4, /*transpose=*/false); + } else { + // Process 8 1d adst8 columns in parallel per iteration. + int i = tx_width; + auto* data = src; + do { + Adst8_NEON(data, tx_width, + /*transpose=*/false); + data += 8; + i -= 8; + } while (i != 0); + } + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y, + tx_width, src, tx_type); +} + +void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<16>(src, adjusted_tx_height); + } + + if (adjusted_tx_height == 4) { + // Process 4 1d adst16 rows in parallel. + Adst16_NEON(src, 16, /*is_row=*/true, row_shift); + } else { + assert(adjusted_tx_height % 8 == 0); + int i = adjusted_tx_height; + do { + // Process 8 1d adst16 rows in parallel per iteration. + Adst16_NEON(src, 16, /*is_row=*/true, + row_shift); + src += 128; + i -= 8; + } while (i != 0); + } +} + +void Adst16TransformLoopColumn_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<16>(src, tx_width); + } + + if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) { + if (tx_width == 4) { + // Process 4 1d adst16 columns in parallel. + Adst16_NEON(src, 4, /*is_row=*/false, + /*row_shift=*/0); + } else { + int i = tx_width; + auto* data = src; + do { + // Process 8 1d adst16 columns in parallel per iteration. + Adst16_NEON( + data, tx_width, /*is_row=*/false, /*row_shift=*/0); + data += 8; + i -= 8; + } while (i != 0); + } + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y, + tx_width, src, tx_type); +} + +void Identity4TransformLoopRow_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + // Special case: Process row calculations during column transform call. + // Improves performance. + if (tx_type == kTransformTypeIdentityIdentity && + tx_size == kTransformSize4x4) { + return; + } + + auto* src = static_cast(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const bool should_round = (tx_height == 8); + + if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) { + return; + } + + if (should_round) { + ApplyRounding<4>(src, adjusted_tx_height); + } + if (tx_height < 16) { + int i = adjusted_tx_height; + do { + Identity4_NEON(src, /*step=*/4); + src += 16; + i -= 4; + } while (i != 0); + } else { + int i = adjusted_tx_height; + do { + Identity4_NEON(src, /*step=*/4); + src += 16; + i -= 4; + } while (i != 0); + } +} + +void Identity4TransformLoopColumn_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, + void* dst_frame) { + auto& frame = *static_cast*>(dst_frame); + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + // Special case: Process row calculations during column transform call. + if (tx_type == kTransformTypeIdentityIdentity && + (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) { + Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); + return; + } + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<4>(src, tx_width); + } + + IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); +} + +void Identity8TransformLoopRow_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + // Special case: Process row calculations during column transform call. + // Improves performance. + if (tx_type == kTransformTypeIdentityIdentity && + tx_size == kTransformSize8x4) { + return; + } + + auto* src = static_cast(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<8>(src, adjusted_tx_height); + } + + // When combining the identity8 multiplier with the row shift, the + // calculations for tx_height == 8 and tx_height == 16 can be simplified + // from ((A * 2) + 1) >> 1) to A. + if ((tx_height & 0x18) != 0) { + return; + } + if (tx_height == 32) { + int i = adjusted_tx_height; + do { + Identity8Row32_NEON(src, /*step=*/8); + src += 32; + i -= 4; + } while (i != 0); + return; + } + + assert(tx_size == kTransformSize8x4); + int i = adjusted_tx_height; + do { + Identity8Row4_NEON(src, /*step=*/8); + src += 32; + i -= 4; + } while (i != 0); +} + +void Identity8TransformLoopColumn_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, + void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<8>(src, tx_width); + } + auto& frame = *static_cast*>(dst_frame); + IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); +} + +void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<16>(src, adjusted_tx_height); + } + int i = adjusted_tx_height; + do { + Identity16Row_NEON(src, /*step=*/16, kTransformRowShift[tx_size]); + src += 64; + i -= 4; + } while (i != 0); +} + +void Identity16TransformLoopColumn_NEON(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, + void* src_buffer, int start_x, + int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<16>(src, tx_width); + } + auto& frame = *static_cast*>(dst_frame); + IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); +} + +void Identity32TransformLoopRow_NEON(TransformType /*tx_type*/, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + const int tx_height = kTransformHeight[tx_size]; + + // When combining the identity32 multiplier with the row shift, the + // calculations for tx_height == 8 and tx_height == 32 can be simplified + // from ((A * 4) + 2) >> 2) to A. + if ((tx_height & 0x28) != 0) { + return; + } + + // Process kTransformSize32x16. The src is always rounded before the + // identity transform and shifted by 1 afterwards. + auto* src = static_cast(src_buffer); + if (Identity32DcOnly(src, adjusted_tx_height)) { + return; + } + + assert(tx_size == kTransformSize32x16); + ApplyRounding<32>(src, adjusted_tx_height); + int i = adjusted_tx_height; + do { + Identity32Row16_NEON(src, /*step=*/32); + src += 128; + i -= 4; + } while (i != 0); +} + +void Identity32TransformLoopColumn_NEON(TransformType /*tx_type*/, + TransformSize tx_size, + int adjusted_tx_height, + void* src_buffer, int start_x, + int start_y, void* dst_frame) { + auto& frame = *static_cast*>(dst_frame); + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); +} + +void Wht4TransformLoopRow_NEON(TransformType tx_type, TransformSize tx_size, + int /*adjusted_tx_height*/, void* /*src_buffer*/, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + assert(tx_type == kTransformTypeDctDct); + assert(tx_size == kTransformSize4x4); + static_cast(tx_type); + static_cast(tx_size); + // Do both row and column transforms in the column-transform pass. +} + +void Wht4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + assert(tx_type == kTransformTypeDctDct); + assert(tx_size == kTransformSize4x4); + static_cast(tx_type); + static_cast(tx_size); + + // Process 4 1d wht4 rows and columns in parallel. + const auto* src = static_cast(src_buffer); + auto& frame = *static_cast*>(dst_frame); + uint8_t* dst = frame[start_y] + start_x; + const int dst_stride = frame.columns(); + Wht4_NEON(dst, dst_stride, src, adjusted_tx_height); +} + +//------------------------------------------------------------------------------ + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + // Maximum transform size for Dct is 64. + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] = + Dct4TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] = + Dct4TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] = + Dct8TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] = + Dct8TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] = + Dct16TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] = + Dct16TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] = + Dct32TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] = + Dct32TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] = + Dct64TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] = + Dct64TransformLoopColumn_NEON; + + // Maximum transform size for Adst is 16. + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] = + Adst4TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] = + Adst4TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] = + Adst8TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] = + Adst8TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] = + Adst16TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] = + Adst16TransformLoopColumn_NEON; + + // Maximum transform size for Identity transform is 32. + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] = + Identity4TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] = + Identity4TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] = + Identity8TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] = + Identity8TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] = + Identity16TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] = + Identity16TransformLoopColumn_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] = + Identity32TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] = + Identity32TransformLoopColumn_NEON; + + // Maximum transform size for Wht is 4. + dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] = + Wht4TransformLoopRow_NEON; + dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] = + Wht4TransformLoopColumn_NEON; +} + +} // namespace +} // namespace low_bitdepth + +void InverseTransformInit_NEON() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void InverseTransformInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/inverse_transform_neon.h b/src/dsp/arm/inverse_transform_neon.h new file mode 100644 index 0000000..af647e8 --- /dev/null +++ b/src/dsp/arm/inverse_transform_neon.h @@ -0,0 +1,52 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::inverse_transforms, see the defines below for specifics. +// This function is not thread-safe. +void InverseTransformInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_ diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc new file mode 100644 index 0000000..146c983 --- /dev/null +++ b/src/dsp/arm/loop_filter_neon.cc @@ -0,0 +1,1190 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/loop_filter.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh) +inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) { + const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh)); + return vorr_u8(a, RightShift<32>(a)); +} + +// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh +inline uint8x8_t OuterThreshold(const uint8x8_t p0q0, const uint8x8_t p1q1, + const uint8_t outer_thresh) { + const uint8x8x2_t a = Interleave32(p0q0, p1q1); + const uint8x8_t b = vabd_u8(a.val[0], a.val[1]); + const uint8x8_t p0q0_double = vqadd_u8(b, b); + const uint8x8_t p1q1_half = RightShift<32>(vshr_n_u8(b, 1)); + const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half); + return vcle_u8(c, vdup_n_u8(outer_thresh)); +} + +// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && +// OuterThreshhold() +inline uint8x8_t NeedsFilter4(const uint8x8_t abd_p0p1_q0q1, + const uint8x8_t p0q0, const uint8x8_t p1q1, + const uint8_t inner_thresh, + const uint8_t outer_thresh) { + const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh)); + const uint8x8_t inner_mask = vand_u8(a, RightShift<32>(a)); + const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh); + return vand_u8(inner_mask, outer_mask); +} + +inline void Filter4Masks(const uint8x8_t p0q0, const uint8x8_t p1q1, + const uint8_t hev_thresh, const uint8_t outer_thresh, + const uint8_t inner_thresh, uint8x8_t* const hev_mask, + uint8x8_t* const needs_filter4_mask) { + const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1); + // This includes cases where NeedsFilter4() is not true and so Filter2() will + // not be applied. + const uint8x8_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh); + + *needs_filter4_mask = + NeedsFilter4(p0p1_q0q1, p0q0, p1q1, inner_thresh, outer_thresh); + + // Filter2() will only be applied if both NeedsFilter4() and Hev() are true. + *hev_mask = vand_u8(hev_tmp_mask, *needs_filter4_mask); +} + +// Calculate Filter4() or Filter2() based on |hev_mask|. +inline void Filter4(const uint8x8_t q0p1, const uint8x8_t p0q1, + const uint8x8_t hev_mask, uint8x8_t* const p1q1_result, + uint8x8_t* const p0q0_result) { + const int16x4_t zero = vdup_n_s16(0); + + // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val); + const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubl_u8(q0p1, p0q1)); + const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3); + + // If this is for Filter2() then include |p1mq1|. Otherwise zero it. + const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1); + const int8x8_t p1mq1_saturated = vqmovn_s16(vcombine_s16(p1mq1, zero)); + const int8x8_t hev_option = + vand_s8(vreinterpret_s8_u8(hev_mask), p1mq1_saturated); + + const int16x4_t a = + vget_low_s16(vaddw_s8(vcombine_s16(q0mp0_3, zero), hev_option)); + + // We can not shift with rounding because the clamp comes *before* the + // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 = + // Clip3(a + 3, min_signed_val, max_signed_val) >> 3; + const int16x4_t plus_four = vadd_s16(a, vdup_n_s16(4)); + const int16x4_t plus_three = vadd_s16(a, vdup_n_s16(3)); + const int8x8_t a2_a1 = + vshr_n_s8(vqmovn_s16(vcombine_s16(plus_three, plus_four)), 3); + + // a3 is in the high 4 values. + // a3 = (a1 + 1) >> 1; + const int8x8_t a3 = vrshr_n_s8(a2_a1, 1); + + const int16x8_t p0q1_l = vreinterpretq_s16_u16(vmovl_u8(p0q1)); + const int16x8_t q0p1_l = vreinterpretq_s16_u16(vmovl_u8(q0p1)); + + const int16x8_t p1q1_l = + vcombine_s16(vget_high_s16(q0p1_l), vget_high_s16(p0q1_l)); + + const int8x8_t a3_ma3 = InterleaveHigh32(a3, vneg_s8(a3)); + const int16x8_t p1q1_a3 = vaddw_s8(p1q1_l, a3_ma3); + + const int16x8_t p0q0_l = + vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l)); + // Need to shift the second term or we end up with a2_ma2. + const int8x8_t a2_ma1 = + InterleaveLow32(a2_a1, RightShift<32>(vneg_s8(a2_a1))); + const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1); + + *p1q1_result = vqmovun_s16(p1q1_a3); + *p0q0_result = vqmovun_s16(p0q0_a); +} + +void Horizontal4_NEON(void* const dest, const ptrdiff_t stride, + const int outer_thresh, const int inner_thresh, + const int hev_thresh) { + uint8_t* dst = static_cast(dest); + + const uint8x8_t p1_v = Load4(dst - 2 * stride); + const uint8x8_t p0_v = Load4(dst - stride); + const uint8x8_t p0q0 = Load4<1>(dst, p0_v); + const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v); + + uint8x8_t hev_mask; + uint8x8_t needs_filter4_mask; + Filter4Masks(p0q0, p1q1, hev_thresh, outer_thresh, inner_thresh, &hev_mask, + &needs_filter4_mask); + + // Copy the masks to the high bits for packed comparisons later. + hev_mask = InterleaveLow32(hev_mask, hev_mask); + needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask); + +#if defined(__aarch64__) + // This provides a good speedup for the unit test. Not sure how applicable it + // is to valid streams though. + // Consider doing this on armv7 if there is a quick way to check if a vector + // is zero. + if (vaddv_u8(needs_filter4_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + uint8x8_t f_p1q1; + uint8x8_t f_p0q0; + const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1); + Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0); + + // Already integrated the Hev mask when calculating the filtered values. + const uint8x8_t p0q0_output = vbsl_u8(needs_filter4_mask, f_p0q0, p0q0); + + // p1/q1 are unmodified if only Hev() is true. This works because it was and'd + // with |needs_filter4_mask| previously. + const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask); + const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1); + + StoreLo4(dst - 2 * stride, p1q1_output); + StoreLo4(dst - stride, p0q0_output); + StoreHi4(dst, p0q0_output); + StoreHi4(dst + stride, p1q1_output); +} + +void Vertical4_NEON(void* const dest, const ptrdiff_t stride, + const int outer_thresh, const int inner_thresh, + const int hev_thresh) { + uint8_t* dst = static_cast(dest); + + // Move |dst| to the left side of the filter window. + dst -= 2; + + // |p1q0| and |p0q1| are named for the values they will contain after the + // transpose. + const uint8x8_t row0 = Load4(dst); + uint8x8_t p1q0 = Load4<1>(dst + stride, row0); + const uint8x8_t row2 = Load4(dst + 2 * stride); + uint8x8_t p0q1 = Load4<1>(dst + 3 * stride, row2); + + Transpose4x4(&p1q0, &p0q1); + // Rearrange. + const uint8x8x2_t p1q1xq0p0 = Interleave32(p1q0, Transpose32(p0q1)); + const uint8x8x2_t p1q1xp0q0 = {p1q1xq0p0.val[0], + Transpose32(p1q1xq0p0.val[1])}; + + uint8x8_t hev_mask; + uint8x8_t needs_filter4_mask; + Filter4Masks(p1q1xp0q0.val[1], p1q1xp0q0.val[0], hev_thresh, outer_thresh, + inner_thresh, &hev_mask, &needs_filter4_mask); + + // Copy the masks to the high bits for packed comparisons later. + hev_mask = InterleaveLow32(hev_mask, hev_mask); + needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask); + +#if defined(__aarch64__) + // This provides a good speedup for the unit test. Not sure how applicable it + // is to valid streams though. + // Consider doing this on armv7 if there is a quick way to check if a vector + // is zero. + if (vaddv_u8(needs_filter4_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + uint8x8_t f_p1q1; + uint8x8_t f_p0q0; + Filter4(Transpose32(p1q0), p0q1, hev_mask, &f_p1q1, &f_p0q0); + + // Already integrated the Hev mask when calculating the filtered values. + const uint8x8_t p0q0_output = + vbsl_u8(needs_filter4_mask, f_p0q0, p1q1xp0q0.val[1]); + + // p1/q1 are unmodified if only Hev() is true. This works because it was and'd + // with |needs_filter4_mask| previously. + const uint8x8_t p1q1_mask = veor_u8(hev_mask, needs_filter4_mask); + const uint8x8_t p1q1_output = vbsl_u8(p1q1_mask, f_p1q1, p1q1xp0q0.val[0]); + + // Put things back in order to reverse the transpose. + const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output); + uint8x8_t output_0 = p1p0xq1q0.val[0], + output_1 = Transpose32(p1p0xq1q0.val[1]); + + Transpose4x4(&output_0, &output_1); + + StoreLo4(dst, output_0); + StoreLo4(dst + stride, output_1); + StoreHi4(dst + 2 * stride, output_0); + StoreHi4(dst + 3 * stride, output_1); +} + +// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh && +// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh +// |flat_thresh| == 1 for 8 bit decode. +inline uint8x8_t IsFlat3(const uint8x8_t abd_p0p1_q0q1, + const uint8x8_t abd_p0p2_q0q2) { + const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2); + const uint8x8_t b = vcle_u8(a, vdup_n_u8(1)); + return vand_u8(b, RightShift<32>(b)); +} + +// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh && +// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh && +// OuterThreshhold() +inline uint8x8_t NeedsFilter6(const uint8x8_t abd_p0p1_q0q1, + const uint8x8_t abd_p1p2_q1q2, + const uint8x8_t p0q0, const uint8x8_t p1q1, + const uint8_t inner_thresh, + const uint8_t outer_thresh) { + const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2); + const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh)); + const uint8x8_t inner_mask = vand_u8(b, RightShift<32>(b)); + const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh); + return vand_u8(inner_mask, outer_mask); +} + +inline void Filter6Masks(const uint8x8_t p2q2, const uint8x8_t p1q1, + const uint8x8_t p0q0, const uint8_t hev_thresh, + const uint8_t outer_thresh, const uint8_t inner_thresh, + uint8x8_t* const needs_filter6_mask, + uint8x8_t* const is_flat3_mask, + uint8x8_t* const hev_mask) { + const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1); + *hev_mask = Hev(p0p1_q0q1, hev_thresh); + *is_flat3_mask = IsFlat3(p0p1_q0q1, vabd_u8(p0q0, p2q2)); + *needs_filter6_mask = NeedsFilter6(p0p1_q0q1, vabd_u8(p1q1, p2q2), p0q0, p1q1, + inner_thresh, outer_thresh); +} + +inline void Filter6(const uint8x8_t p2q2, const uint8x8_t p1q1, + const uint8x8_t p0q0, uint8x8_t* const p1q1_output, + uint8x8_t* const p0q0_output) { + // Sum p1 and q1 output from opposite directions + // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0 + // ^^^^^^^^ + // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3) + // ^^^^^^^^ + const uint16x8_t p2q2_double = vaddl_u8(p2q2, p2q2); + uint16x8_t sum = vaddw_u8(p2q2_double, p2q2); + + // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0 + // ^^^^^^^^ + // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3) + // ^^^^^^^^ + sum = vaddq_u16(vaddl_u8(p1q1, p1q1), sum); + + // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0 + // ^^^^^^^^ + // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3) + // ^^^^^^^^ + sum = vaddq_u16(vaddl_u8(p0q0, p0q0), sum); + + // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0 + // ^^ + // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q3) + // ^^ + const uint8x8_t q0p0 = Transpose32(p0q0); + sum = vaddw_u8(sum, q0p0); + + *p1q1_output = vrshrn_n_u16(sum, 3); + + // Convert to p0 and q0 output: + // p0 = p1 - (2 * p2) + q0 + q1 + // q0 = q1 - (2 * q2) + p0 + p1 + sum = vsubq_u16(sum, p2q2_double); + const uint8x8_t q1p1 = Transpose32(p1q1); + sum = vaddq_u16(vaddl_u8(q0p0, q1p1), sum); + + *p0q0_output = vrshrn_n_u16(sum, 3); +} + +void Horizontal6_NEON(void* const dest, const ptrdiff_t stride, + const int outer_thresh, const int inner_thresh, + const int hev_thresh) { + auto* dst = static_cast(dest); + + const uint8x8_t p2_v = Load4(dst - 3 * stride); + const uint8x8_t p1_v = Load4(dst - 2 * stride); + const uint8x8_t p0_v = Load4(dst - stride); + const uint8x8_t p0q0 = Load4<1>(dst, p0_v); + const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v); + const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v); + + uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask; + Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh, + &needs_filter6_mask, &is_flat3_mask, &hev_mask); + + needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask); + is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask); + hev_mask = InterleaveLow32(hev_mask, hev_mask); + +#if defined(__aarch64__) + // This provides a good speedup for the unit test. Not sure how applicable it + // is to valid streams though. + // Consider doing this on armv7 if there is a quick way to check if a vector + // is zero. + if (vaddv_u8(needs_filter6_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + uint8x8_t f_p1q1; + uint8x8_t f_p0q0; + const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1); + Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0); + // Reset the outer values if only a Hev() mask was required. + f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1); + + uint8x8_t f6_p1q1, f6_p0q0; +#if defined(__aarch64__) + if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) { + // Filter6() does not apply. + const uint8x8_t zero = vdup_n_u8(0); + f6_p1q1 = zero; + f6_p0q0 = zero; + } else { +#endif // defined(__aarch64__) + Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); +#if defined(__aarch64__) + } +#endif // defined(__aarch64__) + + uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1); + p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1); + StoreLo4(dst - 2 * stride, p1q1_output); + StoreHi4(dst + stride, p1q1_output); + + uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0); + p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0); + StoreLo4(dst - stride, p0q0_output); + StoreHi4(dst, p0q0_output); +} + +void Vertical6_NEON(void* const dest, const ptrdiff_t stride, + const int outer_thresh, const int inner_thresh, + const int hev_thresh) { + auto* dst = static_cast(dest); + + // Move |dst| to the left side of the filter window. + dst -= 3; + + // |p2q1|, |p1q2|, |p0xx| and |q0xx| are named for the values they will + // contain after the transpose. + // These over-read by 2 bytes. We only need 6. + uint8x8_t p2q1 = vld1_u8(dst); + uint8x8_t p1q2 = vld1_u8(dst + stride); + uint8x8_t p0xx = vld1_u8(dst + 2 * stride); + uint8x8_t q0xx = vld1_u8(dst + 3 * stride); + + Transpose8x4(&p2q1, &p1q2, &p0xx, &q0xx); + + const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2)); + const uint8x8_t p2q2 = p2q2xq1p1.val[0]; + const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]); + const uint8x8_t p0q0 = InterleaveLow32(p0xx, q0xx); + + uint8x8_t needs_filter6_mask, is_flat3_mask, hev_mask; + Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh, + &needs_filter6_mask, &is_flat3_mask, &hev_mask); + + needs_filter6_mask = InterleaveLow32(needs_filter6_mask, needs_filter6_mask); + is_flat3_mask = InterleaveLow32(is_flat3_mask, is_flat3_mask); + hev_mask = InterleaveLow32(hev_mask, hev_mask); + +#if defined(__aarch64__) + // This provides a good speedup for the unit test. Not sure how applicable it + // is to valid streams though. + // Consider doing this on armv7 if there is a quick way to check if a vector + // is zero. + if (vaddv_u8(needs_filter6_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + uint8x8_t f_p1q1; + uint8x8_t f_p0q0; + const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1); + Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0); + // Reset the outer values if only a Hev() mask was required. + f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1); + + uint8x8_t f6_p1q1, f6_p0q0; +#if defined(__aarch64__) + if (vaddv_u8(vand_u8(is_flat3_mask, needs_filter6_mask)) == 0) { + // Filter6() does not apply. + const uint8x8_t zero = vdup_n_u8(0); + f6_p1q1 = zero; + f6_p0q0 = zero; + } else { +#endif // defined(__aarch64__) + Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); +#if defined(__aarch64__) + } +#endif // defined(__aarch64__) + + uint8x8_t p1q1_output = vbsl_u8(is_flat3_mask, f6_p1q1, f_p1q1); + p1q1_output = vbsl_u8(needs_filter6_mask, p1q1_output, p1q1); + + uint8x8_t p0q0_output = vbsl_u8(is_flat3_mask, f6_p0q0, f_p0q0); + p0q0_output = vbsl_u8(needs_filter6_mask, p0q0_output, p0q0); + + // The six tap filter is only six taps on input. Output is limited to p1-q1. + dst += 1; + // Put things back in order to reverse the transpose. + const uint8x8x2_t p1p0xq1q0 = Interleave32(p1q1_output, p0q0_output); + uint8x8_t output_0 = p1p0xq1q0.val[0]; + uint8x8_t output_1 = Transpose32(p1p0xq1q0.val[1]); + + Transpose4x4(&output_0, &output_1); + + StoreLo4(dst, output_0); + StoreLo4(dst + stride, output_1); + StoreHi4(dst + 2 * stride, output_0); + StoreHi4(dst + 3 * stride, output_1); +} + +// IsFlat4 uses N=1, IsFlatOuter4 uses N=4. +// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh && +// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh && +// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh +// |flat_thresh| == 1 for 8 bit decode. +inline uint8x8_t IsFlat4(const uint8x8_t abd_p0n0_q0n0, + const uint8x8_t abd_p0n1_q0n1, + const uint8x8_t abd_p0n2_q0n2) { + const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1); + const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2); + const uint8x8_t c = vcle_u8(b, vdup_n_u8(1)); + return vand_u8(c, RightShift<32>(c)); +} + +// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh && +// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && +// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh +// OuterThreshhold() +inline uint8x8_t NeedsFilter8(const uint8x8_t abd_p0p1_q0q1, + const uint8x8_t abd_p1p2_q1q2, + const uint8x8_t abd_p2p3_q2q3, + const uint8x8_t p0q0, const uint8x8_t p1q1, + const uint8_t inner_thresh, + const uint8_t outer_thresh) { + const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2); + const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3); + const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh)); + const uint8x8_t inner_mask = vand_u8(c, RightShift<32>(c)); + const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh); + return vand_u8(inner_mask, outer_mask); +} + +inline void Filter8Masks(const uint8x8_t p3q3, const uint8x8_t p2q2, + const uint8x8_t p1q1, const uint8x8_t p0q0, + const uint8_t hev_thresh, const uint8_t outer_thresh, + const uint8_t inner_thresh, + uint8x8_t* const needs_filter8_mask, + uint8x8_t* const is_flat4_mask, + uint8x8_t* const hev_mask) { + const uint8x8_t p0p1_q0q1 = vabd_u8(p0q0, p1q1); + *hev_mask = Hev(p0p1_q0q1, hev_thresh); + *is_flat4_mask = IsFlat4(p0p1_q0q1, vabd_u8(p0q0, p2q2), vabd_u8(p0q0, p3q3)); + *needs_filter8_mask = + NeedsFilter8(p0p1_q0q1, vabd_u8(p1q1, p2q2), vabd_u8(p2q2, p3q3), p0q0, + p1q1, inner_thresh, outer_thresh); +} + +inline void Filter8(const uint8x8_t p3q3, const uint8x8_t p2q2, + const uint8x8_t p1q1, const uint8x8_t p0q0, + uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output, + uint8x8_t* const p0q0_output) { + // Sum p2 and q2 output from opposite directions + // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0 + // ^^^^^^^^ + // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3) + // ^^^^^^^^ + uint16x8_t sum = vaddw_u8(vaddl_u8(p3q3, p3q3), p3q3); + + // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0 + // ^^^^^^^^ + // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3) + // ^^^^^^^^ + sum = vaddq_u16(vaddl_u8(p2q2, p2q2), sum); + + // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0 + // ^^^^^^^ + // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3) + // ^^^^^^^ + sum = vaddq_u16(vaddl_u8(p1q1, p0q0), sum); + + // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0 + // ^^ + // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3) + // ^^ + const uint8x8_t q0p0 = Transpose32(p0q0); + sum = vaddw_u8(sum, q0p0); + + *p2q2_output = vrshrn_n_u16(sum, 3); + + // Convert to p1 and q1 output: + // p1 = p2 - p3 - p2 + p1 + q1 + // q1 = q2 - q3 - q2 + q0 + p1 + sum = vsubq_u16(sum, vaddl_u8(p3q3, p2q2)); + const uint8x8_t q1p1 = Transpose32(p1q1); + sum = vaddq_u16(vaddl_u8(p1q1, q1p1), sum); + + *p1q1_output = vrshrn_n_u16(sum, 3); + + // Convert to p0 and q0 output: + // p0 = p1 - p3 - p1 + p0 + q2 + // q0 = q1 - q3 - q1 + q0 + p2 + sum = vsubq_u16(sum, vaddl_u8(p3q3, p1q1)); + const uint8x8_t q2p2 = Transpose32(p2q2); + sum = vaddq_u16(vaddl_u8(p0q0, q2p2), sum); + + *p0q0_output = vrshrn_n_u16(sum, 3); +} + +void Horizontal8_NEON(void* const dest, const ptrdiff_t stride, + const int outer_thresh, const int inner_thresh, + const int hev_thresh) { + auto* dst = static_cast(dest); + + const uint8x8_t p3_v = Load4(dst - 4 * stride); + const uint8x8_t p2_v = Load4(dst - 3 * stride); + const uint8x8_t p1_v = Load4(dst - 2 * stride); + const uint8x8_t p0_v = Load4(dst - stride); + const uint8x8_t p0q0 = Load4<1>(dst, p0_v); + const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v); + const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v); + const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v); + + uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask; + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh, + &needs_filter8_mask, &is_flat4_mask, &hev_mask); + + needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask); + is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask); + is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask); + hev_mask = InterleaveLow32(hev_mask, hev_mask); + +#if defined(__aarch64__) + // This provides a good speedup for the unit test. Not sure how applicable it + // is to valid streams though. + // Consider doing this on armv7 if there is a quick way to check if a vector + // is zero. + if (vaddv_u8(needs_filter8_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + uint8x8_t f_p1q1; + uint8x8_t f_p0q0; + const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1); + Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0); + // Reset the outer values if only a Hev() mask was required. + f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1); + + uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0; +#if defined(__aarch64__) + if (vaddv_u8(is_flat4_mask) == 0) { + // Filter8() does not apply. + const uint8x8_t zero = vdup_n_u8(0); + f8_p2q2 = zero; + f8_p1q1 = zero; + f8_p0q0 = zero; + } else { +#endif // defined(__aarch64__) + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + + const uint8x8_t p2p2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2); + StoreLo4(dst - 3 * stride, p2p2_output); + StoreHi4(dst + 2 * stride, p2p2_output); +#if defined(__aarch64__) + } +#endif // defined(__aarch64__) + + uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1); + p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1); + StoreLo4(dst - 2 * stride, p1q1_output); + StoreHi4(dst + stride, p1q1_output); + + uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0); + p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0); + StoreLo4(dst - stride, p0q0_output); + StoreHi4(dst, p0q0_output); +} + +void Vertical8_NEON(void* const dest, const ptrdiff_t stride, + const int outer_thresh, const int inner_thresh, + const int hev_thresh) { + auto* dst = static_cast(dest); + // Move |dst| to the left side of the filter window. + dst -= 4; + + // |p3q0|, |p2q1|, |p1q2| and |p0q3| are named for the values they will + // contain after the transpose. + uint8x8_t p3q0 = vld1_u8(dst); + uint8x8_t p2q1 = vld1_u8(dst + stride); + uint8x8_t p1q2 = vld1_u8(dst + 2 * stride); + uint8x8_t p0q3 = vld1_u8(dst + 3 * stride); + + Transpose8x4(&p3q0, &p2q1, &p1q2, &p0q3); + const uint8x8x2_t p3q3xq0p0 = Interleave32(p3q0, Transpose32(p0q3)); + const uint8x8_t p3q3 = p3q3xq0p0.val[0]; + const uint8x8_t p0q0 = Transpose32(p3q3xq0p0.val[1]); + const uint8x8x2_t p2q2xq1p1 = Interleave32(p2q1, Transpose32(p1q2)); + const uint8x8_t p2q2 = p2q2xq1p1.val[0]; + const uint8x8_t p1q1 = Transpose32(p2q2xq1p1.val[1]); + + uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask; + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh, + &needs_filter8_mask, &is_flat4_mask, &hev_mask); + + needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask); + is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask); + is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask); + hev_mask = InterleaveLow32(hev_mask, hev_mask); + +#if defined(__aarch64__) + // This provides a good speedup for the unit test. Not sure how applicable it + // is to valid streams though. + // Consider doing this on armv7 if there is a quick way to check if a vector + // is zero. + if (vaddv_u8(needs_filter8_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + uint8x8_t f_p1q1; + uint8x8_t f_p0q0; + const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1); + Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0); + // Reset the outer values if only a Hev() mask was required. + f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1); + + uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0; +#if defined(__aarch64__) + if (vaddv_u8(is_flat4_mask) == 0) { + // Filter8() does not apply. + const uint8x8_t zero = vdup_n_u8(0); + f8_p2q2 = zero; + f8_p1q1 = zero; + f8_p0q0 = zero; + } else { +#endif // defined(__aarch64__) + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + +#if defined(__aarch64__) + } +#endif // defined(__aarch64__) + + // Always prepare and store p2/q2 because we need to transpose it anyway. + const uint8x8_t p2q2_output = vbsl_u8(is_flat4_mask, f8_p2q2, p2q2); + + uint8x8_t p1q1_output = vbsl_u8(is_flat4_mask, f8_p1q1, f_p1q1); + p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1); + + uint8x8_t p0q0_output = vbsl_u8(is_flat4_mask, f8_p0q0, f_p0q0); + p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0); + + // Write out p3/q3 as well. There isn't a good way to write out 6 bytes. + // Variable names reflect the values before transposition. + const uint8x8x2_t p3q0xq3p0_output = + Interleave32(p3q3, Transpose32(p0q0_output)); + uint8x8_t p3q0_output = p3q0xq3p0_output.val[0]; + uint8x8_t p0q3_output = Transpose32(p3q0xq3p0_output.val[1]); + const uint8x8x2_t p2q1xq2p1_output = + Interleave32(p2q2_output, Transpose32(p1q1_output)); + uint8x8_t p2q1_output = p2q1xq2p1_output.val[0]; + uint8x8_t p1q2_output = Transpose32(p2q1xq2p1_output.val[1]); + + Transpose8x4(&p3q0_output, &p2q1_output, &p1q2_output, &p0q3_output); + + vst1_u8(dst, p3q0_output); + vst1_u8(dst + stride, p2q1_output); + vst1_u8(dst + 2 * stride, p1q2_output); + vst1_u8(dst + 3 * stride, p0q3_output); +} + +inline void Filter14(const uint8x8_t p6q6, const uint8x8_t p5q5, + const uint8x8_t p4q4, const uint8x8_t p3q3, + const uint8x8_t p2q2, const uint8x8_t p1q1, + const uint8x8_t p0q0, uint8x8_t* const p5q5_output, + uint8x8_t* const p4q4_output, uint8x8_t* const p3q3_output, + uint8x8_t* const p2q2_output, uint8x8_t* const p1q1_output, + uint8x8_t* const p0q0_output) { + // Sum p5 and q5 output from opposite directions + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^^ + uint16x8_t sum = vsubw_u8(vshll_n_u8(p6q6, 3), p6q6); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^^ + sum = vaddq_u16(vaddl_u8(p5q5, p5q5), sum); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^^ + sum = vaddq_u16(vaddl_u8(p4q4, p4q4), sum); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^ + sum = vaddq_u16(vaddl_u8(p3q3, p2q2), sum); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^ + sum = vaddq_u16(vaddl_u8(p1q1, p0q0), sum); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^ + const uint8x8_t q0p0 = Transpose32(p0q0); + sum = vaddw_u8(sum, q0p0); + + *p5q5_output = vrshrn_n_u16(sum, 4); + + // Convert to p4 and q4 output: + // p4 = p5 - (2 * p6) + p3 + q1 + // q4 = q5 - (2 * q6) + q3 + p1 + sum = vsubq_u16(sum, vaddl_u8(p6q6, p6q6)); + const uint8x8_t q1p1 = Transpose32(p1q1); + sum = vaddq_u16(vaddl_u8(p3q3, q1p1), sum); + + *p4q4_output = vrshrn_n_u16(sum, 4); + + // Convert to p3 and q3 output: + // p3 = p4 - p6 - p5 + p2 + q2 + // q3 = q4 - q6 - q5 + q2 + p2 + sum = vsubq_u16(sum, vaddl_u8(p6q6, p5q5)); + const uint8x8_t q2p2 = Transpose32(p2q2); + sum = vaddq_u16(vaddl_u8(p2q2, q2p2), sum); + + *p3q3_output = vrshrn_n_u16(sum, 4); + + // Convert to p2 and q2 output: + // p2 = p3 - p6 - p4 + p1 + q3 + // q2 = q3 - q6 - q4 + q1 + p3 + sum = vsubq_u16(sum, vaddl_u8(p6q6, p4q4)); + const uint8x8_t q3p3 = Transpose32(p3q3); + sum = vaddq_u16(vaddl_u8(p1q1, q3p3), sum); + + *p2q2_output = vrshrn_n_u16(sum, 4); + + // Convert to p1 and q1 output: + // p1 = p2 - p6 - p3 + p0 + q4 + // q1 = q2 - q6 - q3 + q0 + p4 + sum = vsubq_u16(sum, vaddl_u8(p6q6, p3q3)); + const uint8x8_t q4p4 = Transpose32(p4q4); + sum = vaddq_u16(vaddl_u8(p0q0, q4p4), sum); + + *p1q1_output = vrshrn_n_u16(sum, 4); + + // Convert to p0 and q0 output: + // p0 = p1 - p6 - p2 + q0 + q5 + // q0 = q1 - q6 - q2 + p0 + p5 + sum = vsubq_u16(sum, vaddl_u8(p6q6, p2q2)); + const uint8x8_t q5p5 = Transpose32(p5q5); + sum = vaddq_u16(vaddl_u8(q0p0, q5p5), sum); + + *p0q0_output = vrshrn_n_u16(sum, 4); +} + +void Horizontal14_NEON(void* const dest, const ptrdiff_t stride, + const int outer_thresh, const int inner_thresh, + const int hev_thresh) { + auto* dst = static_cast(dest); + + const uint8x8_t p6_v = Load4(dst - 7 * stride); + const uint8x8_t p5_v = Load4(dst - 6 * stride); + const uint8x8_t p4_v = Load4(dst - 5 * stride); + const uint8x8_t p3_v = Load4(dst - 4 * stride); + const uint8x8_t p2_v = Load4(dst - 3 * stride); + const uint8x8_t p1_v = Load4(dst - 2 * stride); + const uint8x8_t p0_v = Load4(dst - stride); + const uint8x8_t p0q0 = Load4<1>(dst, p0_v); + const uint8x8_t p1q1 = Load4<1>(dst + stride, p1_v); + const uint8x8_t p2q2 = Load4<1>(dst + 2 * stride, p2_v); + const uint8x8_t p3q3 = Load4<1>(dst + 3 * stride, p3_v); + const uint8x8_t p4q4 = Load4<1>(dst + 4 * stride, p4_v); + const uint8x8_t p5q5 = Load4<1>(dst + 5 * stride, p5_v); + const uint8x8_t p6q6 = Load4<1>(dst + 6 * stride, p6_v); + + uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask; + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh, + &needs_filter8_mask, &is_flat4_mask, &hev_mask); + + needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask); + is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask); + is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask); + hev_mask = InterleaveLow32(hev_mask, hev_mask); + +#if defined(__aarch64__) + // This provides a good speedup for the unit test. Not sure how applicable it + // is to valid streams though. + // Consider doing this on armv7 if there is a quick way to check if a vector + // is zero. + if (vaddv_u8(needs_filter8_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Decide between Filter8() and Filter14(). + uint8x8_t is_flat_outer4_mask = + IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6)); + is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask); + is_flat_outer4_mask = + InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask); + + uint8x8_t f_p1q1; + uint8x8_t f_p0q0; + const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1); + Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0); + // Reset the outer values if only a Hev() mask was required. + f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1); + + uint8x8_t f8_p1q1, f8_p0q0; + uint8x8_t f14_p2q2, f14_p1q1, f14_p0q0; +#if defined(__aarch64__) + if (vaddv_u8(is_flat4_mask) == 0) { + // Filter8() and Filter14() do not apply. + const uint8x8_t zero = vdup_n_u8(0); + f8_p1q1 = zero; + f8_p0q0 = zero; + f14_p1q1 = zero; + f14_p0q0 = zero; + } else { +#endif // defined(__aarch64__) + uint8x8_t f8_p2q2; + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + +#if defined(__aarch64__) + if (vaddv_u8(is_flat_outer4_mask) == 0) { + // Filter14() does not apply. + const uint8x8_t zero = vdup_n_u8(0); + f14_p2q2 = zero; + f14_p1q1 = zero; + f14_p0q0 = zero; + } else { +#endif // defined(__aarch64__) + uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3; + Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, + &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); + + const uint8x8_t p5q5_output = + vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5); + StoreLo4(dst - 6 * stride, p5q5_output); + StoreHi4(dst + 5 * stride, p5q5_output); + + const uint8x8_t p4q4_output = + vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4); + StoreLo4(dst - 5 * stride, p4q4_output); + StoreHi4(dst + 4 * stride, p4q4_output); + + const uint8x8_t p3q3_output = + vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3); + StoreLo4(dst - 4 * stride, p3q3_output); + StoreHi4(dst + 3 * stride, p3q3_output); +#if defined(__aarch64__) + } +#endif // defined(__aarch64__) + + uint8x8_t p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2); + p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2); + StoreLo4(dst - 3 * stride, p2q2_output); + StoreHi4(dst + 2 * stride, p2q2_output); +#if defined(__aarch64__) + } +#endif // defined(__aarch64__) + + uint8x8_t p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1); + p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1); + p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1); + StoreLo4(dst - 2 * stride, p1q1_output); + StoreHi4(dst + stride, p1q1_output); + + uint8x8_t p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0); + p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0); + p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0); + StoreLo4(dst - stride, p0q0_output); + StoreHi4(dst, p0q0_output); +} + +void Vertical14_NEON(void* const dest, const ptrdiff_t stride, + const int outer_thresh, const int inner_thresh, + const int hev_thresh) { + auto* dst = static_cast(dest); + dst -= 8; + // input + // p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7 + const uint8x16_t x0 = vld1q_u8(dst); + dst += stride; + const uint8x16_t x1 = vld1q_u8(dst); + dst += stride; + const uint8x16_t x2 = vld1q_u8(dst); + dst += stride; + const uint8x16_t x3 = vld1q_u8(dst); + dst -= (stride * 3); + + // re-order input +#if defined(__aarch64__) + const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607); + const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203); + const uint8x16_t index_qp7toqp0 = vcombine_u8(index_qp3toqp0, index_qp7toqp4); + + uint8x16_t input_0 = vqtbl1q_u8(x0, index_qp7toqp0); + uint8x16_t input_1 = vqtbl1q_u8(x1, index_qp7toqp0); + uint8x16_t input_2 = vqtbl1q_u8(x2, index_qp7toqp0); + uint8x16_t input_3 = vqtbl1q_u8(x3, index_qp7toqp0); +#else + const uint8x8_t index_qp3toqp0 = vcreate_u8(0x0b0a090804050607); + const uint8x8_t index_qp7toqp4 = vcreate_u8(0x0f0e0d0c00010203); + + const uint8x8_t x0_qp3qp0 = VQTbl1U8(x0, index_qp3toqp0); + const uint8x8_t x1_qp3qp0 = VQTbl1U8(x1, index_qp3toqp0); + const uint8x8_t x2_qp3qp0 = VQTbl1U8(x2, index_qp3toqp0); + const uint8x8_t x3_qp3qp0 = VQTbl1U8(x3, index_qp3toqp0); + + const uint8x8_t x0_qp7qp4 = VQTbl1U8(x0, index_qp7toqp4); + const uint8x8_t x1_qp7qp4 = VQTbl1U8(x1, index_qp7toqp4); + const uint8x8_t x2_qp7qp4 = VQTbl1U8(x2, index_qp7toqp4); + const uint8x8_t x3_qp7qp4 = VQTbl1U8(x3, index_qp7toqp4); + + const uint8x16_t input_0 = vcombine_u8(x0_qp3qp0, x0_qp7qp4); + const uint8x16_t input_1 = vcombine_u8(x1_qp3qp0, x1_qp7qp4); + const uint8x16_t input_2 = vcombine_u8(x2_qp3qp0, x2_qp7qp4); + const uint8x16_t input_3 = vcombine_u8(x3_qp3qp0, x3_qp7qp4); +#endif + // input after re-order + // p0 p1 p2 p3 q0 q1 q2 q3 p4 p5 p6 p7 q4 q5 q6 q7 + + const uint8x16x2_t in01 = vtrnq_u8(input_0, input_1); + const uint8x16x2_t in23 = vtrnq_u8(input_2, input_3); + const uint16x8x2_t in02 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[0]), + vreinterpretq_u16_u8(in23.val[0])); + const uint16x8x2_t in13 = vtrnq_u16(vreinterpretq_u16_u8(in01.val[1]), + vreinterpretq_u16_u8(in23.val[1])); + + const uint8x8_t p0q0 = vget_low_u8(vreinterpretq_u8_u16(in02.val[0])); + const uint8x8_t p1q1 = vget_low_u8(vreinterpretq_u8_u16(in13.val[0])); + + const uint8x8_t p2q2 = vget_low_u8(vreinterpretq_u8_u16(in02.val[1])); + const uint8x8_t p3q3 = vget_low_u8(vreinterpretq_u8_u16(in13.val[1])); + + const uint8x8_t p4q4 = vget_high_u8(vreinterpretq_u8_u16(in02.val[0])); + const uint8x8_t p5q5 = vget_high_u8(vreinterpretq_u8_u16(in13.val[0])); + + const uint8x8_t p6q6 = vget_high_u8(vreinterpretq_u8_u16(in02.val[1])); + const uint8x8_t p7q7 = vget_high_u8(vreinterpretq_u8_u16(in13.val[1])); + + uint8x8_t needs_filter8_mask, is_flat4_mask, hev_mask; + Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_thresh, inner_thresh, + &needs_filter8_mask, &is_flat4_mask, &hev_mask); + + needs_filter8_mask = InterleaveLow32(needs_filter8_mask, needs_filter8_mask); + is_flat4_mask = vand_u8(is_flat4_mask, needs_filter8_mask); + is_flat4_mask = InterleaveLow32(is_flat4_mask, is_flat4_mask); + hev_mask = InterleaveLow32(hev_mask, hev_mask); + +#if defined(__aarch64__) + // This provides a good speedup for the unit test. Not sure how applicable it + // is to valid streams though. + // Consider doing this on armv7 if there is a quick way to check if a vector + // is zero. + if (vaddv_u8(needs_filter8_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // defined(__aarch64__) + + // Decide between Filter8() and Filter14(). + uint8x8_t is_flat_outer4_mask = + IsFlat4(vabd_u8(p0q0, p4q4), vabd_u8(p0q0, p5q5), vabd_u8(p0q0, p6q6)); + is_flat_outer4_mask = vand_u8(is_flat4_mask, is_flat_outer4_mask); + is_flat_outer4_mask = + InterleaveLow32(is_flat_outer4_mask, is_flat_outer4_mask); + + uint8x8_t f_p0q0, f_p1q1; + const uint8x8x2_t q0p1xp0q1 = Interleave32(Transpose32(p0q0), p1q1); + Filter4(q0p1xp0q1.val[0], q0p1xp0q1.val[1], hev_mask, &f_p1q1, &f_p0q0); + // Reset the outer values if only a Hev() mask was required. + f_p1q1 = vbsl_u8(hev_mask, p1q1, f_p1q1); + + uint8x8_t p1q1_output, p0q0_output; + uint8x8_t p5q5_output, p4q4_output, p3q3_output, p2q2_output; + +#if defined(__aarch64__) + if (vaddv_u8(is_flat4_mask) == 0) { + // Filter8() and Filter14() do not apply. + p1q1_output = p1q1; + p0q0_output = p0q0; + + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = p2q2; + } else { +#endif // defined(__aarch64__) + uint8x8_t f8_p2q2, f8_p1q1, f8_p0q0; + Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + +#if defined(__aarch64__) + if (vaddv_u8(is_flat_outer4_mask) == 0) { + // Filter14() does not apply. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = f8_p2q2; + p1q1_output = f8_p1q1; + p0q0_output = f8_p0q0; + } else { +#endif // defined(__aarch64__) + uint8x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; + Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, + &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); + + p5q5_output = vbsl_u8(is_flat_outer4_mask, f14_p5q5, p5q5); + p4q4_output = vbsl_u8(is_flat_outer4_mask, f14_p4q4, p4q4); + p3q3_output = vbsl_u8(is_flat_outer4_mask, f14_p3q3, p3q3); + p2q2_output = vbsl_u8(is_flat_outer4_mask, f14_p2q2, f8_p2q2); + p1q1_output = vbsl_u8(is_flat_outer4_mask, f14_p1q1, f8_p1q1); + p0q0_output = vbsl_u8(is_flat_outer4_mask, f14_p0q0, f8_p0q0); +#if defined(__aarch64__) + } +#endif // defined(__aarch64__) + p2q2_output = vbsl_u8(is_flat4_mask, p2q2_output, p2q2); +#if defined(__aarch64__) + } +#endif // defined(__aarch64__) + + p1q1_output = vbsl_u8(is_flat4_mask, p1q1_output, f_p1q1); + p1q1_output = vbsl_u8(needs_filter8_mask, p1q1_output, p1q1); + p0q0_output = vbsl_u8(is_flat4_mask, p0q0_output, f_p0q0); + p0q0_output = vbsl_u8(needs_filter8_mask, p0q0_output, p0q0); + + const uint8x16_t p0q0_p4q4 = vcombine_u8(p0q0_output, p4q4_output); + const uint8x16_t p2q2_p6q6 = vcombine_u8(p2q2_output, p6q6); + const uint8x16_t p1q1_p5q5 = vcombine_u8(p1q1_output, p5q5_output); + const uint8x16_t p3q3_p7q7 = vcombine_u8(p3q3_output, p7q7); + + const uint16x8x2_t out02 = vtrnq_u16(vreinterpretq_u16_u8(p0q0_p4q4), + vreinterpretq_u16_u8(p2q2_p6q6)); + const uint16x8x2_t out13 = vtrnq_u16(vreinterpretq_u16_u8(p1q1_p5q5), + vreinterpretq_u16_u8(p3q3_p7q7)); + const uint8x16x2_t out01 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[0]), + vreinterpretq_u8_u16(out13.val[0])); + const uint8x16x2_t out23 = vtrnq_u8(vreinterpretq_u8_u16(out02.val[1]), + vreinterpretq_u8_u16(out13.val[1])); + +#if defined(__aarch64__) + const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b); + const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504); + const uint8x16_t index_p7toq7 = vcombine_u8(index_p7top0, index_q7toq0); + + const uint8x16_t output_0 = vqtbl1q_u8(out01.val[0], index_p7toq7); + const uint8x16_t output_1 = vqtbl1q_u8(out01.val[1], index_p7toq7); + const uint8x16_t output_2 = vqtbl1q_u8(out23.val[0], index_p7toq7); + const uint8x16_t output_3 = vqtbl1q_u8(out23.val[1], index_p7toq7); +#else + const uint8x8_t index_p7top0 = vcreate_u8(0x0001020308090a0b); + const uint8x8_t index_q7toq0 = vcreate_u8(0x0f0e0d0c07060504); + + const uint8x8_t x0_p7p0 = VQTbl1U8(out01.val[0], index_p7top0); + const uint8x8_t x1_p7p0 = VQTbl1U8(out01.val[1], index_p7top0); + const uint8x8_t x2_p7p0 = VQTbl1U8(out23.val[0], index_p7top0); + const uint8x8_t x3_p7p0 = VQTbl1U8(out23.val[1], index_p7top0); + + const uint8x8_t x0_q7q0 = VQTbl1U8(out01.val[0], index_q7toq0); + const uint8x8_t x1_q7q0 = VQTbl1U8(out01.val[1], index_q7toq0); + const uint8x8_t x2_q7q0 = VQTbl1U8(out23.val[0], index_q7toq0); + const uint8x8_t x3_q7q0 = VQTbl1U8(out23.val[1], index_q7toq0); + + const uint8x16_t output_0 = vcombine_u8(x0_p7p0, x0_q7q0); + const uint8x16_t output_1 = vcombine_u8(x1_p7p0, x1_q7q0); + const uint8x16_t output_2 = vcombine_u8(x2_p7p0, x2_q7q0); + const uint8x16_t output_3 = vcombine_u8(x3_p7p0, x3_q7q0); +#endif + + vst1q_u8(dst, output_0); + dst += stride; + vst1q_u8(dst, output_1); + dst += stride; + vst1q_u8(dst, output_2); + dst += stride; + vst1q_u8(dst, output_3); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = + Horizontal4_NEON; + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON; + + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = + Horizontal6_NEON; + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON; + + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = + Horizontal8_NEON; + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON; + + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = + Horizontal14_NEON; + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = + Vertical14_NEON; +} +} // namespace +} // namespace low_bitdepth + +void LoopFilterInit_NEON() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void LoopFilterInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/loop_filter_neon.h b/src/dsp/arm/loop_filter_neon.h new file mode 100644 index 0000000..5f79200 --- /dev/null +++ b/src/dsp/arm/loop_filter_neon.h @@ -0,0 +1,53 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::loop_filters, see the defines below for specifics. This +// function is not thread-safe. +void LoopFilterInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON + +#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical LIBGAV1_CPU_NEON + +#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal \ + LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical LIBGAV1_CPU_NEON + +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_LOOP_FILTER_NEON_H_ diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc new file mode 100644 index 0000000..337c9b4 --- /dev/null +++ b/src/dsp/arm/loop_restoration_neon.cc @@ -0,0 +1,1901 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/loop_restoration.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON +#include + +#include +#include +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +template +inline uint8x8_t VshrU128(const uint8x8x2_t src) { + return vext_u8(src.val[0], src.val[1], bytes); +} + +template +inline uint16x8_t VshrU128(const uint16x8x2_t src) { + return vextq_u16(src.val[0], src.val[1], bytes / 2); +} + +// Wiener + +// Must make a local copy of coefficients to help compiler know that they have +// no overlap with other buffers. Using 'const' keyword is not enough. Actually +// compiler doesn't make a copy, since there is enough registers in this case. +inline void PopulateWienerCoefficients( + const RestorationUnitInfo& restoration_info, const int direction, + int16_t filter[4]) { + // In order to keep the horizontal pass intermediate values within 16 bits we + // offset |filter[3]| by 128. The 128 offset will be added back in the loop. + for (int i = 0; i < 4; ++i) { + filter[i] = restoration_info.wiener_info.filter[direction][i]; + } + if (direction == WienerInfo::kHorizontal) { + filter[3] -= 128; + } +} + +inline int16x8_t WienerHorizontal2(const uint8x8_t s0, const uint8x8_t s1, + const int16_t filter, const int16x8_t sum) { + const int16x8_t ss = vreinterpretq_s16_u16(vaddl_u8(s0, s1)); + return vmlaq_n_s16(sum, ss, filter); +} + +inline int16x8x2_t WienerHorizontal2(const uint8x16_t s0, const uint8x16_t s1, + const int16_t filter, + const int16x8x2_t sum) { + int16x8x2_t d; + d.val[0] = + WienerHorizontal2(vget_low_u8(s0), vget_low_u8(s1), filter, sum.val[0]); + d.val[1] = + WienerHorizontal2(vget_high_u8(s0), vget_high_u8(s1), filter, sum.val[1]); + return d; +} + +inline void WienerHorizontalSum(const uint8x8_t s[3], const int16_t filter[4], + int16x8_t sum, int16_t* const wiener_buffer) { + constexpr int offset = + 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1); + constexpr int limit = (offset << 2) - 1; + const int16x8_t s_0_2 = vreinterpretq_s16_u16(vaddl_u8(s[0], s[2])); + const int16x8_t s_1 = ZeroExtend(s[1]); + sum = vmlaq_n_s16(sum, s_0_2, filter[2]); + sum = vmlaq_n_s16(sum, s_1, filter[3]); + // Calculate scaled down offset correction, and add to sum here to prevent + // signed 16 bit outranging. + sum = vrsraq_n_s16(vshlq_n_s16(s_1, 7 - kInterRoundBitsHorizontal), sum, + kInterRoundBitsHorizontal); + sum = vmaxq_s16(sum, vdupq_n_s16(-offset)); + sum = vminq_s16(sum, vdupq_n_s16(limit - offset)); + vst1q_s16(wiener_buffer, sum); +} + +inline void WienerHorizontalSum(const uint8x16_t src[3], + const int16_t filter[4], int16x8x2_t sum, + int16_t* const wiener_buffer) { + uint8x8_t s[3]; + s[0] = vget_low_u8(src[0]); + s[1] = vget_low_u8(src[1]); + s[2] = vget_low_u8(src[2]); + WienerHorizontalSum(s, filter, sum.val[0], wiener_buffer); + s[0] = vget_high_u8(src[0]); + s[1] = vget_high_u8(src[1]); + s[2] = vget_high_u8(src[2]); + WienerHorizontalSum(s, filter, sum.val[1], wiener_buffer + 8); +} + +inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const int16_t filter[4], + int16_t** const wiener_buffer) { + for (int y = height; y != 0; --y) { + const uint8_t* src_ptr = src; + uint8x16_t s[8]; + s[0] = vld1q_u8(src_ptr); + ptrdiff_t x = width; + do { + src_ptr += 16; + s[7] = vld1q_u8(src_ptr); + s[1] = vextq_u8(s[0], s[7], 1); + s[2] = vextq_u8(s[0], s[7], 2); + s[3] = vextq_u8(s[0], s[7], 3); + s[4] = vextq_u8(s[0], s[7], 4); + s[5] = vextq_u8(s[0], s[7], 5); + s[6] = vextq_u8(s[0], s[7], 6); + int16x8x2_t sum; + sum.val[0] = sum.val[1] = vdupq_n_s16(0); + sum = WienerHorizontal2(s[0], s[6], filter[0], sum); + sum = WienerHorizontal2(s[1], s[5], filter[1], sum); + WienerHorizontalSum(s + 2, filter, sum, *wiener_buffer); + s[0] = s[7]; + *wiener_buffer += 16; + x -= 16; + } while (x != 0); + src += src_stride; + } +} + +inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const int16_t filter[4], + int16_t** const wiener_buffer) { + for (int y = height; y != 0; --y) { + const uint8_t* src_ptr = src; + uint8x16_t s[6]; + s[0] = vld1q_u8(src_ptr); + ptrdiff_t x = width; + do { + src_ptr += 16; + s[5] = vld1q_u8(src_ptr); + s[1] = vextq_u8(s[0], s[5], 1); + s[2] = vextq_u8(s[0], s[5], 2); + s[3] = vextq_u8(s[0], s[5], 3); + s[4] = vextq_u8(s[0], s[5], 4); + int16x8x2_t sum; + sum.val[0] = sum.val[1] = vdupq_n_s16(0); + sum = WienerHorizontal2(s[0], s[4], filter[1], sum); + WienerHorizontalSum(s + 1, filter, sum, *wiener_buffer); + s[0] = s[5]; + *wiener_buffer += 16; + x -= 16; + } while (x != 0); + src += src_stride; + } +} + +inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const int16_t filter[4], + int16_t** const wiener_buffer) { + for (int y = height; y != 0; --y) { + const uint8_t* src_ptr = src; + uint8x16_t s[4]; + s[0] = vld1q_u8(src_ptr); + ptrdiff_t x = width; + do { + src_ptr += 16; + s[3] = vld1q_u8(src_ptr); + s[1] = vextq_u8(s[0], s[3], 1); + s[2] = vextq_u8(s[0], s[3], 2); + int16x8x2_t sum; + sum.val[0] = sum.val[1] = vdupq_n_s16(0); + WienerHorizontalSum(s, filter, sum, *wiener_buffer); + s[0] = s[3]; + *wiener_buffer += 16; + x -= 16; + } while (x != 0); + src += src_stride; + } +} + +inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + int16_t** const wiener_buffer) { + for (int y = height; y != 0; --y) { + const uint8_t* src_ptr = src; + ptrdiff_t x = width; + do { + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x8_t s0 = vget_low_u8(s); + const uint8x8_t s1 = vget_high_u8(s); + const int16x8_t d0 = vreinterpretq_s16_u16(vshll_n_u8(s0, 4)); + const int16x8_t d1 = vreinterpretq_s16_u16(vshll_n_u8(s1, 4)); + vst1q_s16(*wiener_buffer + 0, d0); + vst1q_s16(*wiener_buffer + 8, d1); + src_ptr += 16; + *wiener_buffer += 16; + x -= 16; + } while (x != 0); + src += src_stride; + } +} + +inline int32x4x2_t WienerVertical2(const int16x8_t a0, const int16x8_t a1, + const int16_t filter, + const int32x4x2_t sum) { + const int16x8_t a = vaddq_s16(a0, a1); + int32x4x2_t d; + d.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(a), filter); + d.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(a), filter); + return d; +} + +inline uint8x8_t WienerVertical(const int16x8_t a[3], const int16_t filter[4], + const int32x4x2_t sum) { + int32x4x2_t d = WienerVertical2(a[0], a[2], filter[2], sum); + d.val[0] = vmlal_n_s16(d.val[0], vget_low_s16(a[1]), filter[3]); + d.val[1] = vmlal_n_s16(d.val[1], vget_high_s16(a[1]), filter[3]); + const uint16x4_t sum_lo_16 = vqrshrun_n_s32(d.val[0], 11); + const uint16x4_t sum_hi_16 = vqrshrun_n_s32(d.val[1], 11); + return vqmovn_u16(vcombine_u16(sum_lo_16, sum_hi_16)); +} + +inline uint8x8_t WienerVerticalTap7Kernel(const int16_t* const wiener_buffer, + const ptrdiff_t wiener_stride, + const int16_t filter[4], + int16x8_t a[7]) { + int32x4x2_t sum; + a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride); + a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride); + a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride); + a[6] = vld1q_s16(wiener_buffer + 6 * wiener_stride); + sum.val[0] = sum.val[1] = vdupq_n_s32(0); + sum = WienerVertical2(a[0], a[6], filter[0], sum); + sum = WienerVertical2(a[1], a[5], filter[1], sum); + a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride); + a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride); + a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride); + return WienerVertical(a + 2, filter, sum); +} + +inline uint8x8x2_t WienerVerticalTap7Kernel2(const int16_t* const wiener_buffer, + const ptrdiff_t wiener_stride, + const int16_t filter[4]) { + int16x8_t a[8]; + int32x4x2_t sum; + uint8x8x2_t d; + d.val[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a); + a[7] = vld1q_s16(wiener_buffer + 7 * wiener_stride); + sum.val[0] = sum.val[1] = vdupq_n_s32(0); + sum = WienerVertical2(a[1], a[7], filter[0], sum); + sum = WienerVertical2(a[2], a[6], filter[1], sum); + d.val[1] = WienerVertical(a + 3, filter, sum); + return d; +} + +inline void WienerVerticalTap7(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t filter[4], uint8_t* dst, + const ptrdiff_t dst_stride) { + for (int y = height >> 1; y != 0; --y) { + uint8_t* dst_ptr = dst; + ptrdiff_t x = width; + do { + uint8x8x2_t d[2]; + d[0] = WienerVerticalTap7Kernel2(wiener_buffer + 0, width, filter); + d[1] = WienerVerticalTap7Kernel2(wiener_buffer + 8, width, filter); + vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0])); + vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1])); + wiener_buffer += 16; + dst_ptr += 16; + x -= 16; + } while (x != 0); + wiener_buffer += width; + dst += 2 * dst_stride; + } + + if ((height & 1) != 0) { + ptrdiff_t x = width; + do { + int16x8_t a[7]; + const uint8x8_t d0 = + WienerVerticalTap7Kernel(wiener_buffer + 0, width, filter, a); + const uint8x8_t d1 = + WienerVerticalTap7Kernel(wiener_buffer + 8, width, filter, a); + vst1q_u8(dst, vcombine_u8(d0, d1)); + wiener_buffer += 16; + dst += 16; + x -= 16; + } while (x != 0); + } +} + +inline uint8x8_t WienerVerticalTap5Kernel(const int16_t* const wiener_buffer, + const ptrdiff_t wiener_stride, + const int16_t filter[4], + int16x8_t a[5]) { + a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride); + a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride); + a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride); + a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride); + a[4] = vld1q_s16(wiener_buffer + 4 * wiener_stride); + int32x4x2_t sum; + sum.val[0] = sum.val[1] = vdupq_n_s32(0); + sum = WienerVertical2(a[0], a[4], filter[1], sum); + return WienerVertical(a + 1, filter, sum); +} + +inline uint8x8x2_t WienerVerticalTap5Kernel2(const int16_t* const wiener_buffer, + const ptrdiff_t wiener_stride, + const int16_t filter[4]) { + int16x8_t a[6]; + int32x4x2_t sum; + uint8x8x2_t d; + d.val[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a); + a[5] = vld1q_s16(wiener_buffer + 5 * wiener_stride); + sum.val[0] = sum.val[1] = vdupq_n_s32(0); + sum = WienerVertical2(a[1], a[5], filter[1], sum); + d.val[1] = WienerVertical(a + 2, filter, sum); + return d; +} + +inline void WienerVerticalTap5(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t filter[4], uint8_t* dst, + const ptrdiff_t dst_stride) { + for (int y = height >> 1; y != 0; --y) { + uint8_t* dst_ptr = dst; + ptrdiff_t x = width; + do { + uint8x8x2_t d[2]; + d[0] = WienerVerticalTap5Kernel2(wiener_buffer + 0, width, filter); + d[1] = WienerVerticalTap5Kernel2(wiener_buffer + 8, width, filter); + vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0])); + vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1])); + wiener_buffer += 16; + dst_ptr += 16; + x -= 16; + } while (x != 0); + wiener_buffer += width; + dst += 2 * dst_stride; + } + + if ((height & 1) != 0) { + ptrdiff_t x = width; + do { + int16x8_t a[5]; + const uint8x8_t d0 = + WienerVerticalTap5Kernel(wiener_buffer + 0, width, filter, a); + const uint8x8_t d1 = + WienerVerticalTap5Kernel(wiener_buffer + 8, width, filter, a); + vst1q_u8(dst, vcombine_u8(d0, d1)); + wiener_buffer += 16; + dst += 16; + x -= 16; + } while (x != 0); + } +} + +inline uint8x8_t WienerVerticalTap3Kernel(const int16_t* const wiener_buffer, + const ptrdiff_t wiener_stride, + const int16_t filter[4], + int16x8_t a[3]) { + a[0] = vld1q_s16(wiener_buffer + 0 * wiener_stride); + a[1] = vld1q_s16(wiener_buffer + 1 * wiener_stride); + a[2] = vld1q_s16(wiener_buffer + 2 * wiener_stride); + int32x4x2_t sum; + sum.val[0] = sum.val[1] = vdupq_n_s32(0); + return WienerVertical(a, filter, sum); +} + +inline uint8x8x2_t WienerVerticalTap3Kernel2(const int16_t* const wiener_buffer, + const ptrdiff_t wiener_stride, + const int16_t filter[4]) { + int16x8_t a[4]; + int32x4x2_t sum; + uint8x8x2_t d; + d.val[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a); + a[3] = vld1q_s16(wiener_buffer + 3 * wiener_stride); + sum.val[0] = sum.val[1] = vdupq_n_s32(0); + d.val[1] = WienerVertical(a + 1, filter, sum); + return d; +} + +inline void WienerVerticalTap3(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t filter[4], uint8_t* dst, + const ptrdiff_t dst_stride) { + for (int y = height >> 1; y != 0; --y) { + uint8_t* dst_ptr = dst; + ptrdiff_t x = width; + do { + uint8x8x2_t d[2]; + d[0] = WienerVerticalTap3Kernel2(wiener_buffer + 0, width, filter); + d[1] = WienerVerticalTap3Kernel2(wiener_buffer + 8, width, filter); + vst1q_u8(dst_ptr, vcombine_u8(d[0].val[0], d[1].val[0])); + vst1q_u8(dst_ptr + dst_stride, vcombine_u8(d[0].val[1], d[1].val[1])); + wiener_buffer += 16; + dst_ptr += 16; + x -= 16; + } while (x != 0); + wiener_buffer += width; + dst += 2 * dst_stride; + } + + if ((height & 1) != 0) { + ptrdiff_t x = width; + do { + int16x8_t a[3]; + const uint8x8_t d0 = + WienerVerticalTap3Kernel(wiener_buffer + 0, width, filter, a); + const uint8x8_t d1 = + WienerVerticalTap3Kernel(wiener_buffer + 8, width, filter, a); + vst1q_u8(dst, vcombine_u8(d0, d1)); + wiener_buffer += 16; + dst += 16; + x -= 16; + } while (x != 0); + } +} + +inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer, + uint8_t* const dst) { + const int16x8_t a0 = vld1q_s16(wiener_buffer + 0); + const int16x8_t a1 = vld1q_s16(wiener_buffer + 8); + const uint8x8_t d0 = vqrshrun_n_s16(a0, 4); + const uint8x8_t d1 = vqrshrun_n_s16(a1, 4); + vst1q_u8(dst, vcombine_u8(d0, d1)); +} + +inline void WienerVerticalTap1(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + uint8_t* dst, const ptrdiff_t dst_stride) { + for (int y = height >> 1; y != 0; --y) { + uint8_t* dst_ptr = dst; + ptrdiff_t x = width; + do { + WienerVerticalTap1Kernel(wiener_buffer, dst_ptr); + WienerVerticalTap1Kernel(wiener_buffer + width, dst_ptr + dst_stride); + wiener_buffer += 16; + dst_ptr += 16; + x -= 16; + } while (x != 0); + wiener_buffer += width; + dst += 2 * dst_stride; + } + + if ((height & 1) != 0) { + ptrdiff_t x = width; + do { + WienerVerticalTap1Kernel(wiener_buffer, dst); + wiener_buffer += 16; + dst += 16; + x -= 16; + } while (x != 0); + } +} + +// For width 16 and up, store the horizontal results, and then do the vertical +// filter row by row. This is faster than doing it column by column when +// considering cache issues. +void WienerFilter_NEON(const RestorationUnitInfo& restoration_info, + const void* const source, const void* const top_border, + const void* const bottom_border, const ptrdiff_t stride, + const int width, const int height, + RestorationBuffer* const restoration_buffer, + void* const dest) { + const int16_t* const number_leading_zero_coefficients = + restoration_info.wiener_info.number_leading_zero_coefficients; + const int number_rows_to_skip = std::max( + static_cast(number_leading_zero_coefficients[WienerInfo::kVertical]), + 1); + const ptrdiff_t wiener_stride = Align(width, 16); + int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer; + // The values are saturated to 13 bits before storing. + int16_t* wiener_buffer_horizontal = + wiener_buffer_vertical + number_rows_to_skip * wiener_stride; + int16_t filter_horizontal[(kWienerFilterTaps + 1) / 2]; + int16_t filter_vertical[(kWienerFilterTaps + 1) / 2]; + PopulateWienerCoefficients(restoration_info, WienerInfo::kHorizontal, + filter_horizontal); + PopulateWienerCoefficients(restoration_info, WienerInfo::kVertical, + filter_vertical); + + // horizontal filtering. + // Over-reads up to 15 - |kRestorationHorizontalBorder| values. + const int height_horizontal = + height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip; + const int height_extra = (height_horizontal - height) >> 1; + assert(height_extra <= 2); + const auto* const src = static_cast(source); + const auto* const top = static_cast(top_border); + const auto* const bottom = static_cast(bottom_border); + if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { + WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride, + wiener_stride, height_extra, filter_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap7(src - 3, stride, wiener_stride, height, + filter_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra, + filter_horizontal, &wiener_buffer_horizontal); + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { + WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride, + wiener_stride, height_extra, filter_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap5(src - 2, stride, wiener_stride, height, + filter_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra, + filter_horizontal, &wiener_buffer_horizontal); + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { + // The maximum over-reads happen here. + WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride, + wiener_stride, height_extra, filter_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap3(src - 1, stride, wiener_stride, height, + filter_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra, + filter_horizontal, &wiener_buffer_horizontal); + } else { + assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); + WienerHorizontalTap1(top + (2 - height_extra) * stride, stride, + wiener_stride, height_extra, + &wiener_buffer_horizontal); + WienerHorizontalTap1(src, stride, wiener_stride, height, + &wiener_buffer_horizontal); + WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra, + &wiener_buffer_horizontal); + } + + // vertical filtering. + // Over-writes up to 15 values. + auto* dst = static_cast(dest); + if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) { + // Because the top row of |source| is a duplicate of the second row, and the + // bottom row of |source| is a duplicate of its above row, we can duplicate + // the top and bottom row of |wiener_buffer| accordingly. + memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride, + sizeof(*wiener_buffer_horizontal) * wiener_stride); + memcpy(restoration_buffer->wiener_buffer, + restoration_buffer->wiener_buffer + wiener_stride, + sizeof(*restoration_buffer->wiener_buffer) * wiener_stride); + WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height, + filter_vertical, dst, stride); + } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) { + WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride, + height, filter_vertical, dst, stride); + } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) { + WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride, + wiener_stride, height, filter_vertical, dst, stride); + } else { + assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3); + WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride, + wiener_stride, height, dst, stride); + } +} + +//------------------------------------------------------------------------------ +// SGR + +inline void Prepare3_8(const uint8x8x2_t src, uint8x8_t dst[3]) { + dst[0] = VshrU128<0>(src); + dst[1] = VshrU128<1>(src); + dst[2] = VshrU128<2>(src); +} + +inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3], + uint16x4_t high[3]) { + uint16x8_t s[3]; + s[0] = VshrU128<0>(src); + s[1] = VshrU128<2>(src); + s[2] = VshrU128<4>(src); + low[0] = vget_low_u16(s[0]); + low[1] = vget_low_u16(s[1]); + low[2] = vget_low_u16(s[2]); + high[0] = vget_high_u16(s[0]); + high[1] = vget_high_u16(s[1]); + high[2] = vget_high_u16(s[2]); +} + +inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) { + dst[0] = VshrU128<0>(src); + dst[1] = VshrU128<1>(src); + dst[2] = VshrU128<2>(src); + dst[3] = VshrU128<3>(src); + dst[4] = VshrU128<4>(src); +} + +inline void Prepare5_16(const uint16x8x2_t src, uint16x4_t low[5], + uint16x4_t high[5]) { + Prepare3_16(src, low, high); + const uint16x8_t s3 = VshrU128<6>(src); + const uint16x8_t s4 = VshrU128<8>(src); + low[3] = vget_low_u16(s3); + low[4] = vget_low_u16(s4); + high[3] = vget_high_u16(s3); + high[4] = vget_high_u16(s4); +} + +inline uint16x8_t Sum3_16(const uint16x8_t src0, const uint16x8_t src1, + const uint16x8_t src2) { + const uint16x8_t sum = vaddq_u16(src0, src1); + return vaddq_u16(sum, src2); +} + +inline uint16x8_t Sum3_16(const uint16x8_t src[3]) { + return Sum3_16(src[0], src[1], src[2]); +} + +inline uint32x4_t Sum3_32(const uint32x4_t src0, const uint32x4_t src1, + const uint32x4_t src2) { + const uint32x4_t sum = vaddq_u32(src0, src1); + return vaddq_u32(sum, src2); +} + +inline uint32x4x2_t Sum3_32(const uint32x4x2_t src[3]) { + uint32x4x2_t d; + d.val[0] = Sum3_32(src[0].val[0], src[1].val[0], src[2].val[0]); + d.val[1] = Sum3_32(src[0].val[1], src[1].val[1], src[2].val[1]); + return d; +} + +inline uint16x8_t Sum3W_16(const uint8x8_t src[3]) { + const uint16x8_t sum = vaddl_u8(src[0], src[1]); + return vaddw_u8(sum, src[2]); +} + +inline uint32x4_t Sum3W_32(const uint16x4_t src[3]) { + const uint32x4_t sum = vaddl_u16(src[0], src[1]); + return vaddw_u16(sum, src[2]); +} + +inline uint16x8_t Sum5_16(const uint16x8_t src[5]) { + const uint16x8_t sum01 = vaddq_u16(src[0], src[1]); + const uint16x8_t sum23 = vaddq_u16(src[2], src[3]); + const uint16x8_t sum = vaddq_u16(sum01, sum23); + return vaddq_u16(sum, src[4]); +} + +inline uint32x4_t Sum5_32(const uint32x4_t src0, const uint32x4_t src1, + const uint32x4_t src2, const uint32x4_t src3, + const uint32x4_t src4) { + const uint32x4_t sum01 = vaddq_u32(src0, src1); + const uint32x4_t sum23 = vaddq_u32(src2, src3); + const uint32x4_t sum = vaddq_u32(sum01, sum23); + return vaddq_u32(sum, src4); +} + +inline uint32x4x2_t Sum5_32(const uint32x4x2_t src[5]) { + uint32x4x2_t d; + d.val[0] = Sum5_32(src[0].val[0], src[1].val[0], src[2].val[0], src[3].val[0], + src[4].val[0]); + d.val[1] = Sum5_32(src[0].val[1], src[1].val[1], src[2].val[1], src[3].val[1], + src[4].val[1]); + return d; +} + +inline uint32x4_t Sum5W_32(const uint16x4_t src[5]) { + const uint32x4_t sum01 = vaddl_u16(src[0], src[1]); + const uint32x4_t sum23 = vaddl_u16(src[2], src[3]); + const uint32x4_t sum0123 = vaddq_u32(sum01, sum23); + return vaddw_u16(sum0123, src[4]); +} + +inline uint16x8_t Sum3Horizontal(const uint8x8x2_t src) { + uint8x8_t s[3]; + Prepare3_8(src, s); + return Sum3W_16(s); +} + +inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) { + uint16x4_t low[3], high[3]; + uint32x4x2_t sum; + Prepare3_16(src, low, high); + sum.val[0] = Sum3W_32(low); + sum.val[1] = Sum3W_32(high); + return sum; +} + +inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) { + uint8x8_t s[5]; + Prepare5_8(src, s); + const uint16x8_t sum01 = vaddl_u8(s[0], s[1]); + const uint16x8_t sum23 = vaddl_u8(s[2], s[3]); + const uint16x8_t sum0123 = vaddq_u16(sum01, sum23); + return vaddw_u8(sum0123, s[4]); +} + +inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) { + uint16x4_t low[5], high[5]; + Prepare5_16(src, low, high); + uint32x4x2_t sum; + sum.val[0] = Sum5W_32(low); + sum.val[1] = Sum5W_32(high); + return sum; +} + +void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3, + uint32x4_t* const row_sq5) { + const uint32x4_t sum04 = vaddl_u16(src[0], src[4]); + const uint32x4_t sum12 = vaddl_u16(src[1], src[2]); + *row_sq3 = vaddw_u16(sum12, src[3]); + *row_sq5 = vaddq_u32(sum04, *row_sq3); +} + +void SumHorizontal(const uint8x8x2_t src, const uint16x8x2_t sq, + uint16x8_t* const row3, uint16x8_t* const row5, + uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) { + uint8x8_t s[5]; + Prepare5_8(src, s); + const uint16x8_t sum04 = vaddl_u8(s[0], s[4]); + const uint16x8_t sum12 = vaddl_u8(s[1], s[2]); + *row3 = vaddw_u8(sum12, s[3]); + *row5 = vaddq_u16(sum04, *row3); + uint16x4_t low[5], high[5]; + Prepare5_16(sq, low, high); + SumHorizontal(low, &row_sq3->val[0], &row_sq5->val[0]); + SumHorizontal(high, &row_sq3->val[1], &row_sq5->val[1]); +} + +inline uint16x8_t Sum343(const uint8x8x2_t src) { + uint8x8_t s[3]; + Prepare3_8(src, s); + const uint16x8_t sum = Sum3W_16(s); + const uint16x8_t sum3 = Sum3_16(sum, sum, sum); + return vaddw_u8(sum3, s[1]); +} + +inline uint32x4_t Sum343W(const uint16x4_t src[3]) { + const uint32x4_t sum = Sum3W_32(src); + const uint32x4_t sum3 = Sum3_32(sum, sum, sum); + return vaddw_u16(sum3, src[1]); +} + +inline uint32x4x2_t Sum343W(const uint16x8x2_t src) { + uint16x4_t low[3], high[3]; + uint32x4x2_t d; + Prepare3_16(src, low, high); + d.val[0] = Sum343W(low); + d.val[1] = Sum343W(high); + return d; +} + +inline uint16x8_t Sum565(const uint8x8x2_t src) { + uint8x8_t s[3]; + Prepare3_8(src, s); + const uint16x8_t sum = Sum3W_16(s); + const uint16x8_t sum4 = vshlq_n_u16(sum, 2); + const uint16x8_t sum5 = vaddq_u16(sum4, sum); + return vaddw_u8(sum5, s[1]); +} + +inline uint32x4_t Sum565W(const uint16x4_t src[3]) { + const uint32x4_t sum = Sum3W_32(src); + const uint32x4_t sum4 = vshlq_n_u32(sum, 2); + const uint32x4_t sum5 = vaddq_u32(sum4, sum); + return vaddw_u16(sum5, src[1]); +} + +inline uint32x4x2_t Sum565W(const uint16x8x2_t src) { + uint16x4_t low[3], high[3]; + uint32x4x2_t d; + Prepare3_16(src, low, high); + d.val[0] = Sum565W(low); + d.val[1] = Sum565W(high); + return d; +} + +inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, + const int height, const ptrdiff_t sum_stride, uint16_t* sum3, + uint16_t* sum5, uint32_t* square_sum3, + uint32_t* square_sum5) { + int y = height; + do { + uint8x8x2_t s; + uint16x8x2_t sq; + s.val[0] = vld1_u8(src); + sq.val[0] = vmull_u8(s.val[0], s.val[0]); + ptrdiff_t x = 0; + do { + uint16x8_t row3, row5; + uint32x4x2_t row_sq3, row_sq5; + s.val[1] = vld1_u8(src + x + 8); + sq.val[1] = vmull_u8(s.val[1], s.val[1]); + SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5); + vst1q_u16(sum3, row3); + vst1q_u16(sum5, row5); + vst1q_u32(square_sum3 + 0, row_sq3.val[0]); + vst1q_u32(square_sum3 + 4, row_sq3.val[1]); + vst1q_u32(square_sum5 + 0, row_sq5.val[0]); + vst1q_u32(square_sum5 + 4, row_sq5.val[1]); + s.val[0] = s.val[1]; + sq.val[0] = sq.val[1]; + sum3 += 8; + sum5 += 8; + square_sum3 += 8; + square_sum5 += 8; + x += 8; + } while (x < sum_stride); + src += src_stride; + } while (--y != 0); +} + +template +inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, + const int height, const ptrdiff_t sum_stride, uint16_t* sums, + uint32_t* square_sums) { + static_assert(size == 3 || size == 5, ""); + int y = height; + do { + uint8x8x2_t s; + uint16x8x2_t sq; + s.val[0] = vld1_u8(src); + sq.val[0] = vmull_u8(s.val[0], s.val[0]); + ptrdiff_t x = 0; + do { + uint16x8_t row; + uint32x4x2_t row_sq; + s.val[1] = vld1_u8(src + x + 8); + sq.val[1] = vmull_u8(s.val[1], s.val[1]); + if (size == 3) { + row = Sum3Horizontal(s); + row_sq = Sum3WHorizontal(sq); + } else { + row = Sum5Horizontal(s); + row_sq = Sum5WHorizontal(sq); + } + vst1q_u16(sums, row); + vst1q_u32(square_sums + 0, row_sq.val[0]); + vst1q_u32(square_sums + 4, row_sq.val[1]); + s.val[0] = s.val[1]; + sq.val[0] = sq.val[1]; + sums += 8; + square_sums += 8; + x += 8; + } while (x < sum_stride); + src += src_stride; + } while (--y != 0); +} + +template +inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq, + const uint32_t scale) { + // a = |sum_sq| + // d = |sum| + // p = (a * n < d * d) ? 0 : a * n - d * d; + const uint32x4_t dxd = vmull_u16(sum, sum); + const uint32x4_t axn = vmulq_n_u32(sum_sq, n); + // Ensure |p| does not underflow by using saturating subtraction. + const uint32x4_t p = vqsubq_u32(axn, dxd); + const uint32x4_t pxs = vmulq_n_u32(p, scale); + // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits + // is 20. + const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits); + return vmovn_u32(shifted); +} + +template +inline void CalculateIntermediate(const uint16x8_t sum, + const uint32x4x2_t sum_sq, + const uint32_t scale, uint8x8_t* const ma, + uint16x8_t* const b) { + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; + const uint16x4_t z0 = CalculateMa(vget_low_u16(sum), sum_sq.val[0], scale); + const uint16x4_t z1 = + CalculateMa(vget_high_u16(sum), sum_sq.val[1], scale); + const uint16x8_t z01 = vcombine_u16(z0, z1); + // Using vqmovn_u16() needs an extra sign extension instruction. + const uint16x8_t z = vminq_u16(z01, vdupq_n_u16(255)); + // Using vgetq_lane_s16() can save the sign extension instruction. + const uint8_t lookup[8] = { + kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 0)], + kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 1)], + kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 2)], + kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 3)], + kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 4)], + kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 5)], + kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 6)], + kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 7)]}; + *ma = vld1_u8(lookup); + // b = ma * b * one_over_n + // |ma| = [0, 255] + // |sum| is a box sum with radius 1 or 2. + // For the first pass radius is 2. Maximum value is 5x5x255 = 6375. + // For the second pass radius is 1. Maximum value is 3x3x255 = 2295. + // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n + // When radius is 2 |n| is 25. |one_over_n| is 164. + // When radius is 1 |n| is 9. |one_over_n| is 455. + // |kSgrProjReciprocalBits| is 12. + // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). + // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). + const uint16x8_t maq = vmovl_u8(*ma); + const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum)); + const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum)); + const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n); + const uint32x4_t m3 = vmulq_n_u32(m1, one_over_n); + const uint16x4_t b_lo = vrshrn_n_u32(m2, kSgrProjReciprocalBits); + const uint16x4_t b_hi = vrshrn_n_u32(m3, kSgrProjReciprocalBits); + *b = vcombine_u16(b_lo, b_hi); +} + +inline void CalculateIntermediate5(const uint16x8_t s5[5], + const uint32x4x2_t sq5[5], + const uint32_t scale, uint8x8_t* const ma, + uint16x8_t* const b) { + const uint16x8_t sum = Sum5_16(s5); + const uint32x4x2_t sum_sq = Sum5_32(sq5); + CalculateIntermediate<25>(sum, sum_sq, scale, ma, b); +} + +inline void CalculateIntermediate3(const uint16x8_t s3[3], + const uint32x4x2_t sq3[3], + const uint32_t scale, uint8x8_t* const ma, + uint16x8_t* const b) { + const uint16x8_t sum = Sum3_16(s3); + const uint32x4x2_t sum_sq = Sum3_32(sq3); + CalculateIntermediate<9>(sum, sum_sq, scale, ma, b); +} + +inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3, + const ptrdiff_t x, uint16x8_t* const sum_ma343, + uint16x8_t* const sum_ma444, + uint32x4x2_t* const sum_b343, + uint32x4x2_t* const sum_b444, uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + uint8x8_t s[3]; + Prepare3_8(ma3, s); + const uint16x8_t sum_ma111 = Sum3W_16(s); + *sum_ma444 = vshlq_n_u16(sum_ma111, 2); + const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111); + *sum_ma343 = vaddw_u8(sum333, s[1]); + uint16x4_t low[3], high[3]; + uint32x4x2_t sum_b111; + Prepare3_16(b3, low, high); + sum_b111.val[0] = Sum3W_32(low); + sum_b111.val[1] = Sum3W_32(high); + sum_b444->val[0] = vshlq_n_u32(sum_b111.val[0], 2); + sum_b444->val[1] = vshlq_n_u32(sum_b111.val[1], 2); + sum_b343->val[0] = vsubq_u32(sum_b444->val[0], sum_b111.val[0]); + sum_b343->val[1] = vsubq_u32(sum_b444->val[1], sum_b111.val[1]); + sum_b343->val[0] = vaddw_u16(sum_b343->val[0], low[1]); + sum_b343->val[1] = vaddw_u16(sum_b343->val[1], high[1]); + vst1q_u16(ma343 + x, *sum_ma343); + vst1q_u16(ma444 + x, *sum_ma444); + vst1q_u32(b343 + x + 0, sum_b343->val[0]); + vst1q_u32(b343 + x + 4, sum_b343->val[1]); + vst1q_u32(b444 + x + 0, sum_b444->val[0]); + vst1q_u32(b444 + x + 4, sum_b444->val[1]); +} + +inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3, + const ptrdiff_t x, uint16x8_t* const sum_ma343, + uint32x4x2_t* const sum_b343, uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + uint16x8_t sum_ma444; + uint32x4x2_t sum_b444; + Store343_444(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444, ma343, + ma444, b343, b444); +} + +inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3, + const ptrdiff_t x, uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + uint16x8_t sum_ma343; + uint32x4x2_t sum_b343; + Store343_444(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343, b444); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5( + const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x, + const uint32_t scale, uint16_t* const sum5[5], + uint32_t* const square_sum5[5], uint8x8x2_t s[2], uint16x8x2_t sq[2], + uint8x8_t* const ma, uint16x8_t* const b) { + uint16x8_t s5[5]; + uint32x4x2_t sq5[5]; + s[0].val[1] = vld1_u8(src0 + x + 8); + s[1].val[1] = vld1_u8(src1 + x + 8); + sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]); + sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]); + s5[3] = Sum5Horizontal(s[0]); + s5[4] = Sum5Horizontal(s[1]); + sq5[3] = Sum5WHorizontal(sq[0]); + sq5[4] = Sum5WHorizontal(sq[1]); + vst1q_u16(sum5[3] + x, s5[3]); + vst1q_u16(sum5[4] + x, s5[4]); + vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]); + vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]); + vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]); + vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]); + s5[0] = vld1q_u16(sum5[0] + x); + s5[1] = vld1q_u16(sum5[1] + x); + s5[2] = vld1q_u16(sum5[2] + x); + sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4); + CalculateIntermediate5(s5, sq5, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow( + const uint8_t* const src, const ptrdiff_t x, const uint32_t scale, + const uint16_t* const sum5[5], const uint32_t* const square_sum5[5], + uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma, + uint16x8_t* const b) { + uint16x8_t s5[5]; + uint32x4x2_t sq5[5]; + s->val[1] = vld1_u8(src + x + 8); + sq->val[1] = vmull_u8(s->val[1], s->val[1]); + s5[3] = s5[4] = Sum5Horizontal(*s); + sq5[3] = sq5[4] = Sum5WHorizontal(*sq); + s5[0] = vld1q_u16(sum5[0] + x); + s5[1] = vld1q_u16(sum5[1] + x); + s5[2] = vld1q_u16(sum5[2] + x); + sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4); + CalculateIntermediate5(s5, sq5, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3( + const uint8_t* const src, const ptrdiff_t x, const uint32_t scale, + uint16_t* const sum3[3], uint32_t* const square_sum3[3], + uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma, + uint16x8_t* const b) { + uint16x8_t s3[3]; + uint32x4x2_t sq3[3]; + s->val[1] = vld1_u8(src + x + 8); + sq->val[1] = vmull_u8(s->val[1], s->val[1]); + s3[2] = Sum3Horizontal(*s); + sq3[2] = Sum3WHorizontal(*sq); + vst1q_u16(sum3[2] + x, s3[2]); + vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]); + vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]); + s3[0] = vld1q_u16(sum3[0] + x); + s3[1] = vld1q_u16(sum3[1] + x); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4); + CalculateIntermediate3(s3, sq3, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess( + const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x, + const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + uint8x8x2_t s[2], uint16x8x2_t sq[2], uint8x8_t* const ma3_0, + uint8x8_t* const ma3_1, uint16x8_t* const b3_0, uint16x8_t* const b3_1, + uint8x8_t* const ma5, uint16x8_t* const b5) { + uint16x8_t s3[4], s5[5]; + uint32x4x2_t sq3[4], sq5[5]; + s[0].val[1] = vld1_u8(src0 + x + 8); + s[1].val[1] = vld1_u8(src1 + x + 8); + sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]); + sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]); + SumHorizontal(s[0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]); + SumHorizontal(s[1], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]); + vst1q_u16(sum3[2] + x, s3[2]); + vst1q_u16(sum3[3] + x, s3[3]); + vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]); + vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]); + vst1q_u32(square_sum3[3] + x + 0, sq3[3].val[0]); + vst1q_u32(square_sum3[3] + x + 4, sq3[3].val[1]); + vst1q_u16(sum5[3] + x, s5[3]); + vst1q_u16(sum5[4] + x, s5[4]); + vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]); + vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]); + vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]); + vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]); + s3[0] = vld1q_u16(sum3[0] + x); + s3[1] = vld1q_u16(sum3[1] + x); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4); + s5[0] = vld1q_u16(sum5[0] + x); + s5[1] = vld1q_u16(sum5[1] + x); + s5[2] = vld1q_u16(sum5[2] + x); + sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4); + CalculateIntermediate3(s3, sq3, scales[1], ma3_0, b3_0); + CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], ma3_1, b3_1); + CalculateIntermediate5(s5, sq5, scales[0], ma5, b5); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow( + const uint8_t* const src, const ptrdiff_t x, const uint16_t scales[2], + const uint16_t* const sum3[4], const uint16_t* const sum5[5], + const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5], + uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma3, + uint8x8_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) { + uint16x8_t s3[3], s5[5]; + uint32x4x2_t sq3[3], sq5[5]; + s->val[1] = vld1_u8(src + x + 8); + sq->val[1] = vmull_u8(s->val[1], s->val[1]); + SumHorizontal(*s, *sq, &s3[2], &s5[3], &sq3[2], &sq5[3]); + s5[0] = vld1q_u16(sum5[0] + x); + s5[1] = vld1q_u16(sum5[1] + x); + s5[2] = vld1q_u16(sum5[2] + x); + s5[4] = s5[3]; + sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0); + sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4); + sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0); + sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4); + sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0); + sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4); + sq5[4] = sq5[3]; + CalculateIntermediate5(s5, sq5, scales[0], ma5, b5); + s3[0] = vld1q_u16(sum3[0] + x); + s3[1] = vld1q_u16(sum3[1] + x); + sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0); + sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4); + sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0); + sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4); + CalculateIntermediate3(s3, sq3, scales[1], ma3, b3); +} + +inline void BoxSumFilterPreProcess5(const uint8_t* const src0, + const uint8_t* const src1, const int width, + const uint32_t scale, + uint16_t* const sum5[5], + uint32_t* const square_sum5[5], + uint16_t* ma565, uint32_t* b565) { + uint8x8x2_t s[2], mas; + uint16x8x2_t sq[2], bs; + s[0].val[0] = vld1_u8(src0); + s[1].val[0] = vld1_u8(src1); + sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]); + sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]); + BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq, + &mas.val[0], &bs.val[0]); + + int x = 0; + do { + s[0].val[0] = s[0].val[1]; + s[1].val[0] = s[1].val[1]; + sq[0].val[0] = sq[0].val[1]; + sq[1].val[0] = sq[1].val[1]; + BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq, + &mas.val[1], &bs.val[1]); + const uint16x8_t ma = Sum565(mas); + const uint32x4x2_t b = Sum565W(bs); + vst1q_u16(ma565, ma); + vst1q_u32(b565 + 0, b.val[0]); + vst1q_u32(b565 + 4, b.val[1]); + mas.val[0] = mas.val[1]; + bs.val[0] = bs.val[1]; + ma565 += 8; + b565 += 8; + x += 8; + } while (x < width); +} + +template +LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3( + const uint8_t* const src, const int width, const uint32_t scale, + uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343, + uint16_t* ma444, uint32_t* b343, uint32_t* b444) { + uint8x8x2_t s, mas; + uint16x8x2_t sq, bs; + s.val[0] = vld1_u8(src); + sq.val[0] = vmull_u8(s.val[0], s.val[0]); + BoxFilterPreProcess3(src, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0], + &bs.val[0]); + + int x = 0; + do { + s.val[0] = s.val[1]; + sq.val[0] = sq.val[1]; + BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, &s, &sq, + &mas.val[1], &bs.val[1]); + if (calculate444) { + Store343_444(mas, bs, 0, ma343, ma444, b343, b444); + ma444 += 8; + b444 += 8; + } else { + const uint16x8_t ma = Sum343(mas); + const uint32x4x2_t b = Sum343W(bs); + vst1q_u16(ma343, ma); + vst1q_u32(b343 + 0, b.val[0]); + vst1q_u32(b343 + 4, b.val[1]); + } + mas.val[0] = mas.val[1]; + bs.val[0] = bs.val[1]; + ma343 += 8; + b343 += 8; + x += 8; + } while (x < width); +} + +inline void BoxSumFilterPreProcess( + const uint8_t* const src0, const uint8_t* const src1, const int width, + const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + uint16_t* const ma343[4], uint16_t* const ma444[2], uint16_t* ma565, + uint32_t* const b343[4], uint32_t* const b444[2], uint32_t* b565) { + uint8x8x2_t s[2]; + uint8x8x2_t ma3[2], ma5; + uint16x8x2_t sq[2], b3[2], b5; + s[0].val[0] = vld1_u8(src0); + s[1].val[0] = vld1_u8(src1); + sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]); + sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]); + BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3, + square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0], + &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]); + + int x = 0; + do { + s[0].val[0] = s[0].val[1]; + s[1].val[0] = s[1].val[1]; + sq[0].val[0] = sq[0].val[1]; + sq[1].val[0] = sq[1].val[1]; + BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3, + square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1], + &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]); + uint16x8_t ma = Sum343(ma3[0]); + uint32x4x2_t b = Sum343W(b3[0]); + vst1q_u16(ma343[0] + x, ma); + vst1q_u32(b343[0] + x, b.val[0]); + vst1q_u32(b343[0] + x + 4, b.val[1]); + Store343_444(ma3[1], b3[1], x, ma343[1], ma444[0], b343[1], b444[0]); + ma = Sum565(ma5); + b = Sum565W(b5); + vst1q_u16(ma565, ma); + vst1q_u32(b565 + 0, b.val[0]); + vst1q_u32(b565 + 4, b.val[1]); + ma3[0].val[0] = ma3[0].val[1]; + ma3[1].val[0] = ma3[1].val[1]; + b3[0].val[0] = b3[0].val[1]; + b3[1].val[0] = b3[1].val[1]; + ma5.val[0] = ma5.val[1]; + b5.val[0] = b5.val[1]; + ma565 += 8; + b565 += 8; + x += 8; + } while (x < width); +} + +template +inline int16x4_t FilterOutput(const uint16x4_t src, const uint16x4_t ma, + const uint32x4_t b) { + // ma: 255 * 32 = 8160 (13 bits) + // b: 65088 * 32 = 2082816 (21 bits) + // v: b - ma * 255 (22 bits) + const int32x4_t v = vreinterpretq_s32_u32(vmlsl_u16(b, ma, src)); + // kSgrProjSgrBits = 8 + // kSgrProjRestoreBits = 4 + // shift = 4 or 5 + // v >> 8 or 9 (13 bits) + return vrshrn_n_s32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits); +} + +template +inline int16x8_t CalculateFilteredOutput(const uint8x8_t src, + const uint16x8_t ma, + const uint32x4x2_t b) { + const uint16x8_t src_u16 = vmovl_u8(src); + const int16x4_t dst_lo = + FilterOutput(vget_low_u16(src_u16), vget_low_u16(ma), b.val[0]); + const int16x4_t dst_hi = + FilterOutput(vget_high_u16(src_u16), vget_high_u16(ma), b.val[1]); + return vcombine_s16(dst_lo, dst_hi); // 13 bits +} + +inline int16x8_t CalculateFilteredOutputPass1(const uint8x8_t s, + uint16x8_t ma[2], + uint32x4x2_t b[2]) { + const uint16x8_t ma_sum = vaddq_u16(ma[0], ma[1]); + uint32x4x2_t b_sum; + b_sum.val[0] = vaddq_u32(b[0].val[0], b[1].val[0]); + b_sum.val[1] = vaddq_u32(b[0].val[1], b[1].val[1]); + return CalculateFilteredOutput<5>(s, ma_sum, b_sum); +} + +inline int16x8_t CalculateFilteredOutputPass2(const uint8x8_t s, + uint16x8_t ma[3], + uint32x4x2_t b[3]) { + const uint16x8_t ma_sum = Sum3_16(ma); + const uint32x4x2_t b_sum = Sum3_32(b); + return CalculateFilteredOutput<5>(s, ma_sum, b_sum); +} + +inline void SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2], + uint8_t* const dst) { + const int16x4_t v_lo = + vrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits); + const int16x4_t v_hi = + vrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits); + const int16x8_t vv = vcombine_s16(v_lo, v_hi); + const int16x8_t s = ZeroExtend(src); + const int16x8_t d = vaddq_s16(s, vv); + vst1_u8(dst, vqmovun_s16(d)); +} + +inline void SelfGuidedDoubleMultiplier(const uint8x8_t src, + const int16x8_t filter[2], const int w0, + const int w2, uint8_t* const dst) { + int32x4_t v[2]; + v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0); + v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0); + v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2); + v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2); + SelfGuidedFinal(src, v, dst); +} + +inline void SelfGuidedSingleMultiplier(const uint8x8_t src, + const int16x8_t filter, const int w0, + uint8_t* const dst) { + // weight: -96 to 96 (Sgrproj_Xqd_Min/Max) + int32x4_t v[2]; + v[0] = vmull_n_s16(vget_low_s16(filter), w0); + v[1] = vmull_n_s16(vget_high_s16(filter), w0); + SelfGuidedFinal(src, v, dst); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPass1( + const uint8_t* const src, const uint8_t* const src0, + const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5], + uint32_t* const square_sum5[5], const int width, const uint32_t scale, + const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2], + uint8_t* const dst) { + uint8x8x2_t s[2], mas; + uint16x8x2_t sq[2], bs; + s[0].val[0] = vld1_u8(src0); + s[1].val[0] = vld1_u8(src1); + sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]); + sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]); + BoxFilterPreProcess5(src0, src1, 0, scale, sum5, square_sum5, s, sq, + &mas.val[0], &bs.val[0]); + + int x = 0; + do { + s[0].val[0] = s[0].val[1]; + s[1].val[0] = s[1].val[1]; + sq[0].val[0] = sq[0].val[1]; + sq[1].val[0] = sq[1].val[1]; + BoxFilterPreProcess5(src0, src1, x + 8, scale, sum5, square_sum5, s, sq, + &mas.val[1], &bs.val[1]); + uint16x8_t ma[2]; + uint32x4x2_t b[2]; + ma[1] = Sum565(mas); + b[1] = Sum565W(bs); + vst1q_u16(ma565[1] + x, ma[1]); + vst1q_u32(b565[1] + x + 0, b[1].val[0]); + vst1q_u32(b565[1] + x + 4, b[1].val[1]); + const uint8x8_t sr0 = vld1_u8(src + x); + const uint8x8_t sr1 = vld1_u8(src + stride + x); + int16x8_t p0, p1; + ma[0] = vld1q_u16(ma565[0] + x); + b[0].val[0] = vld1q_u32(b565[0] + x + 0); + b[0].val[1] = vld1q_u32(b565[0] + x + 4); + p0 = CalculateFilteredOutputPass1(sr0, ma, b); + p1 = CalculateFilteredOutput<4>(sr1, ma[1], b[1]); + SelfGuidedSingleMultiplier(sr0, p0, w0, dst + x); + SelfGuidedSingleMultiplier(sr1, p1, w0, dst + stride + x); + mas.val[0] = mas.val[1]; + bs.val[0] = bs.val[1]; + x += 8; + } while (x < width); +} + +inline void BoxFilterPass1LastRow(const uint8_t* const src, + const uint8_t* const src0, const int width, + const uint32_t scale, const int16_t w0, + uint16_t* const sum5[5], + uint32_t* const square_sum5[5], + uint16_t* ma565, uint32_t* b565, + uint8_t* const dst) { + uint8x8x2_t s, mas; + uint16x8x2_t sq, bs; + s.val[0] = vld1_u8(src0); + sq.val[0] = vmull_u8(s.val[0], s.val[0]); + BoxFilterPreProcess5LastRow(src0, 0, scale, sum5, square_sum5, &s, &sq, + &mas.val[0], &bs.val[0]); + + int x = 0; + do { + s.val[0] = s.val[1]; + sq.val[0] = sq.val[1]; + BoxFilterPreProcess5LastRow(src0, x + 8, scale, sum5, square_sum5, &s, &sq, + &mas.val[1], &bs.val[1]); + uint16x8_t ma[2]; + uint32x4x2_t b[2]; + ma[1] = Sum565(mas); + b[1] = Sum565W(bs); + mas.val[0] = mas.val[1]; + bs.val[0] = bs.val[1]; + ma[0] = vld1q_u16(ma565); + b[0].val[0] = vld1q_u32(b565 + 0); + b[0].val[1] = vld1q_u32(b565 + 4); + const uint8x8_t sr = vld1_u8(src + x); + const int16x8_t p = CalculateFilteredOutputPass1(sr, ma, b); + SelfGuidedSingleMultiplier(sr, p, w0, dst + x); + ma565 += 8; + b565 += 8; + x += 8; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPass2( + const uint8_t* const src, const uint8_t* const src0, const int width, + const uint32_t scale, const int16_t w0, uint16_t* const sum3[3], + uint32_t* const square_sum3[3], uint16_t* const ma343[3], + uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2], + uint8_t* const dst) { + uint8x8x2_t s, mas; + uint16x8x2_t sq, bs; + s.val[0] = vld1_u8(src0); + sq.val[0] = vmull_u8(s.val[0], s.val[0]); + BoxFilterPreProcess3(src0, 0, scale, sum3, square_sum3, &s, &sq, &mas.val[0], + &bs.val[0]); + + int x = 0; + do { + s.val[0] = s.val[1]; + sq.val[0] = sq.val[1]; + BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, &s, &sq, + &mas.val[1], &bs.val[1]); + uint16x8_t ma[3]; + uint32x4x2_t b[3]; + Store343_444(mas, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2], + b444[1]); + const uint8x8_t sr = vld1_u8(src + x); + ma[0] = vld1q_u16(ma343[0] + x); + ma[1] = vld1q_u16(ma444[0] + x); + b[0].val[0] = vld1q_u32(b343[0] + x + 0); + b[0].val[1] = vld1q_u32(b343[0] + x + 4); + b[1].val[0] = vld1q_u32(b444[0] + x + 0); + b[1].val[1] = vld1q_u32(b444[0] + x + 4); + const int16x8_t p = CalculateFilteredOutputPass2(sr, ma, b); + SelfGuidedSingleMultiplier(sr, p, w0, dst + x); + mas.val[0] = mas.val[1]; + bs.val[0] = bs.val[1]; + x += 8; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilter( + const uint8_t* const src, const uint8_t* const src0, + const uint8_t* const src1, const ptrdiff_t stride, const int width, + const uint16_t scales[2], const int16_t w0, const int16_t w2, + uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + uint16_t* const ma343[4], uint16_t* const ma444[3], + uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3], + uint32_t* const b565[2], uint8_t* const dst) { + uint8x8x2_t s[2], ma3[2], ma5; + uint16x8x2_t sq[2], b3[2], b5; + s[0].val[0] = vld1_u8(src0); + s[1].val[0] = vld1_u8(src1); + sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]); + sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]); + BoxFilterPreProcess(src0, src1, 0, scales, sum3, sum5, square_sum3, + square_sum5, s, sq, &ma3[0].val[0], &ma3[1].val[0], + &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]); + + int x = 0; + do { + s[0].val[0] = s[0].val[1]; + s[1].val[0] = s[1].val[1]; + sq[0].val[0] = sq[0].val[1]; + sq[1].val[0] = sq[1].val[1]; + BoxFilterPreProcess(src0, src1, x + 8, scales, sum3, sum5, square_sum3, + square_sum5, s, sq, &ma3[0].val[1], &ma3[1].val[1], + &b3[0].val[1], &b3[1].val[1], &ma5.val[1], &b5.val[1]); + uint16x8_t ma[3][3]; + uint32x4x2_t b[3][3]; + Store343_444(ma3[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1], + ma343[2], ma444[1], b343[2], b444[1]); + Store343_444(ma3[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2], + b343[3], b444[2]); + ma[0][1] = Sum565(ma5); + b[0][1] = Sum565W(b5); + vst1q_u16(ma565[1] + x, ma[0][1]); + vst1q_u32(b565[1] + x, b[0][1].val[0]); + vst1q_u32(b565[1] + x + 4, b[0][1].val[1]); + ma3[0].val[0] = ma3[0].val[1]; + ma3[1].val[0] = ma3[1].val[1]; + b3[0].val[0] = b3[0].val[1]; + b3[1].val[0] = b3[1].val[1]; + ma5.val[0] = ma5.val[1]; + b5.val[0] = b5.val[1]; + int16x8_t p[2][2]; + const uint8x8_t sr0 = vld1_u8(src + x); + const uint8x8_t sr1 = vld1_u8(src + stride + x); + ma[0][0] = vld1q_u16(ma565[0] + x); + b[0][0].val[0] = vld1q_u32(b565[0] + x); + b[0][0].val[1] = vld1q_u32(b565[0] + x + 4); + p[0][0] = CalculateFilteredOutputPass1(sr0, ma[0], b[0]); + p[1][0] = CalculateFilteredOutput<4>(sr1, ma[0][1], b[0][1]); + ma[1][0] = vld1q_u16(ma343[0] + x); + ma[1][1] = vld1q_u16(ma444[0] + x); + b[1][0].val[0] = vld1q_u32(b343[0] + x); + b[1][0].val[1] = vld1q_u32(b343[0] + x + 4); + b[1][1].val[0] = vld1q_u32(b444[0] + x); + b[1][1].val[1] = vld1q_u32(b444[0] + x + 4); + p[0][1] = CalculateFilteredOutputPass2(sr0, ma[1], b[1]); + ma[2][0] = vld1q_u16(ma343[1] + x); + b[2][0].val[0] = vld1q_u32(b343[1] + x); + b[2][0].val[1] = vld1q_u32(b343[1] + x + 4); + p[1][1] = CalculateFilteredOutputPass2(sr1, ma[2], b[2]); + SelfGuidedDoubleMultiplier(sr0, p[0], w0, w2, dst + x); + SelfGuidedDoubleMultiplier(sr1, p[1], w0, w2, dst + stride + x); + x += 8; + } while (x < width); +} + +inline void BoxFilterLastRow( + const uint8_t* const src, const uint8_t* const src0, const int width, + const uint16_t scales[2], const int16_t w0, const int16_t w2, + uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + uint16_t* const ma343[4], uint16_t* const ma444[3], + uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3], + uint32_t* const b565[2], uint8_t* const dst) { + uint8x8x2_t s, ma3, ma5; + uint16x8x2_t sq, b3, b5; + uint16x8_t ma[3]; + uint32x4x2_t b[3]; + s.val[0] = vld1_u8(src0); + sq.val[0] = vmull_u8(s.val[0], s.val[0]); + BoxFilterPreProcessLastRow(src0, 0, scales, sum3, sum5, square_sum3, + square_sum5, &s, &sq, &ma3.val[0], &ma5.val[0], + &b3.val[0], &b5.val[0]); + + int x = 0; + do { + s.val[0] = s.val[1]; + sq.val[0] = sq.val[1]; + BoxFilterPreProcessLastRow(src0, x + 8, scales, sum3, sum5, square_sum3, + square_sum5, &s, &sq, &ma3.val[1], &ma5.val[1], + &b3.val[1], &b5.val[1]); + ma[1] = Sum565(ma5); + b[1] = Sum565W(b5); + ma5.val[0] = ma5.val[1]; + b5.val[0] = b5.val[1]; + ma[2] = Sum343(ma3); + b[2] = Sum343W(b3); + ma3.val[0] = ma3.val[1]; + b3.val[0] = b3.val[1]; + const uint8x8_t sr = vld1_u8(src + x); + int16x8_t p[2]; + ma[0] = vld1q_u16(ma565[0] + x); + b[0].val[0] = vld1q_u32(b565[0] + x + 0); + b[0].val[1] = vld1q_u32(b565[0] + x + 4); + p[0] = CalculateFilteredOutputPass1(sr, ma, b); + ma[0] = vld1q_u16(ma343[0] + x); + ma[1] = vld1q_u16(ma444[0] + x); + b[0].val[0] = vld1q_u32(b343[0] + x + 0); + b[0].val[1] = vld1q_u32(b343[0] + x + 4); + b[1].val[0] = vld1q_u32(b444[0] + x + 0); + b[1].val[1] = vld1q_u32(b444[0] + x + 4); + p[1] = CalculateFilteredOutputPass2(sr, ma, b); + SelfGuidedDoubleMultiplier(sr, p, w0, w2, dst + x); + x += 8; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( + const RestorationUnitInfo& restoration_info, const uint8_t* src, + const uint8_t* const top_border, const uint8_t* bottom_border, + const ptrdiff_t stride, const int width, const int height, + SgrBuffer* const sgr_buffer, uint8_t* dst) { + const auto temp_stride = Align(width, 8); + const ptrdiff_t sum_stride = temp_stride + 8; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12. + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1; + uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2]; + uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2]; + sum3[0] = sgr_buffer->sum3; + square_sum3[0] = sgr_buffer->square_sum3; + ma343[0] = sgr_buffer->ma343; + b343[0] = sgr_buffer->b343; + for (int i = 1; i <= 3; ++i) { + sum3[i] = sum3[i - 1] + sum_stride; + square_sum3[i] = square_sum3[i - 1] + sum_stride; + ma343[i] = ma343[i - 1] + temp_stride; + b343[i] = b343[i - 1] + temp_stride; + } + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + for (int i = 1; i <= 4; ++i) { + sum5[i] = sum5[i - 1] + sum_stride; + square_sum5[i] = square_sum5[i - 1] + sum_stride; + } + ma444[0] = sgr_buffer->ma444; + b444[0] = sgr_buffer->b444; + for (int i = 1; i <= 2; ++i) { + ma444[i] = ma444[i - 1] + temp_stride; + b444[i] = b444[i - 1] + temp_stride; + } + ma565[0] = sgr_buffer->ma565; + ma565[1] = ma565[0] + temp_stride; + b565[0] = sgr_buffer->b565; + b565[1] = b565[0] + temp_stride; + assert(scales[0] != 0); + assert(scales[1] != 0); + BoxSum(top_border, stride, 2, sum_stride, sum3[0], sum5[1], square_sum3[0], + square_sum5[1]); + sum5[0] = sum5[1]; + square_sum5[0] = square_sum5[1]; + const uint8_t* const s = (height > 1) ? src + stride : bottom_border; + BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3, + square_sum5, ma343, ma444, ma565[0], b343, b444, + b565[0]); + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + + for (int y = (height >> 1) - 1; y > 0; --y) { + Circulate4PointersBy2(sum3); + Circulate4PointersBy2(square_sum3); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width, + scales, w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, + ma444, ma565, b343, b444, b565, dst); + src += 2 * stride; + dst += 2 * stride; + Circulate4PointersBy2(ma343); + Circulate4PointersBy2(b343); + std::swap(ma444[0], ma444[2]); + std::swap(b444[0], b444[2]); + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + + Circulate4PointersBy2(sum3); + Circulate4PointersBy2(square_sum3); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + if ((height & 1) == 0 || height > 1) { + const uint8_t* sr[2]; + if ((height & 1) == 0) { + sr[0] = bottom_border; + sr[1] = bottom_border + stride; + } else { + sr[0] = src + 2 * stride; + sr[1] = bottom_border; + } + BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5, + square_sum3, square_sum5, ma343, ma444, ma565, b343, b444, b565, + dst); + } + if ((height & 1) != 0) { + if (height > 1) { + src += 2 * stride; + dst += 2 * stride; + Circulate4PointersBy2(sum3); + Circulate4PointersBy2(square_sum3); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + Circulate4PointersBy2(ma343); + Circulate4PointersBy2(b343); + std::swap(ma444[0], ma444[2]); + std::swap(b444[0], b444[2]); + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + BoxFilterLastRow(src + 3, bottom_border + stride, width, scales, w0, w2, + sum3, sum5, square_sum3, square_sum5, ma343, ma444, ma565, + b343, b444, b565, dst); + } +} + +inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, + const uint8_t* src, + const uint8_t* const top_border, + const uint8_t* bottom_border, + const ptrdiff_t stride, const int width, + const int height, SgrBuffer* const sgr_buffer, + uint8_t* dst) { + const auto temp_stride = Align(width, 8); + const ptrdiff_t sum_stride = temp_stride + 8; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12. + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + uint16_t *sum5[5], *ma565[2]; + uint32_t *square_sum5[5], *b565[2]; + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + for (int i = 1; i <= 4; ++i) { + sum5[i] = sum5[i - 1] + sum_stride; + square_sum5[i] = square_sum5[i - 1] + sum_stride; + } + ma565[0] = sgr_buffer->ma565; + ma565[1] = ma565[0] + temp_stride; + b565[0] = sgr_buffer->b565; + b565[1] = b565[0] + temp_stride; + assert(scale != 0); + BoxSum<5>(top_border, stride, 2, sum_stride, sum5[1], square_sum5[1]); + sum5[0] = sum5[1]; + square_sum5[0] = square_sum5[1]; + const uint8_t* const s = (height > 1) ? src + stride : bottom_border; + BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, ma565[0], + b565[0]); + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + + for (int y = (height >> 1) - 1; y > 0; --y) { + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5, + square_sum5, width, scale, w0, ma565, b565, dst); + src += 2 * stride; + dst += 2 * stride; + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + if ((height & 1) == 0 || height > 1) { + const uint8_t* sr[2]; + if ((height & 1) == 0) { + sr[0] = bottom_border; + sr[1] = bottom_border + stride; + } else { + sr[0] = src + 2 * stride; + sr[1] = bottom_border; + } + BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width, + scale, w0, ma565, b565, dst); + } + if ((height & 1) != 0) { + if (height > 1) { + src += 2 * stride; + dst += 2 * stride; + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + } + BoxFilterPass1LastRow(src + 3, bottom_border + stride, width, scale, w0, + sum5, square_sum5, ma565[0], b565[0], dst); + } +} + +inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, + const uint8_t* src, + const uint8_t* const top_border, + const uint8_t* bottom_border, + const ptrdiff_t stride, const int width, + const int height, SgrBuffer* const sgr_buffer, + uint8_t* dst) { + assert(restoration_info.sgr_proj_info.multiplier[0] == 0); + const auto temp_stride = Align(width, 8); + const ptrdiff_t sum_stride = temp_stride + 8; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12. + uint16_t *sum3[3], *ma343[3], *ma444[2]; + uint32_t *square_sum3[3], *b343[3], *b444[2]; + sum3[0] = sgr_buffer->sum3; + square_sum3[0] = sgr_buffer->square_sum3; + ma343[0] = sgr_buffer->ma343; + b343[0] = sgr_buffer->b343; + for (int i = 1; i <= 2; ++i) { + sum3[i] = sum3[i - 1] + sum_stride; + square_sum3[i] = square_sum3[i - 1] + sum_stride; + ma343[i] = ma343[i - 1] + temp_stride; + b343[i] = b343[i - 1] + temp_stride; + } + ma444[0] = sgr_buffer->ma444; + ma444[1] = ma444[0] + temp_stride; + b444[0] = sgr_buffer->b444; + b444[1] = b444[0] + temp_stride; + assert(scale != 0); + BoxSum<3>(top_border, stride, 2, sum_stride, sum3[0], square_sum3[0]); + BoxSumFilterPreProcess3(src, width, scale, sum3, square_sum3, ma343[0], + nullptr, b343[0], nullptr); + Circulate3PointersBy1(sum3); + Circulate3PointersBy1(square_sum3); + const uint8_t* s; + if (height > 1) { + s = src + stride; + } else { + s = bottom_border; + bottom_border += stride; + } + BoxSumFilterPreProcess3(s, width, scale, sum3, square_sum3, ma343[1], + ma444[0], b343[1], b444[0]); + + for (int y = height - 2; y > 0; --y) { + Circulate3PointersBy1(sum3); + Circulate3PointersBy1(square_sum3); + BoxFilterPass2(src + 2, src + 2 * stride, width, scale, w0, sum3, + square_sum3, ma343, ma444, b343, b444, dst); + src += stride; + dst += stride; + Circulate3PointersBy1(ma343); + Circulate3PointersBy1(b343); + std::swap(ma444[0], ma444[1]); + std::swap(b444[0], b444[1]); + } + + src += 2; + int y = std::min(height, 2); + do { + Circulate3PointersBy1(sum3); + Circulate3PointersBy1(square_sum3); + BoxFilterPass2(src, bottom_border, width, scale, w0, sum3, square_sum3, + ma343, ma444, b343, b444, dst); + src += stride; + dst += stride; + bottom_border += stride; + Circulate3PointersBy1(ma343); + Circulate3PointersBy1(b343); + std::swap(ma444[0], ma444[1]); + std::swap(b444[0], b444[1]); + } while (--y != 0); +} + +// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in +// the end of each row. It is safe to overwrite the output as it will not be +// part of the visible frame. +void SelfGuidedFilter_NEON( + const RestorationUnitInfo& restoration_info, const void* const source, + const void* const top_border, const void* const bottom_border, + const ptrdiff_t stride, const int width, const int height, + RestorationBuffer* const restoration_buffer, void* const dest) { + const int index = restoration_info.sgr_proj_info.index; + const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 + const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 + const auto* const src = static_cast(source); + const auto* top = static_cast(top_border); + const auto* bottom = static_cast(bottom_border); + auto* const dst = static_cast(dest); + SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer; + if (radius_pass_1 == 0) { + // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the + // following assertion. + assert(radius_pass_0 != 0); + BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3, + stride, width, height, sgr_buffer, dst); + } else if (radius_pass_0 == 0) { + BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2, + stride, width, height, sgr_buffer, dst); + } else { + BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride, + width, height, sgr_buffer, dst); + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->loop_restorations[0] = WienerFilter_NEON; + dsp->loop_restorations[1] = SelfGuidedFilter_NEON; +} + +} // namespace +} // namespace low_bitdepth + +void LoopRestorationInit_NEON() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void LoopRestorationInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/loop_restoration_neon.h b/src/dsp/arm/loop_restoration_neon.h new file mode 100644 index 0000000..b551610 --- /dev/null +++ b/src/dsp/arm/loop_restoration_neon.h @@ -0,0 +1,40 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::loop_restorations, see the defines below for specifics. +// This function is not thread-safe. +void LoopRestorationInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON + +#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_NEON + +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_LOOP_RESTORATION_NEON_H_ diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc new file mode 100644 index 0000000..084f42f --- /dev/null +++ b/src/dsp/arm/mask_blend_neon.cc @@ -0,0 +1,444 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/mask_blend.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +// TODO(b/150461164): Consider combining with GetInterIntraMask4x2(). +// Compound predictors use int16_t values and need to multiply long because the +// Convolve range * 64 is 20 bits. Unfortunately there is no multiply int16_t by +// int8_t and accumulate into int32_t instruction. +template +inline int16x8_t GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) { + if (subsampling_x == 1) { + const int16x4_t mask_val0 = vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask))); + const int16x4_t mask_val1 = vreinterpret_s16_u16( + vpaddl_u8(vld1_u8(mask + (mask_stride << subsampling_y)))); + int16x8_t final_val; + if (subsampling_y == 1) { + const int16x4_t next_mask_val0 = + vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride))); + const int16x4_t next_mask_val1 = + vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride * 3))); + final_val = vaddq_s16(vcombine_s16(mask_val0, mask_val1), + vcombine_s16(next_mask_val0, next_mask_val1)); + } else { + final_val = vreinterpretq_s16_u16( + vpaddlq_u8(vreinterpretq_u8_s16(vcombine_s16(mask_val0, mask_val1)))); + } + return vrshrq_n_s16(final_val, subsampling_y + 1); + } + assert(subsampling_y == 0 && subsampling_x == 0); + const uint8x8_t mask_val0 = Load4(mask); + const uint8x8_t mask_val = Load4<1>(mask + mask_stride, mask_val0); + return vreinterpretq_s16_u16(vmovl_u8(mask_val)); +} + +template +inline int16x8_t GetMask8(const uint8_t* mask, ptrdiff_t mask_stride) { + if (subsampling_x == 1) { + int16x8_t mask_val = vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask))); + if (subsampling_y == 1) { + const int16x8_t next_mask_val = + vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask + mask_stride))); + mask_val = vaddq_s16(mask_val, next_mask_val); + } + return vrshrq_n_s16(mask_val, 1 + subsampling_y); + } + assert(subsampling_y == 0 && subsampling_x == 0); + const uint8x8_t mask_val = vld1_u8(mask); + return vreinterpretq_s16_u16(vmovl_u8(mask_val)); +} + +inline void WriteMaskBlendLine4x2(const int16_t* const pred_0, + const int16_t* const pred_1, + const int16x8_t pred_mask_0, + const int16x8_t pred_mask_1, uint8_t* dst, + const ptrdiff_t dst_stride) { + const int16x8_t pred_val_0 = vld1q_s16(pred_0); + const int16x8_t pred_val_1 = vld1q_s16(pred_1); + // int res = (mask_value * prediction_0[x] + + // (64 - mask_value) * prediction_1[x]) >> 6; + const int32x4_t weighted_pred_0_lo = + vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0)); + const int32x4_t weighted_pred_0_hi = + vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0)); + const int32x4_t weighted_combo_lo = vmlal_s16( + weighted_pred_0_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1)); + const int32x4_t weighted_combo_hi = + vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1), + vget_high_s16(pred_val_1)); + // dst[x] = static_cast( + // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0, + // (1 << kBitdepth8) - 1)); + const uint8x8_t result = + vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6), + vshrn_n_s32(weighted_combo_hi, 6)), + 4); + StoreLo4(dst, result); + StoreHi4(dst + dst_stride, result); +} + +template +inline void MaskBlending4x4_NEON(const int16_t* pred_0, const int16_t* pred_1, + const uint8_t* mask, + const ptrdiff_t mask_stride, uint8_t* dst, + const ptrdiff_t dst_stride) { + const int16x8_t mask_inverter = vdupq_n_s16(64); + int16x8_t pred_mask_0 = + GetMask4x2(mask, mask_stride); + int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); + WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, + dst_stride); + // TODO(b/150461164): Arm tends to do better with load(val); val += stride + // It may be possible to turn this into a loop with a templated height. + pred_0 += 4 << 1; + pred_1 += 4 << 1; + mask += mask_stride << (1 + subsampling_y); + dst += dst_stride << 1; + + pred_mask_0 = GetMask4x2(mask, mask_stride); + pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); + WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, + dst_stride); +} + +template +inline void MaskBlending4xH_NEON(const int16_t* pred_0, const int16_t* pred_1, + const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, const int height, + uint8_t* dst, const ptrdiff_t dst_stride) { + const uint8_t* mask = mask_ptr; + if (height == 4) { + MaskBlending4x4_NEON( + pred_0, pred_1, mask, mask_stride, dst, dst_stride); + return; + } + const int16x8_t mask_inverter = vdupq_n_s16(64); + int y = 0; + do { + int16x8_t pred_mask_0 = + GetMask4x2(mask, mask_stride); + int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); + + WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, + dst_stride); + pred_0 += 4 << 1; + pred_1 += 4 << 1; + mask += mask_stride << (1 + subsampling_y); + dst += dst_stride << 1; + + pred_mask_0 = GetMask4x2(mask, mask_stride); + pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); + WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, + dst_stride); + pred_0 += 4 << 1; + pred_1 += 4 << 1; + mask += mask_stride << (1 + subsampling_y); + dst += dst_stride << 1; + + pred_mask_0 = GetMask4x2(mask, mask_stride); + pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); + WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, + dst_stride); + pred_0 += 4 << 1; + pred_1 += 4 << 1; + mask += mask_stride << (1 + subsampling_y); + dst += dst_stride << 1; + + pred_mask_0 = GetMask4x2(mask, mask_stride); + pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); + WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, + dst_stride); + pred_0 += 4 << 1; + pred_1 += 4 << 1; + mask += mask_stride << (1 + subsampling_y); + dst += dst_stride << 1; + y += 8; + } while (y < height); +} + +template +inline void MaskBlend_NEON(const void* prediction_0, const void* prediction_1, + const ptrdiff_t /*prediction_stride_1*/, + const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, const int width, + const int height, void* dest, + const ptrdiff_t dst_stride) { + auto* dst = static_cast(dest); + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + if (width == 4) { + MaskBlending4xH_NEON( + pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride); + return; + } + const uint8_t* mask = mask_ptr; + const int16x8_t mask_inverter = vdupq_n_s16(64); + int y = 0; + do { + int x = 0; + do { + const int16x8_t pred_mask_0 = GetMask8( + mask + (x << subsampling_x), mask_stride); + // 64 - mask + const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0); + const int16x8_t pred_val_0 = vld1q_s16(pred_0 + x); + const int16x8_t pred_val_1 = vld1q_s16(pred_1 + x); + uint8x8_t result; + // int res = (mask_value * prediction_0[x] + + // (64 - mask_value) * prediction_1[x]) >> 6; + const int32x4_t weighted_pred_0_lo = + vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0)); + const int32x4_t weighted_pred_0_hi = + vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0)); + const int32x4_t weighted_combo_lo = + vmlal_s16(weighted_pred_0_lo, vget_low_s16(pred_mask_1), + vget_low_s16(pred_val_1)); + const int32x4_t weighted_combo_hi = + vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1), + vget_high_s16(pred_val_1)); + + // dst[x] = static_cast( + // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0, + // (1 << kBitdepth8) - 1)); + result = vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6), + vshrn_n_s32(weighted_combo_hi, 6)), + 4); + vst1_u8(dst + x, result); + + x += 8; + } while (x < width); + dst += dst_stride; + pred_0 += width; + pred_1 += width; + mask += mask_stride << subsampling_y; + } while (++y < height); +} + +// TODO(b/150461164): This is much faster for inter_intra (input is Pixel +// values) but regresses compound versions (input is int16_t). Try to +// consolidate these. +template +inline uint8x8_t GetInterIntraMask4x2(const uint8_t* mask, + ptrdiff_t mask_stride) { + if (subsampling_x == 1) { + const uint8x8_t mask_val = + vpadd_u8(vld1_u8(mask), vld1_u8(mask + (mask_stride << subsampling_y))); + if (subsampling_y == 1) { + const uint8x8_t next_mask_val = vpadd_u8(vld1_u8(mask + mask_stride), + vld1_u8(mask + mask_stride * 3)); + + // Use a saturating add to work around the case where all |mask| values + // are 64. Together with the rounding shift this ensures the correct + // result. + const uint8x8_t sum = vqadd_u8(mask_val, next_mask_val); + return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y); + } + + return vrshr_n_u8(mask_val, /*subsampling_x=*/1); + } + + assert(subsampling_y == 0 && subsampling_x == 0); + const uint8x8_t mask_val0 = Load4(mask); + // TODO(b/150461164): Investigate the source of |mask| and see if the stride + // can be removed. + // TODO(b/150461164): The unit tests start at 8x8. Does this get run? + return Load4<1>(mask + mask_stride, mask_val0); +} + +template +inline uint8x8_t GetInterIntraMask8(const uint8_t* mask, + ptrdiff_t mask_stride) { + if (subsampling_x == 1) { + const uint8x16_t mask_val = vld1q_u8(mask); + const uint8x8_t mask_paired = + vpadd_u8(vget_low_u8(mask_val), vget_high_u8(mask_val)); + if (subsampling_y == 1) { + const uint8x16_t next_mask_val = vld1q_u8(mask + mask_stride); + const uint8x8_t next_mask_paired = + vpadd_u8(vget_low_u8(next_mask_val), vget_high_u8(next_mask_val)); + + // Use a saturating add to work around the case where all |mask| values + // are 64. Together with the rounding shift this ensures the correct + // result. + const uint8x8_t sum = vqadd_u8(mask_paired, next_mask_paired); + return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y); + } + + return vrshr_n_u8(mask_paired, /*subsampling_x=*/1); + } + + assert(subsampling_y == 0 && subsampling_x == 0); + return vld1_u8(mask); +} + +inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0, + uint8_t* const pred_1, + const ptrdiff_t pred_stride_1, + const uint8x8_t pred_mask_0, + const uint8x8_t pred_mask_1) { + const uint8x8_t pred_val_0 = vld1_u8(pred_0); + uint8x8_t pred_val_1 = Load4(pred_1); + pred_val_1 = Load4<1>(pred_1 + pred_stride_1, pred_val_1); + + const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0); + const uint16x8_t weighted_combo = + vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1); + const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6); + StoreLo4(pred_1, result); + StoreHi4(pred_1 + pred_stride_1, result); +} + +template +inline void InterIntraMaskBlending8bpp4x4_NEON(const uint8_t* pred_0, + uint8_t* pred_1, + const ptrdiff_t pred_stride_1, + const uint8_t* mask, + const ptrdiff_t mask_stride) { + const uint8x8_t mask_inverter = vdup_n_u8(64); + uint8x8_t pred_mask_1 = + GetInterIntraMask4x2(mask, mask_stride); + uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1); + InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1); + pred_0 += 4 << 1; + pred_1 += pred_stride_1 << 1; + mask += mask_stride << (1 + subsampling_y); + + pred_mask_1 = + GetInterIntraMask4x2(mask, mask_stride); + pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1); + InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1); +} + +template +inline void InterIntraMaskBlending8bpp4xH_NEON( + const uint8_t* pred_0, uint8_t* pred_1, const ptrdiff_t pred_stride_1, + const uint8_t* mask, const ptrdiff_t mask_stride, const int height) { + if (height == 4) { + InterIntraMaskBlending8bpp4x4_NEON( + pred_0, pred_1, pred_stride_1, mask, mask_stride); + return; + } + int y = 0; + do { + InterIntraMaskBlending8bpp4x4_NEON( + pred_0, pred_1, pred_stride_1, mask, mask_stride); + pred_0 += 4 << 2; + pred_1 += pred_stride_1 << 2; + mask += mask_stride << (2 + subsampling_y); + + InterIntraMaskBlending8bpp4x4_NEON( + pred_0, pred_1, pred_stride_1, mask, mask_stride); + pred_0 += 4 << 2; + pred_1 += pred_stride_1 << 2; + mask += mask_stride << (2 + subsampling_y); + y += 8; + } while (y < height); +} + +template +inline void InterIntraMaskBlend8bpp_NEON(const uint8_t* prediction_0, + uint8_t* prediction_1, + const ptrdiff_t prediction_stride_1, + const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, + const int width, const int height) { + if (width == 4) { + InterIntraMaskBlending8bpp4xH_NEON( + prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride, + height); + return; + } + const uint8_t* mask = mask_ptr; + const uint8x8_t mask_inverter = vdup_n_u8(64); + int y = 0; + do { + int x = 0; + do { + // TODO(b/150461164): Consider a 16 wide specialization (at least for the + // unsampled version) to take advantage of vld1q_u8(). + const uint8x8_t pred_mask_1 = + GetInterIntraMask8( + mask + (x << subsampling_x), mask_stride); + // 64 - mask + const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1); + const uint8x8_t pred_val_0 = vld1_u8(prediction_0); + prediction_0 += 8; + const uint8x8_t pred_val_1 = vld1_u8(prediction_1 + x); + const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0); + // weighted_pred0 + weighted_pred1 + const uint16x8_t weighted_combo = + vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1); + const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6); + vst1_u8(prediction_1 + x, result); + + x += 8; + } while (x < width); + prediction_1 += prediction_stride_1; + mask += mask_stride << subsampling_y; + } while (++y < height); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->mask_blend[0][0] = MaskBlend_NEON<0, 0>; + dsp->mask_blend[1][0] = MaskBlend_NEON<1, 0>; + dsp->mask_blend[2][0] = MaskBlend_NEON<1, 1>; + // The is_inter_intra index of mask_blend[][] is replaced by + // inter_intra_mask_blend_8bpp[] in 8-bit. + dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_NEON<0, 0>; + dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_NEON<1, 0>; + dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_NEON<1, 1>; +} + +} // namespace +} // namespace low_bitdepth + +void MaskBlendInit_NEON() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON + +namespace libgav1 { +namespace dsp { + +void MaskBlendInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/mask_blend_neon.h b/src/dsp/arm/mask_blend_neon.h new file mode 100644 index 0000000..3829274 --- /dev/null +++ b/src/dsp/arm/mask_blend_neon.h @@ -0,0 +1,41 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::mask_blend. This function is not thread-safe. +void MaskBlendInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_MASK_BLEND_NEON_H_ diff --git a/src/dsp/arm/motion_field_projection_neon.cc b/src/dsp/arm/motion_field_projection_neon.cc new file mode 100644 index 0000000..8caba7d --- /dev/null +++ b/src/dsp/arm/motion_field_projection_neon.cc @@ -0,0 +1,393 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/motion_field_projection.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { +namespace dsp { +namespace { + +inline int16x8_t LoadDivision(const int8x8x2_t division_table, + const int8x8_t reference_offset) { + const int8x8_t kOne = vcreate_s8(0x0100010001000100); + const int8x16_t kOneQ = vcombine_s8(kOne, kOne); + const int8x8_t t = vadd_s8(reference_offset, reference_offset); + const int8x8x2_t tt = vzip_s8(t, t); + const int8x16_t t1 = vcombine_s8(tt.val[0], tt.val[1]); + const int8x16_t idx = vaddq_s8(t1, kOneQ); + const int8x8_t idx_low = vget_low_s8(idx); + const int8x8_t idx_high = vget_high_s8(idx); + const int16x4_t d0 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_low)); + const int16x4_t d1 = vreinterpret_s16_s8(vtbl2_s8(division_table, idx_high)); + return vcombine_s16(d0, d1); +} + +inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator, + const int numerator) { + const int32x4_t m0 = vmull_s16(mv, denominator); + const int32x4_t m = vmulq_n_s32(m0, numerator); + // Add the sign (0 or -1) to round towards zero. + const int32x4_t add_sign = vsraq_n_s32(m, m, 31); + return vqrshrn_n_s32(add_sign, 14); +} + +inline int16x8_t MvProjectionClip(const int16x8_t mv, + const int16x8_t denominator, + const int numerator) { + const int16x4_t mv0 = vget_low_s16(mv); + const int16x4_t mv1 = vget_high_s16(mv); + const int16x4_t s0 = MvProjection(mv0, vget_low_s16(denominator), numerator); + const int16x4_t s1 = MvProjection(mv1, vget_high_s16(denominator), numerator); + const int16x8_t projection = vcombine_s16(s0, s1); + const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp); + const int16x8_t clamp = vminq_s16(projection, projection_mv_clamp); + return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp)); +} + +inline int8x8_t Project_NEON(const int16x8_t delta, const int16x8_t dst_sign) { + // Add 63 to negative delta so that it shifts towards zero. + const int16x8_t delta_sign = vshrq_n_s16(delta, 15); + const uint16x8_t delta_u = vreinterpretq_u16_s16(delta); + const uint16x8_t delta_sign_u = vreinterpretq_u16_s16(delta_sign); + const uint16x8_t delta_adjust_u = vsraq_n_u16(delta_u, delta_sign_u, 10); + const int16x8_t delta_adjust = vreinterpretq_s16_u16(delta_adjust_u); + const int16x8_t offset0 = vshrq_n_s16(delta_adjust, 6); + const int16x8_t offset1 = veorq_s16(offset0, dst_sign); + const int16x8_t offset2 = vsubq_s16(offset1, dst_sign); + return vqmovn_s16(offset2); +} + +inline void GetPosition( + const int8x8x2_t division_table, const MotionVector* const mv, + const int numerator, const int x8_start, const int x8_end, const int x8, + const int8x8_t r_offsets, const int8x8_t source_reference_type8, + const int8x8_t skip_r, const int8x8_t y8_floor8, const int8x8_t y8_ceiling8, + const int16x8_t d_sign, const int delta, int8x8_t* const r, + int8x8_t* const position_y8, int8x8_t* const position_x8, + int64_t* const skip_64, int32x4_t mvs[2]) { + const auto* const mv_int = reinterpret_cast(mv + x8); + *r = vtbl1_s8(r_offsets, source_reference_type8); + const int16x8_t denorm = LoadDivision(division_table, source_reference_type8); + int16x8_t projection_mv[2]; + mvs[0] = vld1q_s32(mv_int + 0); + mvs[1] = vld1q_s32(mv_int + 4); + // Deinterlace x and y components + const int16x8_t mv0 = vreinterpretq_s16_s32(mvs[0]); + const int16x8_t mv1 = vreinterpretq_s16_s32(mvs[1]); + const int16x8x2_t mv_yx = vuzpq_s16(mv0, mv1); + // numerator could be 0. + projection_mv[0] = MvProjectionClip(mv_yx.val[0], denorm, numerator); + projection_mv[1] = MvProjectionClip(mv_yx.val[1], denorm, numerator); + // Do not update the motion vector if the block position is not valid or + // if position_x8 is outside the current range of x8_start and x8_end. + // Note that position_y8 will always be within the range of y8_start and + // y8_end. + // After subtracting the base, valid projections are within 8-bit. + *position_y8 = Project_NEON(projection_mv[0], d_sign); + const int8x8_t position_x = Project_NEON(projection_mv[1], d_sign); + const int8x8_t k01234567 = vcreate_s8(uint64_t{0x0706050403020100}); + *position_x8 = vqadd_s8(position_x, k01234567); + const int8x16_t position_xy = vcombine_s8(*position_x8, *position_y8); + const int x8_floor = std::max( + x8_start - x8, delta - kProjectionMvMaxHorizontalOffset); // [-8, 8] + const int x8_ceiling = std::min( + x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset); // [0, 16] + const int8x8_t x8_floor8 = vdup_n_s8(x8_floor); + const int8x8_t x8_ceiling8 = vdup_n_s8(x8_ceiling); + const int8x16_t floor_xy = vcombine_s8(x8_floor8, y8_floor8); + const int8x16_t ceiling_xy = vcombine_s8(x8_ceiling8, y8_ceiling8); + const uint8x16_t underflow = vcltq_s8(position_xy, floor_xy); + const uint8x16_t overflow = vcgeq_s8(position_xy, ceiling_xy); + const int8x16_t out = vreinterpretq_s8_u8(vorrq_u8(underflow, overflow)); + const int8x8_t skip_low = vorr_s8(skip_r, vget_low_s8(out)); + const int8x8_t skip = vorr_s8(skip_low, vget_high_s8(out)); + *skip_64 = vget_lane_s64(vreinterpret_s64_s8(skip), 0); +} + +template +inline void Store(const int16x8_t position, const int8x8_t reference_offset, + const int32x4_t mv, int8_t* dst_reference_offset, + MotionVector* dst_mv) { + const ptrdiff_t offset = vgetq_lane_s16(position, idx); + auto* const d_mv = reinterpret_cast(&dst_mv[offset]); + vst1q_lane_s32(d_mv, mv, idx & 3); + vst1_lane_s8(&dst_reference_offset[offset], reference_offset, idx); +} + +template +inline void CheckStore(const int8_t* skips, const int16x8_t position, + const int8x8_t reference_offset, const int32x4_t mv, + int8_t* dst_reference_offset, MotionVector* dst_mv) { + if (skips[idx] == 0) { + Store(position, reference_offset, mv, dst_reference_offset, dst_mv); + } +} + +// 7.9.2. +void MotionFieldProjectionKernel_NEON(const ReferenceInfo& reference_info, + const int reference_to_current_with_sign, + const int dst_sign, const int y8_start, + const int y8_end, const int x8_start, + const int x8_end, + TemporalMotionField* const motion_field) { + const ptrdiff_t stride = motion_field->mv.columns(); + // The column range has to be offset by kProjectionMvMaxHorizontalOffset since + // coordinates in that range could end up being position_x8 because of + // projection. + const int adjusted_x8_start = + std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0); + const int adjusted_x8_end = std::min( + x8_end + kProjectionMvMaxHorizontalOffset, static_cast(stride)); + const int adjusted_x8_end8 = adjusted_x8_end & ~7; + const int leftover = adjusted_x8_end - adjusted_x8_end8; + const int8_t* const reference_offsets = + reference_info.relative_distance_to.data(); + const bool* const skip_references = reference_info.skip_references.data(); + const int16_t* const projection_divisions = + reference_info.projection_divisions.data(); + const ReferenceFrameType* source_reference_types = + &reference_info.motion_field_reference_frame[y8_start][0]; + const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0]; + int8_t* dst_reference_offset = motion_field->reference_offset[y8_start]; + MotionVector* dst_mv = motion_field->mv[y8_start]; + const int16x8_t d_sign = vdupq_n_s16(dst_sign); + + static_assert(sizeof(int8_t) == sizeof(bool), ""); + static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), ""); + static_assert(sizeof(int32_t) == sizeof(MotionVector), ""); + assert(dst_sign == 0 || dst_sign == -1); + assert(stride == motion_field->reference_offset.columns()); + assert((y8_start & 7) == 0); + assert((adjusted_x8_start & 7) == 0); + // The final position calculation is represented with int16_t. Valid + // position_y8 from its base is at most 7. After considering the horizontal + // offset which is at most |stride - 1|, we have the following assertion, + // which means this optimization works for frame width up to 32K (each + // position is a 8x8 block). + assert(8 * stride <= 32768); + const int8x8_t skip_reference = + vld1_s8(reinterpret_cast(skip_references)); + const int8x8_t r_offsets = vld1_s8(reference_offsets); + const int8x16_t table = vreinterpretq_s8_s16(vld1q_s16(projection_divisions)); + int8x8x2_t division_table; + division_table.val[0] = vget_low_s8(table); + division_table.val[1] = vget_high_s8(table); + + int y8 = y8_start; + do { + const int y8_floor = (y8 & ~7) - y8; // [-7, 0] + const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8); // [1, 8] + const int8x8_t y8_floor8 = vdup_n_s8(y8_floor); + const int8x8_t y8_ceiling8 = vdup_n_s8(y8_ceiling); + int x8; + + for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) { + const int8x8_t source_reference_type8 = + vld1_s8(reinterpret_cast(source_reference_types + x8)); + const int8x8_t skip_r = vtbl1_s8(skip_reference, source_reference_type8); + const int64_t early_skip = vget_lane_s64(vreinterpret_s64_s8(skip_r), 0); + // Early termination #1 if all are skips. Chance is typically ~30-40%. + if (early_skip == -1) continue; + int64_t skip_64; + int8x8_t r, position_x8, position_y8; + int32x4_t mvs[2]; + GetPosition(division_table, mv, reference_to_current_with_sign, x8_start, + x8_end, x8, r_offsets, source_reference_type8, skip_r, + y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_y8, + &position_x8, &skip_64, mvs); + // Early termination #2 if all are skips. + // Chance is typically ~15-25% after Early termination #1. + if (skip_64 == -1) continue; + const int16x8_t p_y = vmovl_s8(position_y8); + const int16x8_t p_x = vmovl_s8(position_x8); + const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride); + const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8)); + if (skip_64 == 0) { + // Store all. Chance is typically ~70-85% after Early termination #2. + Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv); + } else { + // Check and store each. + // Chance is typically ~15-30% after Early termination #2. + // The compiler is smart enough to not create the local buffer skips[]. + int8_t skips[8]; + memcpy(skips, &skip_64, sizeof(skips)); + CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv); + CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv); + CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv); + CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv); + CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv); + CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv); + CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv); + CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv); + } + } + + // The following leftover processing cannot be moved out of the do...while + // loop. Doing so may change the result storing orders of the same position. + if (leftover > 0) { + // Use SIMD only when leftover is at least 4, and there are at least 8 + // elements in a row. + if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) { + // Process the last 8 elements to avoid loading invalid memory. Some + // elements may have been processed in the above loop, which is OK. + const int delta = 8 - leftover; + x8 = adjusted_x8_end - 8; + const int8x8_t source_reference_type8 = vld1_s8( + reinterpret_cast(source_reference_types + x8)); + const int8x8_t skip_r = + vtbl1_s8(skip_reference, source_reference_type8); + const int64_t early_skip = + vget_lane_s64(vreinterpret_s64_s8(skip_r), 0); + // Early termination #1 if all are skips. + if (early_skip != -1) { + int64_t skip_64; + int8x8_t r, position_x8, position_y8; + int32x4_t mvs[2]; + GetPosition(division_table, mv, reference_to_current_with_sign, + x8_start, x8_end, x8, r_offsets, source_reference_type8, + skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r, + &position_y8, &position_x8, &skip_64, mvs); + // Early termination #2 if all are skips. + if (skip_64 != -1) { + const int16x8_t p_y = vmovl_s8(position_y8); + const int16x8_t p_x = vmovl_s8(position_x8); + const int16x8_t pos = vmlaq_n_s16(p_x, p_y, stride); + const int16x8_t position = vaddq_s16(pos, vdupq_n_s16(x8)); + // Store up to 7 elements since leftover is at most 7. + if (skip_64 == 0) { + // Store all. + Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv); + } else { + // Check and store each. + // The compiler is smart enough to not create the local buffer + // skips[]. + int8_t skips[8]; + memcpy(skips, &skip_64, sizeof(skips)); + CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, + dst_mv); + CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, + dst_mv); + CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, + dst_mv); + CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, + dst_mv); + CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, + dst_mv); + CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, + dst_mv); + CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, + dst_mv); + } + } + } + } else { + for (; x8 < adjusted_x8_end; ++x8) { + const int source_reference_type = source_reference_types[x8]; + if (skip_references[source_reference_type]) continue; + MotionVector projection_mv; + // reference_to_current_with_sign could be 0. + GetMvProjection(mv[x8], reference_to_current_with_sign, + projection_divisions[source_reference_type], + &projection_mv); + // Do not update the motion vector if the block position is not valid + // or if position_x8 is outside the current range of x8_start and + // x8_end. Note that position_y8 will always be within the range of + // y8_start and y8_end. + const int position_y8 = Project(0, projection_mv.mv[0], dst_sign); + if (position_y8 < y8_floor || position_y8 >= y8_ceiling) continue; + const int x8_base = x8 & ~7; + const int x8_floor = + std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset); + const int x8_ceiling = + std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset); + const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign); + if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue; + dst_mv[position_y8 * stride + position_x8] = mv[x8]; + dst_reference_offset[position_y8 * stride + position_x8] = + reference_offsets[source_reference_type]; + } + } + } + + source_reference_types += stride; + mv += stride; + dst_reference_offset += stride; + dst_mv += stride; + } while (++y8 < y8_end); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON; +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_NEON; +} +#endif + +} // namespace + +void MotionFieldProjectionInit_NEON() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void MotionFieldProjectionInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/motion_field_projection_neon.h b/src/dsp/arm/motion_field_projection_neon.h new file mode 100644 index 0000000..41ab6a6 --- /dev/null +++ b/src/dsp/arm/motion_field_projection_neon.h @@ -0,0 +1,39 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::motion_field_projection_kernel. This function is not +// thread-safe. +void MotionFieldProjectionInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON + +#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_NEON + +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_MOTION_FIELD_PROJECTION_NEON_H_ diff --git a/src/dsp/arm/motion_vector_search_neon.cc b/src/dsp/arm/motion_vector_search_neon.cc new file mode 100644 index 0000000..8a403a6 --- /dev/null +++ b/src/dsp/arm/motion_vector_search_neon.cc @@ -0,0 +1,267 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/motion_vector_search.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { +namespace dsp { +namespace { + +inline int16x4_t MvProjection(const int16x4_t mv, const int16x4_t denominator, + const int32x4_t numerator) { + const int32x4_t m0 = vmull_s16(mv, denominator); + const int32x4_t m = vmulq_s32(m0, numerator); + // Add the sign (0 or -1) to round towards zero. + const int32x4_t add_sign = vsraq_n_s32(m, m, 31); + return vqrshrn_n_s32(add_sign, 14); +} + +inline int16x4_t MvProjectionCompound(const int16x4_t mv, + const int temporal_reference_offsets, + const int reference_offsets[2]) { + const int16x4_t denominator = + vdup_n_s16(kProjectionMvDivisionLookup[temporal_reference_offsets]); + const int32x2_t offset = vld1_s32(reference_offsets); + const int32x2x2_t offsets = vzip_s32(offset, offset); + const int32x4_t numerator = vcombine_s32(offsets.val[0], offsets.val[1]); + return MvProjection(mv, denominator, numerator); +} + +inline int16x8_t ProjectionClip(const int16x4_t mv0, const int16x4_t mv1) { + const int16x8_t projection_mv_clamp = vdupq_n_s16(kProjectionMvClamp); + const int16x8_t mv = vcombine_s16(mv0, mv1); + const int16x8_t clamp = vminq_s16(mv, projection_mv_clamp); + return vmaxq_s16(clamp, vnegq_s16(projection_mv_clamp)); +} + +inline int16x8_t MvProjectionCompoundClip( + const MotionVector* const temporal_mvs, + const int8_t* const temporal_reference_offsets, + const int reference_offsets[2]) { + const auto* const tmvs = reinterpret_cast(temporal_mvs); + const int32x2_t temporal_mv = vld1_s32(tmvs); + const int16x4_t tmv0 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 0)); + const int16x4_t tmv1 = vreinterpret_s16_s32(vdup_lane_s32(temporal_mv, 1)); + const int16x4_t mv0 = MvProjectionCompound( + tmv0, temporal_reference_offsets[0], reference_offsets); + const int16x4_t mv1 = MvProjectionCompound( + tmv1, temporal_reference_offsets[1], reference_offsets); + return ProjectionClip(mv0, mv1); +} + +inline int16x8_t MvProjectionSingleClip( + const MotionVector* const temporal_mvs, + const int8_t* const temporal_reference_offsets, const int reference_offset, + int16x4_t* const lookup) { + const auto* const tmvs = reinterpret_cast(temporal_mvs); + const int16x8_t temporal_mv = vld1q_s16(tmvs); + *lookup = vld1_lane_s16( + &kProjectionMvDivisionLookup[temporal_reference_offsets[0]], *lookup, 0); + *lookup = vld1_lane_s16( + &kProjectionMvDivisionLookup[temporal_reference_offsets[1]], *lookup, 1); + *lookup = vld1_lane_s16( + &kProjectionMvDivisionLookup[temporal_reference_offsets[2]], *lookup, 2); + *lookup = vld1_lane_s16( + &kProjectionMvDivisionLookup[temporal_reference_offsets[3]], *lookup, 3); + const int16x4x2_t denominator = vzip_s16(*lookup, *lookup); + const int16x4_t tmv0 = vget_low_s16(temporal_mv); + const int16x4_t tmv1 = vget_high_s16(temporal_mv); + const int32x4_t numerator = vdupq_n_s32(reference_offset); + const int16x4_t mv0 = MvProjection(tmv0, denominator.val[0], numerator); + const int16x4_t mv1 = MvProjection(tmv1, denominator.val[1], numerator); + return ProjectionClip(mv0, mv1); +} + +inline void LowPrecision(const int16x8_t mv, void* const candidate_mvs) { + const int16x8_t kRoundDownMask = vdupq_n_s16(1); + const uint16x8_t mvu = vreinterpretq_u16_s16(mv); + const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15)); + const int16x8_t mv1 = vbicq_s16(mv0, kRoundDownMask); + vst1q_s16(static_cast(candidate_mvs), mv1); +} + +inline void ForceInteger(const int16x8_t mv, void* const candidate_mvs) { + const int16x8_t kRoundDownMask = vdupq_n_s16(7); + const uint16x8_t mvu = vreinterpretq_u16_s16(mv); + const int16x8_t mv0 = vreinterpretq_s16_u16(vsraq_n_u16(mvu, mvu, 15)); + const int16x8_t mv1 = vaddq_s16(mv0, vdupq_n_s16(3)); + const int16x8_t mv2 = vbicq_s16(mv1, kRoundDownMask); + vst1q_s16(static_cast(candidate_mvs), mv2); +} + +void MvProjectionCompoundLowPrecision_NEON( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* candidate_mvs) { + // |reference_offsets| non-zero check usually equals true and is ignored. + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + // One more element could be calculated. + int loop_count = (count + 1) >> 1; + do { + const int16x8_t mv = MvProjectionCompoundClip( + temporal_mvs, temporal_reference_offsets, offsets); + LowPrecision(mv, candidate_mvs); + temporal_mvs += 2; + temporal_reference_offsets += 2; + candidate_mvs += 2; + } while (--loop_count); +} + +void MvProjectionCompoundForceInteger_NEON( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* candidate_mvs) { + // |reference_offsets| non-zero check usually equals true and is ignored. + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + // One more element could be calculated. + int loop_count = (count + 1) >> 1; + do { + const int16x8_t mv = MvProjectionCompoundClip( + temporal_mvs, temporal_reference_offsets, offsets); + ForceInteger(mv, candidate_mvs); + temporal_mvs += 2; + temporal_reference_offsets += 2; + candidate_mvs += 2; + } while (--loop_count); +} + +void MvProjectionCompoundHighPrecision_NEON( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* candidate_mvs) { + // |reference_offsets| non-zero check usually equals true and is ignored. + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + // One more element could be calculated. + int loop_count = (count + 1) >> 1; + do { + const int16x8_t mv = MvProjectionCompoundClip( + temporal_mvs, temporal_reference_offsets, offsets); + vst1q_s16(reinterpret_cast(candidate_mvs), mv); + temporal_mvs += 2; + temporal_reference_offsets += 2; + candidate_mvs += 2; + } while (--loop_count); +} + +void MvProjectionSingleLowPrecision_NEON( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offset, const int count, MotionVector* candidate_mvs) { + // Up to three more elements could be calculated. + int loop_count = (count + 3) >> 2; + int16x4_t lookup = vdup_n_s16(0); + do { + const int16x8_t mv = MvProjectionSingleClip( + temporal_mvs, temporal_reference_offsets, reference_offset, &lookup); + LowPrecision(mv, candidate_mvs); + temporal_mvs += 4; + temporal_reference_offsets += 4; + candidate_mvs += 4; + } while (--loop_count); +} + +void MvProjectionSingleForceInteger_NEON( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offset, const int count, MotionVector* candidate_mvs) { + // Up to three more elements could be calculated. + int loop_count = (count + 3) >> 2; + int16x4_t lookup = vdup_n_s16(0); + do { + const int16x8_t mv = MvProjectionSingleClip( + temporal_mvs, temporal_reference_offsets, reference_offset, &lookup); + ForceInteger(mv, candidate_mvs); + temporal_mvs += 4; + temporal_reference_offsets += 4; + candidate_mvs += 4; + } while (--loop_count); +} + +void MvProjectionSingleHighPrecision_NEON( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offset, const int count, MotionVector* candidate_mvs) { + // Up to three more elements could be calculated. + int loop_count = (count + 3) >> 2; + int16x4_t lookup = vdup_n_s16(0); + do { + const int16x8_t mv = MvProjectionSingleClip( + temporal_mvs, temporal_reference_offsets, reference_offset, &lookup); + vst1q_s16(reinterpret_cast(candidate_mvs), mv); + temporal_mvs += 4; + temporal_reference_offsets += 4; + candidate_mvs += 4; + } while (--loop_count); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON; + dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON; + dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON; + dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON; + dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON; + dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON; +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_NEON; + dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_NEON; + dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_NEON; + dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_NEON; + dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_NEON; + dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_NEON; +} +#endif + +} // namespace + +void MotionVectorSearchInit_NEON() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void MotionVectorSearchInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/motion_vector_search_neon.h b/src/dsp/arm/motion_vector_search_neon.h new file mode 100644 index 0000000..19b4519 --- /dev/null +++ b/src/dsp/arm/motion_vector_search_neon.h @@ -0,0 +1,39 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This +// function is not thread-safe. +void MotionVectorSearchInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON + +#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_NEON + +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_MOTION_VECTOR_SEARCH_NEON_H_ diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc new file mode 100644 index 0000000..66ad663 --- /dev/null +++ b/src/dsp/arm/obmc_neon.cc @@ -0,0 +1,392 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/obmc.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +#include "src/dsp/obmc.inc" + +inline void WriteObmcLine4(uint8_t* const pred, const uint8_t* const obmc_pred, + const uint8x8_t pred_mask, + const uint8x8_t obmc_pred_mask) { + const uint8x8_t pred_val = Load4(pred); + const uint8x8_t obmc_pred_val = Load4(obmc_pred); + const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); + const uint8x8_t result = + vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + StoreLo4(pred, result); +} + +template +inline void OverlapBlend2xH_NEON(uint8_t* const prediction, + const ptrdiff_t prediction_stride, + const int height, + const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + uint8_t* pred = prediction; + const uint8x8_t mask_inverter = vdup_n_u8(64); + const uint8_t* obmc_pred = obmc_prediction; + uint8x8_t pred_mask; + uint8x8_t obmc_pred_mask; + int compute_height; + const int mask_offset = height - 2; + if (from_left) { + pred_mask = Load2(kObmcMask); + obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + compute_height = height; + } else { + // Weights for the last line are all 64, which is a no-op. + compute_height = height - 1; + } + uint8x8_t pred_val = vdup_n_u8(0); + uint8x8_t obmc_pred_val = vdup_n_u8(0); + int y = 0; + do { + if (!from_left) { + pred_mask = vdup_n_u8(kObmcMask[mask_offset + y]); + obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + } + pred_val = Load2<0>(pred, pred_val); + const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); + obmc_pred_val = Load2<0>(obmc_pred, obmc_pred_val); + const uint8x8_t result = + vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + Store2<0>(pred, result); + + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + } while (++y != compute_height); +} + +inline void OverlapBlendFromLeft4xH_NEON( + uint8_t* const prediction, const ptrdiff_t prediction_stride, + const int height, const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + + const uint8x8_t mask_inverter = vdup_n_u8(64); + const uint8x8_t pred_mask = Load4(kObmcMask + 2); + // 64 - mask + const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + int y = 0; + do { + WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + y += 2; + } while (y != height); +} + +inline void OverlapBlendFromLeft8xH_NEON( + uint8_t* const prediction, const ptrdiff_t prediction_stride, + const int height, const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + const uint8x8_t mask_inverter = vdup_n_u8(64); + const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6); + // 64 - mask + const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + int y = 0; + do { + const uint8x8_t pred_val = vld1_u8(pred); + const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); + const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred); + const uint8x8_t result = + vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + + vst1_u8(pred, result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + } while (++y != height); +} + +void OverlapBlendFromLeft_NEON(void* const prediction, + const ptrdiff_t prediction_stride, + const int width, const int height, + const void* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + auto* pred = static_cast(prediction); + const auto* obmc_pred = static_cast(obmc_prediction); + + if (width == 2) { + OverlapBlend2xH_NEON(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + if (width == 4) { + OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + if (width == 8) { + OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + const uint8x16_t mask_inverter = vdupq_n_u8(64); + const uint8_t* mask = kObmcMask + width - 2; + int x = 0; + do { + pred = static_cast(prediction) + x; + obmc_pred = static_cast(obmc_prediction) + x; + const uint8x16_t pred_mask = vld1q_u8(mask + x); + // 64 - mask + const uint8x16_t obmc_pred_mask = vsubq_u8(mask_inverter, pred_mask); + int y = 0; + do { + const uint8x16_t pred_val = vld1q_u8(pred); + const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred); + const uint16x8_t weighted_pred_lo = + vmull_u8(vget_low_u8(pred_mask), vget_low_u8(pred_val)); + const uint8x8_t result_lo = + vrshrn_n_u16(vmlal_u8(weighted_pred_lo, vget_low_u8(obmc_pred_mask), + vget_low_u8(obmc_pred_val)), + 6); + const uint16x8_t weighted_pred_hi = + vmull_u8(vget_high_u8(pred_mask), vget_high_u8(pred_val)); + const uint8x8_t result_hi = + vrshrn_n_u16(vmlal_u8(weighted_pred_hi, vget_high_u8(obmc_pred_mask), + vget_high_u8(obmc_pred_val)), + 6); + vst1q_u8(pred, vcombine_u8(result_lo, result_hi)); + + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + } while (++y < height); + x += 16; + } while (x < width); +} + +inline void OverlapBlendFromTop4x4_NEON(uint8_t* const prediction, + const ptrdiff_t prediction_stride, + const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride, + const int height) { + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + uint8x8_t pred_mask = vdup_n_u8(kObmcMask[height - 2]); + const uint8x8_t mask_inverter = vdup_n_u8(64); + uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + if (height == 2) { + return; + } + + pred_mask = vdup_n_u8(kObmcMask[3]); + obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + pred_mask = vdup_n_u8(kObmcMask[4]); + obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask); +} + +inline void OverlapBlendFromTop4xH_NEON( + uint8_t* const prediction, const ptrdiff_t prediction_stride, + const int height, const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + if (height < 8) { + OverlapBlendFromTop4x4_NEON(prediction, prediction_stride, obmc_prediction, + obmc_prediction_stride, height); + return; + } + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + const uint8_t* mask = kObmcMask + height - 2; + const uint8x8_t mask_inverter = vdup_n_u8(64); + int y = 0; + // Compute 6 lines for height 8, or 12 lines for height 16. The remaining + // lines are unchanged as the corresponding mask value is 64. + do { + uint8x8_t pred_mask = vdup_n_u8(mask[y]); + uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + pred_mask = vdup_n_u8(mask[y + 1]); + obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + pred_mask = vdup_n_u8(mask[y + 2]); + obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + pred_mask = vdup_n_u8(mask[y + 3]); + obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + pred_mask = vdup_n_u8(mask[y + 4]); + obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + pred_mask = vdup_n_u8(mask[y + 5]); + obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + // Increment for the right mask index. + y += 6; + } while (y < height - 4); +} + +inline void OverlapBlendFromTop8xH_NEON( + uint8_t* const prediction, const ptrdiff_t prediction_stride, + const int height, const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + const uint8x8_t mask_inverter = vdup_n_u8(64); + const uint8_t* mask = kObmcMask + height - 2; + const int compute_height = height - (height >> 2); + int y = 0; + do { + const uint8x8_t pred_mask = vdup_n_u8(mask[y]); + // 64 - mask + const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + const uint8x8_t pred_val = vld1_u8(pred); + const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val); + const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred); + const uint8x8_t result = + vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6); + + vst1_u8(pred, result); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + } while (++y != compute_height); +} + +void OverlapBlendFromTop_NEON(void* const prediction, + const ptrdiff_t prediction_stride, + const int width, const int height, + const void* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + auto* pred = static_cast(prediction); + const auto* obmc_pred = static_cast(obmc_prediction); + + if (width == 2) { + OverlapBlend2xH_NEON(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + if (width == 4) { + OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + + if (width == 8) { + OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + + const uint8_t* mask = kObmcMask + height - 2; + const uint8x8_t mask_inverter = vdup_n_u8(64); + // Stop when mask value becomes 64. This is inferred for 4xH. + const int compute_height = height - (height >> 2); + int y = 0; + do { + const uint8x8_t pred_mask = vdup_n_u8(mask[y]); + // 64 - mask + const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask); + int x = 0; + do { + const uint8x16_t pred_val = vld1q_u8(pred + x); + const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred + x); + const uint16x8_t weighted_pred_lo = + vmull_u8(pred_mask, vget_low_u8(pred_val)); + const uint8x8_t result_lo = + vrshrn_n_u16(vmlal_u8(weighted_pred_lo, obmc_pred_mask, + vget_low_u8(obmc_pred_val)), + 6); + const uint16x8_t weighted_pred_hi = + vmull_u8(pred_mask, vget_high_u8(pred_val)); + const uint8x8_t result_hi = + vrshrn_n_u16(vmlal_u8(weighted_pred_hi, obmc_pred_mask, + vget_high_u8(obmc_pred_val)), + 6); + vst1q_u8(pred + x, vcombine_u8(result_lo, result_hi)); + + x += 16; + } while (x < width); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + } while (++y < compute_height); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON; + dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON; +} + +} // namespace + +void ObmcInit_NEON() { Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON + +namespace libgav1 { +namespace dsp { + +void ObmcInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/obmc_neon.h b/src/dsp/arm/obmc_neon.h new file mode 100644 index 0000000..d5c9d9c --- /dev/null +++ b/src/dsp/arm/obmc_neon.h @@ -0,0 +1,38 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::obmc_blend. This function is not thread-safe. +void ObmcInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +// If NEON is enabled, signal the NEON implementation should be used. +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_OBMC_NEON_H_ diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc new file mode 100644 index 0000000..1680450 --- /dev/null +++ b/src/dsp/arm/super_res_neon.cc @@ -0,0 +1,166 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/super_res.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { + +namespace low_bitdepth { +namespace { + +void SuperResCoefficients_NEON(const int upscaled_width, + const int initial_subpixel_x, const int step, + void* const coefficients) { + auto* dst = static_cast(coefficients); + int subpixel_x = initial_subpixel_x; + int x = RightShiftWithCeiling(upscaled_width, 3); + do { + uint8x8_t filter[8]; + uint8x16_t d[kSuperResFilterTaps / 2]; + for (int i = 0; i < 8; ++i, subpixel_x += step) { + filter[i] = + vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >> + kSuperResExtraBits]); + } + Transpose8x8(filter, d); + vst1q_u8(dst, d[0]); + dst += 16; + vst1q_u8(dst, d[1]); + dst += 16; + vst1q_u8(dst, d[2]); + dst += 16; + vst1q_u8(dst, d[3]); + dst += 16; + } while (--x != 0); +} + +// Maximum sum of positive taps: 171 = 7 + 86 + 71 + 7 +// Maximum sum: 255*171 == 0xAA55 +// The sum is clipped to [0, 255], so adding all positive and then +// subtracting all negative with saturation is sufficient. +// 0 1 2 3 4 5 6 7 +// tap sign: - + - + + - + - +inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps], + const uint8_t** coefficients) { + uint8x16_t f[kSuperResFilterTaps / 2]; + for (int i = 0; i < kSuperResFilterTaps / 2; ++i, *coefficients += 16) { + f[i] = vld1q_u8(*coefficients); + } + uint16x8_t res = vmull_u8(src[1], vget_high_u8(f[0])); + res = vmlal_u8(res, src[3], vget_high_u8(f[1])); + res = vmlal_u8(res, src[4], vget_low_u8(f[2])); + res = vmlal_u8(res, src[6], vget_low_u8(f[3])); + uint16x8_t temp = vmull_u8(src[0], vget_low_u8(f[0])); + temp = vmlal_u8(temp, src[2], vget_low_u8(f[1])); + temp = vmlal_u8(temp, src[5], vget_high_u8(f[2])); + temp = vmlal_u8(temp, src[7], vget_high_u8(f[3])); + res = vqsubq_u16(res, temp); + return vqrshrn_n_u16(res, kFilterBits); +} + +void SuperRes_NEON(const void* const coefficients, void* const source, + const ptrdiff_t stride, const int height, + const int downscaled_width, const int upscaled_width, + const int initial_subpixel_x, const int step, + void* const dest) { + auto* src = static_cast(source) - DivideBy2(kSuperResFilterTaps); + auto* dst = static_cast(dest); + int y = height; + do { + const auto* filter = static_cast(coefficients); + uint8_t* dst_ptr = dst; + ExtendLine(src + DivideBy2(kSuperResFilterTaps), downscaled_width, + kSuperResHorizontalBorder, kSuperResHorizontalBorder); + int subpixel_x = initial_subpixel_x; + uint8x8_t sr[8]; + uint8x16_t s[8]; + int x = RightShiftWithCeiling(upscaled_width, 4); + // The below code calculates up to 15 extra upscaled + // pixels which will over-read up to 15 downscaled pixels in the end of each + // row. kSuperResHorizontalBorder accounts for this. + do { + for (int i = 0; i < 8; ++i, subpixel_x += step) { + sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]); + } + for (int i = 0; i < 8; ++i, subpixel_x += step) { + const uint8x8_t s_hi = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]); + s[i] = vcombine_u8(sr[i], s_hi); + } + Transpose8x16(s); + // Do not use loop for the following 8 instructions, since the compiler + // will generate redundant code. + sr[0] = vget_low_u8(s[0]); + sr[1] = vget_low_u8(s[1]); + sr[2] = vget_low_u8(s[2]); + sr[3] = vget_low_u8(s[3]); + sr[4] = vget_low_u8(s[4]); + sr[5] = vget_low_u8(s[5]); + sr[6] = vget_low_u8(s[6]); + sr[7] = vget_low_u8(s[7]); + const uint8x8_t d0 = SuperRes(sr, &filter); + // Do not use loop for the following 8 instructions, since the compiler + // will generate redundant code. + sr[0] = vget_high_u8(s[0]); + sr[1] = vget_high_u8(s[1]); + sr[2] = vget_high_u8(s[2]); + sr[3] = vget_high_u8(s[3]); + sr[4] = vget_high_u8(s[4]); + sr[5] = vget_high_u8(s[5]); + sr[6] = vget_high_u8(s[6]); + sr[7] = vget_high_u8(s[7]); + const uint8x8_t d1 = SuperRes(sr, &filter); + vst1q_u8(dst_ptr, vcombine_u8(d0, d1)); + dst_ptr += 16; + } while (--x != 0); + src += stride; + dst += stride; + } while (--y != 0); +} + +void Init8bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + dsp->super_res_coefficients = SuperResCoefficients_NEON; + dsp->super_res = SuperRes_NEON; +} + +} // namespace +} // namespace low_bitdepth + +void SuperResInit_NEON() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON + +namespace libgav1 { +namespace dsp { + +void SuperResInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/super_res_neon.h b/src/dsp/arm/super_res_neon.h new file mode 100644 index 0000000..f51785d --- /dev/null +++ b/src/dsp/arm/super_res_neon.h @@ -0,0 +1,37 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::super_res. This function is not thread-safe. +void SuperResInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_SuperResClip LIBGAV1_CPU_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_ diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc new file mode 100644 index 0000000..7a41998 --- /dev/null +++ b/src/dsp/arm/warp_neon.cc @@ -0,0 +1,453 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/warp.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +// Number of extra bits of precision in warped filtering. +constexpr int kWarpedDiffPrecisionBits = 10; +constexpr int kFirstPassOffset = 1 << 14; +constexpr int kOffsetRemoval = + (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128; + +// Applies the horizontal filter to one source row and stores the result in +// |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8 +// |intermediate_result| two-dimensional array. +// +// src_row_centered contains 16 "centered" samples of a source row. (We center +// the samples by subtracting 128 from the samples.) +void HorizontalFilter(const int sx4, const int16_t alpha, + const int8x16_t src_row_centered, + int16_t intermediate_result_row[8]) { + int sx = sx4 - MultiplyBy4(alpha); + int8x8_t filter[8]; + for (int x = 0; x < 8; ++x) { + const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + filter[x] = vld1_s8(kWarpedFilters8[offset]); + sx += alpha; + } + Transpose8x8(filter); + // Add kFirstPassOffset to ensure |sum| stays within uint16_t. + // Add 128 (offset) * 128 (filter sum) (also 1 << 14) to account for the + // centering of the source samples. These combined are 1 << 15 or -32768. + int16x8_t sum = + vdupq_n_s16(static_cast(kFirstPassOffset + 128 * 128)); + // Unrolled k = 0..7 loop. We need to manually unroll the loop because the + // third argument (an index value) to vextq_s8() must be a constant + // (immediate). src_row_window is a sliding window of length 8 into + // src_row_centered. + // k = 0. + int8x8_t src_row_window = vget_low_s8(src_row_centered); + sum = vmlal_s8(sum, filter[0], src_row_window); + // k = 1. + src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 1)); + sum = vmlal_s8(sum, filter[1], src_row_window); + // k = 2. + src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 2)); + sum = vmlal_s8(sum, filter[2], src_row_window); + // k = 3. + src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 3)); + sum = vmlal_s8(sum, filter[3], src_row_window); + // k = 4. + src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 4)); + sum = vmlal_s8(sum, filter[4], src_row_window); + // k = 5. + src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 5)); + sum = vmlal_s8(sum, filter[5], src_row_window); + // k = 6. + src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 6)); + sum = vmlal_s8(sum, filter[6], src_row_window); + // k = 7. + src_row_window = vget_low_s8(vextq_s8(src_row_centered, src_row_centered, 7)); + sum = vmlal_s8(sum, filter[7], src_row_window); + // End of unrolled k = 0..7 loop. + // Due to the offset |sum| is guaranteed to be unsigned. + uint16x8_t sum_unsigned = vreinterpretq_u16_s16(sum); + sum_unsigned = vrshrq_n_u16(sum_unsigned, kInterRoundBitsHorizontal); + // After the shift |sum_unsigned| will fit into int16_t. + vst1q_s16(intermediate_result_row, vreinterpretq_s16_u16(sum_unsigned)); +} + +template +void Warp_NEON(const void* const source, const ptrdiff_t source_stride, + const int source_width, const int source_height, + const int* const warp_params, const int subsampling_x, + const int subsampling_y, const int block_start_x, + const int block_start_y, const int block_width, + const int block_height, const int16_t alpha, const int16_t beta, + const int16_t gamma, const int16_t delta, void* dest, + const ptrdiff_t dest_stride) { + constexpr int kRoundBitsVertical = + is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical; + union { + // Intermediate_result is the output of the horizontal filtering and + // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 - + // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t + // type so that we can multiply it by kWarpedFilters (which has signed + // values) using vmlal_s16(). + int16_t intermediate_result[15][8]; // 15 rows, 8 columns. + // In the simple special cases where the samples in each row are all the + // same, store one sample per row in a column vector. + int16_t intermediate_result_column[15]; + }; + + const auto* const src = static_cast(source); + using DestType = + typename std::conditional::type; + auto* dst = static_cast(dest); + + assert(block_width >= 8); + assert(block_height >= 8); + + // Warp process applies for each 8x8 block. + int start_y = block_start_y; + do { + int start_x = block_start_x; + do { + const int src_x = (start_x + 4) << subsampling_x; + const int src_y = (start_y + 4) << subsampling_y; + const int dst_x = + src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; + const int dst_y = + src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; + const int x4 = dst_x >> subsampling_x; + const int y4 = dst_y >> subsampling_y; + const int ix4 = x4 >> kWarpedModelPrecisionBits; + const int iy4 = y4 >> kWarpedModelPrecisionBits; + // A prediction block may fall outside the frame's boundaries. If a + // prediction block is calculated using only samples outside the frame's + // boundary, the filtering can be simplified. We can divide the plane + // into several regions and handle them differently. + // + // | | + // 1 | 3 | 1 + // | | + // -------+-----------+------- + // |***********| + // 2 |*****4*****| 2 + // |***********| + // -------+-----------+------- + // | | + // 1 | 3 | 1 + // | | + // + // At the center, region 4 represents the frame and is the general case. + // + // In regions 1 and 2, the prediction block is outside the frame's + // boundary horizontally. Therefore the horizontal filtering can be + // simplified. Furthermore, in the region 1 (at the four corners), the + // prediction is outside the frame's boundary both horizontally and + // vertically, so we get a constant prediction block. + // + // In region 3, the prediction block is outside the frame's boundary + // vertically. Unfortunately because we apply the horizontal filters + // first, by the time we apply the vertical filters, they no longer see + // simple inputs. So the only simplification is that all the rows are + // the same, but we still need to apply all the horizontal and vertical + // filters. + + // Check for two simple special cases, where the horizontal filter can + // be significantly simplified. + // + // In general, for each row, the horizontal filter is calculated as + // follows: + // for (int x = -4; x < 4; ++x) { + // const int offset = ...; + // int sum = first_pass_offset; + // for (int k = 0; k < 8; ++k) { + // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1); + // sum += kWarpedFilters[offset][k] * src_row[column]; + // } + // ... + // } + // The column index before clipping, ix4 + x + k - 3, varies in the range + // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1 + // or ix4 + 7 <= 0, then all the column indexes are clipped to the same + // border index (source_width - 1 or 0, respectively). Then for each x, + // the inner for loop of the horizontal filter is reduced to multiplying + // the border pixel by the sum of the filter coefficients. + if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { + // Regions 1 and 2. + // Points to the left or right border of the first row of |src|. + const uint8_t* first_row_border = + (ix4 + 7 <= 0) ? src : src + source_width - 1; + // In general, for y in [-7, 8), the row number iy4 + y is clipped: + // const int row = Clip3(iy4 + y, 0, source_height - 1); + // In two special cases, iy4 + y is clipped to either 0 or + // source_height - 1 for all y. In the rest of the cases, iy4 + y is + // bounded and we can avoid clipping iy4 + y by relying on a reference + // frame's boundary extension on the top and bottom. + if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + // Region 1. + // Every sample used to calculate the prediction block has the same + // value. So the whole prediction block has the same value. + const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const uint8_t row_border_pixel = + first_row_border[row * source_stride]; + + DestType* dst_row = dst + start_x - block_start_x; + for (int y = 0; y < 8; ++y) { + if (is_compound) { + const int16x8_t sum = + vdupq_n_s16(row_border_pixel << (kInterRoundBitsVertical - + kRoundBitsVertical)); + vst1q_s16(reinterpret_cast(dst_row), sum); + } else { + memset(dst_row, row_border_pixel, 8); + } + dst_row += dest_stride; + } + // End of region 1. Continue the |start_x| do-while loop. + start_x += 8; + continue; + } + + // Region 2. + // Horizontal filter. + // The input values in this region are generated by extending the border + // which makes them identical in the horizontal direction. This + // computation could be inlined in the vertical pass but most + // implementations will need a transpose of some sort. + // It is not necessary to use the offset values here because the + // horizontal pass is a simple shift and the vertical pass will always + // require using 32 bits. + for (int y = -7; y < 8; ++y) { + // We may over-read up to 13 pixels above the top source row, or up + // to 13 pixels below the bottom source row. This is proved in + // warp.cc. + const int row = iy4 + y; + int sum = first_row_border[row * source_stride]; + sum <<= (kFilterBits - kInterRoundBitsHorizontal); + intermediate_result_column[y + 7] = sum; + } + // Vertical filter. + DestType* dst_row = dst + start_x - block_start_x; + int sy4 = + (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + for (int y = 0; y < 8; ++y) { + int sy = sy4 - MultiplyBy4(gamma); +#if defined(__aarch64__) + const int16x8_t intermediate = + vld1q_s16(&intermediate_result_column[y]); + int16_t tmp[8]; + for (int x = 0; x < 8; ++x) { + const int offset = + RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + const int16x8_t filter = vld1q_s16(kWarpedFilters[offset]); + const int32x4_t product_low = + vmull_s16(vget_low_s16(filter), vget_low_s16(intermediate)); + const int32x4_t product_high = + vmull_s16(vget_high_s16(filter), vget_high_s16(intermediate)); + // vaddvq_s32 is only available on __aarch64__. + const int32_t sum = + vaddvq_s32(product_low) + vaddvq_s32(product_high); + const int16_t sum_descale = + RightShiftWithRounding(sum, kRoundBitsVertical); + if (is_compound) { + dst_row[x] = sum_descale; + } else { + tmp[x] = sum_descale; + } + sy += gamma; + } + if (!is_compound) { + const int16x8_t sum = vld1q_s16(tmp); + vst1_u8(reinterpret_cast(dst_row), vqmovun_s16(sum)); + } +#else // !defined(__aarch64__) + int16x8_t filter[8]; + for (int x = 0; x < 8; ++x) { + const int offset = + RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + filter[x] = vld1q_s16(kWarpedFilters[offset]); + sy += gamma; + } + Transpose8x8(filter); + int32x4_t sum_low = vdupq_n_s32(0); + int32x4_t sum_high = sum_low; + for (int k = 0; k < 8; ++k) { + const int16_t intermediate = intermediate_result_column[y + k]; + sum_low = + vmlal_n_s16(sum_low, vget_low_s16(filter[k]), intermediate); + sum_high = + vmlal_n_s16(sum_high, vget_high_s16(filter[k]), intermediate); + } + const int16x8_t sum = + vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical), + vrshrn_n_s32(sum_high, kRoundBitsVertical)); + if (is_compound) { + vst1q_s16(reinterpret_cast(dst_row), sum); + } else { + vst1_u8(reinterpret_cast(dst_row), vqmovun_s16(sum)); + } +#endif // defined(__aarch64__) + dst_row += dest_stride; + sy4 += delta; + } + // End of region 2. Continue the |start_x| do-while loop. + start_x += 8; + continue; + } + + // Regions 3 and 4. + // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. + + // In general, for y in [-7, 8), the row number iy4 + y is clipped: + // const int row = Clip3(iy4 + y, 0, source_height - 1); + // In two special cases, iy4 + y is clipped to either 0 or + // source_height - 1 for all y. In the rest of the cases, iy4 + y is + // bounded and we can avoid clipping iy4 + y by relying on a reference + // frame's boundary extension on the top and bottom. + if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + // Region 3. + // Horizontal filter. + const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const uint8_t* const src_row = src + row * source_stride; + // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also + // read but is ignored. + // + // NOTE: This may read up to 13 bytes before src_row[0] or up to 14 + // bytes after src_row[source_width - 1]. We assume the source frame + // has left and right borders of at least 13 bytes that extend the + // frame boundary pixels. We also assume there is at least one extra + // padding byte after the right border of the last source row. + const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]); + // Convert src_row_v to int8 (subtract 128). + const int8x16_t src_row_centered = + vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128))); + int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + for (int y = -7; y < 8; ++y) { + HorizontalFilter(sx4, alpha, src_row_centered, + intermediate_result[y + 7]); + sx4 += beta; + } + } else { + // Region 4. + // Horizontal filter. + int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + for (int y = -7; y < 8; ++y) { + // We may over-read up to 13 pixels above the top source row, or up + // to 13 pixels below the bottom source row. This is proved in + // warp.cc. + const int row = iy4 + y; + const uint8_t* const src_row = src + row * source_stride; + // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also + // read but is ignored. + // + // NOTE: This may read up to 13 bytes before src_row[0] or up to 14 + // bytes after src_row[source_width - 1]. We assume the source frame + // has left and right borders of at least 13 bytes that extend the + // frame boundary pixels. We also assume there is at least one extra + // padding byte after the right border of the last source row. + const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]); + // Convert src_row_v to int8 (subtract 128). + const int8x16_t src_row_centered = + vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128))); + HorizontalFilter(sx4, alpha, src_row_centered, + intermediate_result[y + 7]); + sx4 += beta; + } + } + + // Regions 3 and 4. + // Vertical filter. + DestType* dst_row = dst + start_x - block_start_x; + int sy4 = + (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + for (int y = 0; y < 8; ++y) { + int sy = sy4 - MultiplyBy4(gamma); + int16x8_t filter[8]; + for (int x = 0; x < 8; ++x) { + const int offset = + RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + filter[x] = vld1q_s16(kWarpedFilters[offset]); + sy += gamma; + } + Transpose8x8(filter); + int32x4_t sum_low = vdupq_n_s32(-kOffsetRemoval); + int32x4_t sum_high = sum_low; + for (int k = 0; k < 8; ++k) { + const int16x8_t intermediate = vld1q_s16(intermediate_result[y + k]); + sum_low = vmlal_s16(sum_low, vget_low_s16(filter[k]), + vget_low_s16(intermediate)); + sum_high = vmlal_s16(sum_high, vget_high_s16(filter[k]), + vget_high_s16(intermediate)); + } + const int16x8_t sum = + vcombine_s16(vrshrn_n_s32(sum_low, kRoundBitsVertical), + vrshrn_n_s32(sum_high, kRoundBitsVertical)); + if (is_compound) { + vst1q_s16(reinterpret_cast(dst_row), sum); + } else { + vst1_u8(reinterpret_cast(dst_row), vqmovun_s16(sum)); + } + dst_row += dest_stride; + sy4 += delta; + } + start_x += 8; + } while (start_x < block_start_x + block_width); + dst += 8 * dest_stride; + start_y += 8; + } while (start_y < block_start_y + block_height); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->warp = Warp_NEON; + dsp->warp_compound = Warp_NEON; +} + +} // namespace +} // namespace low_bitdepth + +void WarpInit_NEON() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 +#else // !LIBGAV1_ENABLE_NEON +namespace libgav1 { +namespace dsp { + +void WarpInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/warp_neon.h b/src/dsp/arm/warp_neon.h new file mode 100644 index 0000000..dbcaa23 --- /dev/null +++ b/src/dsp/arm/warp_neon.h @@ -0,0 +1,37 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::warp. This function is not thread-safe. +void WarpInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_WARP_NEON_H_ diff --git a/src/dsp/arm/weight_mask_neon.cc b/src/dsp/arm/weight_mask_neon.cc new file mode 100644 index 0000000..49d3be0 --- /dev/null +++ b/src/dsp/arm/weight_mask_neon.cc @@ -0,0 +1,463 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/arm/weight_mask_neon.h" + +#include "src/dsp/weight_mask.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON + +#include + +#include +#include +#include + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +constexpr int kRoundingBits8bpp = 4; + +template +inline void WeightMask8_NEON(const int16_t* prediction_0, + const int16_t* prediction_1, uint8_t* mask) { + const int16x8_t pred_0 = vld1q_s16(prediction_0); + const int16x8_t pred_1 = vld1q_s16(prediction_1); + const uint8x8_t difference_offset = vdup_n_u8(38); + const uint8x8_t mask_ceiling = vdup_n_u8(64); + const uint16x8_t difference = vrshrq_n_u16( + vreinterpretq_u16_s16(vabdq_s16(pred_0, pred_1)), kRoundingBits8bpp); + const uint8x8_t adjusted_difference = + vqadd_u8(vqshrn_n_u16(difference, 4), difference_offset); + const uint8x8_t mask_value = vmin_u8(adjusted_difference, mask_ceiling); + if (mask_is_inverse) { + const uint8x8_t inverted_mask_value = vsub_u8(mask_ceiling, mask_value); + vst1_u8(mask, inverted_mask_value); + } else { + vst1_u8(mask, mask_value); + } +} + +#define WEIGHT8_WITHOUT_STRIDE \ + WeightMask8_NEON(pred_0, pred_1, mask) + +#define WEIGHT8_AND_STRIDE \ + WEIGHT8_WITHOUT_STRIDE; \ + pred_0 += 8; \ + pred_1 += 8; \ + mask += mask_stride + +template +void WeightMask8x8_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y = 0; + do { + WEIGHT8_AND_STRIDE; + } while (++y < 7); + WEIGHT8_WITHOUT_STRIDE; +} + +template +void WeightMask8x16_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT8_AND_STRIDE; + WEIGHT8_AND_STRIDE; + WEIGHT8_AND_STRIDE; + } while (++y3 < 5); + WEIGHT8_WITHOUT_STRIDE; +} + +template +void WeightMask8x32_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y5 = 0; + do { + WEIGHT8_AND_STRIDE; + WEIGHT8_AND_STRIDE; + WEIGHT8_AND_STRIDE; + WEIGHT8_AND_STRIDE; + WEIGHT8_AND_STRIDE; + } while (++y5 < 6); + WEIGHT8_AND_STRIDE; + WEIGHT8_WITHOUT_STRIDE; +} + +#define WEIGHT16_WITHOUT_STRIDE \ + WeightMask8_NEON(pred_0, pred_1, mask); \ + WeightMask8_NEON(pred_0 + 8, pred_1 + 8, mask + 8) + +#define WEIGHT16_AND_STRIDE \ + WEIGHT16_WITHOUT_STRIDE; \ + pred_0 += 16; \ + pred_1 += 16; \ + mask += mask_stride + +template +void WeightMask16x8_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y = 0; + do { + WEIGHT16_AND_STRIDE; + } while (++y < 7); + WEIGHT16_WITHOUT_STRIDE; +} + +template +void WeightMask16x16_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + } while (++y3 < 5); + WEIGHT16_WITHOUT_STRIDE; +} + +template +void WeightMask16x32_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y5 = 0; + do { + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + } while (++y5 < 6); + WEIGHT16_AND_STRIDE; + WEIGHT16_WITHOUT_STRIDE; +} + +template +void WeightMask16x64_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + } while (++y3 < 21); + WEIGHT16_WITHOUT_STRIDE; +} + +#define WEIGHT32_WITHOUT_STRIDE \ + WeightMask8_NEON(pred_0, pred_1, mask); \ + WeightMask8_NEON(pred_0 + 8, pred_1 + 8, mask + 8); \ + WeightMask8_NEON(pred_0 + 16, pred_1 + 16, mask + 16); \ + WeightMask8_NEON(pred_0 + 24, pred_1 + 24, mask + 24) + +#define WEIGHT32_AND_STRIDE \ + WEIGHT32_WITHOUT_STRIDE; \ + pred_0 += 32; \ + pred_1 += 32; \ + mask += mask_stride + +template +void WeightMask32x8_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_WITHOUT_STRIDE; +} + +template +void WeightMask32x16_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + } while (++y3 < 5); + WEIGHT32_WITHOUT_STRIDE; +} + +template +void WeightMask32x32_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y5 = 0; + do { + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + } while (++y5 < 6); + WEIGHT32_AND_STRIDE; + WEIGHT32_WITHOUT_STRIDE; +} + +template +void WeightMask32x64_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + } while (++y3 < 21); + WEIGHT32_WITHOUT_STRIDE; +} + +#define WEIGHT64_WITHOUT_STRIDE \ + WeightMask8_NEON(pred_0, pred_1, mask); \ + WeightMask8_NEON(pred_0 + 8, pred_1 + 8, mask + 8); \ + WeightMask8_NEON(pred_0 + 16, pred_1 + 16, mask + 16); \ + WeightMask8_NEON(pred_0 + 24, pred_1 + 24, mask + 24); \ + WeightMask8_NEON(pred_0 + 32, pred_1 + 32, mask + 32); \ + WeightMask8_NEON(pred_0 + 40, pred_1 + 40, mask + 40); \ + WeightMask8_NEON(pred_0 + 48, pred_1 + 48, mask + 48); \ + WeightMask8_NEON(pred_0 + 56, pred_1 + 56, mask + 56) + +#define WEIGHT64_AND_STRIDE \ + WEIGHT64_WITHOUT_STRIDE; \ + pred_0 += 64; \ + pred_1 += 64; \ + mask += mask_stride + +template +void WeightMask64x16_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + } while (++y3 < 5); + WEIGHT64_WITHOUT_STRIDE; +} + +template +void WeightMask64x32_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y5 = 0; + do { + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + } while (++y5 < 6); + WEIGHT64_AND_STRIDE; + WEIGHT64_WITHOUT_STRIDE; +} + +template +void WeightMask64x64_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + } while (++y3 < 21); + WEIGHT64_WITHOUT_STRIDE; +} + +template +void WeightMask64x128_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + } while (++y3 < 42); + WEIGHT64_AND_STRIDE; + WEIGHT64_WITHOUT_STRIDE; +} + +template +void WeightMask128x64_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + const ptrdiff_t adjusted_mask_stride = mask_stride - 64; + do { + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + } while (++y3 < 21); + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; +} + +template +void WeightMask128x128_NEON(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + const ptrdiff_t adjusted_mask_stride = mask_stride - 64; + do { + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + } while (++y3 < 42); + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; +} + +#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \ + dsp->weight_mask[w_index][h_index][0] = \ + WeightMask##width##x##height##_NEON<0>; \ + dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_NEON<1> +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + INIT_WEIGHT_MASK_8BPP(8, 8, 0, 0); + INIT_WEIGHT_MASK_8BPP(8, 16, 0, 1); + INIT_WEIGHT_MASK_8BPP(8, 32, 0, 2); + INIT_WEIGHT_MASK_8BPP(16, 8, 1, 0); + INIT_WEIGHT_MASK_8BPP(16, 16, 1, 1); + INIT_WEIGHT_MASK_8BPP(16, 32, 1, 2); + INIT_WEIGHT_MASK_8BPP(16, 64, 1, 3); + INIT_WEIGHT_MASK_8BPP(32, 8, 2, 0); + INIT_WEIGHT_MASK_8BPP(32, 16, 2, 1); + INIT_WEIGHT_MASK_8BPP(32, 32, 2, 2); + INIT_WEIGHT_MASK_8BPP(32, 64, 2, 3); + INIT_WEIGHT_MASK_8BPP(64, 16, 3, 1); + INIT_WEIGHT_MASK_8BPP(64, 32, 3, 2); + INIT_WEIGHT_MASK_8BPP(64, 64, 3, 3); + INIT_WEIGHT_MASK_8BPP(64, 128, 3, 4); + INIT_WEIGHT_MASK_8BPP(128, 64, 4, 3); + INIT_WEIGHT_MASK_8BPP(128, 128, 4, 4); +} + +} // namespace +} // namespace low_bitdepth + +void WeightMaskInit_NEON() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON + +namespace libgav1 { +namespace dsp { + +void WeightMaskInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON diff --git a/src/dsp/arm/weight_mask_neon.h b/src/dsp/arm/weight_mask_neon.h new file mode 100644 index 0000000..b4749ec --- /dev/null +++ b/src/dsp/arm/weight_mask_neon.h @@ -0,0 +1,52 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_ +#define LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::weight_mask. This function is not thread-safe. +void WeightMaskInit_NEON(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_ENABLE_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_NEON +#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_NEON +#endif // LIBGAV1_ENABLE_NEON + +#endif // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_NEON_H_ diff --git a/src/dsp/average_blend.cc b/src/dsp/average_blend.cc new file mode 100644 index 0000000..a59abb0 --- /dev/null +++ b/src/dsp/average_blend.cc @@ -0,0 +1,101 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/average_blend.h" + +#include +#include +#include +#include + +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +template +void AverageBlend_C(const void* prediction_0, const void* prediction_1, + const int width, const int height, void* const dest, + const ptrdiff_t dest_stride) { + // 7.11.3.2 Rounding variables derivation process + // 2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7)) + constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4; + using PredType = + typename std::conditional::type; + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + auto* dst = static_cast(dest); + const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel); + + int y = 0; + do { + int x = 0; + do { + // See warp.cc and convolve.cc for detailed prediction ranges. + int res = pred_0[x] + pred_1[x]; + res -= (bitdepth == 8) ? 0 : kCompoundOffset + kCompoundOffset; + dst[x] = static_cast( + Clip3(RightShiftWithRounding(res, inter_post_round_bits + 1), 0, + (1 << bitdepth) - 1)); + } while (++x < width); + + dst += dst_stride; + pred_0 += width; + pred_1 += width; + } while (++y < height); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->average_blend = AverageBlend_C<8, uint8_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp8bpp_AverageBlend + dsp->average_blend = AverageBlend_C<8, uint8_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp10bpp_AverageBlend + dsp->average_blend = AverageBlend_C<10, uint16_t>; +#endif +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp10bpp_AverageBlend + dsp->average_blend = AverageBlend_C<10, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif + +} // namespace + +void AverageBlendInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/average_blend.h b/src/dsp/average_blend.h new file mode 100644 index 0000000..02ecd09 --- /dev/null +++ b/src/dsp/average_blend.h @@ -0,0 +1,47 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_ +#define LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/average_blend_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/average_blend_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::average_blend. This function is not thread-safe. +void AverageBlendInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_AVERAGE_BLEND_H_ diff --git a/src/dsp/cdef.cc b/src/dsp/cdef.cc new file mode 100644 index 0000000..0b50517 --- /dev/null +++ b/src/dsp/cdef.cc @@ -0,0 +1,306 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/cdef.h" + +#include +#include +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace { + +#include "src/dsp/cdef.inc" + +// Silence unused function warnings when CdefDirection_C is obviated. +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + !defined(LIBGAV1_Dsp8bpp_CdefDirection) || \ + (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefDirection)) +constexpr int16_t kDivisionTable[] = {840, 420, 280, 210, 168, 140, 120, 105}; + +int32_t Square(int32_t x) { return x * x; } + +template +void CdefDirection_C(const void* const source, ptrdiff_t stride, + uint8_t* const direction, int* const variance) { + assert(direction != nullptr); + assert(variance != nullptr); + const auto* src = static_cast(source); + stride /= sizeof(Pixel); + int32_t cost[8] = {}; + // |partial| does not have to be int32_t for 8bpp. int16_t will suffice. We + // use int32_t to keep it simple since |cost| will have to be int32_t. + int32_t partial[8][15] = {}; + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + const int x = (src[j] >> (bitdepth - 8)) - 128; + partial[0][i + j] += x; + partial[1][i + j / 2] += x; + partial[2][i] += x; + partial[3][3 + i - j / 2] += x; + partial[4][7 + i - j] += x; + partial[5][3 - i / 2 + j] += x; + partial[6][j] += x; + partial[7][i / 2 + j] += x; + } + src += stride; + } + for (int i = 0; i < 8; ++i) { + cost[2] += Square(partial[2][i]); + cost[6] += Square(partial[6][i]); + } + cost[2] *= kDivisionTable[7]; + cost[6] *= kDivisionTable[7]; + for (int i = 0; i < 7; ++i) { + cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) * + kDivisionTable[i]; + cost[4] += (Square(partial[4][i]) + Square(partial[4][14 - i])) * + kDivisionTable[i]; + } + cost[0] += Square(partial[0][7]) * kDivisionTable[7]; + cost[4] += Square(partial[4][7]) * kDivisionTable[7]; + for (int i = 1; i < 8; i += 2) { + for (int j = 0; j < 5; ++j) { + cost[i] += Square(partial[i][3 + j]); + } + cost[i] *= kDivisionTable[7]; + for (int j = 0; j < 3; ++j) { + cost[i] += (Square(partial[i][j]) + Square(partial[i][10 - j])) * + kDivisionTable[2 * j + 1]; + } + } + int32_t best_cost = 0; + *direction = 0; + for (int i = 0; i < 8; ++i) { + if (cost[i] > best_cost) { + best_cost = cost[i]; + *direction = i; + } + } + *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10; +} +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || + // !defined(LIBGAV1_Dsp8bpp_CdefDirection) || + // (LIBGAV1_MAX_BITDEPTH >= 10 && + // !defined(LIBGAV1_Dsp10bpp_CdefDirection)) + +// Silence unused function warnings when CdefFilter_C is obviated. +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + !defined(LIBGAV1_Dsp8bpp_CdefFilters) || \ + (LIBGAV1_MAX_BITDEPTH >= 10 && !defined(LIBGAV1_Dsp10bpp_CdefFilters)) + +int Constrain(int diff, int threshold, int damping) { + assert(threshold != 0); + damping = std::max(0, damping - FloorLog2(threshold)); + const int sign = (diff < 0) ? -1 : 1; + return sign * + Clip3(threshold - (std::abs(diff) >> damping), 0, std::abs(diff)); +} + +// Filters the source block. It doesn't check whether the candidate pixel is +// inside the frame. However it requires the source input to be padded with a +// constant large value (kCdefLargeValue) if at the boundary. +template +void CdefFilter_C(const uint16_t* src, const ptrdiff_t src_stride, + const int block_height, const int primary_strength, + const int secondary_strength, const int damping, + const int direction, void* const dest, + const ptrdiff_t dest_stride) { + static_assert(block_width == 4 || block_width == 8, "Invalid CDEF width."); + static_assert(enable_primary || enable_secondary, ""); + assert(block_height == 4 || block_height == 8); + assert(direction >= 0 && direction <= 7); + constexpr int coeff_shift = bitdepth - 8; + // Section 5.9.19. CDEF params syntax. + assert(primary_strength >= 0 && primary_strength <= 15 << coeff_shift); + assert(secondary_strength >= 0 && secondary_strength <= 4 << coeff_shift && + secondary_strength != 3 << coeff_shift); + assert(primary_strength != 0 || secondary_strength != 0); + // damping is decreased by 1 for chroma. + assert((damping >= 3 && damping <= 6 + coeff_shift) || + (damping >= 2 && damping <= 5 + coeff_shift)); + // When only primary_strength or secondary_strength are non-zero the number + // of pixels inspected (4 for primary_strength, 8 for secondary_strength) and + // the taps used don't exceed the amount the sum is + // descaled by (16) so we can skip tracking and clipping to the minimum and + // maximum value observed. + constexpr bool clipping_required = enable_primary && enable_secondary; + static constexpr int kCdefSecondaryTaps[2] = {kCdefSecondaryTap0, + kCdefSecondaryTap1}; + auto* dst = static_cast(dest); + const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel); + int y = block_height; + do { + int x = 0; + do { + int16_t sum = 0; + const uint16_t pixel_value = src[x]; + uint16_t max_value = pixel_value; + uint16_t min_value = pixel_value; + for (int k = 0; k < 2; ++k) { + static constexpr int signs[] = {-1, 1}; + for (const int& sign : signs) { + if (enable_primary) { + const int dy = sign * kCdefDirections[direction][k][0]; + const int dx = sign * kCdefDirections[direction][k][1]; + const uint16_t value = src[dy * src_stride + dx + x]; + // Note: the summation can ignore the condition check in SIMD + // implementation, because Constrain() will return 0 when + // value == kCdefLargeValue. + if (value != kCdefLargeValue) { + sum += Constrain(value - pixel_value, primary_strength, damping) * + kCdefPrimaryTaps[(primary_strength >> coeff_shift) & 1][k]; + if (clipping_required) { + max_value = std::max(value, max_value); + min_value = std::min(value, min_value); + } + } + } + + if (enable_secondary) { + static constexpr int offsets[] = {-2, 2}; + for (const int& offset : offsets) { + const int dy = sign * kCdefDirections[direction + offset][k][0]; + const int dx = sign * kCdefDirections[direction + offset][k][1]; + const uint16_t value = src[dy * src_stride + dx + x]; + // Note: the summation can ignore the condition check in SIMD + // implementation. + if (value != kCdefLargeValue) { + sum += Constrain(value - pixel_value, secondary_strength, + damping) * + kCdefSecondaryTaps[k]; + if (clipping_required) { + max_value = std::max(value, max_value); + min_value = std::min(value, min_value); + } + } + } + } + } + } + + const int offset = (8 + sum - (sum < 0)) >> 4; + if (clipping_required) { + dst[x] = static_cast( + Clip3(pixel_value + offset, min_value, max_value)); + } else { + dst[x] = static_cast(pixel_value + offset); + } + } while (++x < block_width); + + src += src_stride; + dst += dst_stride; + } while (--y != 0); +} +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || + // !defined(LIBGAV1_Dsp8bpp_CdefFilters) || + // (LIBGAV1_MAX_BITDEPTH >= 10 && + // !defined(LIBGAV1_Dsp10bpp_CdefFilters)) + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->cdef_direction = CdefDirection_C<8, uint8_t>; + dsp->cdef_filters[0][0] = CdefFilter_C<4, 8, uint8_t>; + dsp->cdef_filters[0][1] = CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[0][2] = + CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/false>; + dsp->cdef_filters[1][0] = CdefFilter_C<8, 8, uint8_t>; + dsp->cdef_filters[1][1] = CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[1][2] = + CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/false>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp8bpp_CdefDirection + dsp->cdef_direction = CdefDirection_C<8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_CdefFilters + dsp->cdef_filters[0][0] = CdefFilter_C<4, 8, uint8_t>; + dsp->cdef_filters[0][1] = CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[0][2] = + CdefFilter_C<4, 8, uint8_t, /*enable_primary=*/false>; + dsp->cdef_filters[1][0] = CdefFilter_C<8, 8, uint8_t>; + dsp->cdef_filters[1][1] = CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[1][2] = + CdefFilter_C<8, 8, uint8_t, /*enable_primary=*/false>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->cdef_direction = CdefDirection_C<10, uint16_t>; + dsp->cdef_filters[0][0] = CdefFilter_C<4, 10, uint16_t>; + dsp->cdef_filters[0][1] = + CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[0][2] = + CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/false>; + dsp->cdef_filters[1][0] = CdefFilter_C<8, 10, uint16_t>; + dsp->cdef_filters[1][1] = + CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[1][2] = + CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/false>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp10bpp_CdefDirection + dsp->cdef_direction = CdefDirection_C<10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_CdefFilters + dsp->cdef_filters[0][0] = CdefFilter_C<4, 10, uint16_t>; + dsp->cdef_filters[0][1] = + CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[0][2] = + CdefFilter_C<4, 10, uint16_t, /*enable_primary=*/false>; + dsp->cdef_filters[1][0] = CdefFilter_C<8, 10, uint16_t>; + dsp->cdef_filters[1][1] = + CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/true, + /*enable_secondary=*/false>; + dsp->cdef_filters[1][2] = + CdefFilter_C<8, 10, uint16_t, /*enable_primary=*/false>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif + +} // namespace + +void CdefInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/cdef.h b/src/dsp/cdef.h new file mode 100644 index 0000000..2d70d2c --- /dev/null +++ b/src/dsp/cdef.h @@ -0,0 +1,47 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_CDEF_H_ +#define LIBGAV1_SRC_DSP_CDEF_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/cdef_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/cdef_sse4.h" +// clang-format on +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not +// thread-safe. +void CdefInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_CDEF_H_ diff --git a/src/dsp/cdef.inc b/src/dsp/cdef.inc new file mode 100644 index 0000000..c1a3136 --- /dev/null +++ b/src/dsp/cdef.inc @@ -0,0 +1,29 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Constants used for cdef implementations. +// This will be included inside an anonymous namespace on files where these are +// necessary. + +const int8_t (*const kCdefDirections)[2][2] = kCdefDirectionsPadded + 2; + +// Mirror values and pad to 16 elements. +alignas(16) constexpr uint32_t kCdefDivisionTable[] = { + 840, 420, 280, 210, 168, 140, 120, 105, + 120, 140, 168, 210, 280, 420, 840, 0}; + +// Used when calculating odd |cost[x]| values to mask off unwanted elements. +// Holds elements 1 3 5 X 5 3 1 X +alignas(16) constexpr uint32_t kCdefDivisionTableOdd[] = {420, 210, 140, 0, + 140, 210, 420, 0}; diff --git a/src/dsp/common.h b/src/dsp/common.h new file mode 100644 index 0000000..d614a81 --- /dev/null +++ b/src/dsp/common.h @@ -0,0 +1,82 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_COMMON_H_ +#define LIBGAV1_SRC_DSP_COMMON_H_ + +#include + +#include "src/dsp/constants.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" + +namespace libgav1 { + +enum { kSgrStride = kRestorationUnitWidth + 32 }; // anonymous enum + +// Self guided projection filter. +struct SgrProjInfo { + int index; + int multiplier[2]; +}; + +struct WienerInfo { + static const int kVertical = 0; + static const int kHorizontal = 1; + int16_t number_leading_zero_coefficients[2]; + alignas(kMaxAlignment) int16_t filter[2][(kWienerFilterTaps + 1) / 2]; +}; + +struct RestorationUnitInfo : public MaxAlignedAllocable { + LoopRestorationType type; + SgrProjInfo sgr_proj_info; + WienerInfo wiener_info; +}; + +struct SgrBuffer { + alignas(kMaxAlignment) uint16_t sum3[4 * kSgrStride]; + alignas(kMaxAlignment) uint16_t sum5[5 * kSgrStride]; + alignas(kMaxAlignment) uint32_t square_sum3[4 * kSgrStride]; + alignas(kMaxAlignment) uint32_t square_sum5[5 * kSgrStride]; + alignas(kMaxAlignment) uint16_t ma343[4 * kRestorationUnitWidth]; + alignas(kMaxAlignment) uint16_t ma444[3 * kRestorationUnitWidth]; + alignas(kMaxAlignment) uint16_t ma565[2 * kRestorationUnitWidth]; + alignas(kMaxAlignment) uint32_t b343[4 * kRestorationUnitWidth]; + alignas(kMaxAlignment) uint32_t b444[3 * kRestorationUnitWidth]; + alignas(kMaxAlignment) uint32_t b565[2 * kRestorationUnitWidth]; + // The following 2 buffers are only used by the C functions. Since SgrBuffer + // is smaller than |wiener_buffer| in RestorationBuffer which is an union, + // it's OK to always keep the following 2 buffers. + alignas(kMaxAlignment) uint8_t ma[kSgrStride]; // [0, 255] + // b is less than 2^16 for 8-bit. However, making it a template slows down the + // C function by 5%. So b is fixed to 32-bit. + alignas(kMaxAlignment) uint32_t b[kSgrStride]; +}; + +union RestorationBuffer { + // For self-guided filter. + SgrBuffer sgr_buffer; + // For wiener filter. + // The array |intermediate| in Section 7.17.4, the intermediate results + // between the horizontal and vertical filters. + alignas(kMaxAlignment) int16_t + wiener_buffer[(kRestorationUnitHeight + kWienerFilterTaps - 1) * + kRestorationUnitWidth]; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_COMMON_H_ diff --git a/src/dsp/constants.cc b/src/dsp/constants.cc new file mode 100644 index 0000000..0099ca3 --- /dev/null +++ b/src/dsp/constants.cc @@ -0,0 +1,103 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/constants.h" + +#include + +namespace libgav1 { + +// Each set of 7 taps is padded with a 0 to easily align and pack into the high +// and low 8 bytes. This way, we can load 16 at a time to fit mulhi and mullo. +const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = { + {{-6, 10, 0, 0, 0, 12, 0, 0}, + {-5, 2, 10, 0, 0, 9, 0, 0}, + {-3, 1, 1, 10, 0, 7, 0, 0}, + {-3, 1, 1, 2, 10, 5, 0, 0}, + {-4, 6, 0, 0, 0, 2, 12, 0}, + {-3, 2, 6, 0, 0, 2, 9, 0}, + {-3, 2, 2, 6, 0, 2, 7, 0}, + {-3, 1, 2, 2, 6, 3, 5, 0}}, + {{-10, 16, 0, 0, 0, 10, 0, 0}, + {-6, 0, 16, 0, 0, 6, 0, 0}, + {-4, 0, 0, 16, 0, 4, 0, 0}, + {-2, 0, 0, 0, 16, 2, 0, 0}, + {-10, 16, 0, 0, 0, 0, 10, 0}, + {-6, 0, 16, 0, 0, 0, 6, 0}, + {-4, 0, 0, 16, 0, 0, 4, 0}, + {-2, 0, 0, 0, 16, 0, 2, 0}}, + {{-8, 8, 0, 0, 0, 16, 0, 0}, + {-8, 0, 8, 0, 0, 16, 0, 0}, + {-8, 0, 0, 8, 0, 16, 0, 0}, + {-8, 0, 0, 0, 8, 16, 0, 0}, + {-4, 4, 0, 0, 0, 0, 16, 0}, + {-4, 0, 4, 0, 0, 0, 16, 0}, + {-4, 0, 0, 4, 0, 0, 16, 0}, + {-4, 0, 0, 0, 4, 0, 16, 0}}, + {{-2, 8, 0, 0, 0, 10, 0, 0}, + {-1, 3, 8, 0, 0, 6, 0, 0}, + {-1, 2, 3, 8, 0, 4, 0, 0}, + {0, 1, 2, 3, 8, 2, 0, 0}, + {-1, 4, 0, 0, 0, 3, 10, 0}, + {-1, 3, 4, 0, 0, 4, 6, 0}, + {-1, 2, 3, 4, 0, 4, 4, 0}, + {-1, 2, 2, 3, 4, 3, 3, 0}}, + {{-12, 14, 0, 0, 0, 14, 0, 0}, + {-10, 0, 14, 0, 0, 12, 0, 0}, + {-9, 0, 0, 14, 0, 11, 0, 0}, + {-8, 0, 0, 0, 14, 10, 0, 0}, + {-10, 12, 0, 0, 0, 0, 14, 0}, + {-9, 1, 12, 0, 0, 0, 12, 0}, + {-8, 0, 0, 12, 0, 1, 11, 0}, + {-7, 0, 0, 1, 12, 1, 9, 0}}}; + +// A lookup table replacing the calculation of the variable s in Section 7.17.3 +// (Box filter process). The first index is sgr_proj_index (the lr_sgr_set +// syntax element in the Spec, saved in the sgr_proj_info.index field of a +// RestorationUnitInfo struct). The second index is pass (0 or 1). +// +// const uint8_t scale = kSgrProjParams[sgr_proj_index][pass * 2 + 1]; +// const uint32_t n2_with_scale = n * n * scale; +// const uint32_t s = +// ((1 << kSgrProjScaleBits) + (n2_with_scale >> 1)) / n2_with_scale; +// 0 is an invalid value, corresponding to radius = 0, where the filter is +// skipped. +const uint16_t kSgrScaleParameter[16][2] = { + {140, 3236}, {112, 2158}, {93, 1618}, {80, 1438}, {70, 1295}, {58, 1177}, + {47, 1079}, {37, 996}, {30, 925}, {25, 863}, {0, 2589}, {0, 1618}, + {0, 1177}, {0, 925}, {56, 0}, {22, 0}, +}; + +const uint8_t kCdefPrimaryTaps[2][2] = {{4, 2}, {3, 3}}; + +// This is Cdef_Directions (section 7.15.3) with 2 padding entries at the +// beginning and end of the table. The cdef direction range is [0, 7] and the +// first index is offset +/-2. This removes the need to constrain the first +// index to the same range using e.g., & 7. +const int8_t kCdefDirectionsPadded[12][2][2] = { + {{1, 0}, {2, 0}}, // Padding: Cdef_Directions[6] + {{1, 0}, {2, -1}}, // Padding: Cdef_Directions[7] + {{-1, 1}, {-2, 2}}, // Begin Cdef_Directions + {{0, 1}, {-1, 2}}, // + {{0, 1}, {0, 2}}, // + {{0, 1}, {1, 2}}, // + {{1, 1}, {2, 2}}, // + {{1, 0}, {2, 1}}, // + {{1, 0}, {2, 0}}, // + {{1, 0}, {2, -1}}, // End Cdef_Directions + {{-1, 1}, {-2, 2}}, // Padding: Cdef_Directions[0] + {{0, 1}, {-1, 2}}, // Padding: Cdef_Directions[1] +}; + +} // namespace libgav1 diff --git a/src/dsp/constants.h b/src/dsp/constants.h new file mode 100644 index 0000000..7c1b62c --- /dev/null +++ b/src/dsp/constants.h @@ -0,0 +1,71 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_CONSTANTS_H_ +#define LIBGAV1_SRC_DSP_CONSTANTS_H_ + +// This file contains DSP related constants that have a direct relationship with +// a DSP component. + +#include + +#include "src/utils/constants.h" + +namespace libgav1 { + +enum { + // Documentation variables. + kBitdepth8 = 8, + kBitdepth10 = 10, + kBitdepth12 = 12, + // Weights are quadratic from '1' to '1 / block_size', scaled by + // 2^kSmoothWeightScale. + kSmoothWeightScale = 8, + kCflLumaBufferStride = 32, + // InterRound0, Section 7.11.3.2. + kInterRoundBitsHorizontal = 3, // 8 & 10-bit. + kInterRoundBitsHorizontal12bpp = 5, + kInterRoundBitsCompoundVertical = 7, // 8, 10 & 12-bit compound prediction. + kInterRoundBitsVertical = 11, // 8 & 10-bit, single prediction. + kInterRoundBitsVertical12bpp = 9, + // Offset applied to 10bpp and 12bpp predictors to allow storing them in + // uint16_t. Removed before blending. + kCompoundOffset = (1 << 14) + (1 << 13), + kCdefSecondaryTap0 = 2, + kCdefSecondaryTap1 = 1, +}; // anonymous enum + +extern const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8]; + +// Values in this enum can be derived as the sum of subsampling_x and +// subsampling_y (since subsampling_x == 0 && subsampling_y == 1 case is never +// allowed by the bitstream). +enum SubsamplingType : uint8_t { + kSubsamplingType444, // subsampling_x = 0, subsampling_y = 0. + kSubsamplingType422, // subsampling_x = 1, subsampling_y = 0. + kSubsamplingType420, // subsampling_x = 1, subsampling_y = 1. + kNumSubsamplingTypes +}; + +extern const uint16_t kSgrScaleParameter[16][2]; + +extern const uint8_t kCdefPrimaryTaps[2][2]; + +extern const int8_t kCdefDirectionsPadded[12][2][2]; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_CONSTANTS_H_ diff --git a/src/dsp/convolve.cc b/src/dsp/convolve.cc new file mode 100644 index 0000000..8c6f68f --- /dev/null +++ b/src/dsp/convolve.cc @@ -0,0 +1,876 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/convolve.h" + +#include +#include +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kHorizontalOffset = 3; +constexpr int kVerticalOffset = 3; + +// Compound prediction output ranges from ConvolveTest.ShowRange. +// Bitdepth: 8 Input range: [ 0, 255] +// intermediate range: [ -7140, 23460] +// first pass output range: [ -1785, 5865] +// intermediate range: [ -328440, 589560] +// second pass output range: [ 0, 255] +// compound second pass output range: [ -5132, 9212] +// +// Bitdepth: 10 Input range: [ 0, 1023] +// intermediate range: [ -28644, 94116] +// first pass output range: [ -7161, 23529] +// intermediate range: [-1317624, 2365176] +// second pass output range: [ 0, 1023] +// compound second pass output range: [ 3988, 61532] +// +// Bitdepth: 12 Input range: [ 0, 4095] +// intermediate range: [ -114660, 376740] +// first pass output range: [ -7166, 23546] +// intermediate range: [-1318560, 2366880] +// second pass output range: [ 0, 4095] +// compound second pass output range: [ 3974, 61559] + +template +void ConvolveScale2D_C(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int vertical_filter_index, const int subpixel_x, + const int subpixel_y, const int step_x, const int step_y, + const int width, const int height, void* prediction, + const ptrdiff_t pred_stride) { + constexpr int kRoundBitsHorizontal = (bitdepth == 12) + ? kInterRoundBitsHorizontal12bpp + : kInterRoundBitsHorizontal; + constexpr int kRoundBitsVertical = + (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical; + const int intermediate_height = + (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> + kScaleSubPixelBits) + + kSubPixelTaps; + // The output of the horizontal filter, i.e. the intermediate_result, is + // guaranteed to fit in int16_t. + int16_t intermediate_result[kMaxSuperBlockSizeInPixels * + (2 * kMaxSuperBlockSizeInPixels + 8)]; + const int intermediate_stride = kMaxSuperBlockSizeInPixels; + const int max_pixel_value = (1 << bitdepth) - 1; + + // Horizontal filter. + // Filter types used for width <= 4 are different from those for width > 4. + // When width > 4, the valid filter index range is always [0, 3]. + // When width <= 4, the valid filter index range is always [4, 5]. + // Similarly for height. + int filter_index = GetFilterIndex(horizontal_filter_index, width); + int16_t* intermediate = intermediate_result; + const auto* src = static_cast(reference); + const ptrdiff_t src_stride = reference_stride / sizeof(Pixel); + auto* dest = static_cast(prediction); + const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel); + const int ref_x = subpixel_x >> kScaleSubPixelBits; + // Note: assume the input src is already aligned to the correct start + // position. + int y = 0; + do { + int p = subpixel_x; + int x = 0; + do { + int sum = 0; + const Pixel* src_x = &src[(p >> kScaleSubPixelBits) - ref_x]; + const int filter_id = (p >> 6) & kSubPixelMask; + for (int k = 0; k < kSubPixelTaps; ++k) { + sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src_x[k]; + } + intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1); + p += step_x; + } while (++x < width); + + src += src_stride; + intermediate += intermediate_stride; + } while (++y < intermediate_height); + + // Vertical filter. + filter_index = GetFilterIndex(vertical_filter_index, height); + intermediate = intermediate_result; + int p = subpixel_y & 1023; + y = 0; + do { + const int filter_id = (p >> 6) & kSubPixelMask; + int x = 0; + do { + int sum = 0; + for (int k = 0; k < kSubPixelTaps; ++k) { + sum += + kHalfSubPixelFilters[filter_index][filter_id][k] * + intermediate[((p >> kScaleSubPixelBits) + k) * intermediate_stride + + x]; + } + dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0, + max_pixel_value); + } while (++x < width); + + dest += dest_stride; + p += step_y; + } while (++y < height); +} + +template +void ConvolveCompoundScale2D_C(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int vertical_filter_index, + const int subpixel_x, const int subpixel_y, + const int step_x, const int step_y, + const int width, const int height, + void* prediction, const ptrdiff_t pred_stride) { + // All compound functions output to the predictor buffer with |pred_stride| + // equal to |width|. + assert(pred_stride == width); + // Compound functions start at 4x4. + assert(width >= 4 && height >= 4); + constexpr int kRoundBitsHorizontal = (bitdepth == 12) + ? kInterRoundBitsHorizontal12bpp + : kInterRoundBitsHorizontal; + constexpr int kRoundBitsVertical = kInterRoundBitsCompoundVertical; + const int intermediate_height = + (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> + kScaleSubPixelBits) + + kSubPixelTaps; + // The output of the horizontal filter, i.e. the intermediate_result, is + // guaranteed to fit in int16_t. + int16_t intermediate_result[kMaxSuperBlockSizeInPixels * + (2 * kMaxSuperBlockSizeInPixels + 8)]; + const int intermediate_stride = kMaxSuperBlockSizeInPixels; + + // Horizontal filter. + // Filter types used for width <= 4 are different from those for width > 4. + // When width > 4, the valid filter index range is always [0, 3]. + // When width <= 4, the valid filter index range is always [4, 5]. + // Similarly for height. + int filter_index = GetFilterIndex(horizontal_filter_index, width); + int16_t* intermediate = intermediate_result; + const auto* src = static_cast(reference); + const ptrdiff_t src_stride = reference_stride / sizeof(Pixel); + auto* dest = static_cast(prediction); + const int ref_x = subpixel_x >> kScaleSubPixelBits; + // Note: assume the input src is already aligned to the correct start + // position. + int y = 0; + do { + int p = subpixel_x; + int x = 0; + do { + int sum = 0; + const Pixel* src_x = &src[(p >> kScaleSubPixelBits) - ref_x]; + const int filter_id = (p >> 6) & kSubPixelMask; + for (int k = 0; k < kSubPixelTaps; ++k) { + sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src_x[k]; + } + intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1); + p += step_x; + } while (++x < width); + + src += src_stride; + intermediate += intermediate_stride; + } while (++y < intermediate_height); + + // Vertical filter. + filter_index = GetFilterIndex(vertical_filter_index, height); + intermediate = intermediate_result; + int p = subpixel_y & 1023; + y = 0; + do { + const int filter_id = (p >> 6) & kSubPixelMask; + int x = 0; + do { + int sum = 0; + for (int k = 0; k < kSubPixelTaps; ++k) { + sum += + kHalfSubPixelFilters[filter_index][filter_id][k] * + intermediate[((p >> kScaleSubPixelBits) + k) * intermediate_stride + + x]; + } + sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1); + sum += (bitdepth == 8) ? 0 : kCompoundOffset; + dest[x] = sum; + } while (++x < width); + + dest += pred_stride; + p += step_y; + } while (++y < height); +} + +template +void ConvolveCompound2D_C(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int vertical_filter_index, + const int horizontal_filter_id, + const int vertical_filter_id, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + // All compound functions output to the predictor buffer with |pred_stride| + // equal to |width|. + assert(pred_stride == width); + // Compound functions start at 4x4. + assert(width >= 4 && height >= 4); + constexpr int kRoundBitsHorizontal = (bitdepth == 12) + ? kInterRoundBitsHorizontal12bpp + : kInterRoundBitsHorizontal; + constexpr int kRoundBitsVertical = kInterRoundBitsCompoundVertical; + const int intermediate_height = height + kSubPixelTaps - 1; + // The output of the horizontal filter, i.e. the intermediate_result, is + // guaranteed to fit in int16_t. + int16_t intermediate_result[kMaxSuperBlockSizeInPixels * + (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; + const int intermediate_stride = kMaxSuperBlockSizeInPixels; + + // Horizontal filter. + // Filter types used for width <= 4 are different from those for width > 4. + // When width > 4, the valid filter index range is always [0, 3]. + // When width <= 4, the valid filter index range is always [4, 5]. + // Similarly for height. + int filter_index = GetFilterIndex(horizontal_filter_index, width); + int16_t* intermediate = intermediate_result; + const ptrdiff_t src_stride = reference_stride / sizeof(Pixel); + const auto* src = static_cast(reference) - + kVerticalOffset * src_stride - kHorizontalOffset; + auto* dest = static_cast(prediction); + + // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called. + assert(horizontal_filter_id != 0); + int y = 0; + do { + int x = 0; + do { + int sum = 0; + for (int k = 0; k < kSubPixelTaps; ++k) { + sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] * + src[x + k]; + } + intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1); + } while (++x < width); + + src += src_stride; + intermediate += intermediate_stride; + } while (++y < intermediate_height); + + // Vertical filter. + filter_index = GetFilterIndex(vertical_filter_index, height); + intermediate = intermediate_result; + // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called. + assert(vertical_filter_id != 0); + y = 0; + do { + int x = 0; + do { + int sum = 0; + for (int k = 0; k < kSubPixelTaps; ++k) { + sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] * + intermediate[k * intermediate_stride + x]; + } + sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1); + sum += (bitdepth == 8) ? 0 : kCompoundOffset; + dest[x] = sum; + } while (++x < width); + + dest += pred_stride; + intermediate += intermediate_stride; + } while (++y < height); +} + +// This function is a simplified version of ConvolveCompound2D_C. +// It is called when it is single prediction mode, where both horizontal and +// vertical filtering are required. +// The output is the single prediction of the block, clipped to valid pixel +// range. +template +void Convolve2D_C(const void* const reference, const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int vertical_filter_index, + const int horizontal_filter_id, const int vertical_filter_id, + const int width, const int height, void* prediction, + const ptrdiff_t pred_stride) { + constexpr int kRoundBitsHorizontal = (bitdepth == 12) + ? kInterRoundBitsHorizontal12bpp + : kInterRoundBitsHorizontal; + constexpr int kRoundBitsVertical = + (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical; + const int intermediate_height = height + kSubPixelTaps - 1; + // The output of the horizontal filter, i.e. the intermediate_result, is + // guaranteed to fit in int16_t. + int16_t intermediate_result[kMaxSuperBlockSizeInPixels * + (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; + const int intermediate_stride = kMaxSuperBlockSizeInPixels; + const int max_pixel_value = (1 << bitdepth) - 1; + + // Horizontal filter. + // Filter types used for width <= 4 are different from those for width > 4. + // When width > 4, the valid filter index range is always [0, 3]. + // When width <= 4, the valid filter index range is always [4, 5]. + // Similarly for height. + int filter_index = GetFilterIndex(horizontal_filter_index, width); + int16_t* intermediate = intermediate_result; + const ptrdiff_t src_stride = reference_stride / sizeof(Pixel); + const auto* src = static_cast(reference) - + kVerticalOffset * src_stride - kHorizontalOffset; + auto* dest = static_cast(prediction); + const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel); + // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called. + assert(horizontal_filter_id != 0); + int y = 0; + do { + int x = 0; + do { + int sum = 0; + for (int k = 0; k < kSubPixelTaps; ++k) { + sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] * + src[x + k]; + } + intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1); + } while (++x < width); + + src += src_stride; + intermediate += intermediate_stride; + } while (++y < intermediate_height); + + // Vertical filter. + filter_index = GetFilterIndex(vertical_filter_index, height); + intermediate = intermediate_result; + // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called. + assert(vertical_filter_id != 0); + y = 0; + do { + int x = 0; + do { + int sum = 0; + for (int k = 0; k < kSubPixelTaps; ++k) { + sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] * + intermediate[k * intermediate_stride + x]; + } + dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0, + max_pixel_value); + } while (++x < width); + + dest += dest_stride; + intermediate += intermediate_stride; + } while (++y < height); +} + +// This function is a simplified version of Convolve2D_C. +// It is called when it is single prediction mode, where only horizontal +// filtering is required. +// The output is the single prediction of the block, clipped to valid pixel +// range. +template +void ConvolveHorizontal_C(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int /*vertical_filter_index*/, + const int horizontal_filter_id, + const int /*vertical_filter_id*/, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + constexpr int kRoundBitsHorizontal = (bitdepth == 12) + ? kInterRoundBitsHorizontal12bpp + : kInterRoundBitsHorizontal; + const int filter_index = GetFilterIndex(horizontal_filter_index, width); + const int bits = kFilterBits - kRoundBitsHorizontal; + const auto* src = static_cast(reference) - kHorizontalOffset; + const ptrdiff_t src_stride = reference_stride / sizeof(Pixel); + auto* dest = static_cast(prediction); + const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel); + const int max_pixel_value = (1 << bitdepth) - 1; + int y = 0; + do { + int x = 0; + do { + int sum = 0; + for (int k = 0; k < kSubPixelTaps; ++k) { + sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] * + src[x + k]; + } + sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1); + dest[x] = Clip3(RightShiftWithRounding(sum, bits), 0, max_pixel_value); + } while (++x < width); + + src += src_stride; + dest += dest_stride; + } while (++y < height); +} + +// This function is a simplified version of Convolve2D_C. +// It is called when it is single prediction mode, where only vertical +// filtering is required. +// The output is the single prediction of the block, clipped to valid pixel +// range. +template +void ConvolveVertical_C(const void* const reference, + const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, + const int vertical_filter_index, + const int /*horizontal_filter_id*/, + const int vertical_filter_id, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + const int filter_index = GetFilterIndex(vertical_filter_index, height); + const ptrdiff_t src_stride = reference_stride / sizeof(Pixel); + const auto* src = + static_cast(reference) - kVerticalOffset * src_stride; + auto* dest = static_cast(prediction); + const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel); + // Copy filters must call ConvolveCopy(). + assert(vertical_filter_id != 0); + + const int max_pixel_value = (1 << bitdepth) - 1; + int y = 0; + do { + int x = 0; + do { + int sum = 0; + for (int k = 0; k < kSubPixelTaps; ++k) { + sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] * + src[k * src_stride + x]; + } + dest[x] = Clip3(RightShiftWithRounding(sum, kFilterBits - 1), 0, + max_pixel_value); + } while (++x < width); + + src += src_stride; + dest += dest_stride; + } while (++y < height); +} + +template +void ConvolveCopy_C(const void* const reference, + const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, + const int /*vertical_filter_index*/, + const int /*horizontal_filter_id*/, + const int /*vertical_filter_id*/, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + const auto* src = static_cast(reference); + auto* dest = static_cast(prediction); + int y = 0; + do { + memcpy(dest, src, width * sizeof(Pixel)); + src += reference_stride; + dest += pred_stride; + } while (++y < height); +} + +template +void ConvolveCompoundCopy_C(const void* const reference, + const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, + const int /*vertical_filter_index*/, + const int /*horizontal_filter_id*/, + const int /*vertical_filter_id*/, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + // All compound functions output to the predictor buffer with |pred_stride| + // equal to |width|. + assert(pred_stride == width); + // Compound functions start at 4x4. + assert(width >= 4 && height >= 4); + constexpr int kRoundBitsVertical = + ((bitdepth == 12) ? kInterRoundBitsVertical12bpp + : kInterRoundBitsVertical) - + kInterRoundBitsCompoundVertical; + const auto* src = static_cast(reference); + const ptrdiff_t src_stride = reference_stride / sizeof(Pixel); + auto* dest = static_cast(prediction); + int y = 0; + do { + int x = 0; + do { + int sum = (bitdepth == 8) ? 0 : ((1 << bitdepth) + (1 << (bitdepth - 1))); + sum += src[x]; + dest[x] = sum << kRoundBitsVertical; + } while (++x < width); + src += src_stride; + dest += pred_stride; + } while (++y < height); +} + +// This function is a simplified version of ConvolveCompound2D_C. +// It is called when it is compound prediction mode, where only horizontal +// filtering is required. +// The output is not clipped to valid pixel range. Its output will be +// blended with another predictor to generate the final prediction of the block. +template +void ConvolveCompoundHorizontal_C( + const void* const reference, const ptrdiff_t reference_stride, + const int horizontal_filter_index, const int /*vertical_filter_index*/, + const int horizontal_filter_id, const int /*vertical_filter_id*/, + const int width, const int height, void* prediction, + const ptrdiff_t pred_stride) { + // All compound functions output to the predictor buffer with |pred_stride| + // equal to |width|. + assert(pred_stride == width); + // Compound functions start at 4x4. + assert(width >= 4 && height >= 4); + constexpr int kRoundBitsHorizontal = (bitdepth == 12) + ? kInterRoundBitsHorizontal12bpp + : kInterRoundBitsHorizontal; + const int filter_index = GetFilterIndex(horizontal_filter_index, width); + const auto* src = static_cast(reference) - kHorizontalOffset; + const ptrdiff_t src_stride = reference_stride / sizeof(Pixel); + auto* dest = static_cast(prediction); + // Copy filters must call ConvolveCopy(). + assert(horizontal_filter_id != 0); + int y = 0; + do { + int x = 0; + do { + int sum = 0; + for (int k = 0; k < kSubPixelTaps; ++k) { + sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] * + src[x + k]; + } + sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1); + sum += (bitdepth == 8) ? 0 : kCompoundOffset; + dest[x] = sum; + } while (++x < width); + + src += src_stride; + dest += pred_stride; + } while (++y < height); +} + +// This function is a simplified version of ConvolveCompound2D_C. +// It is called when it is compound prediction mode, where only vertical +// filtering is required. +// The output is not clipped to valid pixel range. Its output will be +// blended with another predictor to generate the final prediction of the block. +template +void ConvolveCompoundVertical_C(const void* const reference, + const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, + const int vertical_filter_index, + const int /*horizontal_filter_id*/, + const int vertical_filter_id, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + // All compound functions output to the predictor buffer with |pred_stride| + // equal to |width|. + assert(pred_stride == width); + // Compound functions start at 4x4. + assert(width >= 4 && height >= 4); + constexpr int kRoundBitsHorizontal = (bitdepth == 12) + ? kInterRoundBitsHorizontal12bpp + : kInterRoundBitsHorizontal; + const int filter_index = GetFilterIndex(vertical_filter_index, height); + const ptrdiff_t src_stride = reference_stride / sizeof(Pixel); + const auto* src = + static_cast(reference) - kVerticalOffset * src_stride; + auto* dest = static_cast(prediction); + // Copy filters must call ConvolveCopy(). + assert(vertical_filter_id != 0); + int y = 0; + do { + int x = 0; + do { + int sum = 0; + for (int k = 0; k < kSubPixelTaps; ++k) { + sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] * + src[k * src_stride + x]; + } + sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1); + sum += (bitdepth == 8) ? 0 : kCompoundOffset; + dest[x] = sum; + } while (++x < width); + src += src_stride; + dest += pred_stride; + } while (++y < height); +} + +// This function is used when intra block copy is present. +// It is called when it is single prediction mode for U/V plane, where the +// reference block is from current frame and both horizontal and vertical +// filtering are required. +// The output is the single prediction of the block, clipped to valid pixel +// range. +template +void ConvolveIntraBlockCopy2D_C(const void* const reference, + const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, + const int /*vertical_filter_index*/, + const int /*horizontal_filter_id*/, + const int /*vertical_filter_id*/, + const int width, const int height, + void* prediction, const ptrdiff_t pred_stride) { + const auto* src = static_cast(reference); + const ptrdiff_t src_stride = reference_stride / sizeof(Pixel); + auto* dest = static_cast(prediction); + const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel); + const int intermediate_height = height + 1; + uint16_t intermediate_result[kMaxSuperBlockSizeInPixels * + (kMaxSuperBlockSizeInPixels + 1)]; + uint16_t* intermediate = intermediate_result; + // Note: allow vertical access to height + 1. Because this function is only + // for u/v plane of intra block copy, such access is guaranteed to be within + // the prediction block. + int y = 0; + do { + int x = 0; + do { + intermediate[x] = src[x] + src[x + 1]; + } while (++x < width); + + src += src_stride; + intermediate += width; + } while (++y < intermediate_height); + + intermediate = intermediate_result; + y = 0; + do { + int x = 0; + do { + dest[x] = + RightShiftWithRounding(intermediate[x] + intermediate[x + width], 2); + } while (++x < width); + + intermediate += width; + dest += dest_stride; + } while (++y < height); +} + +// This function is used when intra block copy is present. +// It is called when it is single prediction mode for U/V plane, where the +// reference block is from the current frame and only horizontal or vertical +// filtering is required. +// The output is the single prediction of the block, clipped to valid pixel +// range. +// The filtering of intra block copy is simply the average of current and +// the next pixel. +template +void ConvolveIntraBlockCopy1D_C(const void* const reference, + const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, + const int /*vertical_filter_index*/, + const int /*horizontal_filter_id*/, + const int /*vertical_filter_id*/, + const int width, const int height, + void* prediction, const ptrdiff_t pred_stride) { + const auto* src = static_cast(reference); + const ptrdiff_t src_stride = reference_stride / sizeof(Pixel); + auto* dest = static_cast(prediction); + const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel); + const ptrdiff_t offset = is_horizontal ? 1 : src_stride; + int y = 0; + do { + int x = 0; + do { + dest[x] = RightShiftWithRounding(src[x] + src[x + offset], 1); + } while (++x < width); + + src += src_stride; + dest += dest_stride; + } while (++y < height); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->convolve[0][0][0][0] = ConvolveCopy_C<8, uint8_t>; + dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<8, uint8_t>; + dsp->convolve[0][0][1][0] = ConvolveVertical_C<8, uint8_t>; + dsp->convolve[0][0][1][1] = Convolve2D_C<8, uint8_t>; + + dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<8, uint8_t>; + dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<8, uint8_t>; + dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<8, uint8_t>; + dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<8, uint8_t>; + + dsp->convolve[1][0][0][0] = ConvolveCopy_C<8, uint8_t>; + dsp->convolve[1][0][0][1] = + ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/true>; + dsp->convolve[1][0][1][0] = + ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/false>; + dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<8, uint8_t>; + + dsp->convolve[1][1][0][0] = nullptr; + dsp->convolve[1][1][0][1] = nullptr; + dsp->convolve[1][1][1][0] = nullptr; + dsp->convolve[1][1][1][1] = nullptr; + + dsp->convolve_scale[0] = ConvolveScale2D_C<8, uint8_t>; + dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<8, uint8_t>; +#else // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp8bpp_ConvolveCopy + dsp->convolve[0][0][0][0] = ConvolveCopy_C<8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal + dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical + dsp->convolve[0][0][1][0] = ConvolveVertical_C<8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_Convolve2D + dsp->convolve[0][0][1][1] = Convolve2D_C<8, uint8_t>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy + dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal + dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical + dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D + dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<8, uint8_t>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy + dsp->convolve[1][0][0][0] = ConvolveCopy_C<8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal + dsp->convolve[1][0][0][1] = + ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/true>; +#endif +#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical + dsp->convolve[1][0][1][0] = + ConvolveIntraBlockCopy1D_C<8, uint8_t, /*is_horizontal=*/false>; +#endif +#ifndef LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D + dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<8, uint8_t>; +#endif + + dsp->convolve[1][1][0][0] = nullptr; + dsp->convolve[1][1][0][1] = nullptr; + dsp->convolve[1][1][1][0] = nullptr; + dsp->convolve[1][1][1][1] = nullptr; + +#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D + dsp->convolve_scale[0] = ConvolveScale2D_C<8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D + dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<8, uint8_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->convolve[0][0][0][0] = ConvolveCopy_C<10, uint16_t>; + dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<10, uint16_t>; + dsp->convolve[0][0][1][0] = ConvolveVertical_C<10, uint16_t>; + dsp->convolve[0][0][1][1] = Convolve2D_C<10, uint16_t>; + + dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<10, uint16_t>; + dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<10, uint16_t>; + dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<10, uint16_t>; + dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<10, uint16_t>; + + dsp->convolve[1][0][0][0] = ConvolveCopy_C<10, uint16_t>; + dsp->convolve[1][0][0][1] = + ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/true>; + dsp->convolve[1][0][1][0] = + ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/false>; + dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<10, uint16_t>; + + dsp->convolve[1][1][0][0] = nullptr; + dsp->convolve[1][1][0][1] = nullptr; + dsp->convolve[1][1][1][0] = nullptr; + dsp->convolve[1][1][1][1] = nullptr; + + dsp->convolve_scale[0] = ConvolveScale2D_C<10, uint16_t>; + dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<10, uint16_t>; +#else // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp10bpp_ConvolveCopy + dsp->convolve[0][0][0][0] = ConvolveCopy_C<10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_ConvolveHorizontal + dsp->convolve[0][0][0][1] = ConvolveHorizontal_C<10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_ConvolveVertical + dsp->convolve[0][0][1][0] = ConvolveVertical_C<10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_Convolve2D + dsp->convolve[0][0][1][1] = Convolve2D_C<10, uint16_t>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundCopy + dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_C<10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundHorizontal + dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_C<10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundVertical + dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_C<10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_ConvolveCompound2D + dsp->convolve[0][1][1][1] = ConvolveCompound2D_C<10, uint16_t>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockCopy + dsp->convolve[1][0][0][0] = ConvolveCopy_C<10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockHorizontal + dsp->convolve[1][0][0][1] = + ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/true>; +#endif +#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlockVertical + dsp->convolve[1][0][1][0] = + ConvolveIntraBlockCopy1D_C<10, uint16_t, /*is_horizontal=*/false>; +#endif +#ifndef LIBGAV1_Dsp10bpp_ConvolveIntraBlock2D + dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_C<10, uint16_t>; +#endif + + dsp->convolve[1][1][0][0] = nullptr; + dsp->convolve[1][1][0][1] = nullptr; + dsp->convolve[1][1][1][0] = nullptr; + dsp->convolve[1][1][1][1] = nullptr; + +#ifndef LIBGAV1_Dsp10bpp_ConvolveScale2D + dsp->convolve_scale[0] = ConvolveScale2D_C<10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_ConvolveCompoundScale2D + dsp->convolve_scale[1] = ConvolveCompoundScale2D_C<10, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif + +} // namespace + +void ConvolveInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/convolve.h b/src/dsp/convolve.h new file mode 100644 index 0000000..5bc0bad --- /dev/null +++ b/src/dsp/convolve.h @@ -0,0 +1,49 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_CONVOLVE_H_ +#define LIBGAV1_SRC_DSP_CONVOLVE_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/convolve_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/convolve_avx2.h" +#include "src/dsp/x86/convolve_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::convolve and Dsp::convolve_scale. This function is not +// thread-safe. +void ConvolveInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_CONVOLVE_H_ diff --git a/src/dsp/convolve.inc b/src/dsp/convolve.inc new file mode 100644 index 0000000..140648b --- /dev/null +++ b/src/dsp/convolve.inc @@ -0,0 +1,50 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Constants and utility functions used for convolve implementations. +// This will be included inside an anonymous namespace on files where these are +// necessary. + +int GetNumTapsInFilter(const int filter_index) { + if (filter_index < 2) { + // Despite the names these only use 6 taps. + // kInterpolationFilterEightTap + // kInterpolationFilterEightTapSmooth + return 6; + } + + if (filter_index == 2) { + // kInterpolationFilterEightTapSharp + return 8; + } + + if (filter_index == 3) { + // kInterpolationFilterBilinear + return 2; + } + + assert(filter_index > 3); + // For small sizes (width/height <= 4) the large filters are replaced with 4 + // tap options. + // If the original filters were |kInterpolationFilterEightTap| or + // |kInterpolationFilterEightTapSharp| then it becomes + // |kInterpolationFilterSwitchable|. + // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4 + // tap filter. + return 4; +} + +constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels; +constexpr int kHorizontalOffset = 3; +constexpr int kFilterIndexShift = 6; diff --git a/src/dsp/distance_weighted_blend.cc b/src/dsp/distance_weighted_blend.cc new file mode 100644 index 0000000..a035fbe --- /dev/null +++ b/src/dsp/distance_weighted_blend.cc @@ -0,0 +1,101 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/distance_weighted_blend.h" + +#include +#include +#include +#include + +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +template +void DistanceWeightedBlend_C(const void* prediction_0, const void* prediction_1, + const uint8_t weight_0, const uint8_t weight_1, + const int width, const int height, + void* const dest, const ptrdiff_t dest_stride) { + // 7.11.3.2 Rounding variables derivation process + // 2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7)) + constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4; + using PredType = + typename std::conditional::type; + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + auto* dst = static_cast(dest); + const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel); + + int y = 0; + do { + int x = 0; + do { + // See warp.cc and convolve.cc for detailed prediction ranges. + // weight_0 + weight_1 = 16. + int res = pred_0[x] * weight_0 + pred_1[x] * weight_1; + res -= (bitdepth == 8) ? 0 : kCompoundOffset * 16; + dst[x] = static_cast( + Clip3(RightShiftWithRounding(res, inter_post_round_bits + 4), 0, + (1 << bitdepth) - 1)); + } while (++x < width); + + dst += dst_stride; + pred_0 += width; + pred_1 += width; + } while (++y < height); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->distance_weighted_blend = DistanceWeightedBlend_C<8, uint8_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend + dsp->distance_weighted_blend = DistanceWeightedBlend_C<8, uint8_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->distance_weighted_blend = DistanceWeightedBlend_C<10, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp10bpp_DistanceWeightedBlend + dsp->distance_weighted_blend = DistanceWeightedBlend_C<10, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif + +} // namespace + +void DistanceWeightedBlendInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/distance_weighted_blend.h b/src/dsp/distance_weighted_blend.h new file mode 100644 index 0000000..1a782b6 --- /dev/null +++ b/src/dsp/distance_weighted_blend.h @@ -0,0 +1,47 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_ +#define LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/distance_weighted_blend_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/distance_weighted_blend_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::distance_weighted_blend. This function is not thread-safe. +void DistanceWeightedBlendInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_DISTANCE_WEIGHTED_BLEND_H_ diff --git a/src/dsp/dsp.cc b/src/dsp/dsp.cc new file mode 100644 index 0000000..5b54c4e --- /dev/null +++ b/src/dsp/dsp.cc @@ -0,0 +1,150 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/dsp.h" + +#include // NOLINT (unapproved c++11 header) + +#include "src/dsp/arm/weight_mask_neon.h" +#include "src/dsp/average_blend.h" +#include "src/dsp/cdef.h" +#include "src/dsp/convolve.h" +#include "src/dsp/distance_weighted_blend.h" +#include "src/dsp/film_grain.h" +#include "src/dsp/intra_edge.h" +#include "src/dsp/intrapred.h" +#include "src/dsp/inverse_transform.h" +#include "src/dsp/loop_filter.h" +#include "src/dsp/loop_restoration.h" +#include "src/dsp/mask_blend.h" +#include "src/dsp/motion_field_projection.h" +#include "src/dsp/motion_vector_search.h" +#include "src/dsp/obmc.h" +#include "src/dsp/super_res.h" +#include "src/dsp/warp.h" +#include "src/dsp/weight_mask.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp_internal { + +dsp::Dsp* GetWritableDspTable(int bitdepth) { + switch (bitdepth) { + case 8: { + static dsp::Dsp dsp_8bpp; + return &dsp_8bpp; + } +#if LIBGAV1_MAX_BITDEPTH >= 10 + case 10: { + static dsp::Dsp dsp_10bpp; + return &dsp_10bpp; + } +#endif + } + return nullptr; +} + +} // namespace dsp_internal + +namespace dsp { + +void DspInit() { + static std::once_flag once; + std::call_once(once, []() { + AverageBlendInit_C(); + CdefInit_C(); + ConvolveInit_C(); + DistanceWeightedBlendInit_C(); + FilmGrainInit_C(); + IntraEdgeInit_C(); + IntraPredInit_C(); + InverseTransformInit_C(); + LoopFilterInit_C(); + LoopRestorationInit_C(); + MaskBlendInit_C(); + MotionFieldProjectionInit_C(); + MotionVectorSearchInit_C(); + ObmcInit_C(); + SuperResInit_C(); + WarpInit_C(); + WeightMaskInit_C(); +#if LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2 + const uint32_t cpu_features = GetCpuInfo(); +#if LIBGAV1_ENABLE_SSE4_1 + if ((cpu_features & kSSE4_1) != 0) { + AverageBlendInit_SSE4_1(); + CdefInit_SSE4_1(); + ConvolveInit_SSE4_1(); + DistanceWeightedBlendInit_SSE4_1(); + IntraEdgeInit_SSE4_1(); + IntraPredInit_SSE4_1(); + IntraPredCflInit_SSE4_1(); + IntraPredSmoothInit_SSE4_1(); + InverseTransformInit_SSE4_1(); + LoopFilterInit_SSE4_1(); + LoopRestorationInit_SSE4_1(); + MaskBlendInit_SSE4_1(); + MotionFieldProjectionInit_SSE4_1(); + MotionVectorSearchInit_SSE4_1(); + ObmcInit_SSE4_1(); + SuperResInit_SSE4_1(); + WarpInit_SSE4_1(); + WeightMaskInit_SSE4_1(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + LoopRestorationInit10bpp_SSE4_1(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + } +#endif // LIBGAV1_ENABLE_SSE4_1 +#if LIBGAV1_ENABLE_AVX2 + if ((cpu_features & kAVX2) != 0) { + ConvolveInit_AVX2(); + LoopRestorationInit_AVX2(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + LoopRestorationInit10bpp_AVX2(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + } +#endif // LIBGAV1_ENABLE_AVX2 +#endif // LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2 +#if LIBGAV1_ENABLE_NEON + AverageBlendInit_NEON(); + CdefInit_NEON(); + ConvolveInit_NEON(); + DistanceWeightedBlendInit_NEON(); + FilmGrainInit_NEON(); + IntraEdgeInit_NEON(); + IntraPredCflInit_NEON(); + IntraPredDirectionalInit_NEON(); + IntraPredFilterIntraInit_NEON(); + IntraPredInit_NEON(); + IntraPredSmoothInit_NEON(); + InverseTransformInit_NEON(); + LoopFilterInit_NEON(); + LoopRestorationInit_NEON(); + MaskBlendInit_NEON(); + MotionFieldProjectionInit_NEON(); + MotionVectorSearchInit_NEON(); + ObmcInit_NEON(); + SuperResInit_NEON(); + WarpInit_NEON(); + WeightMaskInit_NEON(); +#endif // LIBGAV1_ENABLE_NEON + }); +} + +const Dsp* GetDspTable(int bitdepth) { + return dsp_internal::GetWritableDspTable(bitdepth); +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h new file mode 100644 index 0000000..fcbac3a --- /dev/null +++ b/src/dsp/dsp.h @@ -0,0 +1,910 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_DSP_H_ +#define LIBGAV1_SRC_DSP_DSP_H_ + +#include // ptrdiff_t +#include +#include + +#include "src/dsp/common.h" +#include "src/dsp/constants.h" +#include "src/dsp/film_grain_common.h" +#include "src/utils/cpu.h" +#include "src/utils/reference_info.h" +#include "src/utils/types.h" + +namespace libgav1 { +namespace dsp { + +#if !defined(LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS) +#define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 0 +#endif + +enum IntraPredictor : uint8_t { + kIntraPredictorDcFill, + kIntraPredictorDcTop, + kIntraPredictorDcLeft, + kIntraPredictorDc, + kIntraPredictorVertical, + kIntraPredictorHorizontal, + kIntraPredictorPaeth, + kIntraPredictorSmooth, + kIntraPredictorSmoothVertical, + kIntraPredictorSmoothHorizontal, + kNumIntraPredictors +}; + +// List of valid 1D transforms. +enum Transform1D : uint8_t { + k1DTransformDct, // Discrete Cosine Transform. + k1DTransformAdst, // Asymmetric Discrete Sine Transform. + k1DTransformIdentity, + k1DTransformWht, // Walsh Hadamard Transform. + kNum1DTransforms +}; + +// List of valid 1D transform sizes. Not all transforms may be available for all +// the sizes. +enum TransformSize1D : uint8_t { + k1DTransformSize4, + k1DTransformSize8, + k1DTransformSize16, + k1DTransformSize32, + k1DTransformSize64, + kNum1DTransformSizes +}; + +// The maximum width of the loop filter, fewer pixels may be filtered depending +// on strength thresholds. +enum LoopFilterSize : uint8_t { + kLoopFilterSize4, + kLoopFilterSize6, + kLoopFilterSize8, + kLoopFilterSize14, + kNumLoopFilterSizes +}; + +enum : uint8_t { + kRow = 0, + kColumn = 1, +}; + +//------------------------------------------------------------------------------ +// ToString() +// +// These functions are meant to be used only in debug logging and within tests. +// They are defined inline to avoid including the strings in the release +// library when logging is disabled; unreferenced functions will not be added to +// any object file in that case. + +inline const char* ToString(const IntraPredictor predictor) { + switch (predictor) { + case kIntraPredictorDcFill: + return "kIntraPredictorDcFill"; + case kIntraPredictorDcTop: + return "kIntraPredictorDcTop"; + case kIntraPredictorDcLeft: + return "kIntraPredictorDcLeft"; + case kIntraPredictorDc: + return "kIntraPredictorDc"; + case kIntraPredictorVertical: + return "kIntraPredictorVertical"; + case kIntraPredictorHorizontal: + return "kIntraPredictorHorizontal"; + case kIntraPredictorPaeth: + return "kIntraPredictorPaeth"; + case kIntraPredictorSmooth: + return "kIntraPredictorSmooth"; + case kIntraPredictorSmoothVertical: + return "kIntraPredictorSmoothVertical"; + case kIntraPredictorSmoothHorizontal: + return "kIntraPredictorSmoothHorizontal"; + case kNumIntraPredictors: + return "kNumIntraPredictors"; + } + abort(); +} + +inline const char* ToString(const Transform1D transform) { + switch (transform) { + case k1DTransformDct: + return "k1DTransformDct"; + case k1DTransformAdst: + return "k1DTransformAdst"; + case k1DTransformIdentity: + return "k1DTransformIdentity"; + case k1DTransformWht: + return "k1DTransformWht"; + case kNum1DTransforms: + return "kNum1DTransforms"; + } + abort(); +} + +inline const char* ToString(const TransformSize1D transform_size) { + switch (transform_size) { + case k1DTransformSize4: + return "k1DTransformSize4"; + case k1DTransformSize8: + return "k1DTransformSize8"; + case k1DTransformSize16: + return "k1DTransformSize16"; + case k1DTransformSize32: + return "k1DTransformSize32"; + case k1DTransformSize64: + return "k1DTransformSize64"; + case kNum1DTransformSizes: + return "kNum1DTransformSizes"; + } + abort(); +} + +inline const char* ToString(const LoopFilterSize filter_size) { + switch (filter_size) { + case kLoopFilterSize4: + return "kLoopFilterSize4"; + case kLoopFilterSize6: + return "kLoopFilterSize6"; + case kLoopFilterSize8: + return "kLoopFilterSize8"; + case kLoopFilterSize14: + return "kLoopFilterSize14"; + case kNumLoopFilterSizes: + return "kNumLoopFilterSizes"; + } + abort(); +} + +inline const char* ToString(const LoopFilterType filter_type) { + switch (filter_type) { + case kLoopFilterTypeVertical: + return "kLoopFilterTypeVertical"; + case kLoopFilterTypeHorizontal: + return "kLoopFilterTypeHorizontal"; + case kNumLoopFilterTypes: + return "kNumLoopFilterTypes"; + } + abort(); +} + +//------------------------------------------------------------------------------ +// Intra predictors. Section 7.11.2. +// These require access to one or both of the top row and left column. Some may +// access the top-left (top[-1]), top-right (top[width+N]), bottom-left +// (left[height+N]) or upper-left (left[-1]). + +// Intra predictor function signature. Sections 7.11.2.2, 7.11.2.4 (#10,#11), +// 7.11.2.5, 7.11.2.6. +// |dst| is an unaligned pointer to the output block. Pixel size is determined +// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to +// the row above |dst|. |left| is an aligned vector of the column to the left +// of |dst|. top-left and bottom-left may be accessed. +using IntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride, + const void* top, const void* left); +using IntraPredictorFuncs = + IntraPredictorFunc[kNumTransformSizes][kNumIntraPredictors]; + +// Directional intra predictor function signature, zone 1 (0 < angle < 90). +// Section 7.11.2.4 (#7). +// |dst| is an unaligned pointer to the output block. Pixel size is determined +// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to +// the row above |dst|. |width| and |height| give the dimensions of the block. +// |xstep| is the scaled starting index to |top| from +// kDirectionalIntraPredictorDerivative. |upsampled_top| indicates whether +// |top| has been upsampled as described in '7.11.2.11. Intra edge upsample +// process'. This can occur in cases with |width| + |height| <= 16. top-right +// is accessed. +using DirectionalIntraPredictorZone1Func = void (*)(void* dst, ptrdiff_t stride, + const void* top, int width, + int height, int xstep, + bool upsampled_top); + +// Directional intra predictor function signature, zone 2 (90 < angle < 180). +// Section 7.11.2.4 (#8). +// |dst| is an unaligned pointer to the output block. Pixel size is determined +// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to +// the row above |dst|. |left| is an aligned vector of the column to the left of +// |dst|. |width| and |height| give the dimensions of the block. |xstep| and +// |ystep| are the scaled starting index to |top| and |left|, respectively, +// from kDirectionalIntraPredictorDerivative. |upsampled_top| and +// |upsampled_left| indicate whether |top| and |left| have been upsampled as +// described in '7.11.2.11. Intra edge upsample process'. This can occur in +// cases with |width| + |height| <= 16. top-left and upper-left are accessed, +// up to [-2] in each if |upsampled_top/left| are set. +using DirectionalIntraPredictorZone2Func = void (*)( + void* dst, ptrdiff_t stride, const void* top, const void* left, int width, + int height, int xstep, int ystep, bool upsampled_top, bool upsampled_left); + +// Directional intra predictor function signature, zone 3 (180 < angle < 270). +// Section 7.11.2.4 (#9). +// |dst| is an unaligned pointer to the output block. Pixel size is determined +// by bitdepth with |stride| given in bytes. |left| is an aligned vector of the +// column to the left of |dst|. |width| and |height| give the dimensions of the +// block. |ystep| is the scaled starting index to |left| from +// kDirectionalIntraPredictorDerivative. |upsampled_left| indicates whether +// |left| has been upsampled as described in '7.11.2.11. Intra edge upsample +// process'. This can occur in cases with |width| + |height| <= 16. bottom-left +// is accessed. +using DirectionalIntraPredictorZone3Func = void (*)(void* dst, ptrdiff_t stride, + const void* left, int width, + int height, int ystep, + bool upsampled_left); + +// Filter intra predictor function signature. Section 7.11.2.3. +// |dst| is an unaligned pointer to the output block. Pixel size is determined +// by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to +// the row above |dst|. |left| is an aligned vector of the column to the left +// of |dst|. |width| and |height| are the size of the block in pixels. +using FilterIntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride, + const void* top, const void* left, + FilterIntraPredictor pred, int width, + int height); + +//------------------------------------------------------------------------------ +// Chroma from Luma (Cfl) prediction. Section 7.11.5. + +// Chroma from Luma (Cfl) intra prediction function signature. |dst| is an +// unaligned pointer to the output block. Pixel size is determined by bitdepth +// with |stride| given in bytes. |luma| contains subsampled luma pixels with 3 +// fractional bits of precision. |alpha| is the signed Cfl alpha value for the +// appropriate plane. +using CflIntraPredictorFunc = void (*)( + void* dst, ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], int alpha); +using CflIntraPredictorFuncs = CflIntraPredictorFunc[kNumTransformSizes]; + +// Chroma from Luma (Cfl) subsampler function signature. |luma| is an unaligned +// pointer to the output block. |src| is an unaligned pointer to the input +// block. Pixel size is determined by bitdepth with |stride| given in bytes. +using CflSubsamplerFunc = + void (*)(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + int max_luma_width, int max_luma_height, const void* source, + ptrdiff_t stride); +using CflSubsamplerFuncs = + CflSubsamplerFunc[kNumTransformSizes][kNumSubsamplingTypes]; + +//------------------------------------------------------------------------------ +// Intra Edge Filtering and Upsampling. Step 4 in section 7.11.2.4. + +// Intra edge filter function signature. |buffer| is a pointer to the top_row or +// left_column that needs to be filtered. Typically the -1'th index of |top_row| +// and |left_column| need to be filtered as well, so the caller can merely pass +// the |buffer| as top_row[-1] or left_column[-1]. Pixel size is determined by +// bitdepth. |size| is the number of pixels to be filtered. |strength| is the +// filter strength. Section 7.11.2.12 in the spec. +using IntraEdgeFilterFunc = void (*)(void* buffer, int size, int strength); + +// Intra edge upsampler function signature. |buffer| is a pointer to the top_row +// or left_column that needs to be upsampled. Pixel size is determined by +// bitdepth. |size| is the number of pixels to be upsampled; valid values are: +// 4, 8, 12, 16. This function needs access to negative indices -1 and -2 of +// the |buffer|. Section 7.11.2.11 in the spec. +using IntraEdgeUpsamplerFunc = void (*)(void* buffer, int size); + +//------------------------------------------------------------------------------ +// Inverse transform add function signature. +// +// Steps 2 and 3 of section 7.12.3 (contains the implementation of section +// 7.13.3). +// Apply the inverse transforms and add the residual to the destination frame +// for the transform type and block size |tx_size| starting at position +// |start_x| and |start_y|. |dst_frame| is a pointer to an Array2D. +// |adjusted_tx_height| is the number of rows to process based on the non-zero +// coefficient count in the block. It will be 1 (non-zero coefficient count == +// 1), 4 or a multiple of 8 up to 32 or the original transform height, +// whichever is less. +using InverseTransformAddFunc = void (*)(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, + void* src_buffer, int start_x, + int start_y, void* dst_frame); +// The final dimension holds row and column transforms indexed with kRow and +// kColumn. +using InverseTransformAddFuncs = + InverseTransformAddFunc[kNum1DTransforms][kNum1DTransformSizes][2]; + +//------------------------------------------------------------------------------ +// Post processing. + +// Loop filter function signature. Section 7.14. +// |dst| is an unaligned pointer to the output block. Pixel size is determined +// by bitdepth with |stride| given in bytes. +using LoopFilterFunc = void (*)(void* dst, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); +using LoopFilterFuncs = + LoopFilterFunc[kNumLoopFilterSizes][kNumLoopFilterTypes]; + +// Cdef direction function signature. Section 7.15.2. +// |src| is a pointer to the source block. Pixel size is determined by bitdepth +// with |stride| given in bytes. |direction| and |variance| are output +// parameters and must not be nullptr. +using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride, + uint8_t* direction, int* variance); + +// Cdef filtering function signature. Section 7.15.3. +// |source| is a pointer to the input block padded with kCdefLargeValue if at a +// frame border. |source_stride| is given in units of uint16_t. +// |block_width|, |block_height| are the width/height of the input block. +// |primary_strength|, |secondary_strength|, and |damping| are Cdef filtering +// parameters. +// |direction| is the filtering direction. +// |dest| is the output buffer. |dest_stride| is given in bytes. +using CdefFilteringFunc = void (*)(const uint16_t* source, + ptrdiff_t source_stride, int block_height, + int primary_strength, int secondary_strength, + int damping, int direction, void* dest, + ptrdiff_t dest_stride); + +// The first index is block width: [0]: 4, [1]: 8. The second is based on +// non-zero strengths: [0]: |primary_strength| and |secondary_strength|, [1]: +// |primary_strength| only, [2]: |secondary_strength| only. +using CdefFilteringFuncs = CdefFilteringFunc[2][3]; + +// Upscaling coefficients function signature. Section 7.16. +// This is an auxiliary function for SIMD optimizations and has no corresponding +// C function. Different SIMD versions may have different outputs. So it must +// pair with the corresponding version of SuperResFunc. +// |upscaled_width| is the width of the output frame. +// |step| is the number of subpixels to move the kernel for the next destination +// pixel. +// |initial_subpixel_x| is a base offset from which |step| increments. +// |coefficients| is the upscale filter used by each pixel in a row. +using SuperResCoefficientsFunc = void (*)(int upscaled_width, + int initial_subpixel_x, int step, + void* coefficients); + +// Upscaling process function signature. Section 7.16. +// |coefficients| is the upscale filter used by each pixel in a row. It is not +// used by the C function. +// |source| is the input frame buffer. It will be line extended. +// |dest| is the output buffer. +// |stride| is given in pixels, and shared by |source| and |dest|. +// |height| is the height of the block to be processed. +// |downscaled_width| is the width of the input frame. +// |upscaled_width| is the width of the output frame. +// |step| is the number of subpixels to move the kernel for the next destination +// pixel. +// |initial_subpixel_x| is a base offset from which |step| increments. +using SuperResFunc = void (*)(const void* coefficients, void* source, + ptrdiff_t stride, int height, + int downscaled_width, int upscaled_width, + int initial_subpixel_x, int step, void* dest); + +// Loop restoration function signature. Sections 7.16, 7.17. +// |restoration_info| contains loop restoration information, such as filter +// type, strength. +// |source| is the input frame buffer, which is deblocked and cdef filtered. +// |top_border| and |bottom_border| are the top and bottom borders. +// |dest| is the output. +// |stride| is given in pixels, and shared by |source|, |top_border|, +// |bottom_border| and |dest|. +// |restoration_buffer| contains buffers required for self guided filter and +// wiener filter. They must be initialized before calling. +using LoopRestorationFunc = void (*)( + const RestorationUnitInfo& restoration_info, const void* source, + const void* top_border, const void* bottom_border, ptrdiff_t stride, + int width, int height, RestorationBuffer* restoration_buffer, void* dest); + +// Index 0 is Wiener Filter. +// Index 1 is Self Guided Restoration Filter. +// This can be accessed as LoopRestorationType - 2. +using LoopRestorationFuncs = LoopRestorationFunc[2]; + +// Convolve function signature. Section 7.11.3.4. +// This function applies a horizontal filter followed by a vertical filter. +// |reference| is the input block (reference frame buffer). |reference_stride| +// is the corresponding frame stride. +// |vertical_filter_index|/|horizontal_filter_index| is the index to +// retrieve the type of filter to be applied for vertical/horizontal direction +// from the filter lookup table 'kSubPixelFilters'. +// |horizontal_filter_id| and |vertical_filter_id| are the filter ids. +// |width| and |height| are width and height of the block to be filtered. +// |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in +// x/y direction. +// |prediction| is the output block (output frame buffer). +// Rounding precision is derived from the function being called. For horizontal +// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be +// used. For compound vertical filtering kInterRoundBitsCompoundVertical will be +// used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will +// be used. +using ConvolveFunc = void (*)(const void* reference, ptrdiff_t reference_stride, + int horizontal_filter_index, + int vertical_filter_index, + int horizontal_filter_id, int vertical_filter_id, + int width, int height, void* prediction, + ptrdiff_t pred_stride); + +// Convolve functions signature. Each points to one convolve function with +// a specific setting: +// ConvolveFunc[is_intra_block_copy][is_compound][has_vertical_filter] +// [has_horizontal_filter]. +// If is_compound is false, the prediction is clipped to Pixel. +// If is_compound is true, the range of prediction is: +// 8bpp: [-5132, 9212] (int16_t) +// 10bpp: [ 3988, 61532] (uint16_t) +// 12bpp: [ 3974, 61559] (uint16_t) +// See src/dsp/convolve.cc +using ConvolveFuncs = ConvolveFunc[2][2][2][2]; + +// Convolve + scale function signature. Section 7.11.3.4. +// This function applies a horizontal filter followed by a vertical filter. +// |reference| is the input block (reference frame buffer). |reference_stride| +// is the corresponding frame stride. +// |vertical_filter_index|/|horizontal_filter_index| is the index to +// retrieve the type of filter to be applied for vertical/horizontal direction +// from the filter lookup table 'kSubPixelFilters'. +// |subpixel_x| and |subpixel_y| are starting positions in units of 1/1024. +// |step_x| and |step_y| are step sizes in units of 1/1024 of a pixel. +// |width| and |height| are width and height of the block to be filtered. +// |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in +// x/y direction. +// |prediction| is the output block (output frame buffer). +// Rounding precision is derived from the function being called. For horizontal +// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be +// used. For compound vertical filtering kInterRoundBitsCompoundVertical will be +// used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will +// be used. +using ConvolveScaleFunc = void (*)(const void* reference, + ptrdiff_t reference_stride, + int horizontal_filter_index, + int vertical_filter_index, int subpixel_x, + int subpixel_y, int step_x, int step_y, + int width, int height, void* prediction, + ptrdiff_t pred_stride); + +// Convolve functions signature for scaling version. +// 0: single predictor. 1: compound predictor. +using ConvolveScaleFuncs = ConvolveScaleFunc[2]; + +// Weight mask function signature. Section 7.11.3.12. +// |prediction_0| is the first input block. +// |prediction_1| is the second input block. Both blocks are int16_t* when +// bitdepth == 8 and uint16_t* otherwise. +// |width| and |height| are the prediction width and height. +// The stride for the input buffers is equal to |width|. +// The valid range of block size is [8x8, 128x128] for the luma plane. +// |mask| is the output buffer. |mask_stride| is the output buffer stride. +using WeightMaskFunc = void (*)(const void* prediction_0, + const void* prediction_1, uint8_t* mask, + ptrdiff_t mask_stride); + +// Weight mask functions signature. The dimensions (in order) are: +// * Width index (4 => 0, 8 => 1, 16 => 2 and so on). +// * Height index (4 => 0, 8 => 1, 16 => 2 and so on). +// * mask_is_inverse. +using WeightMaskFuncs = WeightMaskFunc[6][6][2]; + +// Average blending function signature. +// Two predictors are averaged to generate the output. +// Input predictor values are int16_t. Output type is uint8_t, with actual +// range of Pixel value. +// Average blending is in the bottom of Section 7.11.3.1 (COMPOUND_AVERAGE). +// |prediction_0| is the first input block. +// |prediction_1| is the second input block. Both blocks are int16_t* when +// bitdepth == 8 and uint16_t* otherwise. +// |width| and |height| are the same for the first and second input blocks. +// The stride for the input buffers is equal to |width|. +// The valid range of block size is [8x8, 128x128] for the luma plane. +// |dest| is the output buffer. |dest_stride| is the output buffer stride. +using AverageBlendFunc = void (*)(const void* prediction_0, + const void* prediction_1, int width, + int height, void* dest, + ptrdiff_t dest_stride); + +// Distance weighted blending function signature. +// Weights are generated in Section 7.11.3.15. +// Weighted blending is in the bottom of Section 7.11.3.1 (COMPOUND_DISTANCE). +// This function takes two blocks (inter frame prediction) and produces a +// weighted output. +// |prediction_0| is the first input block. +// |prediction_1| is the second input block. Both blocks are int16_t* when +// bitdepth == 8 and uint16_t* otherwise. +// |weight_0| is the weight for the first block. It is derived from the relative +// distance of the first reference frame and the current frame. +// |weight_1| is the weight for the second block. It is derived from the +// relative distance of the second reference frame and the current frame. +// |width| and |height| are the same for the first and second input blocks. +// The stride for the input buffers is equal to |width|. +// The valid range of block size is [8x8, 128x128] for the luma plane. +// |dest| is the output buffer. |dest_stride| is the output buffer stride. +using DistanceWeightedBlendFunc = void (*)(const void* prediction_0, + const void* prediction_1, + uint8_t weight_0, uint8_t weight_1, + int width, int height, void* dest, + ptrdiff_t dest_stride); + +// Mask blending function signature. Section 7.11.3.14. +// This function takes two blocks and produces a blended output stored into the +// output block |dest|. The blending is a weighted average process, controlled +// by values of the mask. +// |prediction_0| is the first input block. When prediction mode is inter_intra +// (or wedge_inter_intra), this refers to the inter frame prediction. It is +// int16_t* when bitdepth == 8 and uint16_t* otherwise. +// The stride for |prediction_0| is equal to |width|. +// |prediction_1| is the second input block. When prediction mode is inter_intra +// (or wedge_inter_intra), this refers to the intra frame prediction and uses +// Pixel values. It is only used for intra frame prediction when bitdepth >= 10. +// It is int16_t* when bitdepth == 8 and uint16_t* otherwise. +// |prediction_stride_1| is the stride, given in units of [u]int16_t. When +// |is_inter_intra| is false (compound prediction) then |prediction_stride_1| is +// equal to |width|. +// |mask| is an integer array, whose value indicates the weight of the blending. +// |mask_stride| is corresponding stride. +// |width|, |height| are the same for both input blocks. +// If it's inter_intra (or wedge_inter_intra), the valid range of block size is +// [8x8, 32x32]. Otherwise (including difference weighted prediction and +// compound average prediction), the valid range is [8x8, 128x128]. +// If there's subsampling, the corresponding width and height are halved for +// chroma planes. +// |subsampling_x|, |subsampling_y| are the subsampling factors. +// |is_inter_intra| stands for the prediction mode. If it is true, one of the +// prediction blocks is from intra prediction of current frame. Otherwise, two +// prediction blocks are both inter frame predictions. +// |is_wedge_inter_intra| indicates if the mask is for the wedge prediction. +// |dest| is the output block. +// |dest_stride| is the corresponding stride for dest. +using MaskBlendFunc = void (*)(const void* prediction_0, + const void* prediction_1, + ptrdiff_t prediction_stride_1, + const uint8_t* mask, ptrdiff_t mask_stride, + int width, int height, void* dest, + ptrdiff_t dest_stride); + +// Mask blending functions signature. Each points to one function with +// a specific setting: +// MaskBlendFunc[subsampling_x + subsampling_y][is_inter_intra]. +using MaskBlendFuncs = MaskBlendFunc[3][2]; + +// This function is similar to the MaskBlendFunc. It is only used when +// |is_inter_intra| is true and |bitdepth| == 8. +// |prediction_[01]| are Pixel values (uint8_t). +// |prediction_1| is also the output buffer. +using InterIntraMaskBlendFunc8bpp = void (*)(const uint8_t* prediction_0, + uint8_t* prediction_1, + ptrdiff_t prediction_stride_1, + const uint8_t* mask, + ptrdiff_t mask_stride, int width, + int height); + +// InterIntra8bpp mask blending functions signature. When is_wedge_inter_intra +// is false, the function at index 0 must be used. Otherwise, the function at +// index subsampling_x + subsampling_y must be used. +using InterIntraMaskBlendFuncs8bpp = InterIntraMaskBlendFunc8bpp[3]; + +// Obmc (overlapped block motion compensation) blending function signature. +// Section 7.11.3.10. +// This function takes two blocks and produces a blended output stored into the +// first input block. The blending is a weighted average process, controlled by +// values of the mask. +// Obmc is not a compound mode. It is different from other compound blending, +// in terms of precision. The current block is computed using convolution with +// clipping to the range of pixel values. Its above and left blocks are also +// clipped. Therefore obmc blending process doesn't need to clip the output. +// |prediction| is the first input block, which will be overwritten. +// |prediction_stride| is the stride, given in bytes. +// |width|, |height| are the same for both input blocks. +// |obmc_prediction| is the second input block. +// |obmc_prediction_stride| is its stride, given in bytes. +using ObmcBlendFunc = void (*)(void* prediction, ptrdiff_t prediction_stride, + int width, int height, + const void* obmc_prediction, + ptrdiff_t obmc_prediction_stride); +using ObmcBlendFuncs = ObmcBlendFunc[kNumObmcDirections]; + +// Warp function signature. Section 7.11.3.5. +// This function applies warp filtering for each 8x8 block inside the current +// coding block. The filtering process is similar to 2d convolve filtering. +// The horizontal filter is applied followed by the vertical filter. +// The function has to calculate corresponding pixel positions before and +// after warping. +// |source| is the input reference frame buffer. +// |source_stride|, |source_width|, |source_height| are corresponding frame +// stride, width, and height. |source_stride| is given in bytes. +// |warp_params| is the matrix of warp motion: warp_params[i] = mN. +// [x' (m2 m3 m0 [x +// z . y' = m4 m5 m1 * y +// 1] m6 m7 1) 1] +// |subsampling_x/y| is the current frame's plane subsampling factor. +// |block_start_x| and |block_start_y| are the starting position the current +// coding block. +// |block_width| and |block_height| are width and height of the current coding +// block. |block_width| and |block_height| are at least 8. +// |alpha|, |beta|, |gamma|, |delta| are valid warp parameters. See the +// comments in the definition of struct GlobalMotion for the range of their +// values. +// |dest| is the output buffer of type Pixel. The output values are clipped to +// Pixel values. +// |dest_stride| is the stride, in units of bytes. +// Rounding precision is derived from the function being called. For horizontal +// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be +// used. For vertical filtering kInterRoundBitsVertical & +// kInterRoundBitsVertical12bpp will be used. +// +// NOTE: WarpFunc assumes the source frame has left, right, top, and bottom +// borders that extend the frame boundary pixels. +// * The left and right borders must be at least 13 pixels wide. In addition, +// Warp_NEON() may read up to 14 bytes after a row in the |source| buffer. +// Therefore, there must be at least one extra padding byte after the right +// border of the last row in the source buffer. +// * The top and bottom borders must be at least 13 pixels high. +using WarpFunc = void (*)(const void* source, ptrdiff_t source_stride, + int source_width, int source_height, + const int* warp_params, int subsampling_x, + int subsampling_y, int block_start_x, + int block_start_y, int block_width, int block_height, + int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta, void* dest, ptrdiff_t dest_stride); + +// Warp for compound predictions. Section 7.11.3.5. +// Similar to WarpFunc, but |dest| is a uint16_t predictor buffer, +// |dest_stride| is given in units of uint16_t and |inter_round_bits_vertical| +// is always 7 (kCompoundInterRoundBitsVertical). +// Rounding precision is derived from the function being called. For horizontal +// filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be +// used. For vertical filtering kInterRoundBitsCompondVertical will be used. +using WarpCompoundFunc = WarpFunc; + +constexpr int kNumAutoRegressionLags = 4; +// Applies an auto-regressive filter to the white noise in |luma_grain_buffer|. +// Section 7.18.3.3, second code block +// |params| are parameters read from frame header, mainly providing +// auto_regression_coeff_y for the filter and auto_regression_shift to right +// shift the filter sum by. Note: This method assumes +// params.auto_regression_coeff_lag is not 0. Do not call this method if +// params.auto_regression_coeff_lag is 0. +using LumaAutoRegressionFunc = void (*)(const FilmGrainParams& params, + void* luma_grain_buffer); +// Function index is auto_regression_coeff_lag - 1. +using LumaAutoRegressionFuncs = + LumaAutoRegressionFunc[kNumAutoRegressionLags - 1]; + +// Applies an auto-regressive filter to the white noise in u_grain and v_grain. +// Section 7.18.3.3, third code block +// The |luma_grain_buffer| provides samples that are added to the autoregressive +// sum when num_y_points > 0. +// |u_grain_buffer| and |v_grain_buffer| point to the buffers of chroma noise +// that were generated from the stored Gaussian sequence, and are overwritten +// with the results of the autoregressive filter. |params| are parameters read +// from frame header, mainly providing auto_regression_coeff_u and +// auto_regression_coeff_v for each chroma plane's filter, and +// auto_regression_shift to right shift the filter sums by. +using ChromaAutoRegressionFunc = void (*)(const FilmGrainParams& params, + const void* luma_grain_buffer, + int subsampling_x, int subsampling_y, + void* u_grain_buffer, + void* v_grain_buffer); +using ChromaAutoRegressionFuncs = + ChromaAutoRegressionFunc[/*use_luma*/ 2][kNumAutoRegressionLags]; + +// Build an image-wide "stripe" of grain noise for every 32 rows in the image. +// Section 7.18.3.5, first code block. +// Each 32x32 luma block is copied at a random offset specified via +// |grain_seed| from the grain template produced by autoregression, and the same +// is done for chroma grains, subject to subsampling. +// |width| and |height| are the dimensions of the overall image. +// |noise_stripes_buffer| points to an Array2DView with one row for each stripe. +// Because this function treats all planes identically and independently, it is +// simplified to take one grain buffer at a time. This means duplicating some +// random number generations, but that work can be reduced in other ways. +using ConstructNoiseStripesFunc = void (*)(const void* grain_buffer, + int grain_seed, int width, + int height, int subsampling_x, + int subsampling_y, + void* noise_stripes_buffer); +using ConstructNoiseStripesFuncs = + ConstructNoiseStripesFunc[/*overlap_flag*/ 2]; + +// Compute the one or two overlap rows for each stripe copied to the noise +// image. +// Section 7.18.3.5, second code block. |width| and |height| are the +// dimensions of the overall image. |noise_stripes_buffer| points to an +// Array2DView with one row for each stripe. |noise_image_buffer| points to an +// Array2D containing the allocated plane for this frame. Because this function +// treats all planes identically and independently, it is simplified to take one +// grain buffer at a time. +using ConstructNoiseImageOverlapFunc = + void (*)(const void* noise_stripes_buffer, int width, int height, + int subsampling_x, int subsampling_y, void* noise_image_buffer); + +// Populate a scaling lookup table with interpolated values of a piecewise +// linear function where values in |point_value| are mapped to the values in +// |point_scaling|. +// |num_points| can be between 0 and 15. When 0, the lookup table is set to +// zero. +// |point_value| and |point_scaling| have |num_points| valid elements. +using InitializeScalingLutFunc = void (*)( + int num_points, const uint8_t point_value[], const uint8_t point_scaling[], + uint8_t scaling_lut[kScalingLookupTableSize]); + +// Blend noise with image. Section 7.18.3.5, third code block. +// |width| is the width of each row, while |height| is how many rows to compute. +// |start_height| is an offset for the noise image, to support multithreading. +// |min_value|, |max_luma|, and |max_chroma| are computed by the caller of these +// functions, according to the code in the spec. +// |source_plane_y| and |source_plane_uv| are the plane buffers of the decoded +// frame. They are blended with the film grain noise and written to +// |dest_plane_y| and |dest_plane_uv| as final output for display. +// source_plane_* and dest_plane_* may point to the same buffer, in which case +// the film grain noise is added in place. +// |scaling_lut_y| and |scaling_lut| represent a piecewise linear mapping from +// the frame's raw pixel value, to a scaling factor for the noise sample. +// |scaling_shift| is applied as a right shift after scaling, so that scaling +// down is possible. It is found in FilmGrainParams, but supplied directly to +// BlendNoiseWithImageLumaFunc because it's the only member used. +using BlendNoiseWithImageLumaFunc = + void (*)(const void* noise_image_ptr, int min_value, int max_value, + int scaling_shift, int width, int height, int start_height, + const uint8_t scaling_lut_y[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, + void* dest_plane_y, ptrdiff_t dest_stride_y); + +using BlendNoiseWithImageChromaFunc = void (*)( + Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, + int min_value, int max_value, int width, int height, int start_height, + int subsampling_x, int subsampling_y, + const uint8_t scaling_lut[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, + const void* source_plane_uv, ptrdiff_t source_stride_uv, + void* dest_plane_uv, ptrdiff_t dest_stride_uv); + +using BlendNoiseWithImageChromaFuncs = + BlendNoiseWithImageChromaFunc[/*chroma_scaling_from_luma*/ 2]; + +//------------------------------------------------------------------------------ + +struct FilmGrainFuncs { + LumaAutoRegressionFuncs luma_auto_regression; + ChromaAutoRegressionFuncs chroma_auto_regression; + ConstructNoiseStripesFuncs construct_noise_stripes; + ConstructNoiseImageOverlapFunc construct_noise_image_overlap; + InitializeScalingLutFunc initialize_scaling_lut; + BlendNoiseWithImageLumaFunc blend_noise_luma; + BlendNoiseWithImageChromaFuncs blend_noise_chroma; +}; + +// Motion field projection function signature. Section 7.9. +// |reference_info| provides reference information for motion field projection. +// |reference_to_current_with_sign| is the precalculated reference frame id +// distance from current frame. +// |dst_sign| is -1 for LAST_FRAME and LAST2_FRAME, or 0 (1 in spec) for others. +// |y8_start| and |y8_end| are the start and end 8x8 rows of the current tile. +// |x8_start| and |x8_end| are the start and end 8x8 columns of the current +// tile. +// |motion_field| is the output which saves the projected motion field +// information. +using MotionFieldProjectionKernelFunc = void (*)( + const ReferenceInfo& reference_info, int reference_to_current_with_sign, + int dst_sign, int y8_start, int y8_end, int x8_start, int x8_end, + TemporalMotionField* motion_field); + +// Compound temporal motion vector projection function signature. +// Section 7.9.3 and 7.10.2.10. +// |temporal_mvs| is the set of temporal reference motion vectors. +// |temporal_reference_offsets| specifies the number of frames covered by the +// original motion vector. +// |reference_offsets| specifies the number of frames to be covered by the +// projected motion vector. +// |count| is the number of the temporal motion vectors. +// |candidate_mvs| is the set of projected motion vectors. +using MvProjectionCompoundFunc = void (*)( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offsets[2], int count, + CompoundMotionVector* candidate_mvs); + +// Single temporal motion vector projection function signature. +// Section 7.9.3 and 7.10.2.10. +// |temporal_mvs| is the set of temporal reference motion vectors. +// |temporal_reference_offsets| specifies the number of frames covered by the +// original motion vector. +// |reference_offset| specifies the number of frames to be covered by the +// projected motion vector. +// |count| is the number of the temporal motion vectors. +// |candidate_mvs| is the set of projected motion vectors. +using MvProjectionSingleFunc = void (*)( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + int reference_offset, int count, MotionVector* candidate_mvs); + +struct Dsp { + AverageBlendFunc average_blend; + CdefDirectionFunc cdef_direction; + CdefFilteringFuncs cdef_filters; + CflIntraPredictorFuncs cfl_intra_predictors; + CflSubsamplerFuncs cfl_subsamplers; + ConvolveFuncs convolve; + ConvolveScaleFuncs convolve_scale; + DirectionalIntraPredictorZone1Func directional_intra_predictor_zone1; + DirectionalIntraPredictorZone2Func directional_intra_predictor_zone2; + DirectionalIntraPredictorZone3Func directional_intra_predictor_zone3; + DistanceWeightedBlendFunc distance_weighted_blend; + FilmGrainFuncs film_grain; + FilterIntraPredictorFunc filter_intra_predictor; + InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp; + IntraEdgeFilterFunc intra_edge_filter; + IntraEdgeUpsamplerFunc intra_edge_upsampler; + IntraPredictorFuncs intra_predictors; + InverseTransformAddFuncs inverse_transforms; + LoopFilterFuncs loop_filters; + LoopRestorationFuncs loop_restorations; + MaskBlendFuncs mask_blend; + MotionFieldProjectionKernelFunc motion_field_projection_kernel; + MvProjectionCompoundFunc mv_projection_compound[3]; + MvProjectionSingleFunc mv_projection_single[3]; + ObmcBlendFuncs obmc_blend; + SuperResCoefficientsFunc super_res_coefficients; + SuperResFunc super_res; + WarpCompoundFunc warp_compound; + WarpFunc warp; + WeightMaskFuncs weight_mask; +}; + +// Initializes function pointers based on build config and runtime +// environment. Must be called once before first use. This function is +// thread-safe. +void DspInit(); + +// Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't +// exist. +const Dsp* GetDspTable(int bitdepth); + +} // namespace dsp + +namespace dsp_internal { + +// Visual Studio builds don't have a way to detect SSE4_1. Only exclude the C +// functions if /arch:AVX2 is used across all sources. +#if !LIBGAV1_TARGETING_AVX2 && \ + (defined(_MSC_VER) || (defined(_M_IX86) || defined(_M_X64))) +#undef LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 1 +#endif + +// Returns true if a more highly optimized version of |func| is not defined for +// the associated bitdepth or if it is forcibly enabled with +// LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS. The define checked for |func| corresponds +// to the LIBGAV1_Dspbpp_|func| define in the header file associated +// with the module. +// |func| is one of: +// - FunctionName, e.g., SelfGuidedFilter. +// - [sub-table-index1][...-indexN] e.g., +// TransformSize4x4_IntraPredictorDc. The indices correspond to enum values +// used as lookups with leading 'k' removed. +// +// NEON support is the only extension available for ARM and it is always +// required. Because of this restriction DSP_ENABLED_8BPP_NEON(func) is always +// true and can be omitted. +#define DSP_ENABLED_8BPP_AVX2(func) \ + (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_AVX2) +#define DSP_ENABLED_10BPP_AVX2(func) \ + (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_AVX2) +#define DSP_ENABLED_8BPP_SSE4_1(func) \ + (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_SSE4_1) +#define DSP_ENABLED_10BPP_SSE4_1(func) \ + (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_SSE4_1) + +// Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't +// exist. This version is meant for use by test or dsp/*Init() functions only. +dsp::Dsp* GetWritableDspTable(int bitdepth); + +} // namespace dsp_internal +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_DSP_H_ diff --git a/src/dsp/film_grain.cc b/src/dsp/film_grain.cc new file mode 100644 index 0000000..41d1dd0 --- /dev/null +++ b/src/dsp/film_grain.cc @@ -0,0 +1,870 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/film_grain.h" + +#include +#include +#include +#include +#include +#include + +#include "src/dsp/common.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/film_grain_common.h" +#include "src/utils/array_2d.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/logging.h" + +namespace libgav1 { +namespace dsp { +namespace film_grain { +namespace { + +// Making this a template function prevents it from adding to code size when it +// is not placed in the DSP table. Most functions in the dsp directory change +// behavior by bitdepth, but because this one doesn't, it receives a dummy +// parameter with one enforced value, ensuring only one copy is made. +template +void InitializeScalingLookupTable_C( + int num_points, const uint8_t point_value[], const uint8_t point_scaling[], + uint8_t scaling_lut[kScalingLookupTableSize]) { + static_assert(singleton == 0, + "Improper instantiation of InitializeScalingLookupTable_C. " + "There should be only one copy of this function."); + if (num_points == 0) { + memset(scaling_lut, 0, sizeof(scaling_lut[0]) * kScalingLookupTableSize); + return; + } + static_assert(sizeof(scaling_lut[0]) == 1, ""); + memset(scaling_lut, point_scaling[0], point_value[0]); + for (int i = 0; i < num_points - 1; ++i) { + const int delta_y = point_scaling[i + 1] - point_scaling[i]; + const int delta_x = point_value[i + 1] - point_value[i]; + const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x); + for (int x = 0; x < delta_x; ++x) { + const int v = point_scaling[i] + ((x * delta + 32768) >> 16); + assert(v >= 0 && v <= UINT8_MAX); + scaling_lut[point_value[i] + x] = v; + } + } + const uint8_t last_point_value = point_value[num_points - 1]; + memset(&scaling_lut[last_point_value], point_scaling[num_points - 1], + kScalingLookupTableSize - last_point_value); +} + +// Section 7.18.3.5. +// Performs a piecewise linear interpolation into the scaling table. +template +int ScaleLut(const uint8_t scaling_lut[kScalingLookupTableSize], int index) { + const int shift = bitdepth - 8; + const int quotient = index >> shift; + const int remainder = index - (quotient << shift); + if (bitdepth == 8) { + assert(quotient < kScalingLookupTableSize); + return scaling_lut[quotient]; + } + assert(quotient + 1 < kScalingLookupTableSize); + const int start = scaling_lut[quotient]; + const int end = scaling_lut[quotient + 1]; + return start + RightShiftWithRounding((end - start) * remainder, shift); +} + +// Applies an auto-regressive filter to the white noise in luma_grain. +template +void ApplyAutoRegressiveFilterToLumaGrain_C(const FilmGrainParams& params, + void* luma_grain_buffer) { + auto* luma_grain = static_cast(luma_grain_buffer); + const int grain_min = GetGrainMin(); + const int grain_max = GetGrainMax(); + const int auto_regression_coeff_lag = params.auto_regression_coeff_lag; + assert(auto_regression_coeff_lag > 0 && auto_regression_coeff_lag <= 3); + // A pictorial representation of the auto-regressive filter for various values + // of auto_regression_coeff_lag. The letter 'O' represents the current sample. + // (The filter always operates on the current sample with filter + // coefficient 1.) The letters 'X' represent the neighboring samples that the + // filter operates on. + // + // auto_regression_coeff_lag == 3: + // X X X X X X X + // X X X X X X X + // X X X X X X X + // X X X O + // auto_regression_coeff_lag == 2: + // X X X X X + // X X X X X + // X X O + // auto_regression_coeff_lag == 1: + // X X X + // X O + // auto_regression_coeff_lag == 0: + // O + // + // Note that if auto_regression_coeff_lag is 0, the filter is the identity + // filter and therefore can be skipped. This implementation assumes it is not + // called in that case. + const int shift = params.auto_regression_shift; + for (int y = kAutoRegressionBorder; y < kLumaHeight; ++y) { + for (int x = kAutoRegressionBorder; x < kLumaWidth - kAutoRegressionBorder; + ++x) { + int sum = 0; + int pos = 0; + int delta_row = -auto_regression_coeff_lag; + // The last iteration (delta_row == 0) is shorter and is handled + // separately. + do { + int delta_column = -auto_regression_coeff_lag; + do { + const int coeff = params.auto_regression_coeff_y[pos]; + sum += luma_grain[(y + delta_row) * kLumaWidth + (x + delta_column)] * + coeff; + ++pos; + } while (++delta_column <= auto_regression_coeff_lag); + } while (++delta_row < 0); + // Last iteration: delta_row == 0. + { + int delta_column = -auto_regression_coeff_lag; + do { + const int coeff = params.auto_regression_coeff_y[pos]; + sum += luma_grain[y * kLumaWidth + (x + delta_column)] * coeff; + ++pos; + } while (++delta_column < 0); + } + luma_grain[y * kLumaWidth + x] = Clip3( + luma_grain[y * kLumaWidth + x] + RightShiftWithRounding(sum, shift), + grain_min, grain_max); + } + } +} + +template +void ApplyAutoRegressiveFilterToChromaGrains_C(const FilmGrainParams& params, + const void* luma_grain_buffer, + int subsampling_x, + int subsampling_y, + void* u_grain_buffer, + void* v_grain_buffer) { + static_assert( + auto_regression_coeff_lag >= 0 && auto_regression_coeff_lag <= 3, + "Unsupported autoregression lag for chroma."); + const auto* luma_grain = static_cast(luma_grain_buffer); + const int grain_min = GetGrainMin(); + const int grain_max = GetGrainMax(); + auto* u_grain = static_cast(u_grain_buffer); + auto* v_grain = static_cast(v_grain_buffer); + const int shift = params.auto_regression_shift; + const int chroma_height = + (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight; + const int chroma_width = + (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth; + for (int y = kAutoRegressionBorder; y < chroma_height; ++y) { + const int luma_y = + ((y - kAutoRegressionBorder) << subsampling_y) + kAutoRegressionBorder; + for (int x = kAutoRegressionBorder; + x < chroma_width - kAutoRegressionBorder; ++x) { + int sum_u = 0; + int sum_v = 0; + int pos = 0; + int delta_row = -auto_regression_coeff_lag; + do { + int delta_column = -auto_regression_coeff_lag; + do { + if (delta_row == 0 && delta_column == 0) { + break; + } + const int coeff_u = params.auto_regression_coeff_u[pos]; + const int coeff_v = params.auto_regression_coeff_v[pos]; + sum_u += + u_grain[(y + delta_row) * chroma_width + (x + delta_column)] * + coeff_u; + sum_v += + v_grain[(y + delta_row) * chroma_width + (x + delta_column)] * + coeff_v; + ++pos; + } while (++delta_column <= auto_regression_coeff_lag); + } while (++delta_row <= 0); + if (use_luma) { + int luma = 0; + const int luma_x = ((x - kAutoRegressionBorder) << subsampling_x) + + kAutoRegressionBorder; + int i = 0; + do { + int j = 0; + do { + luma += luma_grain[(luma_y + i) * kLumaWidth + (luma_x + j)]; + } while (++j <= subsampling_x); + } while (++i <= subsampling_y); + luma = SubsampledValue(luma, subsampling_x + subsampling_y); + const int coeff_u = params.auto_regression_coeff_u[pos]; + const int coeff_v = params.auto_regression_coeff_v[pos]; + sum_u += luma * coeff_u; + sum_v += luma * coeff_v; + } + u_grain[y * chroma_width + x] = Clip3( + u_grain[y * chroma_width + x] + RightShiftWithRounding(sum_u, shift), + grain_min, grain_max); + v_grain[y * chroma_width + x] = Clip3( + v_grain[y * chroma_width + x] + RightShiftWithRounding(sum_v, shift), + grain_min, grain_max); + } + } +} + +// This implementation is for the condition overlap_flag == false. +template +void ConstructNoiseStripes_C(const void* grain_buffer, int grain_seed, + int width, int height, int subsampling_x, + int subsampling_y, void* noise_stripes_buffer) { + auto* noise_stripes = + static_cast*>(noise_stripes_buffer); + const auto* grain = static_cast(grain_buffer); + const int half_width = DivideBy2(width + 1); + const int half_height = DivideBy2(height + 1); + assert(half_width > 0); + assert(half_height > 0); + static_assert(kLumaWidth == kMaxChromaWidth, + "kLumaWidth width should be equal to kMaxChromaWidth"); + const int grain_width = + (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth; + const int plane_width = (width + subsampling_x) >> subsampling_x; + constexpr int kNoiseStripeHeight = 34; + int luma_num = 0; + int y = 0; + do { + GrainType* const noise_stripe = (*noise_stripes)[luma_num]; + uint16_t seed = grain_seed; + seed ^= ((luma_num * 37 + 178) & 255) << 8; + seed ^= ((luma_num * 173 + 105) & 255); + int x = 0; + do { + const int rand = GetFilmGrainRandomNumber(8, &seed); + const int offset_x = rand >> 4; + const int offset_y = rand & 15; + const int plane_offset_x = + (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2; + const int plane_offset_y = + (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2; + int i = 0; + do { + // Section 7.18.3.5 says: + // noiseStripe[ lumaNum ][ 0 ] is 34 samples high and w samples + // wide (a few additional samples across are actually written to + // the array, but these are never read) ... + // + // Note: The warning in the parentheses also applies to + // noiseStripe[ lumaNum ][ 1 ] and noiseStripe[ lumaNum ][ 2 ]. + // + // Writes beyond the width of each row could happen below. To + // prevent those writes, we clip the number of pixels to copy against + // the remaining width. + // TODO(petersonab): Allocate aligned stripes with extra width to cover + // the size of the final stripe block, then remove this call to min. + const int copy_size = + std::min(kNoiseStripeHeight >> subsampling_x, + plane_width - (x << (1 - subsampling_x))); + memcpy(&noise_stripe[i * plane_width + (x << (1 - subsampling_x))], + &grain[(plane_offset_y + i) * grain_width + plane_offset_x], + copy_size * sizeof(noise_stripe[0])); + } while (++i < (kNoiseStripeHeight >> subsampling_y)); + x += 16; + } while (x < half_width); + + ++luma_num; + y += 16; + } while (y < half_height); +} + +// This implementation is for the condition overlap_flag == true. +template +void ConstructNoiseStripesWithOverlap_C(const void* grain_buffer, + int grain_seed, int width, int height, + int subsampling_x, int subsampling_y, + void* noise_stripes_buffer) { + auto* noise_stripes = + static_cast*>(noise_stripes_buffer); + const auto* grain = static_cast(grain_buffer); + const int half_width = DivideBy2(width + 1); + const int half_height = DivideBy2(height + 1); + assert(half_width > 0); + assert(half_height > 0); + static_assert(kLumaWidth == kMaxChromaWidth, + "kLumaWidth width should be equal to kMaxChromaWidth"); + const int grain_width = + (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth; + const int plane_width = (width + subsampling_x) >> subsampling_x; + constexpr int kNoiseStripeHeight = 34; + int luma_num = 0; + int y = 0; + do { + GrainType* const noise_stripe = (*noise_stripes)[luma_num]; + uint16_t seed = grain_seed; + seed ^= ((luma_num * 37 + 178) & 255) << 8; + seed ^= ((luma_num * 173 + 105) & 255); + // Begin special iteration for x == 0. + const int rand = GetFilmGrainRandomNumber(8, &seed); + const int offset_x = rand >> 4; + const int offset_y = rand & 15; + const int plane_offset_x = + (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2; + const int plane_offset_y = + (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2; + // The overlap computation only occurs when x > 0, so it is omitted here. + int i = 0; + do { + // TODO(petersonab): Allocate aligned stripes with extra width to cover + // the size of the final stripe block, then remove this call to min. + const int copy_size = + std::min(kNoiseStripeHeight >> subsampling_x, plane_width); + memcpy(&noise_stripe[i * plane_width], + &grain[(plane_offset_y + i) * grain_width + plane_offset_x], + copy_size * sizeof(noise_stripe[0])); + } while (++i < (kNoiseStripeHeight >> subsampling_y)); + // End special iteration for x == 0. + for (int x = 16; x < half_width; x += 16) { + const int rand = GetFilmGrainRandomNumber(8, &seed); + const int offset_x = rand >> 4; + const int offset_y = rand & 15; + const int plane_offset_x = + (subsampling_x != 0) ? 6 + offset_x : 9 + offset_x * 2; + const int plane_offset_y = + (subsampling_y != 0) ? 6 + offset_y : 9 + offset_y * 2; + int i = 0; + do { + int j = 0; + int grain_sample = + grain[(plane_offset_y + i) * grain_width + plane_offset_x]; + // The first pixel(s) of each segment of the noise_stripe are subject to + // the "overlap" computation. + if (subsampling_x == 0) { + // Corresponds to the line in the spec: + // if (j < 2 && x > 0) + // j = 0 + int old = noise_stripe[i * plane_width + x * 2]; + grain_sample = old * 27 + grain_sample * 17; + grain_sample = + Clip3(RightShiftWithRounding(grain_sample, 5), + GetGrainMin(), GetGrainMax()); + noise_stripe[i * plane_width + x * 2] = grain_sample; + + // This check prevents overwriting for the iteration j = 1. The + // continue applies to the i-loop. + if (x * 2 + 1 >= plane_width) continue; + // j = 1 + grain_sample = + grain[(plane_offset_y + i) * grain_width + plane_offset_x + 1]; + old = noise_stripe[i * plane_width + x * 2 + 1]; + grain_sample = old * 17 + grain_sample * 27; + grain_sample = + Clip3(RightShiftWithRounding(grain_sample, 5), + GetGrainMin(), GetGrainMax()); + noise_stripe[i * plane_width + x * 2 + 1] = grain_sample; + j = 2; + } else { + // Corresponds to the line in the spec: + // if (j == 0 && x > 0) + const int old = noise_stripe[i * plane_width + x]; + grain_sample = old * 23 + grain_sample * 22; + grain_sample = + Clip3(RightShiftWithRounding(grain_sample, 5), + GetGrainMin(), GetGrainMax()); + noise_stripe[i * plane_width + x] = grain_sample; + j = 1; + } + // The following covers the rest of the loop over j as described in the + // spec. + // + // Section 7.18.3.5 says: + // noiseStripe[ lumaNum ][ 0 ] is 34 samples high and w samples + // wide (a few additional samples across are actually written to + // the array, but these are never read) ... + // + // Note: The warning in the parentheses also applies to + // noiseStripe[ lumaNum ][ 1 ] and noiseStripe[ lumaNum ][ 2 ]. + // + // Writes beyond the width of each row could happen below. To + // prevent those writes, we clip the number of pixels to copy against + // the remaining width. + // TODO(petersonab): Allocate aligned stripes with extra width to cover + // the size of the final stripe block, then remove this call to min. + const int copy_size = + std::min(kNoiseStripeHeight >> subsampling_x, + plane_width - (x << (1 - subsampling_x))) - + j; + memcpy(&noise_stripe[i * plane_width + (x << (1 - subsampling_x)) + j], + &grain[(plane_offset_y + i) * grain_width + plane_offset_x + j], + copy_size * sizeof(noise_stripe[0])); + } while (++i < (kNoiseStripeHeight >> subsampling_y)); + } + + ++luma_num; + y += 16; + } while (y < half_height); +} + +template +inline void WriteOverlapLine_C(const GrainType* noise_stripe_row, + const GrainType* noise_stripe_row_prev, + int plane_width, int grain_coeff, int old_coeff, + GrainType* noise_image_row) { + int x = 0; + do { + int grain = noise_stripe_row[x]; + const int old = noise_stripe_row_prev[x]; + grain = old * old_coeff + grain * grain_coeff; + grain = Clip3(RightShiftWithRounding(grain, 5), GetGrainMin(), + GetGrainMax()); + noise_image_row[x] = grain; + } while (++x < plane_width); +} + +template +void ConstructNoiseImageOverlap_C(const void* noise_stripes_buffer, int width, + int height, int subsampling_x, + int subsampling_y, void* noise_image_buffer) { + const auto* noise_stripes = + static_cast*>(noise_stripes_buffer); + auto* noise_image = static_cast*>(noise_image_buffer); + const int plane_width = (width + subsampling_x) >> subsampling_x; + const int plane_height = (height + subsampling_y) >> subsampling_y; + const int stripe_height = 32 >> subsampling_y; + const int stripe_mask = stripe_height - 1; + int y = stripe_height; + int luma_num = 1; + if (subsampling_y == 0) { + // Begin complete stripes section. This is when we are guaranteed to have + // two overlap rows in each stripe. + for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) { + const GrainType* noise_stripe = (*noise_stripes)[luma_num]; + const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1]; + // First overlap row. + WriteOverlapLine_C(noise_stripe, + &noise_stripe_prev[32 * plane_width], + plane_width, 17, 27, (*noise_image)[y]); + // Second overlap row. + WriteOverlapLine_C(&noise_stripe[plane_width], + &noise_stripe_prev[(32 + 1) * plane_width], + plane_width, 27, 17, (*noise_image)[y + 1]); + } + // End complete stripes section. + + const int remaining_height = plane_height - y; + // Either one partial stripe remains (remaining_height > 0), + // OR image is less than one stripe high (remaining_height < 0), + // OR all stripes are completed (remaining_height == 0). + if (remaining_height <= 0) { + return; + } + const GrainType* noise_stripe = (*noise_stripes)[luma_num]; + const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1]; + WriteOverlapLine_C(noise_stripe, + &noise_stripe_prev[32 * plane_width], + plane_width, 17, 27, (*noise_image)[y]); + + // Check if second overlap row is in the image. + if (remaining_height > 1) { + WriteOverlapLine_C(&noise_stripe[plane_width], + &noise_stripe_prev[(32 + 1) * plane_width], + plane_width, 27, 17, (*noise_image)[y + 1]); + } + } else { // |subsampling_y| == 1 + // No special checks needed for partial stripes, because if one exists, the + // first and only overlap row is guaranteed to exist. + for (; y < plane_height; ++luma_num, y += stripe_height) { + const GrainType* noise_stripe = (*noise_stripes)[luma_num]; + const GrainType* noise_stripe_prev = (*noise_stripes)[luma_num - 1]; + WriteOverlapLine_C(noise_stripe, + &noise_stripe_prev[16 * plane_width], + plane_width, 22, 23, (*noise_image)[y]); + } + } +} + +template +void BlendNoiseWithImageLuma_C( + const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift, + int width, int height, int start_height, + const uint8_t scaling_lut_y[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y, + ptrdiff_t dest_stride_y) { + const auto* noise_image = + static_cast*>(noise_image_ptr); + const auto* in_y = static_cast(source_plane_y); + source_stride_y /= sizeof(Pixel); + auto* out_y = static_cast(dest_plane_y); + dest_stride_y /= sizeof(Pixel); + + int y = 0; + do { + int x = 0; + do { + const int orig = in_y[y * source_stride_y + x]; + int noise = noise_image[kPlaneY][y + start_height][x]; + noise = RightShiftWithRounding( + ScaleLut(scaling_lut_y, orig) * noise, scaling_shift); + out_y[y * dest_stride_y + x] = Clip3(orig + noise, min_value, max_luma); + } while (++x < width); + } while (++y < height); +} + +// This function is for the case params_.chroma_scaling_from_luma == false. +template +void BlendNoiseWithImageChroma_C( + Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, + int min_value, int max_chroma, int width, int height, int start_height, + int subsampling_x, int subsampling_y, + const uint8_t scaling_lut_uv[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, + const void* source_plane_uv, ptrdiff_t source_stride_uv, + void* dest_plane_uv, ptrdiff_t dest_stride_uv) { + const auto* noise_image = + static_cast*>(noise_image_ptr); + + const int chroma_width = (width + subsampling_x) >> subsampling_x; + const int chroma_height = (height + subsampling_y) >> subsampling_y; + + const auto* in_y = static_cast(source_plane_y); + source_stride_y /= sizeof(Pixel); + const auto* in_uv = static_cast(source_plane_uv); + source_stride_uv /= sizeof(Pixel); + auto* out_uv = static_cast(dest_plane_uv); + dest_stride_uv /= sizeof(Pixel); + + const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset; + const int luma_multiplier = + (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier; + const int multiplier = + (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier; + + const int scaling_shift = params.chroma_scaling; + start_height >>= subsampling_y; + int y = 0; + do { + int x = 0; + do { + const int luma_x = x << subsampling_x; + const int luma_y = y << subsampling_y; + const int luma_next_x = std::min(luma_x + 1, width - 1); + int average_luma; + if (subsampling_x != 0) { + average_luma = RightShiftWithRounding( + in_y[luma_y * source_stride_y + luma_x] + + in_y[luma_y * source_stride_y + luma_next_x], + 1); + } else { + average_luma = in_y[luma_y * source_stride_y + luma_x]; + } + const int orig = in_uv[y * source_stride_uv + x]; + const int combined = average_luma * luma_multiplier + orig * multiplier; + const int merged = + Clip3((combined >> 6) + LeftShift(offset, bitdepth - 8), 0, + (1 << bitdepth) - 1); + int noise = noise_image[plane][y + start_height][x]; + noise = RightShiftWithRounding( + ScaleLut(scaling_lut_uv, merged) * noise, scaling_shift); + out_uv[y * dest_stride_uv + x] = + Clip3(orig + noise, min_value, max_chroma); + } while (++x < chroma_width); + } while (++y < chroma_height); +} + +// This function is for the case params_.chroma_scaling_from_luma == true. +// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y. +template +void BlendNoiseWithImageChromaWithCfl_C( + Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, + int min_value, int max_chroma, int width, int height, int start_height, + int subsampling_x, int subsampling_y, + const uint8_t scaling_lut[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, + const void* source_plane_uv, ptrdiff_t source_stride_uv, + void* dest_plane_uv, ptrdiff_t dest_stride_uv) { + const auto* noise_image = + static_cast*>(noise_image_ptr); + const auto* in_y = static_cast(source_plane_y); + source_stride_y /= sizeof(Pixel); + const auto* in_uv = static_cast(source_plane_uv); + source_stride_uv /= sizeof(Pixel); + auto* out_uv = static_cast(dest_plane_uv); + dest_stride_uv /= sizeof(Pixel); + + const int chroma_width = (width + subsampling_x) >> subsampling_x; + const int chroma_height = (height + subsampling_y) >> subsampling_y; + const int scaling_shift = params.chroma_scaling; + start_height >>= subsampling_y; + int y = 0; + do { + int x = 0; + do { + const int luma_x = x << subsampling_x; + const int luma_y = y << subsampling_y; + const int luma_next_x = std::min(luma_x + 1, width - 1); + int average_luma; + if (subsampling_x != 0) { + average_luma = RightShiftWithRounding( + in_y[luma_y * source_stride_y + luma_x] + + in_y[luma_y * source_stride_y + luma_next_x], + 1); + } else { + average_luma = in_y[luma_y * source_stride_y + luma_x]; + } + const int orig_uv = in_uv[y * source_stride_uv + x]; + int noise_uv = noise_image[plane][y + start_height][x]; + noise_uv = RightShiftWithRounding( + ScaleLut(scaling_lut, average_luma) * noise_uv, + scaling_shift); + out_uv[y * dest_stride_uv + x] = + Clip3(orig_uv + noise_uv, min_value, max_chroma); + } while (++x < chroma_width); + } while (++y < chroma_height); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + // LumaAutoRegressionFunc + dsp->film_grain.luma_auto_regression[0] = + ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>; + dsp->film_grain.luma_auto_regression[1] = + ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>; + dsp->film_grain.luma_auto_regression[2] = + ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>; + + // ChromaAutoRegressionFunc + // Chroma autoregression should never be called when lag is 0 and use_luma is + // false. + dsp->film_grain.chroma_auto_regression[0][0] = nullptr; + dsp->film_grain.chroma_auto_regression[0][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 1, false>; + dsp->film_grain.chroma_auto_regression[0][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 2, false>; + dsp->film_grain.chroma_auto_regression[0][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 3, false>; + dsp->film_grain.chroma_auto_regression[1][0] = + ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 0, true>; + dsp->film_grain.chroma_auto_regression[1][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 1, true>; + dsp->film_grain.chroma_auto_regression[1][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 2, true>; + dsp->film_grain.chroma_auto_regression[1][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 3, true>; + + // ConstructNoiseStripesFunc + dsp->film_grain.construct_noise_stripes[0] = + ConstructNoiseStripes_C<8, int8_t>; + dsp->film_grain.construct_noise_stripes[1] = + ConstructNoiseStripesWithOverlap_C<8, int8_t>; + + // ConstructNoiseImageOverlapFunc + dsp->film_grain.construct_noise_image_overlap = + ConstructNoiseImageOverlap_C<8, int8_t>; + + // InitializeScalingLutFunc + dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_C<0>; + + // BlendNoiseWithImageLumaFunc + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_C<8, int8_t, uint8_t>; + + // BlendNoiseWithImageChromaFunc + dsp->film_grain.blend_noise_chroma[0] = + BlendNoiseWithImageChroma_C<8, int8_t, uint8_t>; + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_C<8, int8_t, uint8_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp8bpp_FilmGrainAutoregressionLuma + dsp->film_grain.luma_auto_regression[0] = + ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>; + dsp->film_grain.luma_auto_regression[1] = + ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>; + dsp->film_grain.luma_auto_regression[2] = + ApplyAutoRegressiveFilterToLumaGrain_C<8, int8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_FilmGrainAutoregressionChroma + // Chroma autoregression should never be called when lag is 0 and use_luma is + // false. + dsp->film_grain.chroma_auto_regression[0][0] = nullptr; + dsp->film_grain.chroma_auto_regression[0][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 1, false>; + dsp->film_grain.chroma_auto_regression[0][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 2, false>; + dsp->film_grain.chroma_auto_regression[0][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 3, false>; + dsp->film_grain.chroma_auto_regression[1][0] = + ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 0, true>; + dsp->film_grain.chroma_auto_regression[1][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 1, true>; + dsp->film_grain.chroma_auto_regression[1][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 2, true>; + dsp->film_grain.chroma_auto_regression[1][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<8, int8_t, 3, true>; +#endif +#ifndef LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseStripes + dsp->film_grain.construct_noise_stripes[0] = + ConstructNoiseStripes_C<8, int8_t>; + dsp->film_grain.construct_noise_stripes[1] = + ConstructNoiseStripesWithOverlap_C<8, int8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_FilmGrainConstructNoiseImageOverlap + dsp->film_grain.construct_noise_image_overlap = + ConstructNoiseImageOverlap_C<8, int8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc + dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_C<0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_C<8, int8_t, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma + dsp->film_grain.blend_noise_chroma[0] = + BlendNoiseWithImageChroma_C<8, int8_t, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_C<8, int8_t, uint8_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + + // LumaAutoRegressionFunc + dsp->film_grain.luma_auto_regression[0] = + ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>; + dsp->film_grain.luma_auto_regression[1] = + ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>; + dsp->film_grain.luma_auto_regression[2] = + ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>; + + // ChromaAutoRegressionFunc + // Chroma autoregression should never be called when lag is 0 and use_luma is + // false. + dsp->film_grain.chroma_auto_regression[0][0] = nullptr; + dsp->film_grain.chroma_auto_regression[0][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 1, false>; + dsp->film_grain.chroma_auto_regression[0][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 2, false>; + dsp->film_grain.chroma_auto_regression[0][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 3, false>; + dsp->film_grain.chroma_auto_regression[1][0] = + ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 0, true>; + dsp->film_grain.chroma_auto_regression[1][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 1, true>; + dsp->film_grain.chroma_auto_regression[1][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 2, true>; + dsp->film_grain.chroma_auto_regression[1][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 3, true>; + + // ConstructNoiseStripesFunc + dsp->film_grain.construct_noise_stripes[0] = + ConstructNoiseStripes_C<10, int16_t>; + dsp->film_grain.construct_noise_stripes[1] = + ConstructNoiseStripesWithOverlap_C<10, int16_t>; + + // ConstructNoiseImageOverlapFunc + dsp->film_grain.construct_noise_image_overlap = + ConstructNoiseImageOverlap_C<10, int16_t>; + + // InitializeScalingLutFunc + dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_C<0>; + + // BlendNoiseWithImageLumaFunc + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_C<10, int16_t, uint16_t>; + + // BlendNoiseWithImageChromaFunc + dsp->film_grain.blend_noise_chroma[0] = + BlendNoiseWithImageChroma_C<10, int16_t, uint16_t>; + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_C<10, int16_t, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp10bpp_FilmGrainAutoregressionLuma + dsp->film_grain.luma_auto_regression[0] = + ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>; + dsp->film_grain.luma_auto_regression[1] = + ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>; + dsp->film_grain.luma_auto_regression[2] = + ApplyAutoRegressiveFilterToLumaGrain_C<10, int16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_FilmGrainAutoregressionChroma + // Chroma autoregression should never be called when lag is 0 and use_luma is + // false. + dsp->film_grain.chroma_auto_regression[0][0] = nullptr; + dsp->film_grain.chroma_auto_regression[0][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 1, false>; + dsp->film_grain.chroma_auto_regression[0][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 2, false>; + dsp->film_grain.chroma_auto_regression[0][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 3, false>; + dsp->film_grain.chroma_auto_regression[1][0] = + ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 0, true>; + dsp->film_grain.chroma_auto_regression[1][1] = + ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 1, true>; + dsp->film_grain.chroma_auto_regression[1][2] = + ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 2, true>; + dsp->film_grain.chroma_auto_regression[1][3] = + ApplyAutoRegressiveFilterToChromaGrains_C<10, int16_t, 3, true>; +#endif +#ifndef LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseStripes + dsp->film_grain.construct_noise_stripes[0] = + ConstructNoiseStripes_C<10, int16_t>; + dsp->film_grain.construct_noise_stripes[1] = + ConstructNoiseStripesWithOverlap_C<10, int16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_FilmGrainConstructNoiseImageOverlap + dsp->film_grain.construct_noise_image_overlap = + ConstructNoiseImageOverlap_C<10, int16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc + dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_C<0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_C<10, int16_t, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma + dsp->film_grain.blend_noise_chroma[0] = + BlendNoiseWithImageChroma_C<10, int16_t, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_C<10, int16_t, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace +} // namespace film_grain + +void FilmGrainInit_C() { + film_grain::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + film_grain::Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/film_grain.h b/src/dsp/film_grain.h new file mode 100644 index 0000000..fe93270 --- /dev/null +++ b/src/dsp/film_grain.h @@ -0,0 +1,39 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_H_ +#define LIBGAV1_SRC_DSP_FILM_GRAIN_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/film_grain_neon.h" + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initialize Dsp::film_grain_synthesis. This function is not thread-safe. +void FilmGrainInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_FILM_GRAIN_H_ diff --git a/src/dsp/film_grain_common.h b/src/dsp/film_grain_common.h new file mode 100644 index 0000000..64e3e8e --- /dev/null +++ b/src/dsp/film_grain_common.h @@ -0,0 +1,78 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_ +#define LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_ + +#include +#include +#include +#include + +#include "src/dsp/common.h" +#include "src/utils/array_2d.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" + +namespace libgav1 { + +template +int GetGrainMax() { + return (1 << (bitdepth - 1)) - 1; +} + +template +int GetGrainMin() { + return -(1 << (bitdepth - 1)); +} + +inline int GetFilmGrainRandomNumber(int bits, uint16_t* seed) { + uint16_t s = *seed; + uint16_t bit = (s ^ (s >> 1) ^ (s >> 3) ^ (s >> 12)) & 1; + s = (s >> 1) | (bit << 15); + *seed = s; + return s >> (16 - bits); +} + +enum { + kAutoRegressionBorder = 3, + // The width of the luma noise array. + kLumaWidth = 82, + // The height of the luma noise array. + kLumaHeight = 73, + // The two possible widths of the chroma noise array. + kMinChromaWidth = 44, + kMaxChromaWidth = 82, + // The two possible heights of the chroma noise array. + kMinChromaHeight = 38, + kMaxChromaHeight = 73, + // The scaling lookup table maps bytes to bytes, so only uses 256 elements, + // plus one for overflow in 10bit lookups. + kScalingLookupTableSize = 257, + // Padding is added to the scaling lookup table to permit overwrites by + // InitializeScalingLookupTable_NEON. + kScalingLookupTablePadding = 6, + // Padding is added to each row of the noise image to permit overreads by + // BlendNoiseWithImageLuma_NEON and overwrites by WriteOverlapLine8bpp_NEON. + kNoiseImagePadding = 7, + // Padding is added to the end of the |noise_stripes_| buffer to permit + // overreads by WriteOverlapLine8bpp_NEON. + kNoiseStripePadding = 7, +}; // anonymous enum + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_FILM_GRAIN_COMMON_H_ diff --git a/src/dsp/intra_edge.cc b/src/dsp/intra_edge.cc new file mode 100644 index 0000000..fe66db2 --- /dev/null +++ b/src/dsp/intra_edge.cc @@ -0,0 +1,115 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intra_edge.h" + +#include +#include +#include + +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kKernelTaps = 5; +constexpr int kKernels[3][kKernelTaps] = { + {0, 4, 8, 4, 0}, {0, 5, 6, 5, 0}, {2, 4, 4, 4, 2}}; +constexpr int kMaxUpsampleSize = 16; + +template +void IntraEdgeFilter_C(void* buffer, int size, int strength) { + assert(strength > 0); + Pixel edge[129]; + memcpy(edge, buffer, sizeof(edge[0]) * size); + auto* const dst_buffer = static_cast(buffer); + const int kernel_index = strength - 1; + for (int i = 1; i < size; ++i) { + int sum = 0; + for (int j = 0; j < kKernelTaps; ++j) { + const int k = Clip3(i + j - 2, 0, size - 1); + sum += kKernels[kernel_index][j] * edge[k]; + } + dst_buffer[i] = RightShiftWithRounding(sum, 4); + } +} + +template +void IntraEdgeUpsampler_C(void* buffer, int size) { + assert(size % 4 == 0 && size <= kMaxUpsampleSize); + auto* const pixel_buffer = static_cast(buffer); + Pixel temp[kMaxUpsampleSize + 3]; + temp[0] = temp[1] = pixel_buffer[-1]; + memcpy(temp + 2, pixel_buffer, sizeof(temp[0]) * size); + temp[size + 2] = pixel_buffer[size - 1]; + + pixel_buffer[-2] = temp[0]; + for (int i = 0; i < size; ++i) { + const int sum = + -temp[i] + (9 * temp[i + 1]) + (9 * temp[i + 2]) - temp[i + 3]; + pixel_buffer[2 * i - 1] = + Clip3(RightShiftWithRounding(sum, 4), 0, (1 << bitdepth) - 1); + pixel_buffer[2 * i] = temp[i + 2]; + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->intra_edge_filter = IntraEdgeFilter_C; + dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<8, uint8_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter + dsp->intra_edge_filter = IntraEdgeFilter_C; +#endif +#ifndef LIBGAV1_Dsp8bpp_IntraEdgeUpsampler + dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<8, uint8_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->intra_edge_filter = IntraEdgeFilter_C; + dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<10, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp10bpp_IntraEdgeFilter + dsp->intra_edge_filter = IntraEdgeFilter_C; +#endif +#ifndef LIBGAV1_Dsp10bpp_IntraEdgeUpsampler + dsp->intra_edge_upsampler = IntraEdgeUpsampler_C<10, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif + +} // namespace + +void IntraEdgeInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/intra_edge.h b/src/dsp/intra_edge.h new file mode 100644 index 0000000..172ecbb --- /dev/null +++ b/src/dsp/intra_edge.h @@ -0,0 +1,48 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_INTRA_EDGE_H_ +#define LIBGAV1_SRC_DSP_INTRA_EDGE_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/intra_edge_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/intra_edge_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This +// function is not thread-safe. +void IntraEdgeInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_INTRA_EDGE_H_ diff --git a/src/dsp/intrapred.cc b/src/dsp/intrapred.cc new file mode 100644 index 0000000..4bcb580 --- /dev/null +++ b/src/dsp/intrapred.cc @@ -0,0 +1,2911 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred.h" + +#include +#include +#include +#include +#include +#include // memset + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/memory.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr TransformSize kTransformSizesLargerThan32x32[] = { + kTransformSize16x64, kTransformSize32x64, kTransformSize64x16, + kTransformSize64x32, kTransformSize64x64}; + +template +struct IntraPredFuncs_C { + IntraPredFuncs_C() = delete; + + static void DcTop(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void Dc(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void Vertical(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void Paeth(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void Smooth(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void SmoothHorizontal(void* dest, ptrdiff_t stride, + const void* top_row, const void* left_column); +}; + +// Intra-predictors that require bitdepth. +template +struct IntraPredBppFuncs_C { + IntraPredBppFuncs_C() = delete; + + static void DcFill(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); +}; + +//------------------------------------------------------------------------------ +// IntraPredFuncs_C::DcPred + +template +void IntraPredFuncs_C::DcTop( + void* const dest, ptrdiff_t stride, const void* const top_row, + const void* /*left_column*/) { + int sum = block_width >> 1; // rounder + const auto* const top = static_cast(top_row); + for (int x = 0; x < block_width; ++x) sum += top[x]; + const int dc = sum >> FloorLog2(block_width); + + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + for (int y = 0; y < block_height; ++y) { + Memset(dst, dc, block_width); + dst += stride; + } +} + +template +void IntraPredFuncs_C::DcLeft( + void* const dest, ptrdiff_t stride, const void* /*top_row*/, + const void* const left_column) { + int sum = block_height >> 1; // rounder + const auto* const left = static_cast(left_column); + for (int y = 0; y < block_height; ++y) sum += left[y]; + const int dc = sum >> FloorLog2(block_height); + + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + for (int y = 0; y < block_height; ++y) { + Memset(dst, dc, block_width); + dst += stride; + } +} + +// Note for square blocks the divide in the Dc() function reduces to a shift. +// For rectangular block sizes the following multipliers can be used with the +// corresponding shifts. +// 8-bit +// 1:2 (e.g,, 4x8): scale = 0x5556 +// 1:4 (e.g., 4x16): scale = 0x3334 +// final_descale = 16 +// 10/12-bit +// 1:2: scale = 0xaaab +// 1:4: scale = 0x6667 +// final_descale = 17 +// Note these may be halved to the values used in 8-bit in all cases except +// when bitdepth == 12 and block_width + block_height is divisible by 5 (as +// opposed to 3). +// +// The calculation becomes: +// (dc_sum >> intermediate_descale) * scale) >> final_descale +// where intermediate_descale is: +// sum = block_width + block_height +// intermediate_descale = +// (sum <= 20) ? 2 : (sum <= 40) ? 3 : (sum <= 80) ? 4 : 5 +// +// The constants (multiplier and shifts) for a given block size are obtained +// as follows: +// - Let sum = block width + block height +// - Shift 'sum' right until we reach an odd number +// - Let the number of shifts for that block size be called 'intermediate_scale' +// and let the odd number be 'd' (d has only 2 possible values: d = 3 for a +// 1:2 rectangular block and d = 5 for a 1:4 rectangular block). +// - Find multipliers by dividing by 'd' using "Algorithm 1" in: +// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632 +// by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd +// shift will be 16, regardless of the block size. +// TODO(jzern): the base implementation could be updated to use this method. + +template +void IntraPredFuncs_C::Dc( + void* const dest, ptrdiff_t stride, const void* const top_row, + const void* const left_column) { + const int divisor = block_width + block_height; + int sum = divisor >> 1; // rounder + + const auto* const top = static_cast(top_row); + const auto* const left = static_cast(left_column); + for (int x = 0; x < block_width; ++x) sum += top[x]; + for (int y = 0; y < block_height; ++y) sum += left[y]; + + const int dc = sum / divisor; + + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + for (int y = 0; y < block_height; ++y) { + Memset(dst, dc, block_width); + dst += stride; + } +} + +//------------------------------------------------------------------------------ +// IntraPredFuncs_C directional predictors + +// IntraPredFuncs_C::Vertical -- apply top row vertically +template +void IntraPredFuncs_C::Vertical( + void* const dest, ptrdiff_t stride, const void* const top_row, + const void* /*left_column*/) { + auto* dst = static_cast(dest); + for (int y = 0; y < block_height; ++y) { + memcpy(dst, top_row, block_width * sizeof(Pixel)); + dst += stride; + } +} + +// IntraPredFuncs_C::Horizontal -- apply left column horizontally +template +void IntraPredFuncs_C::Horizontal( + void* const dest, ptrdiff_t stride, const void* /*top_row*/, + const void* const left_column) { + const auto* const left = static_cast(left_column); + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + for (int y = 0; y < block_height; ++y) { + Memset(dst, left[y], block_width); + dst += stride; + } +} + +template +inline Pixel Average(Pixel a, Pixel b) { + return static_cast((a + b + 1) >> 1); +} + +template +inline Pixel Average(Pixel a, Pixel b, Pixel c) { + return static_cast((a + 2 * b + c + 2) >> 2); +} + +// IntraPredFuncs_C::Paeth +template +void IntraPredFuncs_C::Paeth( + void* const dest, ptrdiff_t stride, const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const auto* const left = static_cast(left_column); + const Pixel top_left = top[-1]; + const int top_left_x2 = top_left + top_left; + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + + for (int y = 0; y < block_height; ++y) { + const int left_pixel = left[y]; + for (int x = 0; x < block_width; ++x) { + // The Paeth filter selects the value closest to: + // top[x] + left[y] - top_left + // To calculate the absolute distance for the left value this would be: + // abs((top[x] + left[y] - top_left) - left[y]) + // or, because left[y] cancels out: + // abs(top[x] - top_left) + const int left_dist = std::abs(top[x] - top_left); + const int top_dist = std::abs(left_pixel - top_left); + const int top_left_dist = std::abs(top[x] + left_pixel - top_left_x2); + + // Select the closest value to the initial estimate of 'T + L - TL'. + if (left_dist <= top_dist && left_dist <= top_left_dist) { + dst[x] = left_pixel; + } else if (top_dist <= top_left_dist) { + dst[x] = top[x]; + } else { + dst[x] = top_left; + } + } + dst += stride; + } +} + +constexpr uint8_t kSmoothWeights[] = { + // block dimension = 4 + 255, 149, 85, 64, + // block dimension = 8 + 255, 197, 146, 105, 73, 50, 37, 32, + // block dimension = 16 + 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, + // block dimension = 32 + 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, + 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, + // block dimension = 64 + 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, + 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, + 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, + 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4}; + +// IntraPredFuncs_C::Smooth +template +void IntraPredFuncs_C::Smooth( + void* const dest, ptrdiff_t stride, const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const auto* const left = static_cast(left_column); + const Pixel top_right = top[block_width - 1]; + const Pixel bottom_left = left[block_height - 1]; + static_assert( + block_width >= 4 && block_height >= 4, + "Weights for smooth predictor undefined for block width/height < 4"); + const uint8_t* const weights_x = kSmoothWeights + block_width - 4; + const uint8_t* const weights_y = kSmoothWeights + block_height - 4; + const uint16_t scale_value = (1 << kSmoothWeightScale); + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + + for (int y = 0; y < block_height; ++y) { + for (int x = 0; x < block_width; ++x) { + assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]); + uint32_t pred = weights_y[y] * top[x]; + pred += weights_x[x] * left[y]; + pred += static_cast(scale_value - weights_y[y]) * bottom_left; + pred += static_cast(scale_value - weights_x[x]) * top_right; + // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1) + // + 256. With the descale there's no need for saturation. + dst[x] = static_cast( + RightShiftWithRounding(pred, kSmoothWeightScale + 1)); + } + dst += stride; + } +} + +// IntraPredFuncs_C::SmoothVertical +template +void IntraPredFuncs_C::SmoothVertical( + void* const dest, ptrdiff_t stride, const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const auto* const left = static_cast(left_column); + const Pixel bottom_left = left[block_height - 1]; + static_assert(block_height >= 4, + "Weights for smooth predictor undefined for block height < 4"); + const uint8_t* const weights_y = kSmoothWeights + block_height - 4; + const uint16_t scale_value = (1 << kSmoothWeightScale); + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + + for (int y = 0; y < block_height; ++y) { + for (int x = 0; x < block_width; ++x) { + assert(scale_value >= weights_y[y]); + uint32_t pred = weights_y[y] * top[x]; + pred += static_cast(scale_value - weights_y[y]) * bottom_left; + dst[x] = + static_cast(RightShiftWithRounding(pred, kSmoothWeightScale)); + } + dst += stride; + } +} + +// IntraPredFuncs_C::SmoothHorizontal +template +void IntraPredFuncs_C::SmoothHorizontal( + void* const dest, ptrdiff_t stride, const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const auto* const left = static_cast(left_column); + const Pixel top_right = top[block_width - 1]; + static_assert(block_width >= 4, + "Weights for smooth predictor undefined for block width < 4"); + const uint8_t* const weights_x = kSmoothWeights + block_width - 4; + const uint16_t scale_value = (1 << kSmoothWeightScale); + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + + for (int y = 0; y < block_height; ++y) { + for (int x = 0; x < block_width; ++x) { + assert(scale_value >= weights_x[x]); + uint32_t pred = weights_x[x] * left[y]; + pred += static_cast(scale_value - weights_x[x]) * top_right; + dst[x] = + static_cast(RightShiftWithRounding(pred, kSmoothWeightScale)); + } + dst += stride; + } +} + +//------------------------------------------------------------------------------ +// IntraPredBppFuncs_C +template +inline void DcFill_C(void* const dest, ptrdiff_t stride, const int block_width, + const int block_height) { + static_assert(sizeof(Pixel) == 1 || sizeof(Pixel) == 2, + "Only 1 & 2 byte pixels are supported"); + + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + for (int y = 0; y < block_height; ++y) { + Memset(dst, fill, block_width); + dst += stride; + } +} + +template +void IntraPredBppFuncs_C::DcFill( + void* const dest, ptrdiff_t stride, const void* /*top_row*/, + const void* /*left_column*/) { + DcFill_C<0x80 << (bitdepth - 8), Pixel>(dest, stride, block_width, + block_height); +} + +//------------------------------------------------------------------------------ +// FilterIntraPredictor_C + +template +void FilterIntraPredictor_C(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column, + const FilterIntraPredictor pred, const int width, + const int height) { + const int kMaxPixel = (1 << bitdepth) - 1; + const auto* const top = static_cast(top_row); + const auto* const left = static_cast(left_column); + + assert(width <= 32 && height <= 32); + + Pixel buffer[3][33]; // cache 2 rows + top & left boundaries + memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0])); + + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + int row0 = 0, row2 = 2; + int ystep = 1; + int y = 0; + do { + buffer[1][0] = left[y]; + buffer[row2][0] = left[y + 1]; + int x = 1; + do { + const Pixel p0 = buffer[row0][x - 1]; // top-left + const Pixel p1 = buffer[row0][x + 0]; // top 0 + const Pixel p2 = buffer[row0][x + 1]; // top 1 + const Pixel p3 = buffer[row0][x + 2]; // top 2 + const Pixel p4 = buffer[row0][x + 3]; // top 3 + const Pixel p5 = buffer[1][x - 1]; // left 0 + const Pixel p6 = buffer[row2][x - 1]; // left 1 + for (int i = 0; i < 8; ++i) { + const int xoffset = i & 0x03; + const int yoffset = (i >> 2) * ystep; + const int value = kFilterIntraTaps[pred][i][0] * p0 + + kFilterIntraTaps[pred][i][1] * p1 + + kFilterIntraTaps[pred][i][2] * p2 + + kFilterIntraTaps[pred][i][3] * p3 + + kFilterIntraTaps[pred][i][4] * p4 + + kFilterIntraTaps[pred][i][5] * p5 + + kFilterIntraTaps[pred][i][6] * p6; + buffer[1 + yoffset][x + xoffset] = static_cast( + Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel)); + } + x += 4; + } while (x < width); + memcpy(dst, &buffer[1][1], width * sizeof(dst[0])); + dst += stride; + memcpy(dst, &buffer[row2][1], width * sizeof(dst[0])); + dst += stride; + + // The final row becomes the top for the next pass. + row0 ^= 2; + row2 ^= 2; + ystep = -ystep; + y += 2; + } while (y < height); +} + +//------------------------------------------------------------------------------ +// CflIntraPredictor_C + +// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive. +// |alpha| can be -16 to 16 (inclusive). +template +void CflIntraPredictor_C( + void* const dest, ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast(dest); + const int dc = dst[0]; + stride /= sizeof(Pixel); + const int max_value = (1 << bitdepth) - 1; + for (int y = 0; y < block_height; ++y) { + for (int x = 0; x < block_width; ++x) { + assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3)); + assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3); + dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6), + 0, max_value); + } + dst += stride; + } +} + +//------------------------------------------------------------------------------ +// CflSubsampler_C + +template +void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + const auto* src = static_cast(source); + stride /= sizeof(Pixel); + int sum = 0; + for (int y = 0; y < block_height; ++y) { + for (int x = 0; x < block_width; ++x) { + const ptrdiff_t luma_x = + std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x)); + const ptrdiff_t luma_x_next = luma_x + stride; + luma[y][x] = + (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) + + ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1]) + : 0)) + << (3 - subsampling_x - subsampling_y); + sum += luma[y][x]; + } + if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) { + src += stride << subsampling_y; + } + } + const int average = RightShiftWithRounding( + sum, FloorLog2(block_width) + FloorLog2(block_height)); + for (int y = 0; y < block_height; ++y) { + for (int x = 0; x < block_width; ++x) { + luma[y][x] -= average; + } + } +} + +//------------------------------------------------------------------------------ +// 7.11.2.4. Directional intra prediction process + +template +void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride, + const void* const top_row, + const int width, const int height, + const int xstep, + const bool upsampled_top) { + const auto* const top = static_cast(top_row); + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + + assert(xstep > 0); + + // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to + // |top[top_base_x]|. This corresponds to a 45 degree prediction. + if (xstep == 64) { + // 7.11.2.10. Intra edge upsample selection process + // if ( d <= 0 || d >= 40 ) useUpsample = 0 + // For |upsampled_top| the delta is |predictor_angle - 90|. Since the + // |predictor_angle| is 45 the delta is also 45. + assert(!upsampled_top); + const Pixel* top_ptr = top + 1; + for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) { + memcpy(dst, top_ptr, sizeof(*top_ptr) * width); + } + return; + } + + const int upsample_shift = static_cast(upsampled_top); + const int max_base_x = ((width + height) - 1) << upsample_shift; + const int scale_bits = 6 - upsample_shift; + const int base_step = 1 << upsample_shift; + int top_x = xstep; + int y = 0; + do { + int top_base_x = top_x >> scale_bits; + + if (top_base_x >= max_base_x) { + for (int i = y; i < height; ++i) { + Memset(dst, top[max_base_x], width); + dst += stride; + } + return; + } + + const int shift = ((top_x << upsample_shift) & 0x3F) >> 1; + int x = 0; + do { + if (top_base_x >= max_base_x) { + Memset(dst + x, top[max_base_x], width - x); + break; + } + + const int val = + top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift; + dst[x] = RightShiftWithRounding(val, 5); + top_base_x += base_step; + } while (++x < width); + + dst += stride; + top_x += xstep; + } while (++y < height); +} + +template +void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column, + const int width, const int height, + const int xstep, const int ystep, + const bool upsampled_top, + const bool upsampled_left) { + const auto* const top = static_cast(top_row); + const auto* const left = static_cast(left_column); + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + + assert(xstep > 0); + assert(ystep > 0); + + const int upsample_top_shift = static_cast(upsampled_top); + const int upsample_left_shift = static_cast(upsampled_left); + const int scale_bits_x = 6 - upsample_top_shift; + const int scale_bits_y = 6 - upsample_left_shift; + const int min_base_x = -(1 << upsample_top_shift); + const int base_step_x = 1 << upsample_top_shift; + int y = 0; + int top_x = -xstep; + do { + int top_base_x = top_x >> scale_bits_x; + int left_y = (y << 6) - ystep; + int x = 0; + do { + int val; + if (top_base_x >= min_base_x) { + const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1; + val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift; + } else { + // Note this assumes an arithmetic shift to handle negative values. + const int left_base_y = left_y >> scale_bits_y; + const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1; + assert(left_base_y >= -(1 << upsample_left_shift)); + val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift; + } + dst[x] = RightShiftWithRounding(val, 5); + top_base_x += base_step_x; + left_y -= ystep; + } while (++x < width); + + top_x -= xstep; + dst += stride; + } while (++y < height); +} + +template +void DirectionalIntraPredictorZone3_C(void* const dest, ptrdiff_t stride, + const void* const left_column, + const int width, const int height, + const int ystep, + const bool upsampled_left) { + const auto* const left = static_cast(left_column); + stride /= sizeof(Pixel); + + assert(ystep > 0); + + const int upsample_shift = static_cast(upsampled_left); + const int scale_bits = 6 - upsample_shift; + const int base_step = 1 << upsample_shift; + // Zone3 never runs out of left_column values. + assert((width + height - 1) << upsample_shift > // max_base_y + ((ystep * width) >> scale_bits) + + base_step * (height - 1)); // left_base_y + + int left_y = ystep; + int x = 0; + do { + auto* dst = static_cast(dest); + + int left_base_y = left_y >> scale_bits; + int y = 0; + do { + const int shift = ((left_y << upsample_shift) & 0x3F) >> 1; + const int val = + left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift; + dst[x] = RightShiftWithRounding(val, 5); + dst += stride; + left_base_y += base_step; + } while (++y < height); + + left_y += ystep; + } while (++x < width); +} + +//------------------------------------------------------------------------------ + +template +struct IntraPredDefs { + IntraPredDefs() = delete; + + using _4x4 = IntraPredFuncs_C<4, 4, Pixel>; + using _4x8 = IntraPredFuncs_C<4, 8, Pixel>; + using _4x16 = IntraPredFuncs_C<4, 16, Pixel>; + using _8x4 = IntraPredFuncs_C<8, 4, Pixel>; + using _8x8 = IntraPredFuncs_C<8, 8, Pixel>; + using _8x16 = IntraPredFuncs_C<8, 16, Pixel>; + using _8x32 = IntraPredFuncs_C<8, 32, Pixel>; + using _16x4 = IntraPredFuncs_C<16, 4, Pixel>; + using _16x8 = IntraPredFuncs_C<16, 8, Pixel>; + using _16x16 = IntraPredFuncs_C<16, 16, Pixel>; + using _16x32 = IntraPredFuncs_C<16, 32, Pixel>; + using _16x64 = IntraPredFuncs_C<16, 64, Pixel>; + using _32x8 = IntraPredFuncs_C<32, 8, Pixel>; + using _32x16 = IntraPredFuncs_C<32, 16, Pixel>; + using _32x32 = IntraPredFuncs_C<32, 32, Pixel>; + using _32x64 = IntraPredFuncs_C<32, 64, Pixel>; + using _64x16 = IntraPredFuncs_C<64, 16, Pixel>; + using _64x32 = IntraPredFuncs_C<64, 32, Pixel>; + using _64x64 = IntraPredFuncs_C<64, 64, Pixel>; +}; + +template +struct IntraPredBppDefs { + IntraPredBppDefs() = delete; + + using _4x4 = IntraPredBppFuncs_C<4, 4, bitdepth, Pixel>; + using _4x8 = IntraPredBppFuncs_C<4, 8, bitdepth, Pixel>; + using _4x16 = IntraPredBppFuncs_C<4, 16, bitdepth, Pixel>; + using _8x4 = IntraPredBppFuncs_C<8, 4, bitdepth, Pixel>; + using _8x8 = IntraPredBppFuncs_C<8, 8, bitdepth, Pixel>; + using _8x16 = IntraPredBppFuncs_C<8, 16, bitdepth, Pixel>; + using _8x32 = IntraPredBppFuncs_C<8, 32, bitdepth, Pixel>; + using _16x4 = IntraPredBppFuncs_C<16, 4, bitdepth, Pixel>; + using _16x8 = IntraPredBppFuncs_C<16, 8, bitdepth, Pixel>; + using _16x16 = IntraPredBppFuncs_C<16, 16, bitdepth, Pixel>; + using _16x32 = IntraPredBppFuncs_C<16, 32, bitdepth, Pixel>; + using _16x64 = IntraPredBppFuncs_C<16, 64, bitdepth, Pixel>; + using _32x8 = IntraPredBppFuncs_C<32, 8, bitdepth, Pixel>; + using _32x16 = IntraPredBppFuncs_C<32, 16, bitdepth, Pixel>; + using _32x32 = IntraPredBppFuncs_C<32, 32, bitdepth, Pixel>; + using _32x64 = IntraPredBppFuncs_C<32, 64, bitdepth, Pixel>; + using _64x16 = IntraPredBppFuncs_C<64, 16, bitdepth, Pixel>; + using _64x32 = IntraPredBppFuncs_C<64, 32, bitdepth, Pixel>; + using _64x64 = IntraPredBppFuncs_C<64, 64, bitdepth, Pixel>; +}; + +using Defs = IntraPredDefs; +using Defs8bpp = IntraPredBppDefs<8, uint8_t>; + +// Initializes dsp entries for kTransformSize|W|x|H| from |DEFS|/|DEFSBPP| of +// the same size. +#define INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, W, H) \ + dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcFill] = \ + DEFSBPP::_##W##x##H::DcFill; \ + dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcTop] = \ + DEFS::_##W##x##H::DcTop; \ + dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDcLeft] = \ + DEFS::_##W##x##H::DcLeft; \ + dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorDc] = \ + DEFS::_##W##x##H::Dc; \ + dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorVertical] = \ + DEFS::_##W##x##H::Vertical; \ + dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorHorizontal] = \ + DEFS::_##W##x##H::Horizontal; \ + dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorPaeth] = \ + DEFS::_##W##x##H::Paeth; \ + dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] = \ + DEFS::_##W##x##H::Smooth; \ + dsp->intra_predictors[kTransformSize##W##x##H] \ + [kIntraPredictorSmoothVertical] = \ + DEFS::_##W##x##H::SmoothVertical; \ + dsp->intra_predictors[kTransformSize##W##x##H] \ + [kIntraPredictorSmoothHorizontal] = \ + DEFS::_##W##x##H::SmoothHorizontal + +#define INIT_INTRAPREDICTORS(DEFS, DEFSBPP) \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 4); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 8); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 16); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 4); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 8); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 16); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 8, 32); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 4); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 8); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 16); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 32); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 16, 64); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 8); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 16); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 32); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 32, 64); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 16); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 32); \ + INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 64) + +#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL) \ + dsp->cfl_intra_predictors[kTransformSize##W##x##H] = \ + CflIntraPredictor_C; \ + dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \ + CflSubsampler_C; \ + dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \ + CflSubsampler_C; \ + dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \ + CflSubsampler_C + +#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL) \ + INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \ + INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL) + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_INTRAPREDICTORS(Defs, Defs8bpp); + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_C; + dsp->directional_intra_predictor_zone2 = + DirectionalIntraPredictorZone2_C; + dsp->directional_intra_predictor_zone3 = + DirectionalIntraPredictorZone3_C; + dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>; + INIT_CFL_INTRAPREDICTORS(8, uint8_t); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] = + Defs8bpp::_4x4::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] = + Defs::_4x4::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] = + Defs::_4x4::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = Defs::_4x4::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorVertical + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] = + Defs::_4x4::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] = + Defs::_4x4::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] = + Defs::_4x4::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] = + Defs::_4x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] = + Defs::_4x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] = + Defs::_4x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] = + Defs8bpp::_4x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] = + Defs::_4x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] = + Defs::_4x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] = Defs::_4x8::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] = + Defs::_4x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] = + Defs::_4x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] = + Defs::_4x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] = + Defs::_4x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] = + Defs::_4x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] = + Defs::_4x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] = + Defs8bpp::_4x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] = + Defs::_4x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] = + Defs::_4x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] = + Defs::_4x16::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] = + Defs::_4x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] = + Defs::_4x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] = + Defs::_4x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] = + Defs::_4x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] = + Defs::_4x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] = + Defs::_4x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] = + Defs8bpp::_8x4::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] = + Defs::_8x4::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] = + Defs::_8x4::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] = Defs::_8x4::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] = + Defs::_8x4::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] = + Defs::_8x4::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] = + Defs::_8x4::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] = + Defs::_8x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] = + Defs::_8x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] = + Defs::_8x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] = + Defs8bpp::_8x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] = + Defs::_8x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] = + Defs::_8x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] = Defs::_8x8::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] = + Defs::_8x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] = + Defs::_8x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] = + Defs::_8x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] = + Defs::_8x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] = + Defs::_8x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] = + Defs::_8x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] = + Defs8bpp::_8x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] = + Defs::_8x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] = + Defs::_8x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] = + Defs::_8x16::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] = + Defs::_8x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] = + Defs::_8x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] = + Defs::_8x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] = + Defs::_8x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] = + Defs::_8x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] = + Defs::_8x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] = + Defs8bpp::_8x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] = + Defs::_8x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] = + Defs::_8x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] = + Defs::_8x32::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] = + Defs::_8x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] = + Defs::_8x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] = + Defs::_8x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] = + Defs::_8x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] = + Defs::_8x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] = + Defs::_8x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] = + Defs8bpp::_16x4::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] = + Defs::_16x4::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] = + Defs::_16x4::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] = + Defs::_16x4::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] = + Defs::_16x4::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] = + Defs::_16x4::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] = + Defs::_16x4::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] = + Defs::_16x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] = + Defs::_16x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] = + Defs::_16x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] = + Defs8bpp::_16x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] = + Defs::_16x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] = + Defs::_16x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] = + Defs::_16x8::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] = + Defs::_16x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] = + Defs::_16x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] = + Defs::_16x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] = + Defs::_16x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] = + Defs::_16x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] = + Defs::_16x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] = + Defs8bpp::_16x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] = + Defs::_16x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] = + Defs::_16x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] = + Defs::_16x16::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] = + Defs::_16x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] = + Defs::_16x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] = + Defs::_16x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] = + Defs::_16x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] = + Defs::_16x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] = + Defs::_16x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] = + Defs8bpp::_16x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] = + Defs::_16x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] = + Defs::_16x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] = + Defs::_16x32::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] = + Defs::_16x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] = + Defs::_16x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] = + Defs::_16x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] = + Defs::_16x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] = + Defs::_16x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] = + Defs::_16x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] = + Defs8bpp::_16x64::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] = + Defs::_16x64::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] = + Defs::_16x64::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] = + Defs::_16x64::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] = + Defs::_16x64::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] = + Defs::_16x64::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] = + Defs::_16x64::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] = + Defs::_16x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] = + Defs::_16x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] = + Defs::_16x64::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] = + Defs8bpp::_32x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] = + Defs::_32x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] = + Defs::_32x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] = + Defs::_32x8::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] = + Defs::_32x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] = + Defs::_32x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] = + Defs::_32x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] = + Defs::_32x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] = + Defs::_32x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] = + Defs::_32x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] = + Defs8bpp::_32x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] = + Defs::_32x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] = + Defs::_32x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] = + Defs::_32x16::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] = + Defs::_32x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] = + Defs::_32x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] = + Defs::_32x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] = + Defs::_32x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] = + Defs::_32x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] = + Defs::_32x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] = + Defs8bpp::_32x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] = + Defs::_32x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] = + Defs::_32x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] = + Defs::_32x32::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] = + Defs::_32x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] = + Defs::_32x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] = + Defs::_32x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] = + Defs::_32x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] = + Defs::_32x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] = + Defs::_32x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] = + Defs8bpp::_32x64::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] = + Defs::_32x64::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] = + Defs::_32x64::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] = + Defs::_32x64::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] = + Defs::_32x64::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] = + Defs::_32x64::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] = + Defs::_32x64::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] = + Defs::_32x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] = + Defs::_32x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] = + Defs::_32x64::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] = + Defs8bpp::_64x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] = + Defs::_64x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] = + Defs::_64x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] = + Defs::_64x16::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] = + Defs::_64x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] = + Defs::_64x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] = + Defs::_64x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] = + Defs::_64x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] = + Defs::_64x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] = + Defs::_64x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] = + Defs8bpp::_64x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] = + Defs::_64x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] = + Defs::_64x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] = + Defs::_64x32::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] = + Defs::_64x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] = + Defs::_64x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] = + Defs::_64x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] = + Defs::_64x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] = + Defs::_64x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] = + Defs::_64x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] = + Defs8bpp::_64x64::DcFill; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] = + Defs::_64x64::DcTop; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] = + Defs::_64x64::DcLeft; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] = + Defs::_64x64::Dc; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorVertical + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] = + Defs::_64x64::Vertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] = + Defs::_64x64::Horizontal; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] = + Defs::_64x64::Paeth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] = + Defs::_64x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] = + Defs::_64x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = + Defs::_64x64::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_C; +#endif +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 + dsp->directional_intra_predictor_zone2 = + DirectionalIntraPredictorZone2_C; +#endif +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 + dsp->directional_intra_predictor_zone3 = + DirectionalIntraPredictorZone3_C; +#endif + +#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor + dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x4] = + CflIntraPredictor_C<4, 4, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] = + CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] = + CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] = + CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x8] = + CflIntraPredictor_C<4, 8, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] = + CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] = + CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] = + CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x16] = + CflIntraPredictor_C<4, 16, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] = + CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] = + CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] = + CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x4] = + CflIntraPredictor_C<8, 4, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] = + CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] = + CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] = + CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x8] = + CflIntraPredictor_C<8, 8, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] = + CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] = + CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] = + CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x16] = + CflIntraPredictor_C<8, 16, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] = + CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] = + CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] = + CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x32] = + CflIntraPredictor_C<8, 32, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] = + CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] = + CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] = + CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x4] = + CflIntraPredictor_C<16, 4, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] = + CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] = + CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] = + CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x8] = + CflIntraPredictor_C<16, 8, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] = + CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] = + CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] = + CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x16] = + CflIntraPredictor_C<16, 16, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] = + CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] = + CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] = + CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x32] = + CflIntraPredictor_C<16, 32, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] = + CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] = + CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] = + CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x8] = + CflIntraPredictor_C<32, 8, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] = + CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] = + CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] = + CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x16] = + CflIntraPredictor_C<32, 16, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] = + CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] = + CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] = + CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x32] = + CflIntraPredictor_C<32, 32, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] = + CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] = + CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] = + CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + // Cfl predictors are available only for transform sizes with max(width, + // height) <= 32. Set all others to nullptr. + for (const auto i : kTransformSizesLargerThan32x32) { + dsp->cfl_intra_predictors[i] = nullptr; + for (int j = 0; j < kNumSubsamplingTypes; ++j) { + dsp->cfl_subsamplers[i][j] = nullptr; + } + } +} // NOLINT(readability/fn_size) + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using DefsHbd = IntraPredDefs; +using Defs10bpp = IntraPredBppDefs<10, uint16_t>; + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_INTRAPREDICTORS(DefsHbd, Defs10bpp); + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_C; + dsp->directional_intra_predictor_zone2 = + DirectionalIntraPredictorZone2_C; + dsp->directional_intra_predictor_zone3 = + DirectionalIntraPredictorZone3_C; + dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>; + INIT_CFL_INTRAPREDICTORS(10, uint16_t); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] = + Defs10bpp::_4x4::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] = + DefsHbd::_4x4::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] = + DefsHbd::_4x4::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = + DefsHbd::_4x4::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] = + DefsHbd::_4x4::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] = + DefsHbd::_4x4::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] = + DefsHbd::_4x4::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] = + DefsHbd::_4x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] = + DefsHbd::_4x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] = + DefsHbd::_4x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] = + Defs10bpp::_4x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] = + DefsHbd::_4x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] = + DefsHbd::_4x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] = + DefsHbd::_4x8::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] = + DefsHbd::_4x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] = + DefsHbd::_4x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] = + DefsHbd::_4x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] = + DefsHbd::_4x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] = + DefsHbd::_4x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_4x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] = + Defs10bpp::_4x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] = + DefsHbd::_4x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] = + DefsHbd::_4x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] = + DefsHbd::_4x16::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] = + DefsHbd::_4x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] = + DefsHbd::_4x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] = + DefsHbd::_4x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] = + DefsHbd::_4x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] = + DefsHbd::_4x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_4x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] = + Defs10bpp::_8x4::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] = + DefsHbd::_8x4::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] = + DefsHbd::_8x4::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] = + DefsHbd::_8x4::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] = + DefsHbd::_8x4::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] = + DefsHbd::_8x4::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] = + DefsHbd::_8x4::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] = + DefsHbd::_8x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] = + DefsHbd::_8x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] = + Defs10bpp::_8x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] = + DefsHbd::_8x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] = + DefsHbd::_8x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] = + DefsHbd::_8x8::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] = + DefsHbd::_8x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] = + DefsHbd::_8x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] = + DefsHbd::_8x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] = + DefsHbd::_8x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] = + DefsHbd::_8x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] = + Defs10bpp::_8x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] = + DefsHbd::_8x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] = + DefsHbd::_8x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] = + DefsHbd::_8x16::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] = + DefsHbd::_8x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] = + DefsHbd::_8x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] = + DefsHbd::_8x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] = + DefsHbd::_8x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] = + DefsHbd::_8x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] = + Defs10bpp::_8x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] = + DefsHbd::_8x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] = + DefsHbd::_8x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] = + DefsHbd::_8x32::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] = + DefsHbd::_8x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] = + DefsHbd::_8x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] = + DefsHbd::_8x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] = + DefsHbd::_8x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] = + DefsHbd::_8x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_8x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] = + Defs10bpp::_16x4::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] = + DefsHbd::_16x4::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] = + DefsHbd::_16x4::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] = + DefsHbd::_16x4::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] = + DefsHbd::_16x4::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] = + DefsHbd::_16x4::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] = + DefsHbd::_16x4::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] = + DefsHbd::_16x4::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] = + DefsHbd::_16x4::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x4::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] = + Defs10bpp::_16x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] = + DefsHbd::_16x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] = + DefsHbd::_16x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] = + DefsHbd::_16x8::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] = + DefsHbd::_16x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] = + DefsHbd::_16x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] = + DefsHbd::_16x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] = + DefsHbd::_16x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] = + DefsHbd::_16x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] = + Defs10bpp::_16x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] = + DefsHbd::_16x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] = + DefsHbd::_16x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] = + DefsHbd::_16x16::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] = + DefsHbd::_16x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] = + DefsHbd::_16x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] = + DefsHbd::_16x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] = + DefsHbd::_16x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] = + DefsHbd::_16x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] = + Defs10bpp::_16x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] = + DefsHbd::_16x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] = + DefsHbd::_16x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] = + DefsHbd::_16x32::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] = + DefsHbd::_16x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] = + DefsHbd::_16x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] = + DefsHbd::_16x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] = + DefsHbd::_16x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] = + DefsHbd::_16x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] = + Defs10bpp::_16x64::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] = + DefsHbd::_16x64::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] = + DefsHbd::_16x64::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] = + DefsHbd::_16x64::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] = + DefsHbd::_16x64::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] = + DefsHbd::_16x64::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] = + DefsHbd::_16x64::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] = + DefsHbd::_16x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] = + DefsHbd::_16x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] = + DefsHbd::_16x64::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] = + Defs10bpp::_32x8::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] = + DefsHbd::_32x8::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] = + DefsHbd::_32x8::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] = + DefsHbd::_32x8::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] = + DefsHbd::_32x8::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] = + DefsHbd::_32x8::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] = + DefsHbd::_32x8::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] = + DefsHbd::_32x8::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] = + DefsHbd::_32x8::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x8::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] = + Defs10bpp::_32x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] = + DefsHbd::_32x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] = + DefsHbd::_32x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] = + DefsHbd::_32x16::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] = + DefsHbd::_32x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] = + DefsHbd::_32x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] = + DefsHbd::_32x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] = + DefsHbd::_32x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] = + DefsHbd::_32x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] = + Defs10bpp::_32x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] = + DefsHbd::_32x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] = + DefsHbd::_32x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] = + DefsHbd::_32x32::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] = + DefsHbd::_32x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] = + DefsHbd::_32x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] = + DefsHbd::_32x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] = + DefsHbd::_32x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] = + DefsHbd::_32x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] = + Defs10bpp::_32x64::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] = + DefsHbd::_32x64::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] = + DefsHbd::_32x64::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] = + DefsHbd::_32x64::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] = + DefsHbd::_32x64::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] = + DefsHbd::_32x64::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] = + DefsHbd::_32x64::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] = + DefsHbd::_32x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] = + DefsHbd::_32x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] = + DefsHbd::_32x64::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] = + Defs10bpp::_64x16::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] = + DefsHbd::_64x16::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] = + DefsHbd::_64x16::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] = + DefsHbd::_64x16::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] = + DefsHbd::_64x16::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] = + DefsHbd::_64x16::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] = + DefsHbd::_64x16::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] = + DefsHbd::_64x16::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] = + DefsHbd::_64x16::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] = + DefsHbd::_64x16::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] = + Defs10bpp::_64x32::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] = + DefsHbd::_64x32::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] = + DefsHbd::_64x32::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] = + DefsHbd::_64x32::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] = + DefsHbd::_64x32::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] = + DefsHbd::_64x32::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] = + DefsHbd::_64x32::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] = + DefsHbd::_64x32::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] = + DefsHbd::_64x32::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] = + DefsHbd::_64x32::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcFill + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] = + Defs10bpp::_64x64::DcFill; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] = + DefsHbd::_64x64::DcTop; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] = + DefsHbd::_64x64::DcLeft; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] = + DefsHbd::_64x64::Dc; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] = + DefsHbd::_64x64::Vertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] = + DefsHbd::_64x64::Horizontal; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorPaeth + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] = + DefsHbd::_64x64::Paeth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] = + DefsHbd::_64x64::Smooth; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] = + DefsHbd::_64x64::SmoothVertical; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = + DefsHbd::_64x64::SmoothHorizontal; +#endif + +#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_C; +#endif +#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2 + dsp->directional_intra_predictor_zone2 = + DirectionalIntraPredictorZone2_C; +#endif +#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 + dsp->directional_intra_predictor_zone3 = + DirectionalIntraPredictorZone3_C; +#endif + +#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor + dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x4] = + CflIntraPredictor_C<4, 4, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] = + CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] = + CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] = + CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x8] = + CflIntraPredictor_C<4, 8, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] = + CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] = + CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] = + CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize4x16] = + CflIntraPredictor_C<4, 16, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] = + CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] = + CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] = + CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x4] = + CflIntraPredictor_C<8, 4, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] = + CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] = + CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] = + CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x8] = + CflIntraPredictor_C<8, 8, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] = + CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] = + CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] = + CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x16] = + CflIntraPredictor_C<8, 16, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] = + CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] = + CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] = + CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize8x32] = + CflIntraPredictor_C<8, 32, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] = + CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] = + CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] = + CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x4] = + CflIntraPredictor_C<16, 4, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] = + CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] = + CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] = + CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x8] = + CflIntraPredictor_C<16, 8, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] = + CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] = + CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] = + CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x16] = + CflIntraPredictor_C<16, 16, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] = + CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] = + CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] = + CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize16x32] = + CflIntraPredictor_C<16, 32, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] = + CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] = + CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] = + CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x8] = + CflIntraPredictor_C<32, 8, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] = + CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] = + CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] = + CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x16] = + CflIntraPredictor_C<32, 16, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] = + CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] = + CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] = + CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>; +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor + dsp->cfl_intra_predictors[kTransformSize32x32] = + CflIntraPredictor_C<32, 32, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] = + CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] = + CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] = + CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>; +#endif + +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + // Cfl predictors are available only for transform sizes with max(width, + // height) <= 32. Set all others to nullptr. + for (const auto i : kTransformSizesLargerThan32x32) { + dsp->cfl_intra_predictors[i] = nullptr; + for (int j = 0; j < kNumSubsamplingTypes; ++j) { + dsp->cfl_subsamplers[i][j] = nullptr; + } + } +} // NOLINT(readability/fn_size) +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +#undef INIT_CFL_INTRAPREDICTOR_WxH +#undef INIT_CFL_INTRAPREDICTORS +#undef INIT_INTRAPREDICTORS_WxH +#undef INIT_INTRAPREDICTORS + +} // namespace + +void IntraPredInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/intrapred.h b/src/dsp/intrapred.h new file mode 100644 index 0000000..c5286ef --- /dev/null +++ b/src/dsp/intrapred.h @@ -0,0 +1,49 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_INTRAPRED_H_ +#define LIBGAV1_SRC_DSP_INTRAPRED_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/intrapred_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/intrapred_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*, +// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and +// Dsp::filter_intra_predictor. This function is not thread-safe. +void IntraPredInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_INTRAPRED_H_ diff --git a/src/dsp/inverse_transform.cc b/src/dsp/inverse_transform.cc new file mode 100644 index 0000000..a03fad2 --- /dev/null +++ b/src/dsp/inverse_transform.cc @@ -0,0 +1,1636 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/inverse_transform.h" + +#include +#include +#include +#include + +#include "src/dsp/dsp.h" +#include "src/utils/array_2d.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/logging.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// Include the constants and utility functions inside the anonymous namespace. +#include "src/dsp/inverse_transform.inc" + +constexpr uint8_t kTransformColumnShift = 4; + +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) +#undef LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK +#endif + +int32_t RangeCheckValue(int32_t value, int8_t range) { +#if defined(LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK) && \ + LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK + assert(range <= 32); + const int32_t min = -(1 << (range - 1)); + const int32_t max = (1 << (range - 1)) - 1; + if (min > value || value > max) { + LIBGAV1_DLOG(ERROR, "coeff out of bit range, value: %d bit range %d\n", + value, range); + assert(min <= value && value <= max); + } +#endif // LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK + static_cast(range); + return value; +} + +template +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_C(Residual* const dst, int a, + int b, int angle, bool flip, + int8_t range) { + // Note that we multiply in 32 bits and then add/subtract the products in 64 + // bits. The 32-bit multiplications do not overflow. Please see the comment + // and assert() in Cos128(). + const int64_t x = static_cast(dst[a] * Cos128(angle)) - + static_cast(dst[b] * Sin128(angle)); + const int64_t y = static_cast(dst[a] * Sin128(angle)) + + static_cast(dst[b] * Cos128(angle)); + // Section 7.13.2.1: It is a requirement of bitstream conformance that the + // values saved into the array T by this function are representable by a + // signed integer using |range| bits of precision. + dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range); + dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range); +} + +template +void ButterflyRotationFirstIsZero_C(Residual* const dst, int a, int b, + int angle, bool flip, int8_t range) { + // Note that we multiply in 32 bits and then add/subtract the products in 64 + // bits. The 32-bit multiplications do not overflow. Please see the comment + // and assert() in Cos128(). + const auto x = static_cast(dst[b] * -Sin128(angle)); + const auto y = static_cast(dst[b] * Cos128(angle)); + // Section 7.13.2.1: It is a requirement of bitstream conformance that the + // values saved into the array T by this function are representable by a + // signed integer using |range| bits of precision. + dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range); + dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range); +} + +template +void ButterflyRotationSecondIsZero_C(Residual* const dst, int a, int b, + int angle, bool flip, int8_t range) { + // Note that we multiply in 32 bits and then add/subtract the products in 64 + // bits. The 32-bit multiplications do not overflow. Please see the comment + // and assert() in Cos128(). + const auto x = static_cast(dst[a] * Cos128(angle)); + const auto y = static_cast(dst[a] * Sin128(angle)); + + // Section 7.13.2.1: It is a requirement of bitstream conformance that the + // values saved into the array T by this function are representable by a + // signed integer using |range| bits of precision. + dst[a] = RangeCheckValue(RightShiftWithRounding(flip ? y : x, 12), range); + dst[b] = RangeCheckValue(RightShiftWithRounding(flip ? x : y, 12), range); +} + +template +void HadamardRotation_C(Residual* const dst, int a, int b, bool flip, + int8_t range) { + if (flip) std::swap(a, b); + --range; + // For Adst and Dct, the maximum possible value for range is 20. So min and + // max should always fit into int32_t. + const int32_t min = -(1 << range); + const int32_t max = (1 << range) - 1; + const int32_t x = dst[a] + dst[b]; + const int32_t y = dst[a] - dst[b]; + dst[a] = Clip3(x, min, max); + dst[b] = Clip3(y, min, max); +} + +template +void ClampIntermediate(Residual* const dst, int size) { + // If Residual is int16_t (which implies bitdepth is 8), we don't need to + // clip residual[i][j] to 16 bits. + if (sizeof(Residual) > 2) { + const Residual intermediate_clamp_max = + (1 << (std::max(bitdepth + 6, 16) - 1)) - 1; + const Residual intermediate_clamp_min = -intermediate_clamp_max - 1; + for (int j = 0; j < size; ++j) { + dst[j] = Clip3(dst[j], intermediate_clamp_min, intermediate_clamp_max); + } + } +} + +//------------------------------------------------------------------------------ +// Discrete Cosine Transforms (DCT). + +// Value for index (i, j) is computed as bitreverse(j) and interpreting that as +// an integer with bit-length i + 2. +// For e.g. index (2, 3) will be computed as follows: +// * bitreverse(3) = bitreverse(..000011) = 110000... +// * interpreting that as an integer with bit-length 2+2 = 4 will be 1100 = 12 +constexpr uint8_t kBitReverseLookup[kNum1DTransformSizes][64] = { + {0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, + 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, + 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3}, + {0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, + 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, + 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7, 0, 4, 2, 6, 1, 5, 3, 7}, + {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15, + 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15, + 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15, + 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15}, + {0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30, + 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31, + 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30, + 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31}, + {0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60, + 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62, + 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61, + 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63}}; + +template +void Dct_C(void* dest, int8_t range) { + static_assert(size_log2 >= 2 && size_log2 <= 6, ""); + auto* const dst = static_cast(dest); + // stage 1. + const int size = 1 << size_log2; + Residual temp[size]; + memcpy(temp, dst, sizeof(temp)); + for (int i = 0; i < size; ++i) { + dst[i] = temp[kBitReverseLookup[size_log2 - 2][i]]; + } + // stages 2-32 are dependent on the value of size_log2. + // stage 2. + if (size_log2 == 6) { + for (int i = 0; i < 16; ++i) { + ButterflyRotation_C(dst, i + 32, 63 - i, + 63 - MultiplyBy4(kBitReverseLookup[2][i]), false, + range); + } + } + // stage 3 + if (size_log2 >= 5) { + for (int i = 0; i < 8; ++i) { + ButterflyRotation_C(dst, i + 16, 31 - i, + 6 + MultiplyBy8(kBitReverseLookup[1][7 - i]), false, + range); + } + } + // stage 4. + if (size_log2 == 6) { + for (int i = 0; i < 16; ++i) { + HadamardRotation_C(dst, MultiplyBy2(i) + 32, MultiplyBy2(i) + 33, + static_cast(i & 1), range); + } + } + // stage 5. + if (size_log2 >= 4) { + for (int i = 0; i < 4; ++i) { + ButterflyRotation_C(dst, i + 8, 15 - i, + 12 + MultiplyBy16(kBitReverseLookup[0][3 - i]), false, + range); + } + } + // stage 6. + if (size_log2 >= 5) { + for (int i = 0; i < 8; ++i) { + HadamardRotation_C(dst, MultiplyBy2(i) + 16, MultiplyBy2(i) + 17, + static_cast(i & 1), range); + } + } + // stage 7. + if (size_log2 == 6) { + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 2; ++j) { + ButterflyRotation_C( + dst, 62 - MultiplyBy4(i) - j, MultiplyBy4(i) + j + 33, + 60 - MultiplyBy16(kBitReverseLookup[0][i]) + MultiplyBy64(j), true, + range); + } + } + } + // stage 8. + if (size_log2 >= 3) { + for (int i = 0; i < 2; ++i) { + ButterflyRotation_C(dst, i + 4, 7 - i, 56 - 32 * i, false, range); + } + } + // stage 9. + if (size_log2 >= 4) { + for (int i = 0; i < 4; ++i) { + HadamardRotation_C(dst, MultiplyBy2(i) + 8, MultiplyBy2(i) + 9, + static_cast(i & 1), range); + } + } + // stage 10. + if (size_log2 >= 5) { + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + ButterflyRotation_C( + dst, 30 - MultiplyBy4(i) - j, MultiplyBy4(i) + j + 17, + 24 + MultiplyBy64(j) + MultiplyBy32(1 - i), true, range); + } + } + } + // stage 11. + if (size_log2 == 6) { + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 2; ++j) { + HadamardRotation_C(dst, MultiplyBy4(i) + j + 32, + MultiplyBy4(i) - j + 35, static_cast(i & 1), + range); + } + } + } + // stage 12. + for (int i = 0; i < 2; ++i) { + ButterflyRotation_C(dst, MultiplyBy2(i), MultiplyBy2(i) + 1, 32 + 16 * i, + i == 0, range); + } + // stage 13. + if (size_log2 >= 3) { + for (int i = 0; i < 2; ++i) { + HadamardRotation_C(dst, MultiplyBy2(i) + 4, MultiplyBy2(i) + 5, + /*flip=*/i != 0, range); + } + } + // stage 14. + if (size_log2 >= 4) { + for (int i = 0; i < 2; ++i) { + ButterflyRotation_C(dst, 14 - i, i + 9, 48 + 64 * i, true, range); + } + } + // stage 15. + if (size_log2 >= 5) { + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 2; ++j) { + HadamardRotation_C(dst, MultiplyBy4(i) + j + 16, + MultiplyBy4(i) - j + 19, static_cast(i & 1), + range); + } + } + } + // stage 16. + if (size_log2 == 6) { + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 4; ++j) { + ButterflyRotation_C( + dst, 61 - MultiplyBy8(i) - j, MultiplyBy8(i) + j + 34, + 56 - MultiplyBy32(i) + MultiplyBy64(DivideBy2(j)), true, range); + } + } + } + // stage 17. + for (int i = 0; i < 2; ++i) { + HadamardRotation_C(dst, i, 3 - i, false, range); + } + // stage 18. + if (size_log2 >= 3) { + ButterflyRotation_C(dst, 6, 5, 32, true, range); + } + // stage 19. + if (size_log2 >= 4) { + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + HadamardRotation_C(dst, MultiplyBy4(i) + j + 8, MultiplyBy4(i) - j + 11, + /*flip=*/i != 0, range); + } + } + } + // stage 20. + if (size_log2 >= 5) { + for (int i = 0; i < 4; ++i) { + ButterflyRotation_C(dst, 29 - i, i + 18, 48 + 64 * DivideBy2(i), true, + range); + } + } + // stage 21. + if (size_log2 == 6) { + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + HadamardRotation_C(dst, MultiplyBy8(i) + j + 32, + MultiplyBy8(i) - j + 39, static_cast(i & 1), + range); + } + } + } + // stage 22. + if (size_log2 >= 3) { + for (int i = 0; i < 4; ++i) { + HadamardRotation_C(dst, i, 7 - i, false, range); + } + } + // stage 23. + if (size_log2 >= 4) { + for (int i = 0; i < 2; ++i) { + ButterflyRotation_C(dst, 13 - i, i + 10, 32, true, range); + } + } + // stage 24. + if (size_log2 >= 5) { + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 4; ++j) { + HadamardRotation_C(dst, MultiplyBy8(i) + j + 16, + MultiplyBy8(i) - j + 23, i == 1, range); + } + } + } + // stage 25. + if (size_log2 == 6) { + for (int i = 0; i < 8; ++i) { + ButterflyRotation_C(dst, 59 - i, i + 36, (i < 4) ? 48 : 112, true, range); + } + } + // stage 26. + if (size_log2 >= 4) { + for (int i = 0; i < 8; ++i) { + HadamardRotation_C(dst, i, 15 - i, false, range); + } + } + // stage 27. + if (size_log2 >= 5) { + for (int i = 0; i < 4; ++i) { + ButterflyRotation_C(dst, 27 - i, i + 20, 32, true, range); + } + } + // stage 28. + if (size_log2 == 6) { + for (int i = 0; i < 8; ++i) { + HadamardRotation_C(dst, i + 32, 47 - i, false, range); + HadamardRotation_C(dst, i + 48, 63 - i, true, range); + } + } + // stage 29. + if (size_log2 >= 5) { + for (int i = 0; i < 16; ++i) { + HadamardRotation_C(dst, i, 31 - i, false, range); + } + } + // stage 30. + if (size_log2 == 6) { + for (int i = 0; i < 8; ++i) { + ButterflyRotation_C(dst, 55 - i, i + 40, 32, true, range); + } + } + // stage 31. + if (size_log2 == 6) { + for (int i = 0; i < 32; ++i) { + HadamardRotation_C(dst, i, 63 - i, false, range); + } + } +} + +template +void DctDcOnly_C(void* dest, int8_t range, bool should_round, int row_shift, + bool is_row) { + auto* const dst = static_cast(dest); + + if (is_row && should_round) { + dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12); + } + + ButterflyRotationSecondIsZero_C(dst, 0, 1, 32, true, range); + + if (is_row && row_shift > 0) { + dst[0] = RightShiftWithRounding(dst[0], row_shift); + } + + ClampIntermediate(dst, 1); + + const int size = 1 << size_log2; + for (int i = 1; i < size; ++i) { + dst[i] = dst[0]; + } +} + +//------------------------------------------------------------------------------ +// Asymmetric Discrete Sine Transforms (ADST). + +/* + * Row transform max range in bits for bitdepths 8/10/12: 28/30/32. + * Column transform max range in bits for bitdepths 8/10/12: 28/28/30. + */ +template +void Adst4_C(void* dest, int8_t range) { + auto* const dst = static_cast(dest); + if ((dst[0] | dst[1] | dst[2] | dst[3]) == 0) { + return; + } + + // stage 1. + // Section 7.13.2.6: It is a requirement of bitstream conformance that all + // values stored in the s and x arrays by this process are representable by + // a signed integer using range + 12 bits of precision. + int32_t s[7]; + s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12); + s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12); + s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[1], range + 12); + s[3] = RangeCheckValue(kAdst4Multiplier[3] * dst[2], range + 12); + s[4] = RangeCheckValue(kAdst4Multiplier[0] * dst[2], range + 12); + s[5] = RangeCheckValue(kAdst4Multiplier[1] * dst[3], range + 12); + s[6] = RangeCheckValue(kAdst4Multiplier[3] * dst[3], range + 12); + // stage 2. + // Section 7.13.2.6: It is a requirement of bitstream conformance that + // values stored in the variable a7 by this process are representable by a + // signed integer using range + 1 bits of precision. + const int32_t a7 = RangeCheckValue(dst[0] - dst[2], range + 1); + // Section 7.13.2.6: It is a requirement of bitstream conformance that + // values stored in the variable b7 by this process are representable by a + // signed integer using |range| bits of precision. + const int32_t b7 = RangeCheckValue(a7 + dst[3], range); + // stage 3. + s[0] = RangeCheckValue(s[0] + s[3], range + 12); + s[1] = RangeCheckValue(s[1] - s[4], range + 12); + s[3] = s[2]; + s[2] = RangeCheckValue(kAdst4Multiplier[2] * b7, range + 12); + // stage 4. + s[0] = RangeCheckValue(s[0] + s[5], range + 12); + s[1] = RangeCheckValue(s[1] - s[6], range + 12); + // stages 5 and 6. + const int32_t x0 = RangeCheckValue(s[0] + s[3], range + 12); + const int32_t x1 = RangeCheckValue(s[1] + s[3], range + 12); + int32_t x3 = RangeCheckValue(s[0] + s[1], range + 12); + x3 = RangeCheckValue(x3 - s[3], range + 12); + int32_t dst_0 = RightShiftWithRounding(x0, 12); + int32_t dst_1 = RightShiftWithRounding(x1, 12); + int32_t dst_2 = RightShiftWithRounding(s[2], 12); + int32_t dst_3 = RightShiftWithRounding(x3, 12); + if (sizeof(Residual) == 2) { + // If the first argument to RightShiftWithRounding(..., 12) is only + // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it + // in RightShiftWithRounding(..., 12) will cause the function to return + // 0x8000, which cannot be represented as an int16_t. Change it to 0x7fff. + dst_0 -= (dst_0 == 0x8000); + dst_1 -= (dst_1 == 0x8000); + dst_3 -= (dst_3 == 0x8000); + } + dst[0] = dst_0; + dst[1] = dst_1; + dst[2] = dst_2; + dst[3] = dst_3; +} + +template +void Adst4DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift, + bool is_row) { + auto* const dst = static_cast(dest); + + if (is_row && should_round) { + dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12); + } + + // stage 1. + // Section 7.13.2.6: It is a requirement of bitstream conformance that all + // values stored in the s and x arrays by this process are representable by + // a signed integer using range + 12 bits of precision. + int32_t s[3]; + s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12); + s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12); + s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[0], range + 12); + // stage 3. + // stage 4. + // stages 5 and 6. + int32_t dst_0 = RightShiftWithRounding(s[0], 12); + int32_t dst_1 = RightShiftWithRounding(s[1], 12); + int32_t dst_2 = RightShiftWithRounding(s[2], 12); + int32_t dst_3 = + RightShiftWithRounding(RangeCheckValue(s[0] + s[1], range + 12), 12); + if (sizeof(Residual) == 2) { + // If the first argument to RightShiftWithRounding(..., 12) is only + // slightly smaller than 2^27 - 1 (e.g., 0x7fffe4e), adding 2^11 to it + // in RightShiftWithRounding(..., 12) will cause the function to return + // 0x8000, which cannot be represented as an int16_t. Change it to 0x7fff. + dst_0 -= (dst_0 == 0x8000); + dst_1 -= (dst_1 == 0x8000); + dst_3 -= (dst_3 == 0x8000); + } + dst[0] = dst_0; + dst[1] = dst_1; + dst[2] = dst_2; + dst[3] = dst_3; + + const int size = 4; + if (is_row && row_shift > 0) { + for (int j = 0; j < size; ++j) { + dst[j] = RightShiftWithRounding(dst[j], row_shift); + } + } + + ClampIntermediate(dst, 4); +} + +template +void AdstInputPermutation(int32_t* const dst, const Residual* const src, + int n) { + assert(n == 8 || n == 16); + for (int i = 0; i < n; ++i) { + dst[i] = src[((i & 1) == 0) ? n - i - 1 : i - 1]; + } +} + +constexpr int8_t kAdstOutputPermutationLookup[16] = { + 0, 8, 12, 4, 6, 14, 10, 2, 3, 11, 15, 7, 5, 13, 9, 1}; + +template +void AdstOutputPermutation(Residual* const dst, const int32_t* const src, + int n) { + assert(n == 8 || n == 16); + const auto shift = static_cast(n == 8); + for (int i = 0; i < n; ++i) { + const int8_t index = kAdstOutputPermutationLookup[i] >> shift; + int32_t dst_i = ((i & 1) == 0) ? src[index] : -src[index]; + if (sizeof(Residual) == 2) { + // If i is odd and src[index] is -32768, dst_i will be 32768, which + // cannot be represented as an int16_t. + dst_i -= (dst_i == 0x8000); + } + dst[i] = dst_i; + } +} + +template +void Adst8_C(void* dest, int8_t range) { + auto* const dst = static_cast(dest); + // stage 1. + int32_t temp[8]; + AdstInputPermutation(temp, dst, 8); + // stage 2. + for (int i = 0; i < 4; ++i) { + ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 60 - 16 * i, + true, range); + } + // stage 3. + for (int i = 0; i < 4; ++i) { + HadamardRotation_C(temp, i, i + 4, false, range); + } + // stage 4. + for (int i = 0; i < 2; ++i) { + ButterflyRotation_C(temp, i * 3 + 4, i + 5, 48 - 32 * i, true, range); + } + // stage 5. + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + HadamardRotation_C(temp, i + MultiplyBy4(j), i + MultiplyBy4(j) + 2, + false, range); + } + } + // stage 6. + for (int i = 0; i < 2; ++i) { + ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true, + range); + } + // stage 7. + AdstOutputPermutation(dst, temp, 8); +} + +template +void Adst8DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift, + bool is_row) { + auto* const dst = static_cast(dest); + + // stage 1. + int32_t temp[8]; + // After the permutation, the dc value is in temp[1]. The remaining are zero. + AdstInputPermutation(temp, dst, 8); + + if (is_row && should_round) { + temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12); + } + + // stage 2. + ButterflyRotationFirstIsZero_C(temp, 0, 1, 60, true, range); + + // stage 3. + temp[4] = temp[0]; + temp[5] = temp[1]; + + // stage 4. + ButterflyRotation_C(temp, 4, 5, 48, true, range); + + // stage 5. + temp[2] = temp[0]; + temp[3] = temp[1]; + temp[6] = temp[4]; + temp[7] = temp[5]; + + // stage 6. + ButterflyRotation_C(temp, 2, 3, 32, true, range); + ButterflyRotation_C(temp, 6, 7, 32, true, range); + + // stage 7. + AdstOutputPermutation(dst, temp, 8); + + const int size = 8; + if (is_row && row_shift > 0) { + for (int j = 0; j < size; ++j) { + dst[j] = RightShiftWithRounding(dst[j], row_shift); + } + } + + ClampIntermediate(dst, 8); +} + +template +void Adst16_C(void* dest, int8_t range) { + auto* const dst = static_cast(dest); + // stage 1. + int32_t temp[16]; + AdstInputPermutation(temp, dst, 16); + // stage 2. + for (int i = 0; i < 8; ++i) { + ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 62 - 8 * i, + true, range); + } + // stage 3. + for (int i = 0; i < 8; ++i) { + HadamardRotation_C(temp, i, i + 8, false, range); + } + // stage 4. + for (int i = 0; i < 2; ++i) { + ButterflyRotation_C(temp, MultiplyBy2(i) + 8, MultiplyBy2(i) + 9, + 56 - 32 * i, true, range); + ButterflyRotation_C(temp, MultiplyBy2(i) + 13, MultiplyBy2(i) + 12, + 8 + 32 * i, true, range); + } + // stage 5. + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 2; ++j) { + HadamardRotation_C(temp, i + MultiplyBy8(j), i + MultiplyBy8(j) + 4, + false, range); + } + } + // stage 6. + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + ButterflyRotation_C(temp, i * 3 + MultiplyBy8(j) + 4, + i + MultiplyBy8(j) + 5, 48 - 32 * i, true, range); + } + } + // stage 7. + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 4; ++j) { + HadamardRotation_C(temp, i + MultiplyBy4(j), i + MultiplyBy4(j) + 2, + false, range); + } + } + // stage 8. + for (int i = 0; i < 4; ++i) { + ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true, + range); + } + // stage 9. + AdstOutputPermutation(dst, temp, 16); +} + +template +void Adst16DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift, + bool is_row) { + auto* const dst = static_cast(dest); + + // stage 1. + int32_t temp[16]; + // After the permutation, the dc value is in temp[1]. The remaining are zero. + AdstInputPermutation(temp, dst, 16); + + if (is_row && should_round) { + temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12); + } + + // stage 2. + ButterflyRotationFirstIsZero_C(temp, 0, 1, 62, true, range); + + // stage 3. + temp[8] = temp[0]; + temp[9] = temp[1]; + + // stage 4. + ButterflyRotation_C(temp, 8, 9, 56, true, range); + + // stage 5. + temp[4] = temp[0]; + temp[5] = temp[1]; + temp[12] = temp[8]; + temp[13] = temp[9]; + + // stage 6. + ButterflyRotation_C(temp, 4, 5, 48, true, range); + ButterflyRotation_C(temp, 12, 13, 48, true, range); + + // stage 7. + temp[2] = temp[0]; + temp[3] = temp[1]; + temp[10] = temp[8]; + temp[11] = temp[9]; + + temp[6] = temp[4]; + temp[7] = temp[5]; + temp[14] = temp[12]; + temp[15] = temp[13]; + + // stage 8. + for (int i = 0; i < 4; ++i) { + ButterflyRotation_C(temp, MultiplyBy4(i) + 2, MultiplyBy4(i) + 3, 32, true, + range); + } + + // stage 9. + AdstOutputPermutation(dst, temp, 16); + + const int size = 16; + if (is_row && row_shift > 0) { + for (int j = 0; j < size; ++j) { + dst[j] = RightShiftWithRounding(dst[j], row_shift); + } + } + + ClampIntermediate(dst, 16); +} + +//------------------------------------------------------------------------------ +// Identity Transforms. +// +// In the spec, the inverse identity transform is followed by a Round2() call: +// The row transforms with i = 0..(h-1) are applied as follows: +// ... +// * Otherwise, invoke the inverse identity transform process specified in +// section 7.13.2.15 with the input variable n equal to log2W. +// * Set Residual[ i ][ j ] equal to Round2( T[ j ], rowShift ) +// for j = 0..(w-1). +// ... +// The column transforms with j = 0..(w-1) are applied as follows: +// ... +// * Otherwise, invoke the inverse identity transform process specified in +// section 7.13.2.15 with the input variable n equal to log2H. +// * Residual[ i ][ j ] is set equal to Round2( T[ i ], colShift ) +// for i = 0..(h-1). +// +// Therefore, we define the identity transform functions to perform both the +// inverse identity transform and the Round2() call. This has two advantages: +// 1. The outputs of the inverse identity transform do not need to be stored +// in the Residual array. They can be stored in int32_t local variables, +// which have a larger range if Residual is an int16_t array. +// 2. The inverse identity transform and the Round2() call can be jointly +// optimized. +// +// The identity transform functions have the following prototype: +// void Identity_C(void* dest, int8_t shift); +// +// The |shift| parameter is the amount of shift for the Round2() call. For row +// transforms, |shift| is 0, 1, or 2. For column transforms, |shift| is always +// 4. Therefore, an identity transform function can detect whether it is being +// invoked as a row transform or a column transform by checking whether |shift| +// is equal to 4. +// +// Input Range +// +// The inputs of row transforms, stored in the 2D array Dequant, are +// representable by a signed integer using 8 + BitDepth bits of precision: +// f. Dequant[ i ][ j ] is set equal to +// Clip3( - ( 1 << ( 7 + BitDepth ) ), ( 1 << ( 7 + BitDepth ) ) - 1, dq2 ). +// +// The inputs of column transforms are representable by a signed integer using +// Max( BitDepth + 6, 16 ) bits of precision: +// Set the variable colClampRange equal to Max( BitDepth + 6, 16 ). +// ... +// Between the row and column transforms, Residual[ i ][ j ] is set equal to +// Clip3( - ( 1 << ( colClampRange - 1 ) ), +// ( 1 << (colClampRange - 1 ) ) - 1, +// Residual[ i ][ j ] ) +// for i = 0..(h-1), for j = 0..(w-1). +// +// Output Range +// +// The outputs of row transforms are representable by a signed integer using +// 8 + BitDepth + 1 = 9 + BitDepth bits of precision, because the net effect +// of the multiplicative factor of inverse identity transforms minus the +// smallest row shift is an increase of at most one bit. +// +// Transform | Multiplicative factor | Smallest row | Net increase +// width | (in bits) | shift | in bits +// --------------------------------------------------------------- +// 4 | sqrt(2) (0.5 bits) | 0 | +0.5 +// 8 | 2 (1 bit) | 0 | +1 +// 16 | 2*sqrt(2) (1.5 bits) | 1 | +0.5 +// 32 | 4 (2 bits) | 1 | +1 +// +// If BitDepth is 8 and Residual is an int16_t array, to avoid truncation we +// clip the outputs (which have 17 bits of precision) to the range of int16_t +// before storing them in the Residual array. This clipping happens to be the +// same as the required clipping after the row transform (see the spec quoted +// above), so we remain compliant with the spec. (In this case, +// TransformLoop_C() skips clipping the outputs of row transforms to avoid +// duplication of effort.) +// +// The outputs of column transforms are representable by a signed integer using +// Max( BitDepth + 6, 16 ) + 2 - 4 = Max( BitDepth + 4, 14 ) bits of precision, +// because the multiplicative factor of inverse identity transforms is at most +// 4 (2 bits) and |shift| is always 4. + +template +void Identity4Row_C(void* dest, int8_t shift) { + assert(shift == 0 || shift == 1); + auto* const dst = static_cast(dest); + // If |shift| is 0, |rounding| should be 1 << 11. If |shift| is 1, |rounding| + // should be (1 + (1 << 1)) << 11. The following expression works for both + // values of |shift|. + const int32_t rounding = (1 + (shift << 1)) << 11; + for (int i = 0; i < 4; ++i) { + // The intermediate value here will have to fit into an int32_t for it to be + // bitstream conformant. The multiplication is promoted to int32_t by + // defining kIdentity4Multiplier as int32_t. + int32_t dst_i = (dst[i] * kIdentity4Multiplier + rounding) >> (12 + shift); + if (sizeof(Residual) == 2) { + dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); + } + dst[i] = static_cast(dst_i); + } +} + +template +void Identity4Column_C(void* dest, int8_t /*shift*/) { + auto* const dst = static_cast(dest); + const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11; + for (int i = 0; i < 4; ++i) { + // The intermediate value here will have to fit into an int32_t for it to be + // bitstream conformant. The multiplication is promoted to int32_t by + // defining kIdentity4Multiplier as int32_t. + dst[i] = static_cast((dst[i] * kIdentity4Multiplier + rounding) >> + (12 + kTransformColumnShift)); + } +} + +template +void Identity4DcOnly_C(void* dest, int8_t /*range*/, bool should_round, + int row_shift, bool is_row) { + auto* const dst = static_cast(dest); + + if (is_row) { + if (should_round) { + dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12); + } + + const int32_t rounding = (1 + (row_shift << 1)) << 11; + int32_t dst_i = + (dst[0] * kIdentity4Multiplier + rounding) >> (12 + row_shift); + if (sizeof(Residual) == 2) { + dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); + } + dst[0] = static_cast(dst_i); + + ClampIntermediate(dst, 1); + return; + } + + const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11; + dst[0] = static_cast((dst[0] * kIdentity4Multiplier + rounding) >> + (12 + kTransformColumnShift)); +} + +template +void Identity8Row_C(void* dest, int8_t shift) { + assert(shift == 0 || shift == 1 || shift == 2); + auto* const dst = static_cast(dest); + for (int i = 0; i < 8; ++i) { + int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[i]), shift); + if (sizeof(Residual) == 2) { + dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); + } + dst[i] = static_cast(dst_i); + } +} + +template +void Identity8Column_C(void* dest, int8_t /*shift*/) { + auto* const dst = static_cast(dest); + for (int i = 0; i < 8; ++i) { + dst[i] = static_cast( + RightShiftWithRounding(dst[i], kTransformColumnShift - 1)); + } +} + +template +void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round, + int row_shift, bool is_row) { + auto* const dst = static_cast(dest); + + if (is_row) { + if (should_round) { + dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12); + } + + int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[0]), row_shift); + if (sizeof(Residual) == 2) { + dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); + } + dst[0] = static_cast(dst_i); + + // If Residual is int16_t (which implies bitdepth is 8), we don't need to + // clip residual[i][j] to 16 bits. + if (sizeof(Residual) > 2) { + const Residual intermediate_clamp_max = + (1 << (std::max(bitdepth + 6, 16) - 1)) - 1; + const Residual intermediate_clamp_min = -intermediate_clamp_max - 1; + dst[0] = Clip3(dst[0], intermediate_clamp_min, intermediate_clamp_max); + } + return; + } + + dst[0] = static_cast( + RightShiftWithRounding(dst[0], kTransformColumnShift - 1)); +} + +template +void Identity16Row_C(void* dest, int8_t shift) { + assert(shift == 1 || shift == 2); + auto* const dst = static_cast(dest); + const int32_t rounding = (1 + (1 << shift)) << 11; + for (int i = 0; i < 16; ++i) { + // The intermediate value here will have to fit into an int32_t for it to be + // bitstream conformant. The multiplication is promoted to int32_t by + // defining kIdentity16Multiplier as int32_t. + int32_t dst_i = (dst[i] * kIdentity16Multiplier + rounding) >> (12 + shift); + if (sizeof(Residual) == 2) { + dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); + } + dst[i] = static_cast(dst_i); + } +} + +template +void Identity16Column_C(void* dest, int8_t /*shift*/) { + auto* const dst = static_cast(dest); + const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11; + for (int i = 0; i < 16; ++i) { + // The intermediate value here will have to fit into an int32_t for it to be + // bitstream conformant. The multiplication is promoted to int32_t by + // defining kIdentity16Multiplier as int32_t. + dst[i] = + static_cast((dst[i] * kIdentity16Multiplier + rounding) >> + (12 + kTransformColumnShift)); + } +} + +template +void Identity16DcOnly_C(void* dest, int8_t /*range*/, bool should_round, + int row_shift, bool is_row) { + auto* const dst = static_cast(dest); + + if (is_row) { + if (should_round) { + dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12); + } + + const int32_t rounding = (1 + (1 << row_shift)) << 11; + int32_t dst_i = + (dst[0] * kIdentity16Multiplier + rounding) >> (12 + row_shift); + if (sizeof(Residual) == 2) { + dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); + } + dst[0] = static_cast(dst_i); + + ClampIntermediate(dst, 1); + return; + } + + const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11; + dst[0] = static_cast((dst[0] * kIdentity16Multiplier + rounding) >> + (12 + kTransformColumnShift)); +} + +template +void Identity32Row_C(void* dest, int8_t shift) { + assert(shift == 1 || shift == 2); + auto* const dst = static_cast(dest); + for (int i = 0; i < 32; ++i) { + int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[i]), shift); + if (sizeof(Residual) == 2) { + dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); + } + dst[i] = static_cast(dst_i); + } +} + +template +void Identity32Column_C(void* dest, int8_t /*shift*/) { + auto* const dst = static_cast(dest); + for (int i = 0; i < 32; ++i) { + dst[i] = static_cast( + RightShiftWithRounding(dst[i], kTransformColumnShift - 2)); + } +} + +template +void Identity32DcOnly_C(void* dest, int8_t /*range*/, bool should_round, + int row_shift, bool is_row) { + auto* const dst = static_cast(dest); + + if (is_row) { + if (should_round) { + dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12); + } + + int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[0]), row_shift); + if (sizeof(Residual) == 2) { + dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX); + } + dst[0] = static_cast(dst_i); + + ClampIntermediate(dst, 1); + return; + } + + dst[0] = static_cast( + RightShiftWithRounding(dst[0], kTransformColumnShift - 2)); +} + +//------------------------------------------------------------------------------ +// Walsh Hadamard Transform. + +template +void Wht4_C(void* dest, int8_t shift) { + auto* const dst = static_cast(dest); + Residual temp[4]; + temp[0] = dst[0] >> shift; + temp[2] = dst[1] >> shift; + temp[3] = dst[2] >> shift; + temp[1] = dst[3] >> shift; + temp[0] += temp[2]; + temp[3] -= temp[1]; + // This signed right shift must be an arithmetic shift. + Residual e = (temp[0] - temp[3]) >> 1; + dst[1] = e - temp[1]; + dst[2] = e - temp[2]; + dst[0] = temp[0] - dst[1]; + dst[3] = temp[3] + dst[2]; +} + +template +void Wht4DcOnly_C(void* dest, int8_t range, bool /*should_round*/, + int /*row_shift*/, bool /*is_row*/) { + auto* const dst = static_cast(dest); + const int shift = range; + + Residual temp = dst[0] >> shift; + // This signed right shift must be an arithmetic shift. + Residual e = temp >> 1; + dst[0] = temp - e; + dst[1] = e; + dst[2] = e; + dst[3] = e; + + ClampIntermediate(dst, 4); +} + +//------------------------------------------------------------------------------ +// row/column transform loop + +using InverseTransform1DFunc = void (*)(void* dst, int8_t range); +using InverseTransformDcOnlyFunc = void (*)(void* dest, int8_t range, + bool should_round, int row_shift, + bool is_row); + +template +void TransformLoop_C(TransformType tx_type, TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, int start_x, + int start_y, void* dst_frame) { + constexpr bool lossless = transform1d_type == k1DTransformWht; + constexpr bool is_identity = transform1d_type == k1DTransformIdentity; + // The transform size of the WHT is always 4x4. Setting tx_width and + // tx_height to the constant 4 for the WHT speeds the code up. + assert(!lossless || tx_size == kTransformSize4x4); + const int tx_width = lossless ? 4 : kTransformWidth[tx_size]; + const int tx_height = lossless ? 4 : kTransformHeight[tx_size]; + const int tx_width_log2 = kTransformWidthLog2[tx_size]; + const int tx_height_log2 = kTransformHeightLog2[tx_size]; + auto* frame = static_cast*>(dst_frame); + + // Initially this points to the dequantized values. After the transforms are + // applied, this buffer contains the residual. + Array2DView residual(tx_height, tx_width, + static_cast(src_buffer)); + + if (is_row) { + // Row transform. + const uint8_t row_shift = lossless ? 0 : kTransformRowShift[tx_size]; + // This is the |range| parameter of the InverseTransform1DFunc. For lossy + // transforms, this will be equal to the clamping range. + const int8_t row_clamp_range = lossless ? 2 : (bitdepth + 8); + // If the width:height ratio of the transform size is 2:1 or 1:2, multiply + // the input to the row transform by 1 / sqrt(2), which is approximated by + // the fraction 2896 / 2^12. + const bool should_round = std::abs(tx_width_log2 - tx_height_log2) == 1; + + if (adjusted_tx_height == 1) { + dconly_transform1d(residual[0], row_clamp_range, should_round, row_shift, + true); + return; + } + + // Row transforms need to be done only up to 32 because the rest of the rows + // are always all zero if |tx_height| is 64. Otherwise, only process the + // rows that have a non zero coefficients. + for (int i = 0; i < adjusted_tx_height; ++i) { + // If lossless, the transform size is 4x4, so should_round is false. + if (!lossless && should_round) { + // The last 32 values of every row are always zero if the |tx_width| is + // 64. + for (int j = 0; j < std::min(tx_width, 32); ++j) { + residual[i][j] = RightShiftWithRounding( + residual[i][j] * kTransformRowMultiplier, 12); + } + } + // For identity transform, |transform1d_func| also performs the + // Round2(T[j], rowShift) call in the spec. + transform1d_func(residual[i], is_identity ? row_shift : row_clamp_range); + if (!lossless && !is_identity && row_shift > 0) { + for (int j = 0; j < tx_width; ++j) { + residual[i][j] = RightShiftWithRounding(residual[i][j], row_shift); + } + } + + ClampIntermediate(residual[i], tx_width); + } + return; + } + + assert(!is_row); + constexpr uint8_t column_shift = lossless ? 0 : kTransformColumnShift; + // This is the |range| parameter of the InverseTransform1DFunc. For lossy + // transforms, this will be equal to the clamping range. + const int8_t column_clamp_range = lossless ? 0 : std::max(bitdepth + 6, 16); + const bool flip_rows = transform1d_type == k1DTransformAdst && + kTransformFlipRowsMask.Contains(tx_type); + const bool flip_columns = + !lossless && kTransformFlipColumnsMask.Contains(tx_type); + const int min_value = 0; + const int max_value = (1 << bitdepth) - 1; + // Note: 64 is the maximum size of a 1D transform buffer (the largest + // transform size is kTransformSize64x64). + Residual tx_buffer[64]; + for (int j = 0; j < tx_width; ++j) { + const int flipped_j = flip_columns ? tx_width - j - 1 : j; + for (int i = 0; i < tx_height; ++i) { + tx_buffer[i] = residual[i][flipped_j]; + } + if (adjusted_tx_height == 1) { + dconly_transform1d(tx_buffer, column_clamp_range, false, 0, false); + } else { + // For identity transform, |transform1d_func| also performs the + // Round2(T[i], colShift) call in the spec. + transform1d_func(tx_buffer, + is_identity ? column_shift : column_clamp_range); + } + const int x = start_x + j; + for (int i = 0; i < tx_height; ++i) { + const int y = start_y + i; + const int index = flip_rows ? tx_height - i - 1 : i; + Residual residual_value = tx_buffer[index]; + if (!lossless && !is_identity) { + residual_value = RightShiftWithRounding(residual_value, column_shift); + } + (*frame)[y][x] = + Clip3((*frame)[y][x] + residual_value, min_value, max_value); + } + } +} + +//------------------------------------------------------------------------------ + +template +void InitAll(Dsp* const dsp) { + // Maximum transform size for Dct is 64. + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] = + TransformLoop_C, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] = + TransformLoop_C, Dct_C, + /*is_row=*/false>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] = + TransformLoop_C, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] = + TransformLoop_C, Dct_C, + /*is_row=*/false>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] = + TransformLoop_C, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] = + TransformLoop_C, Dct_C, + /*is_row=*/false>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] = + TransformLoop_C, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] = + TransformLoop_C, Dct_C, + /*is_row=*/false>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] = + TransformLoop_C, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] = + TransformLoop_C, Dct_C, + /*is_row=*/false>; + + // Maximum transform size for Adst is 16. + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] = + TransformLoop_C, Adst4_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] = + TransformLoop_C, Adst4_C, + /*is_row=*/false>; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] = + TransformLoop_C, Adst8_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] = + TransformLoop_C, Adst8_C, + /*is_row=*/false>; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] = + TransformLoop_C, Adst16_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] = + TransformLoop_C, Adst16_C, + /*is_row=*/false>; + + // Maximum transform size for Identity transform is 32. + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] = + TransformLoop_C, + Identity4Row_C, /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] = + TransformLoop_C, + Identity4Column_C, /*is_row=*/false>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] = + TransformLoop_C, + Identity8Row_C, /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] = + TransformLoop_C, + Identity8Column_C, /*is_row=*/false>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] = + TransformLoop_C, + Identity16Row_C, /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] = + TransformLoop_C, + Identity16Column_C, /*is_row=*/false>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] = + TransformLoop_C, + Identity32Row_C, /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] = + TransformLoop_C, + Identity32Column_C, /*is_row=*/false>; + + // Maximum transform size for Wht is 4. + dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] = + TransformLoop_C, Wht4_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] = + TransformLoop_C, Wht4_C, + /*is_row=*/false>; +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); + for (auto& inverse_transform_by_size : dsp->inverse_transforms) { + for (auto& inverse_transform : inverse_transform_by_size) { + inverse_transform[kRow] = nullptr; + inverse_transform[kColumn] = nullptr; + } + } +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + InitAll<8, int16_t, uint8_t>(dsp); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct, + DctDcOnly_C<8, int16_t, 2>, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct, + DctDcOnly_C<8, int16_t, 2>, Dct_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct, + DctDcOnly_C<8, int16_t, 3>, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct, + DctDcOnly_C<8, int16_t, 3>, Dct_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct, + DctDcOnly_C<8, int16_t, 4>, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct, + DctDcOnly_C<8, int16_t, 4>, Dct_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct, + DctDcOnly_C<8, int16_t, 5>, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct, + DctDcOnly_C<8, int16_t, 5>, Dct_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct, + DctDcOnly_C<8, int16_t, 6>, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct, + DctDcOnly_C<8, int16_t, 6>, Dct_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst, + Adst4DcOnly_C<8, int16_t>, Adst4_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst, + Adst4DcOnly_C<8, int16_t>, Adst4_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst, + Adst8DcOnly_C<8, int16_t>, Adst8_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst, + Adst8DcOnly_C<8, int16_t>, Adst8_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst, + Adst16DcOnly_C<8, int16_t>, Adst16_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst, + Adst16DcOnly_C<8, int16_t>, Adst16_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity, + Identity4DcOnly_C<8, int16_t>, Identity4Row_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity, + Identity4DcOnly_C<8, int16_t>, Identity4Column_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity, + Identity8DcOnly_C<8, int16_t>, Identity8Row_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity, + Identity8DcOnly_C<8, int16_t>, Identity8Column_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity, + Identity16DcOnly_C<8, int16_t>, Identity16Row_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity, + Identity16DcOnly_C<8, int16_t>, + Identity16Column_C, /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity, + Identity32DcOnly_C<8, int16_t>, Identity32Row_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity, + Identity32DcOnly_C<8, int16_t>, + Identity32Column_C, /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht + dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformWht, + Wht4DcOnly_C<8, int16_t>, Wht4_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] = + TransformLoop_C<8, int16_t, uint8_t, k1DTransformWht, + Wht4DcOnly_C<8, int16_t>, Wht4_C, + /*is_row=*/false>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); + for (auto& inverse_transform_by_size : dsp->inverse_transforms) { + for (auto& inverse_transform : inverse_transform_by_size) { + inverse_transform[kRow] = nullptr; + inverse_transform[kColumn] = nullptr; + } + } +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + InitAll<10, int32_t, uint16_t>(dsp); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformDct + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct, + DctDcOnly_C<10, int32_t, 2>, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct, + DctDcOnly_C<10, int32_t, 2>, Dct_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformDct + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct, + DctDcOnly_C<10, int32_t, 3>, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct, + DctDcOnly_C<10, int32_t, 3>, Dct_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformDct + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct, + DctDcOnly_C<10, int32_t, 4>, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct, + DctDcOnly_C<10, int32_t, 4>, Dct_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformDct + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct, + DctDcOnly_C<10, int32_t, 5>, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct, + DctDcOnly_C<10, int32_t, 5>, Dct_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp10bpp_1DTransformSize64_1DTransformDct + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct, + DctDcOnly_C<10, int32_t, 6>, Dct_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct, + DctDcOnly_C<10, int32_t, 6>, Dct_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformAdst + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst, + Adst4DcOnly_C<10, int32_t>, Adst4_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst, + Adst4DcOnly_C<10, int32_t>, Adst4_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformAdst + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst, + Adst8DcOnly_C<10, int32_t>, Adst8_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst, + Adst8DcOnly_C<10, int32_t>, Adst8_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformAdst + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst, + Adst16DcOnly_C<10, int32_t>, Adst16_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst, + Adst16DcOnly_C<10, int32_t>, Adst16_C, + /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformIdentity + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity, + Identity4DcOnly_C<10, int32_t>, Identity4Row_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity, + Identity4DcOnly_C<10, int32_t>, + Identity4Column_C, /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformIdentity + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity, + Identity8DcOnly_C<10, int32_t>, Identity8Row_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity, + Identity8DcOnly_C<10, int32_t>, + Identity8Column_C, /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformIdentity + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity, + Identity16DcOnly_C<10, int32_t>, Identity16Row_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity, + Identity16DcOnly_C<10, int32_t>, + Identity16Column_C, /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformIdentity + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity, + Identity32DcOnly_C<10, int32_t>, Identity32Row_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity, + Identity32DcOnly_C<10, int32_t>, + Identity32Column_C, /*is_row=*/false>; +#endif +#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformWht + dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformWht, + Wht4DcOnly_C<10, int32_t>, Wht4_C, + /*is_row=*/true>; + dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] = + TransformLoop_C<10, int32_t, uint16_t, k1DTransformWht, + Wht4DcOnly_C<10, int32_t>, Wht4_C, + /*is_row=*/false>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace + +void InverseTransformInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif + + // Local functions that may be unused depending on the optimizations + // available. + static_cast(RangeCheckValue); + static_cast(kBitReverseLookup); +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/inverse_transform.h b/src/dsp/inverse_transform.h new file mode 100644 index 0000000..0916665 --- /dev/null +++ b/src/dsp/inverse_transform.h @@ -0,0 +1,47 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_ +#define LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/inverse_transform_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/inverse_transform_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::inverse_transforms. This function is not thread-safe. +void InverseTransformInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_INVERSE_TRANSFORM_H_ diff --git a/src/dsp/inverse_transform.inc b/src/dsp/inverse_transform.inc new file mode 100644 index 0000000..55e68b6 --- /dev/null +++ b/src/dsp/inverse_transform.inc @@ -0,0 +1,64 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Constants and utility functions used for inverse transform implementations. +// This will be included inside an anonymous namespace on files where these are +// necessary. + +// The value at index i is derived as: round(cos(pi * i / 128) * (1 << 12)). +constexpr int16_t kCos128[65] = { + 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973, + 3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564, + 3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896, + 2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019, + 1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995, + 897, 799, 700, 601, 501, 401, 301, 201, 101, 0}; + +inline int16_t Cos128(int angle) { + angle &= 0xff; + + // If |angle| is 128, this function returns -4096 (= -2^12), which will + // cause the 32-bit multiplications in ButterflyRotation() to overflow if + // dst[a] or dst[b] is -2^19 (a possible corner case when |range| is 20): + // + // (-2^12) * (-2^19) = 2^31, which cannot be represented as an int32_t. + // + // Note: |range| is 20 when bitdepth is 12 and a row transform is performed. + // + // Assert that this angle is never used by DCT or ADST. + assert(angle != 128); + if (angle <= 64) return kCos128[angle]; + if (angle <= 128) return -kCos128[128 - angle]; + if (angle <= 192) return -kCos128[angle - 128]; + return kCos128[256 - angle]; +} + +inline int16_t Sin128(int angle) { return Cos128(angle - 64); } + +// The value for index i is derived as: +// round(sqrt(2) * sin(i * pi / 9) * 2 / 3 * (1 << 12)). +constexpr int16_t kAdst4Multiplier[4] = {1321, 2482, 3344, 3803}; + +constexpr uint8_t kTransformRowShift[kNumTransformSizes] = { + 0, 0, 1, 0, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2}; + +constexpr bool kShouldRound[kNumTransformSizes] = { + false, true, false, true, false, true, false, false, true, false, + true, false, false, true, false, true, false, true, false}; + +constexpr int16_t kIdentity4Multiplier /* round(2^12 * sqrt(2)) */ = 0x16A1; +constexpr int16_t kIdentity4MultiplierFraction /* round(2^12 * (sqrt(2) - 1))*/ + = 0x6A1; +constexpr int16_t kIdentity16Multiplier /* 2 * round(2^12 * sqrt(2)) */ = 11586; +constexpr int16_t kTransformRowMultiplier /* round(2^12 / sqrt(2)) */ = 2896; diff --git a/src/dsp/libgav1_dsp.cmake b/src/dsp/libgav1_dsp.cmake new file mode 100644 index 0000000..960d5a7 --- /dev/null +++ b/src/dsp/libgav1_dsp.cmake @@ -0,0 +1,176 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_) + return() +endif() # LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_ +set(LIBGAV1_SRC_DSP_LIBGAV1_DSP_CMAKE_ 1) + +include("${libgav1_root}/cmake/libgav1_targets.cmake") + +list(APPEND libgav1_dsp_sources + "${libgav1_source}/dsp/average_blend.cc" + "${libgav1_source}/dsp/average_blend.h" + "${libgav1_source}/dsp/cdef.cc" + "${libgav1_source}/dsp/cdef.h" + "${libgav1_source}/dsp/cdef.inc" + "${libgav1_source}/dsp/common.h" + "${libgav1_source}/dsp/constants.cc" + "${libgav1_source}/dsp/constants.h" + "${libgav1_source}/dsp/convolve.cc" + "${libgav1_source}/dsp/convolve.h" + "${libgav1_source}/dsp/convolve.inc" + "${libgav1_source}/dsp/distance_weighted_blend.cc" + "${libgav1_source}/dsp/distance_weighted_blend.h" + "${libgav1_source}/dsp/dsp.cc" + "${libgav1_source}/dsp/dsp.h" + "${libgav1_source}/dsp/film_grain.cc" + "${libgav1_source}/dsp/film_grain.h" + "${libgav1_source}/dsp/film_grain_common.h" + "${libgav1_source}/dsp/intra_edge.cc" + "${libgav1_source}/dsp/intra_edge.h" + "${libgav1_source}/dsp/intrapred.cc" + "${libgav1_source}/dsp/intrapred.h" + "${libgav1_source}/dsp/inverse_transform.cc" + "${libgav1_source}/dsp/inverse_transform.h" + "${libgav1_source}/dsp/inverse_transform.inc" + "${libgav1_source}/dsp/loop_filter.cc" + "${libgav1_source}/dsp/loop_filter.h" + "${libgav1_source}/dsp/loop_restoration.cc" + "${libgav1_source}/dsp/loop_restoration.h" + "${libgav1_source}/dsp/mask_blend.cc" + "${libgav1_source}/dsp/mask_blend.h" + "${libgav1_source}/dsp/motion_field_projection.cc" + "${libgav1_source}/dsp/motion_field_projection.h" + "${libgav1_source}/dsp/motion_vector_search.cc" + "${libgav1_source}/dsp/motion_vector_search.h" + "${libgav1_source}/dsp/obmc.cc" + "${libgav1_source}/dsp/obmc.h" + "${libgav1_source}/dsp/obmc.inc" + "${libgav1_source}/dsp/super_res.cc" + "${libgav1_source}/dsp/super_res.h" + "${libgav1_source}/dsp/warp.cc" + "${libgav1_source}/dsp/warp.h" + "${libgav1_source}/dsp/weight_mask.cc" + "${libgav1_source}/dsp/weight_mask.h") + +list(APPEND libgav1_dsp_sources_avx2 + ${libgav1_dsp_sources_avx2} + "${libgav1_source}/dsp/x86/convolve_avx2.cc" + "${libgav1_source}/dsp/x86/convolve_avx2.h" + "${libgav1_source}/dsp/x86/loop_restoration_10bit_avx2.cc" + "${libgav1_source}/dsp/x86/loop_restoration_avx2.cc" + "${libgav1_source}/dsp/x86/loop_restoration_avx2.h") + +list(APPEND libgav1_dsp_sources_neon + ${libgav1_dsp_sources_neon} + "${libgav1_source}/dsp/arm/average_blend_neon.cc" + "${libgav1_source}/dsp/arm/average_blend_neon.h" + "${libgav1_source}/dsp/arm/cdef_neon.cc" + "${libgav1_source}/dsp/arm/cdef_neon.h" + "${libgav1_source}/dsp/arm/common_neon.h" + "${libgav1_source}/dsp/arm/convolve_neon.cc" + "${libgav1_source}/dsp/arm/convolve_neon.h" + "${libgav1_source}/dsp/arm/distance_weighted_blend_neon.cc" + "${libgav1_source}/dsp/arm/distance_weighted_blend_neon.h" + "${libgav1_source}/dsp/arm/film_grain_neon.cc" + "${libgav1_source}/dsp/arm/film_grain_neon.h" + "${libgav1_source}/dsp/arm/intra_edge_neon.cc" + "${libgav1_source}/dsp/arm/intra_edge_neon.h" + "${libgav1_source}/dsp/arm/intrapred_cfl_neon.cc" + "${libgav1_source}/dsp/arm/intrapred_directional_neon.cc" + "${libgav1_source}/dsp/arm/intrapred_filter_intra_neon.cc" + "${libgav1_source}/dsp/arm/intrapred_neon.cc" + "${libgav1_source}/dsp/arm/intrapred_neon.h" + "${libgav1_source}/dsp/arm/intrapred_smooth_neon.cc" + "${libgav1_source}/dsp/arm/inverse_transform_neon.cc" + "${libgav1_source}/dsp/arm/inverse_transform_neon.h" + "${libgav1_source}/dsp/arm/loop_filter_neon.cc" + "${libgav1_source}/dsp/arm/loop_filter_neon.h" + "${libgav1_source}/dsp/arm/loop_restoration_neon.cc" + "${libgav1_source}/dsp/arm/loop_restoration_neon.h" + "${libgav1_source}/dsp/arm/mask_blend_neon.cc" + "${libgav1_source}/dsp/arm/mask_blend_neon.h" + "${libgav1_source}/dsp/arm/motion_field_projection_neon.cc" + "${libgav1_source}/dsp/arm/motion_field_projection_neon.h" + "${libgav1_source}/dsp/arm/motion_vector_search_neon.cc" + "${libgav1_source}/dsp/arm/motion_vector_search_neon.h" + "${libgav1_source}/dsp/arm/obmc_neon.cc" + "${libgav1_source}/dsp/arm/obmc_neon.h" + "${libgav1_source}/dsp/arm/super_res_neon.cc" + "${libgav1_source}/dsp/arm/super_res_neon.h" + "${libgav1_source}/dsp/arm/warp_neon.cc" + "${libgav1_source}/dsp/arm/warp_neon.h" + "${libgav1_source}/dsp/arm/weight_mask_neon.cc" + "${libgav1_source}/dsp/arm/weight_mask_neon.h") + +list(APPEND libgav1_dsp_sources_sse4 + ${libgav1_dsp_sources_sse4} + "${libgav1_source}/dsp/x86/average_blend_sse4.cc" + "${libgav1_source}/dsp/x86/average_blend_sse4.h" + "${libgav1_source}/dsp/x86/common_sse4.h" + "${libgav1_source}/dsp/x86/cdef_sse4.cc" + "${libgav1_source}/dsp/x86/cdef_sse4.h" + "${libgav1_source}/dsp/x86/convolve_sse4.cc" + "${libgav1_source}/dsp/x86/convolve_sse4.h" + "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.cc" + "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.h" + "${libgav1_source}/dsp/x86/intra_edge_sse4.cc" + "${libgav1_source}/dsp/x86/intra_edge_sse4.h" + "${libgav1_source}/dsp/x86/intrapred_sse4.cc" + "${libgav1_source}/dsp/x86/intrapred_sse4.h" + "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc" + "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.cc" + "${libgav1_source}/dsp/x86/inverse_transform_sse4.cc" + "${libgav1_source}/dsp/x86/inverse_transform_sse4.h" + "${libgav1_source}/dsp/x86/loop_filter_sse4.cc" + "${libgav1_source}/dsp/x86/loop_filter_sse4.h" + "${libgav1_source}/dsp/x86/loop_restoration_10bit_sse4.cc" + "${libgav1_source}/dsp/x86/loop_restoration_sse4.cc" + "${libgav1_source}/dsp/x86/loop_restoration_sse4.h" + "${libgav1_source}/dsp/x86/mask_blend_sse4.cc" + "${libgav1_source}/dsp/x86/mask_blend_sse4.h" + "${libgav1_source}/dsp/x86/motion_field_projection_sse4.cc" + "${libgav1_source}/dsp/x86/motion_field_projection_sse4.h" + "${libgav1_source}/dsp/x86/motion_vector_search_sse4.cc" + "${libgav1_source}/dsp/x86/motion_vector_search_sse4.h" + "${libgav1_source}/dsp/x86/obmc_sse4.cc" + "${libgav1_source}/dsp/x86/obmc_sse4.h" + "${libgav1_source}/dsp/x86/super_res_sse4.cc" + "${libgav1_source}/dsp/x86/super_res_sse4.h" + "${libgav1_source}/dsp/x86/transpose_sse4.h" + "${libgav1_source}/dsp/x86/warp_sse4.cc" + "${libgav1_source}/dsp/x86/warp_sse4.h" + "${libgav1_source}/dsp/x86/weight_mask_sse4.cc" + "${libgav1_source}/dsp/x86/weight_mask_sse4.h") + +macro(libgav1_add_dsp_targets) + unset(dsp_sources) + list(APPEND dsp_sources ${libgav1_dsp_sources} + ${libgav1_dsp_sources_neon} + ${libgav1_dsp_sources_avx2} + ${libgav1_dsp_sources_sse4}) + + libgav1_add_library(NAME + libgav1_dsp + TYPE + OBJECT + SOURCES + ${dsp_sources} + DEFINES + ${libgav1_defines} + $<$:LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS> + INCLUDES + ${libgav1_include_paths}) +endmacro() diff --git a/src/dsp/loop_filter.cc b/src/dsp/loop_filter.cc new file mode 100644 index 0000000..6cad97d --- /dev/null +++ b/src/dsp/loop_filter.cc @@ -0,0 +1,616 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/loop_filter.h" + +#include +#include +#include +#include + +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// 7.14.6.1. +template +struct LoopFilterFuncs_C { + LoopFilterFuncs_C() = delete; + + static constexpr int kMaxPixel = (1 << bitdepth) - 1; + static constexpr int kMinSignedPixel = -(1 << (bitdepth - 1)); + static constexpr int kMaxSignedPixel = (1 << (bitdepth - 1)) - 1; + static constexpr int kFlatThresh = 1 << (bitdepth - 8); + + static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); + static void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); + static void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); + static void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); + static void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); + static void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); + static void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); + static void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); +}; + +inline void AdjustThresholds(const int bitdepth, int* const outer_thresh, + int* const inner_thresh, int* const hev_thresh) { + *outer_thresh <<= bitdepth - 8; + *inner_thresh <<= bitdepth - 8; + *hev_thresh <<= bitdepth - 8; +} + +//------------------------------------------------------------------------------ +// 4-tap filters + +// 7.14.6.2. +template +inline bool NeedsFilter4(const Pixel* p, ptrdiff_t step, int outer_thresh, + int inner_thresh) { + const int p1 = p[-2 * step], p0 = p[-step]; + const int q0 = p[0], q1 = p[step]; + return std::abs(p1 - p0) <= inner_thresh && + std::abs(q1 - q0) <= inner_thresh && + std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh; +} + +// 7.14.6.2. +template +inline bool Hev(const Pixel* p, ptrdiff_t step, int thresh) { + const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step]; + return (std::abs(p1 - p0) > thresh) || (std::abs(q1 - q0) > thresh); +} + +// 7.14.6.3. +// 4 pixels in, 2 pixels out. +template +inline void Filter2_C(Pixel* p, ptrdiff_t step) { + const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step]; + const int min_signed_val = + LoopFilterFuncs_C::kMinSignedPixel; + const int max_signed_val = + LoopFilterFuncs_C::kMaxSignedPixel; + // 8bpp: [-893,892], 10bpp: [-3581,3580], 12bpp [-14333,14332] + const int a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val); + // 8bpp: [-16,15], 10bpp: [-64,63], 12bpp: [-256,255] + const int a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; + const int a2 = Clip3(a + 3, min_signed_val, max_signed_val) >> 3; + const int max_unsigned_val = LoopFilterFuncs_C::kMaxPixel; + p[-step] = Clip3(p0 + a2, 0, max_unsigned_val); + p[0] = Clip3(q0 - a1, 0, max_unsigned_val); +} + +// 7.14.6.3. +// 4 pixels in, 4 pixels out. +template +inline void Filter4_C(Pixel* p, ptrdiff_t step) { + const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step]; + const int a = 3 * (q0 - p0); + const int min_signed_val = + LoopFilterFuncs_C::kMinSignedPixel; + const int max_signed_val = + LoopFilterFuncs_C::kMaxSignedPixel; + const int a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; + const int a2 = Clip3(a + 3, min_signed_val, max_signed_val) >> 3; + const int a3 = (a1 + 1) >> 1; + const int max_unsigned_val = LoopFilterFuncs_C::kMaxPixel; + p[-2 * step] = Clip3(p1 + a3, 0, max_unsigned_val); + p[-1 * step] = Clip3(p0 + a2, 0, max_unsigned_val); + p[0 * step] = Clip3(q0 - a1, 0, max_unsigned_val); + p[1 * step] = Clip3(q1 - a3, 0, max_unsigned_val); +} + +template +void LoopFilterFuncs_C::Vertical4(void* dest, ptrdiff_t stride, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh); + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + for (int i = 0; i < 4; ++i) { + if (NeedsFilter4(dst, 1, outer_thresh, inner_thresh)) { + if (Hev(dst, 1, hev_thresh)) { + Filter2_C(dst, 1); + } else { + Filter4_C(dst, 1); + } + } + dst += stride; + } +} + +template +void LoopFilterFuncs_C::Horizontal4(void* dest, + ptrdiff_t stride, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh); + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + for (int i = 0; i < 4; ++i) { + if (NeedsFilter4(dst, stride, outer_thresh, inner_thresh)) { + if (Hev(dst, stride, hev_thresh)) { + Filter2_C(dst, stride); + } else { + Filter4_C(dst, stride); + } + } + ++dst; + } +} + +//------------------------------------------------------------------------------ +// 5-tap (chroma) filters + +// 7.14.6.2. +template +inline bool NeedsFilter6(const Pixel* p, ptrdiff_t step, int outer_thresh, + int inner_thresh) { + const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step]; + const int q0 = p[0], q1 = p[step], q2 = p[2 * step]; + return std::abs(p2 - p1) <= inner_thresh && + std::abs(p1 - p0) <= inner_thresh && + std::abs(q1 - q0) <= inner_thresh && + std::abs(q2 - q1) <= inner_thresh && + std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh; +} + +// 7.14.6.2. +template +inline bool IsFlat3(const Pixel* p, ptrdiff_t step, int flat_thresh) { + const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step]; + const int q0 = p[0], q1 = p[step], q2 = p[2 * step]; + return std::abs(p1 - p0) <= flat_thresh && std::abs(q1 - q0) <= flat_thresh && + std::abs(p2 - p0) <= flat_thresh && std::abs(q2 - q0) <= flat_thresh; +} + +template +inline Pixel ApplyFilter6(int filter_value) { + return static_cast(RightShiftWithRounding(filter_value, 3)); +} + +// 7.14.6.4. +// 6 pixels in, 4 pixels out. +template +inline void Filter6_C(Pixel* p, ptrdiff_t step) { + const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step]; + const int q0 = p[0], q1 = p[step], q2 = p[2 * step]; + const int a1 = 2 * p1; + const int a0 = 2 * p0; + const int b0 = 2 * q0; + const int b1 = 2 * q1; + // The max is 8 * max_pixel + 4 for the rounder. + // 8bpp: 2044 (11 bits), 10bpp: 8188 (13 bits), 12bpp: 32764 (15 bits) + p[-2 * step] = ApplyFilter6(3 * p2 + a1 + a0 + q0); + p[-1 * step] = ApplyFilter6(p2 + a1 + a0 + b0 + q1); + p[0 * step] = ApplyFilter6(p1 + a0 + b0 + b1 + q2); + p[1 * step] = ApplyFilter6(p0 + b0 + b1 + 3 * q2); +} + +template +void LoopFilterFuncs_C::Vertical6(void* dest, ptrdiff_t stride, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + const int flat_thresh = LoopFilterFuncs_C::kFlatThresh; + AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh); + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + for (int i = 0; i < 4; ++i) { + if (NeedsFilter6(dst, 1, outer_thresh, inner_thresh)) { + if (IsFlat3(dst, 1, flat_thresh)) { + Filter6_C(dst, 1); + } else if (Hev(dst, 1, hev_thresh)) { + Filter2_C(dst, 1); + } else { + Filter4_C(dst, 1); + } + } + dst += stride; + } +} + +template +void LoopFilterFuncs_C::Horizontal6(void* dest, + ptrdiff_t stride, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + const int flat_thresh = LoopFilterFuncs_C::kFlatThresh; + AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh); + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + for (int i = 0; i < 4; ++i) { + if (NeedsFilter6(dst, stride, outer_thresh, inner_thresh)) { + if (IsFlat3(dst, stride, flat_thresh)) { + Filter6_C(dst, stride); + } else if (Hev(dst, stride, hev_thresh)) { + Filter2_C(dst, stride); + } else { + Filter4_C(dst, stride); + } + } + ++dst; + } +} + +//------------------------------------------------------------------------------ +// 7-tap filters + +// 7.14.6.2. +template +inline bool NeedsFilter8(const Pixel* p, ptrdiff_t step, int outer_thresh, + int inner_thresh) { + const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step], + p0 = p[-step]; + const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step]; + return std::abs(p3 - p2) <= inner_thresh && + std::abs(p2 - p1) <= inner_thresh && + std::abs(p1 - p0) <= inner_thresh && + std::abs(q1 - q0) <= inner_thresh && + std::abs(q2 - q1) <= inner_thresh && + std::abs(q3 - q2) <= inner_thresh && + std::abs(p0 - q0) * 2 + std::abs(p1 - q1) / 2 <= outer_thresh; +} + +// 7.14.6.2. +template +inline bool IsFlat4(const Pixel* p, ptrdiff_t step, int flat_thresh) { + const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step], + p0 = p[-step]; + const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step]; + return std::abs(p1 - p0) <= flat_thresh && std::abs(q1 - q0) <= flat_thresh && + std::abs(p2 - p0) <= flat_thresh && std::abs(q2 - q0) <= flat_thresh && + std::abs(p3 - p0) <= flat_thresh && std::abs(q3 - q0) <= flat_thresh; +} + +template +inline Pixel ApplyFilter8(int filter_value) { + return static_cast(RightShiftWithRounding(filter_value, 3)); +} + +// 7.14.6.4. +// 8 pixels in, 6 pixels out. +template +inline void Filter8_C(Pixel* p, ptrdiff_t step) { + const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step], + p0 = p[-step]; + const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step]; + // The max is 8 * max_pixel + 4 for the rounder. + // 8bpp: 2044 (11 bits), 10bpp: 8188 (13 bits), 12bpp: 32764 (15 bits) + p[-3 * step] = ApplyFilter8(3 * p3 + 2 * p2 + p1 + p0 + q0); + p[-2 * step] = ApplyFilter8(2 * p3 + p2 + 2 * p1 + p0 + q0 + q1); + p[-1 * step] = ApplyFilter8(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2); + p[0 * step] = ApplyFilter8(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3); + p[1 * step] = ApplyFilter8(p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3); + p[2 * step] = ApplyFilter8(p0 + q0 + q1 + 2 * q2 + 3 * q3); +} + +template +void LoopFilterFuncs_C::Vertical8(void* dest, ptrdiff_t stride, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + const int flat_thresh = LoopFilterFuncs_C::kFlatThresh; + AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh); + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + for (int i = 0; i < 4; ++i) { + if (NeedsFilter8(dst, 1, outer_thresh, inner_thresh)) { + if (IsFlat4(dst, 1, flat_thresh)) { + Filter8_C(dst, 1); + } else if (Hev(dst, 1, hev_thresh)) { + Filter2_C(dst, 1); + } else { + Filter4_C(dst, 1); + } + } + dst += stride; + } +} + +template +void LoopFilterFuncs_C::Horizontal8(void* dest, + ptrdiff_t stride, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + const int flat_thresh = LoopFilterFuncs_C::kFlatThresh; + AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh); + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + for (int i = 0; i < 4; ++i) { + if (NeedsFilter8(dst, stride, outer_thresh, inner_thresh)) { + if (IsFlat4(dst, stride, flat_thresh)) { + Filter8_C(dst, stride); + } else if (Hev(dst, stride, hev_thresh)) { + Filter2_C(dst, stride); + } else { + Filter4_C(dst, stride); + } + } + ++dst; + } +} + +//------------------------------------------------------------------------------ +// 13-tap filters + +// 7.14.6.2. +template +inline bool IsFlatOuter4(const Pixel* p, ptrdiff_t step, int flat_thresh) { + const int p6 = p[-7 * step], p5 = p[-6 * step], p4 = p[-5 * step], + p0 = p[-step]; + const int q0 = p[0], q4 = p[4 * step], q5 = p[5 * step], q6 = p[6 * step]; + return std::abs(p4 - p0) <= flat_thresh && std::abs(q4 - q0) <= flat_thresh && + std::abs(p5 - p0) <= flat_thresh && std::abs(q5 - q0) <= flat_thresh && + std::abs(p6 - p0) <= flat_thresh && std::abs(q6 - q0) <= flat_thresh; +} + +template +inline Pixel ApplyFilter14(int filter_value) { + return static_cast(RightShiftWithRounding(filter_value, 4)); +} + +// 7.14.6.4. +// 14 pixels in, 12 pixels out. +template +inline void Filter14_C(Pixel* p, ptrdiff_t step) { + const int p6 = p[-7 * step], p5 = p[-6 * step], p4 = p[-5 * step], + p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step], + p0 = p[-step]; + const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step], + q4 = p[4 * step], q5 = p[5 * step], q6 = p[6 * step]; + // The max is 16 * max_pixel + 8 for the rounder. + // 8bpp: 4088 (12 bits), 10bpp: 16376 (14 bits), 12bpp: 65528 (16 bits) + p[-6 * step] = + ApplyFilter14(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0); + p[-5 * step] = ApplyFilter14(p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + + p1 + p0 + q0 + q1); + p[-4 * step] = ApplyFilter14(p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + + p1 + p0 + q0 + q1 + q2); + p[-3 * step] = ApplyFilter14(p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + + p1 * 2 + p0 + q0 + q1 + q2 + q3); + p[-2 * step] = ApplyFilter14(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + + p0 * 2 + q0 + q1 + q2 + q3 + q4); + p[-1 * step] = ApplyFilter14(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + + q0 * 2 + q1 + q2 + q3 + q4 + q5); + p[0 * step] = ApplyFilter14(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + + q1 * 2 + q2 + q3 + q4 + q5 + q6); + p[1 * step] = ApplyFilter14(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + + q2 * 2 + q3 + q4 + q5 + q6 * 2); + p[2 * step] = ApplyFilter14(p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + + q3 * 2 + q4 + q5 + q6 * 3); + p[3 * step] = ApplyFilter14(p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + + q4 * 2 + q5 + q6 * 4); + p[4 * step] = ApplyFilter14(p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + + q5 * 2 + q6 * 5); + p[5 * step] = + ApplyFilter14(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7); +} + +template +void LoopFilterFuncs_C::Vertical14(void* dest, + ptrdiff_t stride, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + const int flat_thresh = LoopFilterFuncs_C::kFlatThresh; + AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh); + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + for (int i = 0; i < 4; ++i) { + if (NeedsFilter8(dst, 1, outer_thresh, inner_thresh)) { + if (IsFlat4(dst, 1, flat_thresh)) { + if (IsFlatOuter4(dst, 1, flat_thresh)) { + Filter14_C(dst, 1); + } else { + Filter8_C(dst, 1); + } + } else if (Hev(dst, 1, hev_thresh)) { + Filter2_C(dst, 1); + } else { + Filter4_C(dst, 1); + } + } + dst += stride; + } +} + +template +void LoopFilterFuncs_C::Horizontal14(void* dest, + ptrdiff_t stride, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + const int flat_thresh = LoopFilterFuncs_C::kFlatThresh; + AdjustThresholds(bitdepth, &outer_thresh, &inner_thresh, &hev_thresh); + auto* dst = static_cast(dest); + stride /= sizeof(Pixel); + for (int i = 0; i < 4; ++i) { + if (NeedsFilter8(dst, stride, outer_thresh, inner_thresh)) { + if (IsFlat4(dst, stride, flat_thresh)) { + if (IsFlatOuter4(dst, stride, flat_thresh)) { + Filter14_C(dst, stride); + } else { + Filter8_C(dst, stride); + } + } else if (Hev(dst, stride, hev_thresh)) { + Filter2_C(dst, stride); + } else { + Filter4_C(dst, stride); + } + } + ++dst; + } +} + +using Defs8bpp = LoopFilterFuncs_C<8, uint8_t>; + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = + Defs8bpp::Horizontal4; + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = + Defs8bpp::Vertical4; + + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = + Defs8bpp::Horizontal6; + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = + Defs8bpp::Vertical6; + + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = + Defs8bpp::Horizontal8; + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = + Defs8bpp::Vertical8; + + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = + Defs8bpp::Horizontal14; + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = + Defs8bpp::Vertical14; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = + Defs8bpp::Horizontal4; +#endif +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = + Defs8bpp::Vertical4; +#endif + +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = + Defs8bpp::Horizontal6; +#endif +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = + Defs8bpp::Vertical6; +#endif + +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = + Defs8bpp::Horizontal8; +#endif +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = + Defs8bpp::Vertical8; +#endif + +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = + Defs8bpp::Horizontal14; +#endif +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = + Defs8bpp::Vertical14; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +using Defs10bpp = LoopFilterFuncs_C<10, uint16_t>; + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = + Defs10bpp::Horizontal4; + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = + Defs10bpp::Vertical4; + + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = + Defs10bpp::Horizontal6; + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = + Defs10bpp::Vertical6; + + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = + Defs10bpp::Horizontal8; + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = + Defs10bpp::Vertical8; + + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = + Defs10bpp::Horizontal14; + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = + Defs10bpp::Vertical14; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = + Defs10bpp::Horizontal4; +#endif +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = + Defs10bpp::Vertical4; +#endif + +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = + Defs10bpp::Horizontal6; +#endif +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = + Defs10bpp::Vertical6; +#endif + +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = + Defs10bpp::Horizontal8; +#endif +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = + Defs10bpp::Vertical8; +#endif + +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = + Defs10bpp::Horizontal14; +#endif +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = + Defs10bpp::Vertical14; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace + +void LoopFilterInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif + // Local functions that may be unused depending on the optimizations + // available. + static_cast(AdjustThresholds); +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/loop_filter.h b/src/dsp/loop_filter.h new file mode 100644 index 0000000..1ddad71 --- /dev/null +++ b/src/dsp/loop_filter.h @@ -0,0 +1,47 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_LOOP_FILTER_H_ +#define LIBGAV1_SRC_DSP_LOOP_FILTER_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/loop_filter_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/loop_filter_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::loop_filters. This function is not thread-safe. +void LoopFilterInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_LOOP_FILTER_H_ diff --git a/src/dsp/loop_restoration.cc b/src/dsp/loop_restoration.cc new file mode 100644 index 0000000..0909df0 --- /dev/null +++ b/src/dsp/loop_restoration.cc @@ -0,0 +1,936 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/loop_restoration.h" + +#include +#include +#include +#include +#include + +#include "src/dsp/common.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { + +// Section 7.17.3. +// a2: range [1, 256]. +// if (z >= 255) +// a2 = 256; +// else if (z == 0) +// a2 = 1; +// else +// a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1); +// ma = 256 - a2; +alignas(16) const uint8_t kSgrMaLookup[256] = { + 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14, + 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, + 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0}; + +namespace { + +template +inline void WienerHorizontal(const Pixel* source, const ptrdiff_t source_stride, + const int width, const int height, + const int16_t* const filter, + const int number_zero_coefficients, + int16_t** wiener_buffer) { + constexpr int kCenterTap = kWienerFilterTaps / 2; + constexpr int kRoundBitsHorizontal = (bitdepth == 12) + ? kInterRoundBitsHorizontal12bpp + : kInterRoundBitsHorizontal; + constexpr int offset = + 1 << (bitdepth + kWienerFilterBits - kRoundBitsHorizontal - 1); + constexpr int limit = (offset << 2) - 1; + for (int y = 0; y < height; ++y) { + int x = 0; + do { + // sum fits into 16 bits only when bitdepth = 8. + int sum = 0; + for (int k = number_zero_coefficients; k < kCenterTap; ++k) { + sum += + filter[k] * (source[x + k] + source[x + kWienerFilterTaps - 1 - k]); + } + sum += filter[kCenterTap] * source[x + kCenterTap]; + const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal); + (*wiener_buffer)[x] = Clip3(rounded_sum, -offset, limit - offset); + } while (++x != width); + source += source_stride; + *wiener_buffer += width; + } +} + +template +inline void WienerVertical(const int16_t* wiener_buffer, const int width, + const int height, const int16_t* const filter, + const int number_zero_coefficients, void* const dest, + const ptrdiff_t dest_stride) { + constexpr int kCenterTap = kWienerFilterTaps / 2; + constexpr int kRoundBitsVertical = + (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical; + auto* dst = static_cast(dest); + int y = height; + do { + int x = 0; + do { + // sum needs 32 bits. + int sum = 0; + for (int k = number_zero_coefficients; k < kCenterTap; ++k) { + sum += filter[k] * + (wiener_buffer[k * width + x] + + wiener_buffer[(kWienerFilterTaps - 1 - k) * width + x]); + } + sum += filter[kCenterTap] * wiener_buffer[kCenterTap * width + x]; + const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical); + dst[x] = static_cast(Clip3(rounded_sum, 0, (1 << bitdepth) - 1)); + } while (++x != width); + wiener_buffer += width; + dst += dest_stride; + } while (--y != 0); +} + +// Note: bit range for wiener filter. +// Wiener filter process first applies horizontal filtering to input pixels, +// followed by rounding with predefined bits (dependent on bitdepth). +// Then vertical filtering is applied, followed by rounding (dependent on +// bitdepth). +// The process is the same as convolution: +// --> --> --> +// --> +// By design: +// (a). horizontal/vertical filtering adds 7 bits to input. +// (b). The output of first rounding fits into 16 bits. +// (c). The output of second rounding fits into 16 bits. +// If input bitdepth > 8, the accumulator of the horizontal filter is larger +// than 16 bit and smaller than 32 bits. +// The accumulator of the vertical filter is larger than 16 bits and smaller +// than 32 bits. +// Note: range of wiener filter coefficients. +// Wiener filter coefficients are symmetric, and their sum is 1 (128). +// The range of each coefficient: +// filter[0] = filter[6], 4 bits, min = -5, max = 10. +// filter[1] = filter[5], 5 bits, min = -23, max = 8. +// filter[2] = filter[4], 6 bits, min = -17, max = 46. +// filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]). +// The difference from libaom is that in libaom: +// filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]). +// Thus in libaom's computation, an offset of 128 is needed for filter[3]. +template +void WienerFilter_C(const RestorationUnitInfo& restoration_info, + const void* const source, const void* const top_border, + const void* const bottom_border, const ptrdiff_t stride, + const int width, const int height, + RestorationBuffer* const restoration_buffer, + void* const dest) { + constexpr int kCenterTap = kWienerFilterTaps / 2; + const int16_t* const number_leading_zero_coefficients = + restoration_info.wiener_info.number_leading_zero_coefficients; + const int number_rows_to_skip = std::max( + static_cast(number_leading_zero_coefficients[WienerInfo::kVertical]), + 1); + int16_t* const wiener_buffer_org = restoration_buffer->wiener_buffer; + + // horizontal filtering. + const int height_horizontal = + height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip; + const int height_extra = (height_horizontal - height) >> 1; + assert(height_extra <= 2); + const int16_t* const filter_horizontal = + restoration_info.wiener_info.filter[WienerInfo::kHorizontal]; + const auto* src = static_cast(source) - kCenterTap; + const auto* top = static_cast(top_border) - kCenterTap; + const auto* bottom = static_cast(bottom_border) - kCenterTap; + auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width; + + if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { + WienerHorizontal(top + (2 - height_extra) * stride, stride, + width, height_extra, filter_horizontal, 0, + &wiener_buffer); + WienerHorizontal(src, stride, width, height, + filter_horizontal, 0, &wiener_buffer); + WienerHorizontal(bottom, stride, width, height_extra, + filter_horizontal, 0, &wiener_buffer); + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { + WienerHorizontal(top + (2 - height_extra) * stride, stride, + width, height_extra, filter_horizontal, 1, + &wiener_buffer); + WienerHorizontal(src, stride, width, height, + filter_horizontal, 1, &wiener_buffer); + WienerHorizontal(bottom, stride, width, height_extra, + filter_horizontal, 1, &wiener_buffer); + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { + WienerHorizontal(top + (2 - height_extra) * stride, stride, + width, height_extra, filter_horizontal, 2, + &wiener_buffer); + WienerHorizontal(src, stride, width, height, + filter_horizontal, 2, &wiener_buffer); + WienerHorizontal(bottom, stride, width, height_extra, + filter_horizontal, 2, &wiener_buffer); + } else { + assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); + WienerHorizontal(top + (2 - height_extra) * stride, stride, + width, height_extra, filter_horizontal, 3, + &wiener_buffer); + WienerHorizontal(src, stride, width, height, + filter_horizontal, 3, &wiener_buffer); + WienerHorizontal(bottom, stride, width, height_extra, + filter_horizontal, 3, &wiener_buffer); + } + + // vertical filtering. + const int16_t* const filter_vertical = + restoration_info.wiener_info.filter[WienerInfo::kVertical]; + if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) { + // Because the top row of |source| is a duplicate of the second row, and the + // bottom row of |source| is a duplicate of its above row, we can duplicate + // the top and bottom row of |wiener_buffer| accordingly. + memcpy(wiener_buffer, wiener_buffer - width, + sizeof(*wiener_buffer) * width); + memcpy(wiener_buffer_org, wiener_buffer_org + width, + sizeof(*wiener_buffer) * width); + WienerVertical(wiener_buffer_org, width, height, + filter_vertical, 0, dest, stride); + } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) { + WienerVertical(wiener_buffer_org, width, height, + filter_vertical, 1, dest, stride); + } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) { + WienerVertical(wiener_buffer_org, width, height, + filter_vertical, 2, dest, stride); + } else { + assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3); + WienerVertical(wiener_buffer_org, width, height, + filter_vertical, 3, dest, stride); + } +} + +//------------------------------------------------------------------------------ +// SGR + +// When |height| is 1, |src_stride| could be set to arbitrary value. +template +LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride, + const int height, const int width, + uint16_t* const* sums, + uint32_t* const* square_sums) { + int y = height; + do { + uint32_t sum = 0; + uint32_t square_sum = 0; + for (int dx = 0; dx < size; ++dx) { + const Pixel source = src[dx]; + sum += source; + square_sum += source * source; + } + (*sums)[0] = sum; + (*square_sums)[0] = square_sum; + int x = 1; + do { + const Pixel source0 = src[x - 1]; + const Pixel source1 = src[x - 1 + size]; + sum -= source0; + sum += source1; + square_sum -= source0 * source0; + square_sum += source1 * source1; + (*sums)[x] = sum; + (*square_sums)[x] = square_sum; + } while (++x != width); + src += src_stride; + ++sums; + ++square_sums; + } while (--y != 0); +} + +// When |height| is 1, |src_stride| could be set to arbitrary value. +template +LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride, + const int height, const int width, + uint16_t* const* sum3, uint16_t* const* sum5, + uint32_t* const* square_sum3, + uint32_t* const* square_sum5) { + int y = height; + do { + uint32_t sum = 0; + uint32_t square_sum = 0; + for (int dx = 0; dx < 4; ++dx) { + const Pixel source = src[dx]; + sum += source; + square_sum += source * source; + } + int x = 0; + do { + const Pixel source0 = src[x]; + const Pixel source1 = src[x + 4]; + sum -= source0; + square_sum -= source0 * source0; + (*sum3)[x] = sum; + (*square_sum3)[x] = square_sum; + sum += source1; + square_sum += source1 * source1; + (*sum5)[x] = sum + source0; + (*square_sum5)[x] = square_sum + source0 * source0; + } while (++x != width); + src += src_stride; + ++sum3; + ++sum5; + ++square_sum3; + ++square_sum5; + } while (--y != 0); +} + +template +inline void CalculateIntermediate(const uint32_t s, uint32_t a, + const uint32_t b, uint8_t* const ma_ptr, + uint32_t* const b_ptr) { + // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1). + // since max bitdepth = 12, max < 2^31. + // after shift, a < 2^16 * n < 2^22 regardless of bitdepth + a = RightShiftWithRounding(a, (bitdepth - 8) << 1); + // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19. + // d < 2^8 * n < 2^14 regardless of bitdepth + const uint32_t d = RightShiftWithRounding(b, bitdepth - 8); + // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28, + // and p itself satisfies p < 2^14 * n^2 < 2^26. + // This bound on p is due to: + // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances + // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b. + // This is an artifact of rounding, and can only happen if all pixels + // are (almost) identical, so in this case we saturate to p=0. + const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d; + // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale < + // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12 + // (this holds even after accounting for the rounding in s) + const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits); + // ma: range [0, 255]. + const uint32_t ma = kSgrMaLookup[std::min(z, 255u)]; + const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; + // ma < 2^8, b < 2^(bitdepth) * n, + // one_over_n = round(2^12 / n) + // => the product here is < 2^(20 + bitdepth) <= 2^32, + // and b is set to a value < 2^(8 + bitdepth). + // This holds even with the rounding in one_over_n and in the overall result, + // as long as ma is strictly less than 2^8. + const uint32_t b2 = ma * b * one_over_n; + *ma_ptr = ma; + *b_ptr = RightShiftWithRounding(b2, kSgrProjReciprocalBits); +} + +template +inline uint32_t Sum343(const T* const src) { + return 3 * (src[0] + src[2]) + 4 * src[1]; +} + +template +inline uint32_t Sum444(const T* const src) { + return 4 * (src[0] + src[1] + src[2]); +} + +template +inline uint32_t Sum565(const T* const src) { + return 5 * (src[0] + src[2]) + 6 * src[1]; +} + +template +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5( + const uint16_t* const sum5[5], const uint32_t* const square_sum5[5], + const int width, const uint32_t s, SgrBuffer* const sgr_buffer, + uint16_t* const ma565, uint32_t* const b565) { + int x = 0; + do { + uint32_t a = 0; + uint32_t b = 0; + for (int dy = 0; dy < 5; ++dy) { + a += square_sum5[dy][x]; + b += sum5[dy][x]; + } + CalculateIntermediate(s, a, b, sgr_buffer->ma + x, + sgr_buffer->b + x); + } while (++x != width + 2); + x = 0; + do { + ma565[x] = Sum565(sgr_buffer->ma + x); + b565[x] = Sum565(sgr_buffer->b + x); + } while (++x != width); +} + +template +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3( + const uint16_t* const sum3[3], const uint32_t* const square_sum3[3], + const int width, const uint32_t s, const bool calculate444, + SgrBuffer* const sgr_buffer, uint16_t* const ma343, uint32_t* const b343, + uint16_t* const ma444, uint32_t* const b444) { + int x = 0; + do { + uint32_t a = 0; + uint32_t b = 0; + for (int dy = 0; dy < 3; ++dy) { + a += square_sum3[dy][x]; + b += sum3[dy][x]; + } + CalculateIntermediate(s, a, b, sgr_buffer->ma + x, + sgr_buffer->b + x); + } while (++x != width + 2); + x = 0; + do { + ma343[x] = Sum343(sgr_buffer->ma + x); + b343[x] = Sum343(sgr_buffer->b + x); + } while (++x != width); + if (calculate444) { + x = 0; + do { + ma444[x] = Sum444(sgr_buffer->ma + x); + b444[x] = Sum444(sgr_buffer->b + x); + } while (++x != width); + } +} + +template +inline int CalculateFilteredOutput(const Pixel src, const uint32_t ma, + const uint32_t b, const int shift) { + const int32_t v = b - ma * src; + return RightShiftWithRounding(v, + kSgrProjSgrBits + shift - kSgrProjRestoreBits); +} + +template +inline void BoxFilterPass1Kernel(const Pixel src0, const Pixel src1, + const uint16_t* const ma565[2], + const uint32_t* const b565[2], + const ptrdiff_t x, int p[2]) { + p[0] = CalculateFilteredOutput(src0, ma565[0][x] + ma565[1][x], + b565[0][x] + b565[1][x], 5); + p[1] = CalculateFilteredOutput(src1, ma565[1][x], b565[1][x], 4); +} + +template +inline int BoxFilterPass2Kernel(const Pixel src, const uint16_t* const ma343[3], + const uint16_t* const ma444, + const uint32_t* const b343[3], + const uint32_t* const b444, const ptrdiff_t x) { + const uint32_t ma = ma343[0][x] + ma444[x] + ma343[2][x]; + const uint32_t b = b343[0][x] + b444[x] + b343[2][x]; + return CalculateFilteredOutput(src, ma, b, 5); +} + +template +inline Pixel SelfGuidedFinal(const int src, const int v) { + // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is: + // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13. + // Then, range of s is bitdepth + 2. This is a rough estimation, taking the + // maximum value of each element. + const int s = src + RightShiftWithRounding( + v, kSgrProjRestoreBits + kSgrProjPrecisionBits); + return static_cast(Clip3(s, 0, (1 << bitdepth) - 1)); +} + +template +inline Pixel SelfGuidedDoubleMultiplier(const int src, const int filter0, + const int filter1, const int16_t w0, + const int16_t w2) { + const int v = w0 * filter0 + w2 * filter1; + return SelfGuidedFinal(src, v); +} + +template +inline Pixel SelfGuidedSingleMultiplier(const int src, const int filter, + const int16_t w0) { + const int v = w0 * filter; + return SelfGuidedFinal(src, v); +} + +template +inline void BoxFilterPass1(const Pixel* const src, const ptrdiff_t stride, + uint16_t* const sum5[5], + uint32_t* const square_sum5[5], const int width, + const uint32_t scale, const int16_t w0, + SgrBuffer* const sgr_buffer, + uint16_t* const ma565[2], uint32_t* const b565[2], + Pixel* dst) { + BoxFilterPreProcess5(sum5, square_sum5, width, scale, sgr_buffer, + ma565[1], b565[1]); + int x = 0; + do { + int p[2]; + BoxFilterPass1Kernel(src[x], src[stride + x], ma565, b565, x, p); + dst[x] = SelfGuidedSingleMultiplier(src[x], p[0], w0); + dst[stride + x] = + SelfGuidedSingleMultiplier(src[stride + x], p[1], w0); + } while (++x != width); +} + +template +inline void BoxFilterPass2(const Pixel* const src, const Pixel* const src0, + const int width, const uint16_t scale, + const int16_t w0, uint16_t* const sum3[4], + uint32_t* const square_sum3[4], + SgrBuffer* const sgr_buffer, + uint16_t* const ma343[4], uint16_t* const ma444[3], + uint32_t* const b343[4], uint32_t* const b444[3], + Pixel* dst) { + BoxSum(src0, 0, 1, width + 2, sum3 + 2, square_sum3 + 2); + BoxFilterPreProcess3(sum3, square_sum3, width, scale, true, + sgr_buffer, ma343[2], b343[2], ma444[1], + b444[1]); + int x = 0; + do { + const int p = + BoxFilterPass2Kernel(src[x], ma343, ma444[0], b343, b444[0], x); + dst[x] = SelfGuidedSingleMultiplier(src[x], p, w0); + } while (++x != width); +} + +template +inline void BoxFilter(const Pixel* const src, const ptrdiff_t stride, + uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], + uint32_t* const square_sum5[5], const int width, + const uint16_t scales[2], const int16_t w0, + const int16_t w2, SgrBuffer* const sgr_buffer, + uint16_t* const ma343[4], uint16_t* const ma444[3], + uint16_t* const ma565[2], uint32_t* const b343[4], + uint32_t* const b444[3], uint32_t* const b565[2], + Pixel* dst) { + BoxFilterPreProcess5(sum5, square_sum5, width, scales[0], + sgr_buffer, ma565[1], b565[1]); + BoxFilterPreProcess3(sum3, square_sum3, width, scales[1], true, + sgr_buffer, ma343[2], b343[2], ma444[1], + b444[1]); + BoxFilterPreProcess3(sum3 + 1, square_sum3 + 1, width, scales[1], + true, sgr_buffer, ma343[3], b343[3], ma444[2], + b444[2]); + int x = 0; + do { + int p[2][2]; + BoxFilterPass1Kernel(src[x], src[stride + x], ma565, b565, x, p[0]); + p[1][0] = + BoxFilterPass2Kernel(src[x], ma343, ma444[0], b343, b444[0], x); + p[1][1] = BoxFilterPass2Kernel(src[stride + x], ma343 + 1, ma444[1], + b343 + 1, b444[1], x); + dst[x] = SelfGuidedDoubleMultiplier(src[x], p[0][0], + p[1][0], w0, w2); + dst[stride + x] = SelfGuidedDoubleMultiplier( + src[stride + x], p[0][1], p[1][1], w0, w2); + } while (++x != width); +} + +template +inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info, + const Pixel* src, const Pixel* const top_border, + const Pixel* bottom_border, const ptrdiff_t stride, + const int width, const int height, + SgrBuffer* const sgr_buffer, Pixel* dst) { + const auto temp_stride = Align(width, 8); + const ptrdiff_t sum_stride = temp_stride + 8; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12. + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1; + uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2]; + uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2]; + sum3[0] = sgr_buffer->sum3; + square_sum3[0] = sgr_buffer->square_sum3; + ma343[0] = sgr_buffer->ma343; + b343[0] = sgr_buffer->b343; + for (int i = 1; i <= 3; ++i) { + sum3[i] = sum3[i - 1] + sum_stride; + square_sum3[i] = square_sum3[i - 1] + sum_stride; + ma343[i] = ma343[i - 1] + temp_stride; + b343[i] = b343[i - 1] + temp_stride; + } + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + for (int i = 1; i <= 4; ++i) { + sum5[i] = sum5[i - 1] + sum_stride; + square_sum5[i] = square_sum5[i - 1] + sum_stride; + } + ma444[0] = sgr_buffer->ma444; + b444[0] = sgr_buffer->b444; + for (int i = 1; i <= 2; ++i) { + ma444[i] = ma444[i - 1] + temp_stride; + b444[i] = b444[i - 1] + temp_stride; + } + ma565[0] = sgr_buffer->ma565; + ma565[1] = ma565[0] + temp_stride; + b565[0] = sgr_buffer->b565; + b565[1] = b565[0] + temp_stride; + assert(scales[0] != 0); + assert(scales[1] != 0); + BoxSum(top_border, stride, 2, width + 2, sum3, sum5 + 1, square_sum3, + square_sum5 + 1); + sum5[0] = sum5[1]; + square_sum5[0] = square_sum5[1]; + BoxSum(src, stride, 1, width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2, + square_sum5 + 3); + const Pixel* const s = (height > 1) ? src + stride : bottom_border; + BoxSum(s, 0, 1, width + 2, sum3 + 3, sum5 + 4, square_sum3 + 3, + square_sum5 + 4); + BoxFilterPreProcess5(sum5, square_sum5, width, scales[0], + sgr_buffer, ma565[0], b565[0]); + BoxFilterPreProcess3(sum3, square_sum3, width, scales[1], false, + sgr_buffer, ma343[0], b343[0], nullptr, + nullptr); + BoxFilterPreProcess3(sum3 + 1, square_sum3 + 1, width, scales[1], + true, sgr_buffer, ma343[1], b343[1], ma444[0], + b444[0]); + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + + for (int y = (height >> 1) - 1; y > 0; --y) { + Circulate4PointersBy2(sum3); + Circulate4PointersBy2(square_sum3); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + BoxSum(src + 2 * stride, stride, 2, width + 2, sum3 + 2, sum5 + 3, + square_sum3 + 2, square_sum5 + 3); + BoxFilter(src + 3, stride, sum3, sum5, square_sum3, + square_sum5, width, scales, w0, w2, sgr_buffer, + ma343, ma444, ma565, b343, b444, b565, dst); + src += 2 * stride; + dst += 2 * stride; + Circulate4PointersBy2(ma343); + Circulate4PointersBy2(b343); + std::swap(ma444[0], ma444[2]); + std::swap(b444[0], b444[2]); + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + + Circulate4PointersBy2(sum3); + Circulate4PointersBy2(square_sum3); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + if ((height & 1) == 0 || height > 1) { + const Pixel* sr; + ptrdiff_t s_stride; + if ((height & 1) == 0) { + sr = bottom_border; + s_stride = stride; + } else { + sr = src + 2 * stride; + s_stride = bottom_border - (src + 2 * stride); + } + BoxSum(sr, s_stride, 2, width + 2, sum3 + 2, sum5 + 3, + square_sum3 + 2, square_sum5 + 3); + BoxFilter(src + 3, stride, sum3, sum5, square_sum3, + square_sum5, width, scales, w0, w2, sgr_buffer, + ma343, ma444, ma565, b343, b444, b565, dst); + } + if ((height & 1) != 0) { + src += 3; + if (height > 1) { + src += 2 * stride; + dst += 2 * stride; + Circulate4PointersBy2(sum3); + Circulate4PointersBy2(square_sum3); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + Circulate4PointersBy2(ma343); + Circulate4PointersBy2(b343); + std::swap(ma444[0], ma444[2]); + std::swap(b444[0], b444[2]); + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + BoxSum(bottom_border + stride, stride, 1, width + 2, sum3 + 2, + sum5 + 3, square_sum3 + 2, square_sum5 + 3); + sum5[4] = sum5[3]; + square_sum5[4] = square_sum5[3]; + BoxFilterPreProcess5(sum5, square_sum5, width, scales[0], + sgr_buffer, ma565[1], b565[1]); + BoxFilterPreProcess3(sum3, square_sum3, width, scales[1], false, + sgr_buffer, ma343[2], b343[2], nullptr, + nullptr); + int x = 0; + do { + const int p0 = CalculateFilteredOutput( + src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5); + const int p1 = BoxFilterPass2Kernel(src[x], ma343, ma444[0], b343, + b444[0], x); + dst[x] = + SelfGuidedDoubleMultiplier(src[x], p0, p1, w0, w2); + } while (++x != width); + } +} + +template +inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, + const Pixel* src, + const Pixel* const top_border, + const Pixel* bottom_border, + const ptrdiff_t stride, const int width, + const int height, SgrBuffer* const sgr_buffer, + Pixel* dst) { + const auto temp_stride = Align(width, 8); + const ptrdiff_t sum_stride = temp_stride + 8; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12. + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + uint16_t *sum5[5], *ma565[2]; + uint32_t *square_sum5[5], *b565[2]; + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + for (int i = 1; i <= 4; ++i) { + sum5[i] = sum5[i - 1] + sum_stride; + square_sum5[i] = square_sum5[i - 1] + sum_stride; + } + ma565[0] = sgr_buffer->ma565; + ma565[1] = ma565[0] + temp_stride; + b565[0] = sgr_buffer->b565; + b565[1] = b565[0] + temp_stride; + assert(scale != 0); + BoxSum(top_border, stride, 2, width + 2, sum5 + 1, square_sum5 + 1); + sum5[0] = sum5[1]; + square_sum5[0] = square_sum5[1]; + BoxSum(src, stride, 1, width + 2, sum5 + 3, square_sum5 + 3); + const Pixel* const s = (height > 1) ? src + stride : bottom_border; + BoxSum(s, 0, 1, width + 2, sum5 + 4, square_sum5 + 4); + BoxFilterPreProcess5(sum5, square_sum5, width, scale, sgr_buffer, + ma565[0], b565[0]); + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + + for (int y = (height >> 1) - 1; y > 0; --y) { + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + BoxSum(src + 2 * stride, stride, 2, width + 2, sum5 + 3, + square_sum5 + 3); + BoxFilterPass1(src + 3, stride, sum5, square_sum5, width, + scale, w0, sgr_buffer, ma565, b565, dst); + src += 2 * stride; + dst += 2 * stride; + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + if ((height & 1) == 0 || height > 1) { + const Pixel* sr; + ptrdiff_t s_stride; + if ((height & 1) == 0) { + sr = bottom_border; + s_stride = stride; + } else { + sr = src + 2 * stride; + s_stride = bottom_border - (src + 2 * stride); + } + BoxSum(sr, s_stride, 2, width + 2, sum5 + 3, square_sum5 + 3); + BoxFilterPass1(src + 3, stride, sum5, square_sum5, width, + scale, w0, sgr_buffer, ma565, b565, dst); + } + if ((height & 1) != 0) { + src += 3; + if (height > 1) { + src += 2 * stride; + dst += 2 * stride; + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + } + BoxSum(bottom_border + stride, stride, 1, width + 2, sum5 + 3, + square_sum5 + 3); + sum5[4] = sum5[3]; + square_sum5[4] = square_sum5[3]; + BoxFilterPreProcess5(sum5, square_sum5, width, scale, sgr_buffer, + ma565[1], b565[1]); + int x = 0; + do { + const int p = CalculateFilteredOutput( + src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5); + dst[x] = SelfGuidedSingleMultiplier(src[x], p, w0); + } while (++x != width); + } +} + +template +inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, + const Pixel* src, + const Pixel* const top_border, + const Pixel* bottom_border, + const ptrdiff_t stride, const int width, + const int height, SgrBuffer* const sgr_buffer, + Pixel* dst) { + assert(restoration_info.sgr_proj_info.multiplier[0] == 0); + const auto temp_stride = Align(width, 8); + const ptrdiff_t sum_stride = temp_stride + 8; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12. + uint16_t *sum3[3], *ma343[3], *ma444[2]; + uint32_t *square_sum3[3], *b343[3], *b444[2]; + sum3[0] = sgr_buffer->sum3; + square_sum3[0] = sgr_buffer->square_sum3; + ma343[0] = sgr_buffer->ma343; + b343[0] = sgr_buffer->b343; + for (int i = 1; i <= 2; ++i) { + sum3[i] = sum3[i - 1] + sum_stride; + square_sum3[i] = square_sum3[i - 1] + sum_stride; + ma343[i] = ma343[i - 1] + temp_stride; + b343[i] = b343[i - 1] + temp_stride; + } + ma444[0] = sgr_buffer->ma444; + ma444[1] = ma444[0] + temp_stride; + b444[0] = sgr_buffer->b444; + b444[1] = b444[0] + temp_stride; + assert(scale != 0); + BoxSum(top_border, stride, 2, width + 2, sum3, square_sum3); + BoxSum(src, stride, 1, width + 2, sum3 + 2, square_sum3 + 2); + BoxFilterPreProcess3(sum3, square_sum3, width, scale, false, + sgr_buffer, ma343[0], b343[0], nullptr, + nullptr); + Circulate3PointersBy1(sum3); + Circulate3PointersBy1(square_sum3); + const Pixel* s; + if (height > 1) { + s = src + stride; + } else { + s = bottom_border; + bottom_border += stride; + } + BoxSum(s, 0, 1, width + 2, sum3 + 2, square_sum3 + 2); + BoxFilterPreProcess3(sum3, square_sum3, width, scale, true, + sgr_buffer, ma343[1], b343[1], ma444[0], + b444[0]); + + for (int y = height - 2; y > 0; --y) { + Circulate3PointersBy1(sum3); + Circulate3PointersBy1(square_sum3); + BoxFilterPass2(src + 2, src + 2 * stride, width, scale, w0, + sum3, square_sum3, sgr_buffer, ma343, ma444, + b343, b444, dst); + src += stride; + dst += stride; + Circulate3PointersBy1(ma343); + Circulate3PointersBy1(b343); + std::swap(ma444[0], ma444[1]); + std::swap(b444[0], b444[1]); + } + + src += 2; + int y = std::min(height, 2); + do { + Circulate3PointersBy1(sum3); + Circulate3PointersBy1(square_sum3); + BoxFilterPass2(src, bottom_border, width, scale, w0, sum3, + square_sum3, sgr_buffer, ma343, ma444, b343, + b444, dst); + src += stride; + dst += stride; + bottom_border += stride; + Circulate3PointersBy1(ma343); + Circulate3PointersBy1(b343); + std::swap(ma444[0], ma444[1]); + std::swap(b444[0], b444[1]); + } while (--y != 0); +} + +template +void SelfGuidedFilter_C(const RestorationUnitInfo& restoration_info, + const void* const source, const void* const top_border, + const void* const bottom_border, const ptrdiff_t stride, + const int width, const int height, + RestorationBuffer* const restoration_buffer, + void* const dest) { + const int index = restoration_info.sgr_proj_info.index; + const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 + const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 + const auto* src = static_cast(source); + const auto* top = static_cast(top_border); + const auto* bottom = static_cast(bottom_border); + auto* dst = static_cast(dest); + SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer; + if (radius_pass_1 == 0) { + // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the + // following assertion. + assert(radius_pass_0 != 0); + BoxFilterProcessPass1(restoration_info, src - 3, top - 3, + bottom - 3, stride, width, height, + sgr_buffer, dst); + } else if (radius_pass_0 == 0) { + BoxFilterProcessPass2(restoration_info, src - 2, top - 2, + bottom - 2, stride, width, height, + sgr_buffer, dst); + } else { + BoxFilterProcess(restoration_info, src - 3, top - 3, + bottom - 3, stride, width, height, + sgr_buffer, dst); + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>; + dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp8bpp_WienerFilter + dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter + dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>; + dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp10bpp_WienerFilter + dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter + dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} // namespace + +void LoopRestorationInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/loop_restoration.h b/src/dsp/loop_restoration.h new file mode 100644 index 0000000..de80926 --- /dev/null +++ b/src/dsp/loop_restoration.h @@ -0,0 +1,85 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_ +#define LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/loop_restoration_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/loop_restoration_avx2.h" +#include "src/dsp/x86/loop_restoration_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +enum { + // Precision of a division table (mtable) + kSgrProjScaleBits = 20, + kSgrProjReciprocalBits = 12, + // Core self-guided restoration precision bits. + kSgrProjSgrBits = 8, + // Precision bits of generated values higher than source before projection. + kSgrProjRestoreBits = 4 +}; // anonymous enum + +extern const uint8_t kSgrMaLookup[256]; + +// Initializes Dsp::loop_restorations. This function is not thread-safe. +void LoopRestorationInit_C(); + +template +void Circulate3PointersBy1(T* p[3]) { + T* const p0 = p[0]; + p[0] = p[1]; + p[1] = p[2]; + p[2] = p0; +} + +template +void Circulate4PointersBy2(T* p[4]) { + std::swap(p[0], p[2]); + std::swap(p[1], p[3]); +} + +template +void Circulate5PointersBy2(T* p[5]) { + T* const p0 = p[0]; + T* const p1 = p[1]; + p[0] = p[2]; + p[1] = p[3]; + p[2] = p[4]; + p[3] = p0; + p[4] = p1; +} + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_LOOP_RESTORATION_H_ diff --git a/src/dsp/mask_blend.cc b/src/dsp/mask_blend.cc new file mode 100644 index 0000000..101c410 --- /dev/null +++ b/src/dsp/mask_blend.cc @@ -0,0 +1,207 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/mask_blend.h" + +#include +#include +#include + +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +template +uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x) { + if ((subsampling_x | subsampling_y) == 0) { + return mask[x]; + } + if (subsampling_x == 1 && subsampling_y == 0) { + return static_cast(RightShiftWithRounding( + mask[MultiplyBy2(x)] + mask[MultiplyBy2(x) + 1], 1)); + } + assert(subsampling_x == 1 && subsampling_y == 1); + return static_cast(RightShiftWithRounding( + mask[MultiplyBy2(x)] + mask[MultiplyBy2(x) + 1] + + mask_next_row[MultiplyBy2(x)] + mask_next_row[MultiplyBy2(x) + 1], + 2)); +} + +template +void MaskBlend_C(const void* prediction_0, const void* prediction_1, + const ptrdiff_t prediction_stride_1, const uint8_t* mask, + const ptrdiff_t mask_stride, const int width, const int height, + void* dest, const ptrdiff_t dest_stride) { + static_assert(!(bitdepth == 8 && is_inter_intra), ""); + assert(mask != nullptr); + using PredType = + typename std::conditional::type; + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + auto* dst = static_cast(dest); + const ptrdiff_t dst_stride = dest_stride / sizeof(Pixel); + constexpr int step_y = subsampling_y ? 2 : 1; + const uint8_t* mask_next_row = mask + mask_stride; + // 7.11.3.2 Rounding variables derivation process + // 2 * FILTER_BITS(7) - (InterRound0(3|5) + InterRound1(7)) + constexpr int inter_post_round_bits = (bitdepth == 12) ? 2 : 4; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + const uint8_t mask_value = + GetMaskValue(mask, mask_next_row, x); + if (is_inter_intra) { + dst[x] = static_cast(RightShiftWithRounding( + mask_value * pred_1[x] + (64 - mask_value) * pred_0[x], 6)); + } else { + assert(prediction_stride_1 == width); + int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6; + res -= (bitdepth == 8) ? 0 : kCompoundOffset; + dst[x] = static_cast( + Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0, + (1 << bitdepth) - 1)); + } + } + dst += dst_stride; + mask += mask_stride * step_y; + mask_next_row += mask_stride * step_y; + pred_0 += width; + pred_1 += prediction_stride_1; + } +} + +template +void InterIntraMaskBlend8bpp_C(const uint8_t* prediction_0, + uint8_t* prediction_1, + const ptrdiff_t prediction_stride_1, + const uint8_t* mask, const ptrdiff_t mask_stride, + const int width, const int height) { + assert(mask != nullptr); + constexpr int step_y = subsampling_y ? 2 : 1; + const uint8_t* mask_next_row = mask + mask_stride; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + const uint8_t mask_value = + GetMaskValue(mask, mask_next_row, x); + prediction_1[x] = static_cast(RightShiftWithRounding( + mask_value * prediction_1[x] + (64 - mask_value) * prediction_0[x], + 6)); + } + mask += mask_stride * step_y; + mask_next_row += mask_stride * step_y; + prediction_0 += width; + prediction_1 += prediction_stride_1; + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->mask_blend[0][0] = MaskBlend_C<8, uint8_t, false, 0, 0>; + dsp->mask_blend[1][0] = MaskBlend_C<8, uint8_t, false, 1, 0>; + dsp->mask_blend[2][0] = MaskBlend_C<8, uint8_t, false, 1, 1>; + // The is_inter_intra index of mask_blend[][] is replaced by + // inter_intra_mask_blend_8bpp[] in 8-bit. + dsp->mask_blend[0][1] = nullptr; + dsp->mask_blend[1][1] = nullptr; + dsp->mask_blend[2][1] = nullptr; + dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_C<0, 0>; + dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_C<1, 0>; + dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp8bpp_MaskBlend444 + dsp->mask_blend[0][0] = MaskBlend_C<8, uint8_t, false, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_MaskBlend422 + dsp->mask_blend[1][0] = MaskBlend_C<8, uint8_t, false, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_MaskBlend420 + dsp->mask_blend[2][0] = MaskBlend_C<8, uint8_t, false, 1, 1>; +#endif + // The is_inter_intra index of mask_blend[][] is replaced by + // inter_intra_mask_blend_8bpp[] in 8-bit. + dsp->mask_blend[0][1] = nullptr; + dsp->mask_blend[1][1] = nullptr; + dsp->mask_blend[2][1] = nullptr; +#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 + dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_C<0, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 + dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_C<1, 0>; +#endif +#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 + dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->mask_blend[0][0] = MaskBlend_C<10, uint16_t, false, 0, 0>; + dsp->mask_blend[1][0] = MaskBlend_C<10, uint16_t, false, 1, 0>; + dsp->mask_blend[2][0] = MaskBlend_C<10, uint16_t, false, 1, 1>; + dsp->mask_blend[0][1] = MaskBlend_C<10, uint16_t, true, 0, 0>; + dsp->mask_blend[1][1] = MaskBlend_C<10, uint16_t, true, 1, 0>; + dsp->mask_blend[2][1] = MaskBlend_C<10, uint16_t, true, 1, 1>; + // These are only used with 8-bit. + dsp->inter_intra_mask_blend_8bpp[0] = nullptr; + dsp->inter_intra_mask_blend_8bpp[1] = nullptr; + dsp->inter_intra_mask_blend_8bpp[2] = nullptr; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp10bpp_MaskBlend444 + dsp->mask_blend[0][0] = MaskBlend_C<10, uint16_t, false, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_MaskBlend422 + dsp->mask_blend[1][0] = MaskBlend_C<10, uint16_t, false, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_MaskBlend420 + dsp->mask_blend[2][0] = MaskBlend_C<10, uint16_t, false, 1, 1>; +#endif +#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra444 + dsp->mask_blend[0][1] = MaskBlend_C<10, uint16_t, true, 0, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra422 + dsp->mask_blend[1][1] = MaskBlend_C<10, uint16_t, true, 1, 0>; +#endif +#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra420 + dsp->mask_blend[2][1] = MaskBlend_C<10, uint16_t, true, 1, 1>; +#endif + // These are only used with 8-bit. + dsp->inter_intra_mask_blend_8bpp[0] = nullptr; + dsp->inter_intra_mask_blend_8bpp[1] = nullptr; + dsp->inter_intra_mask_blend_8bpp[2] = nullptr; +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif + +} // namespace + +void MaskBlendInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/mask_blend.h b/src/dsp/mask_blend.h new file mode 100644 index 0000000..41f5e5b --- /dev/null +++ b/src/dsp/mask_blend.h @@ -0,0 +1,49 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_MASK_BLEND_H_ +#define LIBGAV1_SRC_DSP_MASK_BLEND_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/mask_blend_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +// SSE4_1 +#include "src/dsp/x86/mask_blend_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::mask_blend and Dsp::inter_intra_mask_blend_8bpp. This +// function is not thread-safe. +void MaskBlendInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_MASK_BLEND_H_ diff --git a/src/dsp/motion_field_projection.cc b/src/dsp/motion_field_projection.cc new file mode 100644 index 0000000..b51ec8f --- /dev/null +++ b/src/dsp/motion_field_projection.cc @@ -0,0 +1,138 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/motion_field_projection.h" + +#include +#include +#include +#include + +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/reference_info.h" +#include "src/utils/types.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// Silence unused function warnings when MotionFieldProjectionKernel_C is +// not used. +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel) || \ + (LIBGAV1_MAX_BITDEPTH >= 10 && \ + !defined(LIBGAV1_Dsp10bpp_MotionFieldProjectionKernel)) + +// 7.9.2. +void MotionFieldProjectionKernel_C(const ReferenceInfo& reference_info, + int reference_to_current_with_sign, + int dst_sign, int y8_start, int y8_end, + int x8_start, int x8_end, + TemporalMotionField* motion_field) { + const ptrdiff_t stride = motion_field->mv.columns(); + // The column range has to be offset by kProjectionMvMaxHorizontalOffset since + // coordinates in that range could end up being position_x8 because of + // projection. + const int adjusted_x8_start = + std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0); + const int adjusted_x8_end = std::min( + x8_end + kProjectionMvMaxHorizontalOffset, static_cast(stride)); + const int8_t* const reference_offsets = + reference_info.relative_distance_to.data(); + const bool* const skip_references = reference_info.skip_references.data(); + const int16_t* const projection_divisions = + reference_info.projection_divisions.data(); + const ReferenceFrameType* source_reference_types = + &reference_info.motion_field_reference_frame[y8_start][0]; + const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0]; + int8_t* dst_reference_offset = motion_field->reference_offset[y8_start]; + MotionVector* dst_mv = motion_field->mv[y8_start]; + assert(stride == motion_field->reference_offset.columns()); + assert((y8_start & 7) == 0); + + int y8 = y8_start; + do { + const int y8_floor = (y8 & ~7) - y8; + const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8); + int x8 = adjusted_x8_start; + do { + const int source_reference_type = source_reference_types[x8]; + if (skip_references[source_reference_type]) continue; + MotionVector projection_mv; + // reference_to_current_with_sign could be 0. + GetMvProjection(mv[x8], reference_to_current_with_sign, + projection_divisions[source_reference_type], + &projection_mv); + // Do not update the motion vector if the block position is not valid or + // if position_x8 is outside the current range of x8_start and x8_end. + // Note that position_y8 will always be within the range of y8_start and + // y8_end. + const int position_y8 = Project(0, projection_mv.mv[0], dst_sign); + if (position_y8 < y8_floor || position_y8 >= y8_ceiling) continue; + const int x8_base = x8 & ~7; + const int x8_floor = + std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset); + const int x8_ceiling = + std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset); + const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign); + if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue; + dst_mv[position_y8 * stride + position_x8] = mv[x8]; + dst_reference_offset[position_y8 * stride + position_x8] = + reference_offsets[source_reference_type]; + } while (++x8 < adjusted_x8_end); + source_reference_types += stride; + mv += stride; + dst_reference_offset += stride; + dst_mv += stride; + } while (++y8 < y8_end); +} + +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || + // !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel) || + // (LIBGAV1_MAX_BITDEPTH >= 10 && + // !defined(LIBGAV1_Dsp10bpp_MotionFieldProjectionKernel)) + +void Init8bpp() { +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + !defined(LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel) + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); + dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_C; +#endif +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + !defined(LIBGAV1_Dsp10bpp_MotionFieldProjectionKernel) + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); + dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_C; +#endif +} +#endif + +} // namespace + +void MotionFieldProjectionInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/motion_field_projection.h b/src/dsp/motion_field_projection.h new file mode 100644 index 0000000..36de459 --- /dev/null +++ b/src/dsp/motion_field_projection.h @@ -0,0 +1,48 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_ +#define LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/motion_field_projection_neon.h" +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +// SSE4_1 +#include "src/dsp/x86/motion_field_projection_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::motion_field_projection_kernel. This function is not +// thread-safe. +void MotionFieldProjectionInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_MOTION_FIELD_PROJECTION_H_ diff --git a/src/dsp/motion_vector_search.cc b/src/dsp/motion_vector_search.cc new file mode 100644 index 0000000..9402302 --- /dev/null +++ b/src/dsp/motion_vector_search.cc @@ -0,0 +1,211 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/motion_vector_search.h" + +#include +#include +#include +#include + +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// Silence unused function warnings when the C functions are not used. +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch) || \ + (LIBGAV1_MAX_BITDEPTH >= 10 && \ + !defined(LIBGAV1_Dsp10bpp_MotionVectorSearch)) + +void MvProjectionCompoundLowPrecision_C( + const MotionVector* const temporal_mvs, + const int8_t* const temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* const candidate_mvs) { + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + int index = 0; + do { + candidate_mvs[index].mv64 = 0; + for (int i = 0; i < 2; ++i) { + // |offsets| non-zero check usually equals true and could be ignored. + if (offsets[i] != 0) { + GetMvProjection( + temporal_mvs[index], offsets[i], + kProjectionMvDivisionLookup[temporal_reference_offsets[index]], + &candidate_mvs[index].mv[i]); + for (auto& mv : candidate_mvs[index].mv[i].mv) { + // The next line is equivalent to: + // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1; + mv = (mv - (mv >> 15)) & ~1; + } + } + } + } while (++index < count); +} + +void MvProjectionCompoundForceInteger_C( + const MotionVector* const temporal_mvs, + const int8_t* const temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* const candidate_mvs) { + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + int index = 0; + do { + candidate_mvs[index].mv64 = 0; + for (int i = 0; i < 2; ++i) { + // |offsets| non-zero check usually equals true and could be ignored. + if (offsets[i] != 0) { + GetMvProjection( + temporal_mvs[index], offsets[i], + kProjectionMvDivisionLookup[temporal_reference_offsets[index]], + &candidate_mvs[index].mv[i]); + for (auto& mv : candidate_mvs[index].mv[i].mv) { + // The next line is equivalent to: + // const int value = (std::abs(static_cast(mv)) + 3) & ~7; + // const int sign = mv >> 15; + // mv = ApplySign(value, sign); + mv = (mv + 3 - (mv >> 15)) & ~7; + } + } + } + } while (++index < count); +} + +void MvProjectionCompoundHighPrecision_C( + const MotionVector* const temporal_mvs, + const int8_t* const temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* const candidate_mvs) { + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + int index = 0; + do { + candidate_mvs[index].mv64 = 0; + for (int i = 0; i < 2; ++i) { + // |offsets| non-zero check usually equals true and could be ignored. + if (offsets[i] != 0) { + GetMvProjection( + temporal_mvs[index], offsets[i], + kProjectionMvDivisionLookup[temporal_reference_offsets[index]], + &candidate_mvs[index].mv[i]); + } + } + } while (++index < count); +} + +void MvProjectionSingleLowPrecision_C( + const MotionVector* const temporal_mvs, + const int8_t* const temporal_reference_offsets, const int reference_offset, + const int count, MotionVector* const candidate_mvs) { + int index = 0; + do { + GetMvProjection( + temporal_mvs[index], reference_offset, + kProjectionMvDivisionLookup[temporal_reference_offsets[index]], + &candidate_mvs[index]); + for (auto& mv : candidate_mvs[index].mv) { + // The next line is equivalent to: + // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1; + mv = (mv - (mv >> 15)) & ~1; + } + } while (++index < count); +} + +void MvProjectionSingleForceInteger_C( + const MotionVector* const temporal_mvs, + const int8_t* const temporal_reference_offsets, const int reference_offset, + const int count, MotionVector* const candidate_mvs) { + int index = 0; + do { + GetMvProjection( + temporal_mvs[index], reference_offset, + kProjectionMvDivisionLookup[temporal_reference_offsets[index]], + &candidate_mvs[index]); + for (auto& mv : candidate_mvs[index].mv) { + // The next line is equivalent to: + // const int value = (std::abs(static_cast(mv)) + 3) & ~7; + // const int sign = mv >> 15; + // mv = ApplySign(value, sign); + mv = (mv + 3 - (mv >> 15)) & ~7; + } + } while (++index < count); +} + +void MvProjectionSingleHighPrecision_C( + const MotionVector* const temporal_mvs, + const int8_t* const temporal_reference_offsets, const int reference_offset, + const int count, MotionVector* const candidate_mvs) { + int index = 0; + do { + GetMvProjection( + temporal_mvs[index], reference_offset, + kProjectionMvDivisionLookup[temporal_reference_offsets[index]], + &candidate_mvs[index]); + } while (++index < count); +} + +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || + // !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch) || + // (LIBGAV1_MAX_BITDEPTH >= 10 && + // !defined(LIBGAV1_Dsp10bpp_MotionVectorSearch)) + +void Init8bpp() { +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + !defined(LIBGAV1_Dsp8bpp_MotionVectorSearch) + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); + dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_C; + dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_C; + dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_C; + dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_C; + dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_C; + dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_C; +#endif +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ + !defined(LIBGAV1_Dsp10bpp_MotionVectorSearch) + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); + dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_C; + dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_C; + dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_C; + dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_C; + dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_C; + dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_C; +#endif +} +#endif + +} // namespace + +void MotionVectorSearchInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/motion_vector_search.h b/src/dsp/motion_vector_search.h new file mode 100644 index 0000000..ae16726 --- /dev/null +++ b/src/dsp/motion_vector_search.h @@ -0,0 +1,49 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_ +#define LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/motion_vector_search_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +// SSE4_1 +#include "src/dsp/x86/motion_vector_search_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This +// function is not thread-safe. +void MotionVectorSearchInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_MOTION_VECTOR_SEARCH_H_ diff --git a/src/dsp/obmc.cc b/src/dsp/obmc.cc new file mode 100644 index 0000000..46d1b5b --- /dev/null +++ b/src/dsp/obmc.cc @@ -0,0 +1,125 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/obmc.h" + +#include +#include +#include + +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace { + +#include "src/dsp/obmc.inc" + +// 7.11.3.10 (from top samples). +template +void OverlapBlendVertical_C(void* const prediction, + const ptrdiff_t prediction_stride, const int width, + const int height, const void* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + auto* pred = static_cast(prediction); + const ptrdiff_t pred_stride = prediction_stride / sizeof(Pixel); + const auto* obmc_pred = static_cast(obmc_prediction); + const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(Pixel); + const uint8_t* const mask = kObmcMask + height - 2; + + for (int y = 0; y < height; ++y) { + const uint8_t mask_value = mask[y]; + for (int x = 0; x < width; ++x) { + pred[x] = static_cast(RightShiftWithRounding( + mask_value * pred[x] + (64 - mask_value) * obmc_pred[x], 6)); + } + pred += pred_stride; + obmc_pred += obmc_pred_stride; + } +} + +// 7.11.3.10 (from left samples). +template +void OverlapBlendHorizontal_C(void* const prediction, + const ptrdiff_t prediction_stride, + const int width, const int height, + const void* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + auto* pred = static_cast(prediction); + const ptrdiff_t pred_stride = prediction_stride / sizeof(Pixel); + const auto* obmc_pred = static_cast(obmc_prediction); + const ptrdiff_t obmc_pred_stride = obmc_prediction_stride / sizeof(Pixel); + const uint8_t* const mask = kObmcMask + width - 2; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + const uint8_t mask_value = mask[x]; + pred[x] = static_cast(RightShiftWithRounding( + mask_value * pred[x] + (64 - mask_value) * obmc_pred[x], 6)); + } + pred += pred_stride; + obmc_pred += obmc_pred_stride; + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C; + dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendHorizontal_C; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp8bpp_ObmcVertical + dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C; +#endif +#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal + dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendHorizontal_C; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C; + dsp->obmc_blend[kObmcDirectionHorizontal] = + OverlapBlendHorizontal_C; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp10bpp_ObmcVertical + dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendVertical_C; +#endif +#ifndef LIBGAV1_Dsp10bpp_ObmcHorizontal + dsp->obmc_blend[kObmcDirectionHorizontal] = + OverlapBlendHorizontal_C; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif + +} // namespace + +void ObmcInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/obmc.h b/src/dsp/obmc.h new file mode 100644 index 0000000..3b826c7 --- /dev/null +++ b/src/dsp/obmc.h @@ -0,0 +1,47 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_OBMC_H_ +#define LIBGAV1_SRC_DSP_OBMC_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/obmc_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/obmc_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::obmc_blend. This function is not thread-safe. +void ObmcInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_OBMC_H_ diff --git a/src/dsp/obmc.inc b/src/dsp/obmc.inc new file mode 100644 index 0000000..001c6ee --- /dev/null +++ b/src/dsp/obmc.inc @@ -0,0 +1,32 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Constants and utility functions used for overlap blend implementations. +// This will be included inside an anonymous namespace on files where these are +// necessary. + +// This is a flat array of masks for each block dimension from 2 to 32. The +// starting index for each length is length-2. +constexpr uint8_t kObmcMask[62] = { + // Obmc Mask 2 + 45, 64, + // Obmc Mask 4 + 39, 50, 59, 64, + // Obmc Mask 8 + 36, 42, 48, 53, 57, 61, 64, 64, + // Obmc Mask 16 + 34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64, + // Obmc Mask 32 + 33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58, + 59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64}; diff --git a/src/dsp/super_res.cc b/src/dsp/super_res.cc new file mode 100644 index 0000000..d041bd1 --- /dev/null +++ b/src/dsp/super_res.cc @@ -0,0 +1,109 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/super_res.h" + +#include + +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace { + +template +void SuperRes_C(const void* /*coefficients*/, void* const source, + const ptrdiff_t stride, const int height, + const int downscaled_width, const int upscaled_width, + const int initial_subpixel_x, const int step, + void* const dest) { + assert(step <= 1 << kSuperResScaleBits); + auto* src = static_cast(source) - DivideBy2(kSuperResFilterTaps); + auto* dst = static_cast(dest); + int y = height; + do { + ExtendLine(src + DivideBy2(kSuperResFilterTaps), downscaled_width, + kSuperResHorizontalBorder, kSuperResHorizontalBorder); + // If (original) upscaled_width is <= 9, the downscaled_width may be + // upscaled_width - 1 (i.e. 8, 9), and become the same (i.e. 4) when + // subsampled via RightShiftWithRounding. This leads to an edge case where + // |step| == 1 << 14. + int subpixel_x = initial_subpixel_x; + int x = 0; + do { + int sum = 0; + const Pixel* const src_x = &src[subpixel_x >> kSuperResScaleBits]; + const int src_x_subpixel = + (subpixel_x & kSuperResScaleMask) >> kSuperResExtraBits; + // The sign of each tap is: - + - + + - + - + sum -= src_x[0] * kUpscaleFilterUnsigned[src_x_subpixel][0]; + sum += src_x[1] * kUpscaleFilterUnsigned[src_x_subpixel][1]; + sum -= src_x[2] * kUpscaleFilterUnsigned[src_x_subpixel][2]; + sum += src_x[3] * kUpscaleFilterUnsigned[src_x_subpixel][3]; + sum += src_x[4] * kUpscaleFilterUnsigned[src_x_subpixel][4]; + sum -= src_x[5] * kUpscaleFilterUnsigned[src_x_subpixel][5]; + sum += src_x[6] * kUpscaleFilterUnsigned[src_x_subpixel][6]; + sum -= src_x[7] * kUpscaleFilterUnsigned[src_x_subpixel][7]; + dst[x] = Clip3(RightShiftWithRounding(sum, kFilterBits), 0, + (1 << bitdepth) - 1); + subpixel_x += step; + } while (++x < upscaled_width); + src += stride; + dst += stride; + } while (--y != 0); +} + +void Init8bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); + dsp->super_res_coefficients = nullptr; +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->super_res = SuperRes_C<8, uint8_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp8bpp_SuperRes + dsp->super_res = SuperRes_C<8, uint8_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); + dsp->super_res_coefficients = nullptr; +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->super_res = SuperRes_C<10, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp10bpp_SuperRes + dsp->super_res = SuperRes_C<10, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif + +} // namespace + +void SuperResInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/super_res.h b/src/dsp/super_res.h new file mode 100644 index 0000000..2ca9d2b --- /dev/null +++ b/src/dsp/super_res.h @@ -0,0 +1,47 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_SUPER_RES_H_ +#define LIBGAV1_SRC_DSP_SUPER_RES_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/super_res_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/super_res_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::super_res. This function is not thread-safe. +void SuperResInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_SUPER_RES_H_ diff --git a/src/dsp/warp.cc b/src/dsp/warp.cc new file mode 100644 index 0000000..fbde65a --- /dev/null +++ b/src/dsp/warp.cc @@ -0,0 +1,475 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/warp.h" + +#include +#include +#include +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// Number of extra bits of precision in warped filtering. +constexpr int kWarpedDiffPrecisionBits = 10; + +// Warp prediction output ranges from WarpTest.ShowRange. +// Bitdepth: 8 Input range: [ 0, 255] +// 8bpp intermediate offset: 16384. +// intermediate range: [ 4399, 61009] +// first pass output range: [ 550, 7626] +// 8bpp intermediate offset removal: 262144. +// intermediate range: [ -620566, 1072406] +// second pass output range: [ 0, 255] +// compound second pass output range: [ -4848, 8378] +// +// Bitdepth: 10 Input range: [ 0, 1023] +// intermediate range: [ -48081, 179025] +// first pass output range: [ -6010, 22378] +// intermediate range: [-2103516, 4198620] +// second pass output range: [ 0, 1023] +// compound second pass output range: [ 8142, 57378] +// +// Bitdepth: 12 Input range: [ 0, 4095] +// intermediate range: [ -192465, 716625] +// first pass output range: [ -6015, 22395] +// intermediate range: [-2105190, 4201830] +// second pass output range: [ 0, 4095] +// compound second pass output range: [ 8129, 57403] + +template +void Warp_C(const void* const source, ptrdiff_t source_stride, + const int source_width, const int source_height, + const int* const warp_params, const int subsampling_x, + const int subsampling_y, const int block_start_x, + const int block_start_y, const int block_width, + const int block_height, const int16_t alpha, const int16_t beta, + const int16_t gamma, const int16_t delta, void* dest, + ptrdiff_t dest_stride) { + assert(block_width >= 8 && block_height >= 8); + if (is_compound) { + assert(dest_stride == block_width); + } + constexpr int kRoundBitsHorizontal = (bitdepth == 12) + ? kInterRoundBitsHorizontal12bpp + : kInterRoundBitsHorizontal; + constexpr int kRoundBitsVertical = + is_compound ? kInterRoundBitsCompoundVertical + : (bitdepth == 12) ? kInterRoundBitsVertical12bpp + : kInterRoundBitsVertical; + + // Only used for 8bpp. Allows for keeping the first pass intermediates within + // uint16_t. With 10/12bpp the intermediate value will always require int32_t. + constexpr int first_pass_offset = (bitdepth == 8) ? 1 << 14 : 0; + constexpr int offset_removal = + (first_pass_offset >> kRoundBitsHorizontal) * 128; + + constexpr int kMaxPixel = (1 << bitdepth) - 1; + union { + // |intermediate_result| is the output of the horizontal filtering and + // rounding. The range is within int16_t. + int16_t intermediate_result[15][8]; // 15 rows, 8 columns. + // In the simple special cases where the samples in each row are all the + // same, store one sample per row in a column vector. + int16_t intermediate_result_column[15]; + }; + const auto* const src = static_cast(source); + source_stride /= sizeof(Pixel); + using DestType = + typename std::conditional::type; + auto* dst = static_cast(dest); + if (!is_compound) dest_stride /= sizeof(dst[0]); + + assert(block_width >= 8); + assert(block_height >= 8); + + // Warp process applies for each 8x8 block (or smaller). + for (int start_y = block_start_y; start_y < block_start_y + block_height; + start_y += 8) { + for (int start_x = block_start_x; start_x < block_start_x + block_width; + start_x += 8) { + const int src_x = (start_x + 4) << subsampling_x; + const int src_y = (start_y + 4) << subsampling_y; + const int dst_x = + src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; + const int dst_y = + src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; + const int x4 = dst_x >> subsampling_x; + const int y4 = dst_y >> subsampling_y; + const int ix4 = x4 >> kWarpedModelPrecisionBits; + const int iy4 = y4 >> kWarpedModelPrecisionBits; + + // A prediction block may fall outside the frame's boundaries. If a + // prediction block is calculated using only samples outside the frame's + // boundary, the filtering can be simplified. We can divide the plane + // into several regions and handle them differently. + // + // | | + // 1 | 3 | 1 + // | | + // -------+-----------+------- + // |***********| + // 2 |*****4*****| 2 + // |***********| + // -------+-----------+------- + // | | + // 1 | 3 | 1 + // | | + // + // At the center, region 4 represents the frame and is the general case. + // + // In regions 1 and 2, the prediction block is outside the frame's + // boundary horizontally. Therefore the horizontal filtering can be + // simplified. Furthermore, in the region 1 (at the four corners), the + // prediction is outside the frame's boundary both horizontally and + // vertically, so we get a constant prediction block. + // + // In region 3, the prediction block is outside the frame's boundary + // vertically. Unfortunately because we apply the horizontal filters + // first, by the time we apply the vertical filters, they no longer see + // simple inputs. So the only simplification is that all the rows are + // the same, but we still need to apply all the horizontal and vertical + // filters. + + // Check for two simple special cases, where the horizontal filter can + // be significantly simplified. + // + // In general, for each row, the horizontal filter is calculated as + // follows: + // for (int x = -4; x < 4; ++x) { + // const int offset = ...; + // int sum = first_pass_offset; + // for (int k = 0; k < 8; ++k) { + // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1); + // sum += kWarpedFilters[offset][k] * src_row[column]; + // } + // ... + // } + // The column index before clipping, ix4 + x + k - 3, varies in the range + // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1 + // or ix4 + 7 <= 0, then all the column indexes are clipped to the same + // border index (source_width - 1 or 0, respectively). Then for each x, + // the inner for loop of the horizontal filter is reduced to multiplying + // the border pixel by the sum of the filter coefficients. + if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { + // Regions 1 and 2. + // Points to the left or right border of the first row of |src|. + const Pixel* first_row_border = + (ix4 + 7 <= 0) ? src : src + source_width - 1; + // In general, for y in [-7, 8), the row number iy4 + y is clipped: + // const int row = Clip3(iy4 + y, 0, source_height - 1); + // In two special cases, iy4 + y is clipped to either 0 or + // source_height - 1 for all y. In the rest of the cases, iy4 + y is + // bounded and we can avoid clipping iy4 + y by relying on a reference + // frame's boundary extension on the top and bottom. + if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + // Region 1. + // Every sample used to calculate the prediction block has the same + // value. So the whole prediction block has the same value. + const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const Pixel row_border_pixel = first_row_border[row * source_stride]; + DestType* dst_row = dst + start_x - block_start_x; + if (is_compound) { + int sum = row_border_pixel + << ((14 - kRoundBitsHorizontal) - kRoundBitsVertical); + sum += (bitdepth == 8) ? 0 : kCompoundOffset; + Memset(dst_row, sum, 8); + } else { + Memset(dst_row, row_border_pixel, 8); + } + const DestType* const first_dst_row = dst_row; + dst_row += dest_stride; + for (int y = 1; y < 8; ++y) { + memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row)); + dst_row += dest_stride; + } + // End of region 1. Continue the |start_x| for loop. + continue; + } + + // Region 2. + // Horizontal filter. + // The input values in this region are generated by extending the border + // which makes them identical in the horizontal direction. This + // computation could be inlined in the vertical pass but most + // implementations will need a transpose of some sort. + // It is not necessary to use the offset values here because the + // horizontal pass is a simple shift and the vertical pass will always + // require using 32 bits. + for (int y = -7; y < 8; ++y) { + // We may over-read up to 13 pixels above the top source row, or up + // to 13 pixels below the bottom source row. This is proved below. + const int row = iy4 + y; + int sum = first_row_border[row * source_stride]; + sum <<= kFilterBits - kRoundBitsHorizontal; + intermediate_result_column[y + 7] = sum; + } + // Vertical filter. + DestType* dst_row = dst + start_x - block_start_x; + int sy4 = + (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + for (int y = 0; y < 8; ++y) { + int sy = sy4 - MultiplyBy4(gamma); + for (int x = 0; x < 8; ++x) { + const int offset = + RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + assert(offset >= 0); + assert(offset < 3 * kWarpedPixelPrecisionShifts + 1); + int sum = 0; + for (int k = 0; k < 8; ++k) { + sum += + kWarpedFilters[offset][k] * intermediate_result_column[y + k]; + } + sum = RightShiftWithRounding(sum, kRoundBitsVertical); + if (is_compound) { + sum += (bitdepth == 8) ? 0 : kCompoundOffset; + dst_row[x] = static_cast(sum); + } else { + dst_row[x] = static_cast(Clip3(sum, 0, kMaxPixel)); + } + sy += gamma; + } + dst_row += dest_stride; + sy4 += delta; + } + // End of region 2. Continue the |start_x| for loop. + continue; + } + + // Regions 3 and 4. + // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. + // It follows that -6 <= ix4 <= source_width + 5. This inequality is + // used below. + + // In general, for y in [-7, 8), the row number iy4 + y is clipped: + // const int row = Clip3(iy4 + y, 0, source_height - 1); + // In two special cases, iy4 + y is clipped to either 0 or + // source_height - 1 for all y. In the rest of the cases, iy4 + y is + // bounded and we can avoid clipping iy4 + y by relying on a reference + // frame's boundary extension on the top and bottom. + if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + // Region 3. + // Horizontal filter. + const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const Pixel* const src_row = src + row * source_stride; + int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + for (int y = -7; y < 8; ++y) { + int sx = sx4 - MultiplyBy4(alpha); + for (int x = -4; x < 4; ++x) { + const int offset = + RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + // Since alpha and beta have been validated by SetupShear(), one + // can prove that 0 <= offset <= 3 * 2^6. + assert(offset >= 0); + assert(offset < 3 * kWarpedPixelPrecisionShifts + 1); + // For SIMD optimization: + // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp. + // For 10/12 bit, the range of sum requires 32 bits. + int sum = first_pass_offset; + for (int k = 0; k < 8; ++k) { + // We assume the source frame has left and right borders of at + // least 13 pixels that extend the frame boundary pixels. + // + // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on + // ix4 above, we have + // -13 <= ix4 + x + k - 3 <= source_width + 12, + // or + // -13 <= column <= (source_width - 1) + 13. + // Therefore we may over-read up to 13 pixels before the source + // row, or up to 13 pixels after the source row. + const int column = ix4 + x + k - 3; + sum += kWarpedFilters[offset][k] * src_row[column]; + } + intermediate_result[y + 7][x + 4] = + RightShiftWithRounding(sum, kRoundBitsHorizontal); + sx += alpha; + } + sx4 += beta; + } + } else { + // Region 4. + // Horizontal filter. + // At this point, we know iy4 - 7 < source_height - 1 and iy4 + 7 > 0. + // It follows that -6 <= iy4 <= source_height + 5. This inequality is + // used below. + int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + for (int y = -7; y < 8; ++y) { + // We assume the source frame has top and bottom borders of at least + // 13 pixels that extend the frame boundary pixels. + // + // Since -7 <= y <= 7, using the inequality on iy4 above, we have + // -13 <= iy4 + y <= source_height + 12, + // or + // -13 <= row <= (source_height - 1) + 13. + // Therefore we may over-read up to 13 pixels above the top source + // row, or up to 13 pixels below the bottom source row. + const int row = iy4 + y; + const Pixel* const src_row = src + row * source_stride; + int sx = sx4 - MultiplyBy4(alpha); + for (int x = -4; x < 4; ++x) { + const int offset = + RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + // Since alpha and beta have been validated by SetupShear(), one + // can prove that 0 <= offset <= 3 * 2^6. + assert(offset >= 0); + assert(offset < 3 * kWarpedPixelPrecisionShifts + 1); + // For SIMD optimization: + // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp. + // For 10/12 bit, the range of sum requires 32 bits. + int sum = first_pass_offset; + for (int k = 0; k < 8; ++k) { + // We assume the source frame has left and right borders of at + // least 13 pixels that extend the frame boundary pixels. + // + // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on + // ix4 above, we have + // -13 <= ix4 + x + k - 3 <= source_width + 12, + // or + // -13 <= column <= (source_width - 1) + 13. + // Therefore we may over-read up to 13 pixels before the source + // row, or up to 13 pixels after the source row. + const int column = ix4 + x + k - 3; + sum += kWarpedFilters[offset][k] * src_row[column]; + } + intermediate_result[y + 7][x + 4] = + RightShiftWithRounding(sum, kRoundBitsHorizontal) - + offset_removal; + sx += alpha; + } + sx4 += beta; + } + } + + // Regions 3 and 4. + // Vertical filter. + DestType* dst_row = dst + start_x - block_start_x; + int sy4 = + (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + // The spec says we should use the following loop condition: + // y < std::min(4, block_start_y + block_height - start_y - 4); + // We can prove that block_start_y + block_height - start_y >= 8, which + // implies std::min(4, block_start_y + block_height - start_y - 4) = 4. + // So the loop condition is simply y < 4. + // + // Proof: + // start_y < block_start_y + block_height + // => block_start_y + block_height - start_y > 0 + // => block_height - (start_y - block_start_y) > 0 + // + // Since block_height >= 8 and is a power of 2, it follows that + // block_height is a multiple of 8. start_y - block_start_y is also a + // multiple of 8. Therefore their difference is a multiple of 8. Since + // their difference is > 0, their difference must be >= 8. + // + // We then add an offset of 4 to y so that the loop starts with y = 0 + // and continues if y < 8. + for (int y = 0; y < 8; ++y) { + int sy = sy4 - MultiplyBy4(gamma); + // The spec says we should use the following loop condition: + // x < std::min(4, block_start_x + block_width - start_x - 4); + // Similar to the above, we can prove that the loop condition can be + // simplified to x < 4. + // + // We then add an offset of 4 to x so that the loop starts with x = 0 + // and continues if x < 8. + for (int x = 0; x < 8; ++x) { + const int offset = + RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + // Since gamma and delta have been validated by SetupShear(), one can + // prove that 0 <= offset <= 3 * 2^6. + assert(offset >= 0); + assert(offset < 3 * kWarpedPixelPrecisionShifts + 1); + int sum = 0; + for (int k = 0; k < 8; ++k) { + sum += kWarpedFilters[offset][k] * intermediate_result[y + k][x]; + } + sum -= offset_removal; + sum = RightShiftWithRounding(sum, kRoundBitsVertical); + if (is_compound) { + sum += (bitdepth == 8) ? 0 : kCompoundOffset; + dst_row[x] = static_cast(sum); + } else { + dst_row[x] = static_cast(Clip3(sum, 0, kMaxPixel)); + } + sy += gamma; + } + dst_row += dest_stride; + sy4 += delta; + } + } + dst += 8 * dest_stride; + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->warp = Warp_C; + dsp->warp_compound = Warp_C; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp8bpp_Warp + dsp->warp = Warp_C; +#endif +#ifndef LIBGAV1_Dsp8bpp_WarpCompound + dsp->warp_compound = Warp_C; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->warp = Warp_C; + dsp->warp_compound = Warp_C; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp10bpp_Warp + dsp->warp = Warp_C; +#endif +#ifndef LIBGAV1_Dsp10bpp_WarpCompound + dsp->warp_compound = Warp_C; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif + +} // namespace + +void WarpInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/warp.h b/src/dsp/warp.h new file mode 100644 index 0000000..7367a9b --- /dev/null +++ b/src/dsp/warp.h @@ -0,0 +1,47 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_WARP_H_ +#define LIBGAV1_SRC_DSP_WARP_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/warp_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/warp_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::warp. This function is not thread-safe. +void WarpInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_WARP_H_ diff --git a/src/dsp/weight_mask.cc b/src/dsp/weight_mask.cc new file mode 100644 index 0000000..15d6bc6 --- /dev/null +++ b/src/dsp/weight_mask.cc @@ -0,0 +1,227 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/weight_mask.h" + +#include +#include +#include +#include +#include +#include + +#include "src/dsp/dsp.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +template +void WeightMask_C(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + using PredType = + typename std::conditional::type; + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + static_assert(width >= 8, ""); + static_assert(height >= 8, ""); + constexpr int rounding_bits = bitdepth - 8 + ((bitdepth == 12) ? 2 : 4); + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + const int difference = RightShiftWithRounding( + std::abs(pred_0[x] - pred_1[x]), rounding_bits); + const auto mask_value = + static_cast(std::min(DivideBy16(difference) + 38, 64)); + mask[x] = mask_is_inverse ? 64 - mask_value : mask_value; + } + pred_0 += width; + pred_1 += width; + mask += mask_stride; + } +} + +#define INIT_WEIGHT_MASK(width, height, bitdepth, w_index, h_index) \ + dsp->weight_mask[w_index][h_index][0] = \ + WeightMask_C; \ + dsp->weight_mask[w_index][h_index][1] = \ + WeightMask_C + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_WEIGHT_MASK(8, 8, 8, 0, 0); + INIT_WEIGHT_MASK(8, 16, 8, 0, 1); + INIT_WEIGHT_MASK(8, 32, 8, 0, 2); + INIT_WEIGHT_MASK(16, 8, 8, 1, 0); + INIT_WEIGHT_MASK(16, 16, 8, 1, 1); + INIT_WEIGHT_MASK(16, 32, 8, 1, 2); + INIT_WEIGHT_MASK(16, 64, 8, 1, 3); + INIT_WEIGHT_MASK(32, 8, 8, 2, 0); + INIT_WEIGHT_MASK(32, 16, 8, 2, 1); + INIT_WEIGHT_MASK(32, 32, 8, 2, 2); + INIT_WEIGHT_MASK(32, 64, 8, 2, 3); + INIT_WEIGHT_MASK(64, 16, 8, 3, 1); + INIT_WEIGHT_MASK(64, 32, 8, 3, 2); + INIT_WEIGHT_MASK(64, 64, 8, 3, 3); + INIT_WEIGHT_MASK(64, 128, 8, 3, 4); + INIT_WEIGHT_MASK(128, 64, 8, 4, 3); + INIT_WEIGHT_MASK(128, 128, 8, 4, 4); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8 + INIT_WEIGHT_MASK(8, 8, 8, 0, 0); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16 + INIT_WEIGHT_MASK(8, 16, 8, 0, 1); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32 + INIT_WEIGHT_MASK(8, 32, 8, 0, 2); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8 + INIT_WEIGHT_MASK(16, 8, 8, 1, 0); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16 + INIT_WEIGHT_MASK(16, 16, 8, 1, 1); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32 + INIT_WEIGHT_MASK(16, 32, 8, 1, 2); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64 + INIT_WEIGHT_MASK(16, 64, 8, 1, 3); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8 + INIT_WEIGHT_MASK(32, 8, 8, 2, 0); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16 + INIT_WEIGHT_MASK(32, 16, 8, 2, 1); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32 + INIT_WEIGHT_MASK(32, 32, 8, 2, 2); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64 + INIT_WEIGHT_MASK(32, 64, 8, 2, 3); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16 + INIT_WEIGHT_MASK(64, 16, 8, 3, 1); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32 + INIT_WEIGHT_MASK(64, 32, 8, 3, 2); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64 + INIT_WEIGHT_MASK(64, 64, 8, 3, 3); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128 + INIT_WEIGHT_MASK(64, 128, 8, 3, 4); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64 + INIT_WEIGHT_MASK(128, 64, 8, 4, 3); +#endif +#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128 + INIT_WEIGHT_MASK(128, 128, 8, 4, 4); +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + INIT_WEIGHT_MASK(8, 8, 10, 0, 0); + INIT_WEIGHT_MASK(8, 16, 10, 0, 1); + INIT_WEIGHT_MASK(8, 32, 10, 0, 2); + INIT_WEIGHT_MASK(16, 8, 10, 1, 0); + INIT_WEIGHT_MASK(16, 16, 10, 1, 1); + INIT_WEIGHT_MASK(16, 32, 10, 1, 2); + INIT_WEIGHT_MASK(16, 64, 10, 1, 3); + INIT_WEIGHT_MASK(32, 8, 10, 2, 0); + INIT_WEIGHT_MASK(32, 16, 10, 2, 1); + INIT_WEIGHT_MASK(32, 32, 10, 2, 2); + INIT_WEIGHT_MASK(32, 64, 10, 2, 3); + INIT_WEIGHT_MASK(64, 16, 10, 3, 1); + INIT_WEIGHT_MASK(64, 32, 10, 3, 2); + INIT_WEIGHT_MASK(64, 64, 10, 3, 3); + INIT_WEIGHT_MASK(64, 128, 10, 3, 4); + INIT_WEIGHT_MASK(128, 64, 10, 4, 3); + INIT_WEIGHT_MASK(128, 128, 10, 4, 4); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast(dsp); +#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x8 + INIT_WEIGHT_MASK(8, 8, 10, 0, 0); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x16 + INIT_WEIGHT_MASK(8, 16, 10, 0, 1); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x32 + INIT_WEIGHT_MASK(8, 32, 10, 0, 2); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x8 + INIT_WEIGHT_MASK(16, 8, 10, 1, 0); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x16 + INIT_WEIGHT_MASK(16, 16, 10, 1, 1); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x32 + INIT_WEIGHT_MASK(16, 32, 10, 1, 2); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x64 + INIT_WEIGHT_MASK(16, 64, 10, 1, 3); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x8 + INIT_WEIGHT_MASK(32, 8, 10, 2, 0); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x16 + INIT_WEIGHT_MASK(32, 16, 10, 2, 1); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x32 + INIT_WEIGHT_MASK(32, 32, 10, 2, 2); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x64 + INIT_WEIGHT_MASK(32, 64, 10, 2, 3); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x16 + INIT_WEIGHT_MASK(64, 16, 10, 3, 1); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x32 + INIT_WEIGHT_MASK(64, 32, 10, 3, 2); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x64 + INIT_WEIGHT_MASK(64, 64, 10, 3, 3); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x128 + INIT_WEIGHT_MASK(64, 128, 10, 3, 4); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x64 + INIT_WEIGHT_MASK(128, 64, 10, 4, 3); +#endif +#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x128 + INIT_WEIGHT_MASK(128, 128, 10, 4, 4); +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif + +} // namespace + +void WeightMaskInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 diff --git a/src/dsp/weight_mask.h b/src/dsp/weight_mask.h new file mode 100644 index 0000000..43bef05 --- /dev/null +++ b/src/dsp/weight_mask.h @@ -0,0 +1,47 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_WEIGHT_MASK_H_ +#define LIBGAV1_SRC_DSP_WEIGHT_MASK_H_ + +// Pull in LIBGAV1_DspXXX defines representing the implementation status +// of each function. The resulting value of each can be used by each module to +// determine whether an implementation is needed at compile time. +// IWYU pragma: begin_exports + +// ARM: +#include "src/dsp/arm/weight_mask_neon.h" + +// x86: +// Note includes should be sorted in logical order avx2/avx/sse4, etc. +// The order of includes is important as each tests for a superior version +// before setting the base. +// clang-format off +#include "src/dsp/x86/weight_mask_sse4.h" +// clang-format on + +// IWYU pragma: end_exports + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::weight_mask. This function is not thread-safe. +void WeightMaskInit_C(); + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_SRC_DSP_WEIGHT_MASK_H_ diff --git a/src/dsp/x86/average_blend_sse4.cc b/src/dsp/x86/average_blend_sse4.cc new file mode 100644 index 0000000..8e008d1 --- /dev/null +++ b/src/dsp/x86/average_blend_sse4.cc @@ -0,0 +1,156 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/average_blend.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kInterPostRoundBit = 4; + +inline void AverageBlend4Row(const int16_t* prediction_0, + const int16_t* prediction_1, uint8_t* dest) { + const __m128i pred_0 = LoadLo8(prediction_0); + const __m128i pred_1 = LoadLo8(prediction_1); + __m128i res = _mm_add_epi16(pred_0, pred_1); + res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1); + Store4(dest, _mm_packus_epi16(res, res)); +} + +inline void AverageBlend8Row(const int16_t* prediction_0, + const int16_t* prediction_1, uint8_t* dest) { + const __m128i pred_0 = LoadAligned16(prediction_0); + const __m128i pred_1 = LoadAligned16(prediction_1); + __m128i res = _mm_add_epi16(pred_0, pred_1); + res = RightShiftWithRounding_S16(res, kInterPostRoundBit + 1); + StoreLo8(dest, _mm_packus_epi16(res, res)); +} + +inline void AverageBlendLargeRow(const int16_t* prediction_0, + const int16_t* prediction_1, const int width, + uint8_t* dest) { + int x = 0; + do { + const __m128i pred_00 = LoadAligned16(&prediction_0[x]); + const __m128i pred_01 = LoadAligned16(&prediction_1[x]); + __m128i res0 = _mm_add_epi16(pred_00, pred_01); + res0 = RightShiftWithRounding_S16(res0, kInterPostRoundBit + 1); + const __m128i pred_10 = LoadAligned16(&prediction_0[x + 8]); + const __m128i pred_11 = LoadAligned16(&prediction_1[x + 8]); + __m128i res1 = _mm_add_epi16(pred_10, pred_11); + res1 = RightShiftWithRounding_S16(res1, kInterPostRoundBit + 1); + StoreUnaligned16(dest + x, _mm_packus_epi16(res0, res1)); + x += 16; + } while (x < width); +} + +void AverageBlend_SSE4_1(const void* prediction_0, const void* prediction_1, + const int width, const int height, void* const dest, + const ptrdiff_t dest_stride) { + auto* dst = static_cast(dest); + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y = height; + + if (width == 4) { + do { + // TODO(b/150326556): |prediction_[01]| values are packed. It is possible + // to load 8 values at a time. + AverageBlend4Row(pred_0, pred_1, dst); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + + AverageBlend4Row(pred_0, pred_1, dst); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + + y -= 2; + } while (y != 0); + return; + } + + if (width == 8) { + do { + AverageBlend8Row(pred_0, pred_1, dst); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + + AverageBlend8Row(pred_0, pred_1, dst); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + + y -= 2; + } while (y != 0); + return; + } + + do { + AverageBlendLargeRow(pred_0, pred_1, width, dst); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + + AverageBlendLargeRow(pred_0, pred_1, width, dst); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + + y -= 2; + } while (y != 0); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); +#if DSP_ENABLED_8BPP_SSE4_1(AverageBlend) + dsp->average_blend = AverageBlend_SSE4_1; +#endif +} + +} // namespace + +void AverageBlendInit_SSE4_1() { Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 + +namespace libgav1 { +namespace dsp { + +void AverageBlendInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/average_blend_sse4.h b/src/dsp/x86/average_blend_sse4.h new file mode 100644 index 0000000..937e8e2 --- /dev/null +++ b/src/dsp/x86/average_blend_sse4.h @@ -0,0 +1,41 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::average_blend. This function is not thread-safe. +void AverageBlendInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +// If sse4 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the sse4 implementation should be used. +#if LIBGAV1_TARGETING_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_AverageBlend +#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_SSE4_1 +#endif + +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_ diff --git a/src/dsp/x86/cdef_sse4.cc b/src/dsp/x86/cdef_sse4.cc new file mode 100644 index 0000000..3211a2d --- /dev/null +++ b/src/dsp/x86/cdef_sse4.cc @@ -0,0 +1,728 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/cdef.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include +#include + +#include +#include +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/dsp/x86/transpose_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +#include "src/dsp/cdef.inc" + +// Used when calculating odd |cost[x]| values. +// Holds elements 1 3 5 7 7 7 7 7 +alignas(16) constexpr uint32_t kCdefDivisionTableOddPadded[] = { + 420, 210, 140, 105, 105, 105, 105, 105}; + +// ---------------------------------------------------------------------------- +// Refer to CdefDirection_C(). +// +// int32_t partial[8][15] = {}; +// for (int i = 0; i < 8; ++i) { +// for (int j = 0; j < 8; ++j) { +// const int x = 1; +// partial[0][i + j] += x; +// partial[1][i + j / 2] += x; +// partial[2][i] += x; +// partial[3][3 + i - j / 2] += x; +// partial[4][7 + i - j] += x; +// partial[5][3 - i / 2 + j] += x; +// partial[6][j] += x; +// partial[7][i / 2 + j] += x; +// } +// } +// +// Using the code above, generate the position count for partial[8][15]. +// +// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1 +// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 +// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0 +// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 +// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1 +// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 +// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0 +// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0 +// +// The SIMD code shifts the input horizontally, then adds vertically to get the +// correct partial value for the given position. +// ---------------------------------------------------------------------------- + +// ---------------------------------------------------------------------------- +// partial[0][i + j] += x; +// +// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 +// 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00 +// 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00 +// 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00 +// 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00 +// 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00 +// 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00 +// 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77 +// +// partial[4] is the same except the source is reversed. +LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m128i* v_src_16, + __m128i* partial_lo, + __m128i* partial_hi) { + // 00 01 02 03 04 05 06 07 + *partial_lo = v_src_16[0]; + // 00 00 00 00 00 00 00 00 + *partial_hi = _mm_setzero_si128(); + + // 00 10 11 12 13 14 15 16 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[1], 2)); + // 17 00 00 00 00 00 00 00 + *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[1], 14)); + + // 00 00 20 21 22 23 24 25 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[2], 4)); + // 26 27 00 00 00 00 00 00 + *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[2], 12)); + + // 00 00 00 30 31 32 33 34 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[3], 6)); + // 35 36 37 00 00 00 00 00 + *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[3], 10)); + + // 00 00 00 00 40 41 42 43 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[4], 8)); + // 44 45 46 47 00 00 00 00 + *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[4], 8)); + + // 00 00 00 00 00 50 51 52 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[5], 10)); + // 53 54 55 56 57 00 00 00 + *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[5], 6)); + + // 00 00 00 00 00 00 60 61 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[6], 12)); + // 62 63 64 65 66 67 00 00 + *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[6], 4)); + + // 00 00 00 00 00 00 00 70 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[7], 14)); + // 71 72 73 74 75 76 77 00 + *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[7], 2)); +} + +// ---------------------------------------------------------------------------- +// partial[1][i + j / 2] += x; +// +// A0 = src[0] + src[1], A1 = src[2] + src[3], ... +// +// A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00 +// 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00 +// 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00 +// 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00 +// 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00 +// 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00 +// 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00 +// 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00 +// +// partial[3] is the same except the source is reversed. +LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m128i* v_src_16, + __m128i* partial_lo, + __m128i* partial_hi) { + __m128i v_d1_temp[8]; + const __m128i v_zero = _mm_setzero_si128(); + + for (int i = 0; i < 8; ++i) { + v_d1_temp[i] = _mm_hadd_epi16(v_src_16[i], v_zero); + } + + *partial_lo = *partial_hi = v_zero; + // A0 A1 A2 A3 00 00 00 00 + *partial_lo = _mm_add_epi16(*partial_lo, v_d1_temp[0]); + + // 00 B0 B1 B2 B3 00 00 00 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[1], 2)); + + // 00 00 C0 C1 C2 C3 00 00 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[2], 4)); + // 00 00 00 D0 D1 D2 D3 00 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[3], 6)); + // 00 00 00 00 E0 E1 E2 E3 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[4], 8)); + + // 00 00 00 00 00 F0 F1 F2 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[5], 10)); + // F3 00 00 00 00 00 00 00 + *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[5], 6)); + + // 00 00 00 00 00 00 G0 G1 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[6], 12)); + // G2 G3 00 00 00 00 00 00 + *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[6], 4)); + + // 00 00 00 00 00 00 00 H0 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[7], 14)); + // H1 H2 H3 00 00 00 00 00 + *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[7], 2)); +} + +// ---------------------------------------------------------------------------- +// partial[7][i / 2 + j] += x; +// +// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 +// 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 +// 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00 +// 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00 +// 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00 +// 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00 +// 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00 +// 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00 +// +// partial[5] is the same except the source is reversed. +LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(__m128i* v_src, __m128i* partial_lo, + __m128i* partial_hi) { + __m128i v_pair_add[4]; + // Add vertical source pairs. + v_pair_add[0] = _mm_add_epi16(v_src[0], v_src[1]); + v_pair_add[1] = _mm_add_epi16(v_src[2], v_src[3]); + v_pair_add[2] = _mm_add_epi16(v_src[4], v_src[5]); + v_pair_add[3] = _mm_add_epi16(v_src[6], v_src[7]); + + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + *partial_lo = v_pair_add[0]; + // 00 00 00 00 00 00 00 00 + // 00 00 00 00 00 00 00 00 + *partial_hi = _mm_setzero_si128(); + + // 00 20 21 22 23 24 25 26 + // 00 30 31 32 33 34 35 36 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[1], 2)); + // 27 00 00 00 00 00 00 00 + // 37 00 00 00 00 00 00 00 + *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[1], 14)); + + // 00 00 40 41 42 43 44 45 + // 00 00 50 51 52 53 54 55 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[2], 4)); + // 46 47 00 00 00 00 00 00 + // 56 57 00 00 00 00 00 00 + *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[2], 12)); + + // 00 00 00 60 61 62 63 64 + // 00 00 00 70 71 72 73 74 + *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[3], 6)); + // 65 66 67 00 00 00 00 00 + // 75 76 77 00 00 00 00 00 + *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[3], 10)); +} + +LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride, + __m128i* partial_lo, + __m128i* partial_hi) { + // 8x8 input + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + __m128i v_src[8]; + for (auto& i : v_src) { + i = LoadLo8(src); + src += stride; + } + + const __m128i v_zero = _mm_setzero_si128(); + // partial for direction 2 + // -------------------------------------------------------------------------- + // partial[2][i] += x; + // 00 10 20 30 40 50 60 70 00 00 00 00 00 00 00 00 + // 01 11 21 33 41 51 61 71 00 00 00 00 00 00 00 00 + // 02 12 22 33 42 52 62 72 00 00 00 00 00 00 00 00 + // 03 13 23 33 43 53 63 73 00 00 00 00 00 00 00 00 + // 04 14 24 34 44 54 64 74 00 00 00 00 00 00 00 00 + // 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00 + // 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00 + // 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00 + const __m128i v_src_4_0 = _mm_unpacklo_epi64(v_src[0], v_src[4]); + const __m128i v_src_5_1 = _mm_unpacklo_epi64(v_src[1], v_src[5]); + const __m128i v_src_6_2 = _mm_unpacklo_epi64(v_src[2], v_src[6]); + const __m128i v_src_7_3 = _mm_unpacklo_epi64(v_src[3], v_src[7]); + const __m128i v_hsum_4_0 = _mm_sad_epu8(v_src_4_0, v_zero); + const __m128i v_hsum_5_1 = _mm_sad_epu8(v_src_5_1, v_zero); + const __m128i v_hsum_6_2 = _mm_sad_epu8(v_src_6_2, v_zero); + const __m128i v_hsum_7_3 = _mm_sad_epu8(v_src_7_3, v_zero); + const __m128i v_hsum_1_0 = _mm_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1); + const __m128i v_hsum_3_2 = _mm_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3); + const __m128i v_hsum_5_4 = _mm_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1); + const __m128i v_hsum_7_6 = _mm_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3); + partial_lo[2] = + _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2), + _mm_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6)); + + __m128i v_src_16[8]; + for (int i = 0; i < 8; ++i) { + v_src_16[i] = _mm_cvtepu8_epi16(v_src[i]); + } + + // partial for direction 6 + // -------------------------------------------------------------------------- + // partial[6][j] += x; + // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 00 + // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 00 + // 20 21 22 23 24 25 26 27 00 00 00 00 00 00 00 00 + // 30 31 32 33 34 35 36 37 00 00 00 00 00 00 00 00 + // 40 41 42 43 44 45 46 47 00 00 00 00 00 00 00 00 + // 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00 + // 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00 + // 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00 + partial_lo[6] = v_src_16[0]; + for (int i = 1; i < 8; ++i) { + partial_lo[6] = _mm_add_epi16(partial_lo[6], v_src_16[i]); + } + + // partial for direction 0 + AddPartial_D0_D4(v_src_16, &partial_lo[0], &partial_hi[0]); + + // partial for direction 1 + AddPartial_D1_D3(v_src_16, &partial_lo[1], &partial_hi[1]); + + // partial for direction 7 + AddPartial_D5_D7(v_src_16, &partial_lo[7], &partial_hi[7]); + + __m128i v_src_reverse[8]; + const __m128i reverser = + _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e); + for (int i = 0; i < 8; ++i) { + v_src_reverse[i] = _mm_shuffle_epi8(v_src_16[i], reverser); + } + + // partial for direction 4 + AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]); + + // partial for direction 3 + AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]); + + // partial for direction 5 + AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]); +} + +inline uint32_t SumVector_S32(__m128i a) { + a = _mm_hadd_epi32(a, a); + a = _mm_add_epi32(a, _mm_srli_si128(a, 4)); + return _mm_cvtsi128_si32(a); +} + +// |cost[0]| and |cost[4]| square the input and sum with the corresponding +// element from the other end of the vector: +// |kCdefDivisionTable[]| element: +// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) * +// kCdefDivisionTable[i + 1]; +// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8]; +inline uint32_t Cost0Or4(const __m128i a, const __m128i b, + const __m128i division_table[2]) { + // Reverse and clear upper 2 bytes. + const __m128i reverser = + _mm_set_epi32(0x80800100, 0x03020504, 0x07060908, 0x0b0a0d0c); + // 14 13 12 11 10 09 08 ZZ + const __m128i b_reversed = _mm_shuffle_epi8(b, reverser); + // 00 14 01 13 02 12 03 11 + const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed); + // 04 10 05 09 06 08 07 ZZ + const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed); + + // Square(partial[0][i]) + Square(partial[0][14 - i]) + const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo); + const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi); + + const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]); + const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]); + return SumVector_S32(_mm_add_epi32(c, d)); +} + +inline uint32_t CostOdd(const __m128i a, const __m128i b, + const __m128i division_table[2]) { + // Reverse and clear upper 10 bytes. + const __m128i reverser = + _mm_set_epi32(0x80808080, 0x80808080, 0x80800100, 0x03020504); + // 10 09 08 ZZ ZZ ZZ ZZ ZZ + const __m128i b_reversed = _mm_shuffle_epi8(b, reverser); + // 00 10 01 09 02 08 03 ZZ + const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed); + // 04 ZZ 05 ZZ 06 ZZ 07 ZZ + const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed); + + // Square(partial[0][i]) + Square(partial[0][10 - i]) + const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo); + const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi); + + const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]); + const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]); + return SumVector_S32(_mm_add_epi32(c, d)); +} + +// Sum of squared elements. +inline uint32_t SquareSum_S16(const __m128i a) { + const __m128i square = _mm_madd_epi16(a, a); + return SumVector_S32(square); +} + +void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride, + uint8_t* const direction, int* const variance) { + assert(direction != nullptr); + assert(variance != nullptr); + const auto* src = static_cast(source); + uint32_t cost[8]; + __m128i partial_lo[8], partial_hi[8]; + + AddPartial(src, stride, partial_lo, partial_hi); + + cost[2] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[2]); + cost[6] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[6]); + + const __m128i division_table[2] = {LoadUnaligned16(kCdefDivisionTable), + LoadUnaligned16(kCdefDivisionTable + 4)}; + + cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table); + cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table); + + const __m128i division_table_odd[2] = { + LoadAligned16(kCdefDivisionTableOddPadded), + LoadAligned16(kCdefDivisionTableOddPadded + 4)}; + + cost[1] = CostOdd(partial_lo[1], partial_hi[1], division_table_odd); + cost[3] = CostOdd(partial_lo[3], partial_hi[3], division_table_odd); + cost[5] = CostOdd(partial_lo[5], partial_hi[5], division_table_odd); + cost[7] = CostOdd(partial_lo[7], partial_hi[7], division_table_odd); + + uint32_t best_cost = 0; + *direction = 0; + for (int i = 0; i < 8; ++i) { + if (cost[i] > best_cost) { + best_cost = cost[i]; + *direction = i; + } + } + *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10; +} + +// ------------------------------------------------------------------------- +// CdefFilter + +// Load 4 vectors based on the given |direction|. +inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride, + __m128i* output, const int direction) { + // Each |direction| describes a different set of source values. Expand this + // set by negating each set. For |direction| == 0 this gives a diagonal line + // from top right to bottom left. The first value is y, the second x. Negative + // y values move up. + // a b c d + // {-1, 1}, {1, -1}, {-2, 2}, {2, -2} + // c + // a + // 0 + // b + // d + const int y_0 = kCdefDirections[direction][0][0]; + const int x_0 = kCdefDirections[direction][0][1]; + const int y_1 = kCdefDirections[direction][1][0]; + const int x_1 = kCdefDirections[direction][1][1]; + output[0] = LoadUnaligned16(src - y_0 * stride - x_0); + output[1] = LoadUnaligned16(src + y_0 * stride + x_0); + output[2] = LoadUnaligned16(src - y_1 * stride - x_1); + output[3] = LoadUnaligned16(src + y_1 * stride + x_1); +} + +// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to +// do 2 rows at a time. +void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride, + __m128i* output, const int direction) { + const int y_0 = kCdefDirections[direction][0][0]; + const int x_0 = kCdefDirections[direction][0][1]; + const int y_1 = kCdefDirections[direction][1][0]; + const int x_1 = kCdefDirections[direction][1][1]; + output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0), + src - y_0 * stride + stride - x_0); + output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0), + src + y_0 * stride + stride + x_0); + output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1), + src - y_1 * stride + stride - x_1); + output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1), + src + y_1 * stride + stride + x_1); +} + +inline __m128i Constrain(const __m128i& pixel, const __m128i& reference, + const __m128i& damping, const __m128i& threshold) { + const __m128i diff = _mm_sub_epi16(pixel, reference); + const __m128i abs_diff = _mm_abs_epi16(diff); + // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping), + // 0, std::abs(diff)) + const __m128i shifted_diff = _mm_srl_epi16(abs_diff, damping); + // For bitdepth == 8, the threshold range is [0, 15] and the damping range is + // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be + // larger than threshold. Subtract using saturation will return 0 when pixel + // == kCdefLargeValue. + static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue"); + const __m128i thresh_minus_shifted_diff = + _mm_subs_epu16(threshold, shifted_diff); + const __m128i clamp_abs_diff = + _mm_min_epi16(thresh_minus_shifted_diff, abs_diff); + // Restore the sign. + return _mm_sign_epi16(clamp_abs_diff, diff); +} + +inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val, + const __m128i& tap, const __m128i& damping, + const __m128i& threshold) { + const __m128i constrained = Constrain(val, pixel, damping, threshold); + return _mm_mullo_epi16(constrained, tap); +} + +template +void CdefFilter_SSE4_1(const uint16_t* src, const ptrdiff_t src_stride, + const int height, const int primary_strength, + const int secondary_strength, const int damping, + const int direction, void* dest, + const ptrdiff_t dst_stride) { + static_assert(width == 8 || width == 4, "Invalid CDEF width."); + static_assert(enable_primary || enable_secondary, ""); + constexpr bool clipping_required = enable_primary && enable_secondary; + auto* dst = static_cast(dest); + __m128i primary_damping_shift, secondary_damping_shift; + + // FloorLog2() requires input to be > 0. + // 8-bit damping range: Y: [3, 6], UV: [2, 5]. + if (enable_primary) { + // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary + // for UV filtering. + primary_damping_shift = + _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength))); + } + if (enable_secondary) { + // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is + // necessary. + assert(damping - FloorLog2(secondary_strength) >= 0); + secondary_damping_shift = + _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength)); + } + + const __m128i primary_tap_0 = + _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][0]); + const __m128i primary_tap_1 = + _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][1]); + const __m128i secondary_tap_0 = _mm_set1_epi16(kCdefSecondaryTap0); + const __m128i secondary_tap_1 = _mm_set1_epi16(kCdefSecondaryTap1); + const __m128i cdef_large_value_mask = + _mm_set1_epi16(static_cast(~kCdefLargeValue)); + const __m128i primary_threshold = _mm_set1_epi16(primary_strength); + const __m128i secondary_threshold = _mm_set1_epi16(secondary_strength); + + int y = height; + do { + __m128i pixel; + if (width == 8) { + pixel = LoadUnaligned16(src); + } else { + pixel = LoadHi8(LoadLo8(src), src + src_stride); + } + + __m128i min = pixel; + __m128i max = pixel; + __m128i sum; + + if (enable_primary) { + // Primary |direction|. + __m128i primary_val[4]; + if (width == 8) { + LoadDirection(src, src_stride, primary_val, direction); + } else { + LoadDirection4(src, src_stride, primary_val, direction); + } + + if (clipping_required) { + min = _mm_min_epu16(min, primary_val[0]); + min = _mm_min_epu16(min, primary_val[1]); + min = _mm_min_epu16(min, primary_val[2]); + min = _mm_min_epu16(min, primary_val[3]); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + const __m128i max_p01 = _mm_max_epu8(primary_val[0], primary_val[1]); + const __m128i max_p23 = _mm_max_epu8(primary_val[2], primary_val[3]); + const __m128i max_p = _mm_max_epu8(max_p01, max_p23); + max = _mm_max_epu16(max, _mm_and_si128(max_p, cdef_large_value_mask)); + } + + sum = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0, + primary_damping_shift, primary_threshold); + sum = _mm_add_epi16( + sum, ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_0, + primary_damping_shift, primary_threshold)); + sum = _mm_add_epi16( + sum, ApplyConstrainAndTap(pixel, primary_val[2], primary_tap_1, + primary_damping_shift, primary_threshold)); + sum = _mm_add_epi16( + sum, ApplyConstrainAndTap(pixel, primary_val[3], primary_tap_1, + primary_damping_shift, primary_threshold)); + } else { + sum = _mm_setzero_si128(); + } + + if (enable_secondary) { + // Secondary |direction| values (+/- 2). Clamp |direction|. + __m128i secondary_val[8]; + if (width == 8) { + LoadDirection(src, src_stride, secondary_val, direction + 2); + LoadDirection(src, src_stride, secondary_val + 4, direction - 2); + } else { + LoadDirection4(src, src_stride, secondary_val, direction + 2); + LoadDirection4(src, src_stride, secondary_val + 4, direction - 2); + } + + if (clipping_required) { + min = _mm_min_epu16(min, secondary_val[0]); + min = _mm_min_epu16(min, secondary_val[1]); + min = _mm_min_epu16(min, secondary_val[2]); + min = _mm_min_epu16(min, secondary_val[3]); + min = _mm_min_epu16(min, secondary_val[4]); + min = _mm_min_epu16(min, secondary_val[5]); + min = _mm_min_epu16(min, secondary_val[6]); + min = _mm_min_epu16(min, secondary_val[7]); + + const __m128i max_s01 = + _mm_max_epu8(secondary_val[0], secondary_val[1]); + const __m128i max_s23 = + _mm_max_epu8(secondary_val[2], secondary_val[3]); + const __m128i max_s45 = + _mm_max_epu8(secondary_val[4], secondary_val[5]); + const __m128i max_s67 = + _mm_max_epu8(secondary_val[6], secondary_val[7]); + const __m128i max_s = _mm_max_epu8(_mm_max_epu8(max_s01, max_s23), + _mm_max_epu8(max_s45, max_s67)); + max = _mm_max_epu16(max, _mm_and_si128(max_s, cdef_large_value_mask)); + } + + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0, + secondary_damping_shift, secondary_threshold)); + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_0, + secondary_damping_shift, secondary_threshold)); + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_1, + secondary_damping_shift, secondary_threshold)); + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1, + secondary_damping_shift, secondary_threshold)); + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[4], secondary_tap_0, + secondary_damping_shift, secondary_threshold)); + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[5], secondary_tap_0, + secondary_damping_shift, secondary_threshold)); + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[6], secondary_tap_1, + secondary_damping_shift, secondary_threshold)); + sum = _mm_add_epi16( + sum, + ApplyConstrainAndTap(pixel, secondary_val[7], secondary_tap_1, + secondary_damping_shift, secondary_threshold)); + } + // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max)) + const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15); + // 8 + sum + sum = _mm_add_epi16(sum, _mm_set1_epi16(8)); + // (... - (sum < 0)) >> 4 + sum = _mm_add_epi16(sum, sum_lt_0); + sum = _mm_srai_epi16(sum, 4); + // pixel + ... + sum = _mm_add_epi16(sum, pixel); + if (clipping_required) { + // Clip3 + sum = _mm_min_epi16(sum, max); + sum = _mm_max_epi16(sum, min); + } + + const __m128i result = _mm_packus_epi16(sum, sum); + if (width == 8) { + src += src_stride; + StoreLo8(dst, result); + dst += dst_stride; + --y; + } else { + src += src_stride << 1; + Store4(dst, result); + dst += dst_stride; + Store4(dst, _mm_srli_si128(result, 4)); + dst += dst_stride; + y -= 2; + } + } while (y != 0); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); + dsp->cdef_direction = CdefDirection_SSE4_1; + dsp->cdef_filters[0][0] = CdefFilter_SSE4_1<4>; + dsp->cdef_filters[0][1] = + CdefFilter_SSE4_1<4, /*enable_primary=*/true, /*enable_secondary=*/false>; + dsp->cdef_filters[0][2] = CdefFilter_SSE4_1<4, /*enable_primary=*/false>; + dsp->cdef_filters[1][0] = CdefFilter_SSE4_1<8>; + dsp->cdef_filters[1][1] = + CdefFilter_SSE4_1<8, /*enable_primary=*/true, /*enable_secondary=*/false>; + dsp->cdef_filters[1][2] = CdefFilter_SSE4_1<8, /*enable_primary=*/false>; +} + +} // namespace +} // namespace low_bitdepth + +void CdefInit_SSE4_1() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 +#else // !LIBGAV1_TARGETING_SSE4_1 +namespace libgav1 { +namespace dsp { + +void CdefInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/cdef_sse4.h b/src/dsp/x86/cdef_sse4.h new file mode 100644 index 0000000..6631eb7 --- /dev/null +++ b/src/dsp/x86/cdef_sse4.h @@ -0,0 +1,45 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not +// thread-safe. +void CdefInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_TARGETING_SSE4_1 + +#ifndef LIBGAV1_Dsp8bpp_CdefDirection +#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_CdefFilters +#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_SSE4_1 +#endif + +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_ diff --git a/src/dsp/x86/common_avx2.h b/src/dsp/x86/common_avx2.h new file mode 100644 index 0000000..4ce7de2 --- /dev/null +++ b/src/dsp/x86/common_avx2.h @@ -0,0 +1,138 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_ +#define LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_ + +#include "src/utils/compiler_attributes.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_AVX2 + +#include + +#include +#include +#include + +namespace libgav1 { +namespace dsp { + +//------------------------------------------------------------------------------ +// Compatibility functions. + +inline __m256i SetrM128i(const __m128i lo, const __m128i hi) { + // For compatibility with older gcc toolchains (< 8) use + // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations + // are implemented similarly to the following, clang uses a different method + // but no differences in assembly have been observed. + return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); +} + +//------------------------------------------------------------------------------ +// Load functions. + +inline __m256i LoadAligned32(const void* a) { + assert((reinterpret_cast(a) & 0x1f) == 0); + return _mm256_load_si256(static_cast(a)); +} + +inline void LoadAligned64(const void* a, __m256i dst[2]) { + assert((reinterpret_cast(a) & 0x1f) == 0); + dst[0] = _mm256_load_si256(static_cast(a) + 0); + dst[1] = _mm256_load_si256(static_cast(a) + 1); +} + +inline __m256i LoadUnaligned32(const void* a) { + return _mm256_loadu_si256(static_cast(a)); +} + +//------------------------------------------------------------------------------ +// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning. + +inline __m256i MaskOverreads(const __m256i source, + const ptrdiff_t over_read_in_bytes) { + __m256i dst = source; +#if LIBGAV1_MSAN + if (over_read_in_bytes >= 32) return _mm256_setzero_si256(); + if (over_read_in_bytes > 0) { + __m128i m = _mm_set1_epi8(-1); + for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) { + m = _mm_srli_si128(m, 1); + } + const __m256i mask = (over_read_in_bytes < 16) + ? SetrM128i(_mm_set1_epi8(-1), m) + : SetrM128i(m, _mm_setzero_si128()); + dst = _mm256_and_si256(dst, mask); + } +#else + static_cast(over_read_in_bytes); +#endif + return dst; +} + +inline __m256i LoadAligned32Msan(const void* const source, + const ptrdiff_t over_read_in_bytes) { + return MaskOverreads(LoadAligned32(source), over_read_in_bytes); +} + +inline void LoadAligned64Msan(const void* const source, + const ptrdiff_t over_read_in_bytes, + __m256i dst[2]) { + dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes); + dst[1] = MaskOverreads(LoadAligned32(static_cast(source) + 1), + over_read_in_bytes); +} + +inline __m256i LoadUnaligned32Msan(const void* const source, + const ptrdiff_t over_read_in_bytes) { + return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes); +} + +//------------------------------------------------------------------------------ +// Store functions. + +inline void StoreAligned32(void* a, const __m256i v) { + assert((reinterpret_cast(a) & 0x1f) == 0); + _mm256_store_si256(static_cast<__m256i*>(a), v); +} + +inline void StoreAligned64(void* a, const __m256i v[2]) { + assert((reinterpret_cast(a) & 0x1f) == 0); + _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]); + _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]); +} + +inline void StoreUnaligned32(void* a, const __m256i v) { + _mm256_storeu_si256(static_cast<__m256i*>(a), v); +} + +//------------------------------------------------------------------------------ +// Arithmetic utilities. + +inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) { + assert(bits <= 16); + const __m256i v_bias_d = + _mm256_set1_epi16(static_cast((1 << bits) >> 1)); + const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d); + return _mm256_srai_epi16(v_tmp_d, bits); +} + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_TARGETING_AVX2 +#endif // LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_ diff --git a/src/dsp/x86/common_sse4.h b/src/dsp/x86/common_sse4.h new file mode 100644 index 0000000..c510f8c --- /dev/null +++ b/src/dsp/x86/common_sse4.h @@ -0,0 +1,265 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_ + +#include "src/utils/compiler_attributes.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include +#include + +#include +#include +#include +#include +#include + +#if 0 +#include +#include + +// Quite useful macro for debugging. Left here for convenience. +inline void PrintReg(const __m128i r, const char* const name, int size) { + int n; + union { + __m128i r; + uint8_t i8[16]; + uint16_t i16[8]; + uint32_t i32[4]; + uint64_t i64[2]; + } tmp; + tmp.r = r; + fprintf(stderr, "%s\t: ", name); + if (size == 8) { + for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]); + } else if (size == 16) { + for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]); + } else if (size == 32) { + for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]); + } else { + for (n = 0; n < 2; ++n) + fprintf(stderr, "%.16" PRIx64 " ", static_cast(tmp.i64[n])); + } + fprintf(stderr, "\n"); +} + +inline void PrintReg(const int r, const char* const name) { + fprintf(stderr, "%s: %d\n", name, r); +} + +inline void PrintRegX(const int r, const char* const name) { + fprintf(stderr, "%s: %.8x\n", name, r); +} + +#define PR(var, N) PrintReg(var, #var, N) +#define PD(var) PrintReg(var, #var); +#define PX(var) PrintRegX(var, #var); +#endif // 0 + +namespace libgav1 { +namespace dsp { + +//------------------------------------------------------------------------------ +// Load functions. + +inline __m128i Load2(const void* src) { + int16_t val; + memcpy(&val, src, sizeof(val)); + return _mm_cvtsi32_si128(val); +} + +inline __m128i Load2x2(const void* src1, const void* src2) { + uint16_t val1; + uint16_t val2; + memcpy(&val1, src1, sizeof(val1)); + memcpy(&val2, src2, sizeof(val2)); + return _mm_cvtsi32_si128(val1 | (val2 << 16)); +} + +// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1. +template +inline __m128i Load2(const void* const buf, __m128i val) { + uint16_t temp; + memcpy(&temp, buf, 2); + return _mm_insert_epi16(val, temp, lane); +} + +inline __m128i Load4(const void* src) { + // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32 + // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a + // movss instruction. + // + // Until compiler support of _mm_loadu_si32 is widespread, use of + // _mm_loadu_si32 is banned. + int val; + memcpy(&val, src, sizeof(val)); + return _mm_cvtsi32_si128(val); +} + +inline __m128i Load4x2(const void* src1, const void* src2) { + // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32 + // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a + // movss instruction. + // + // Until compiler support of _mm_loadu_si32 is widespread, use of + // _mm_loadu_si32 is banned. + int val1, val2; + memcpy(&val1, src1, sizeof(val1)); + memcpy(&val2, src2, sizeof(val2)); + return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1); +} + +inline __m128i LoadLo8(const void* a) { + return _mm_loadl_epi64(static_cast(a)); +} + +inline __m128i LoadHi8(const __m128i v, const void* a) { + const __m128 x = + _mm_loadh_pi(_mm_castsi128_ps(v), static_cast(a)); + return _mm_castps_si128(x); +} + +inline __m128i LoadUnaligned16(const void* a) { + return _mm_loadu_si128(static_cast(a)); +} + +inline __m128i LoadAligned16(const void* a) { + assert((reinterpret_cast(a) & 0xf) == 0); + return _mm_load_si128(static_cast(a)); +} + +//------------------------------------------------------------------------------ +// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning. + +inline __m128i MaskOverreads(const __m128i source, + const ptrdiff_t over_read_in_bytes) { + __m128i dst = source; +#if LIBGAV1_MSAN + if (over_read_in_bytes > 0) { + __m128i mask = _mm_set1_epi8(-1); + for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) { + mask = _mm_srli_si128(mask, 1); + } + dst = _mm_and_si128(dst, mask); + } +#else + static_cast(over_read_in_bytes); +#endif + return dst; +} + +inline __m128i LoadLo8Msan(const void* const source, + const ptrdiff_t over_read_in_bytes) { + return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8); +} + +inline __m128i LoadHi8Msan(const __m128i v, const void* source, + const ptrdiff_t over_read_in_bytes) { + return MaskOverreads(LoadHi8(v, source), over_read_in_bytes); +} + +inline __m128i LoadAligned16Msan(const void* const source, + const ptrdiff_t over_read_in_bytes) { + return MaskOverreads(LoadAligned16(source), over_read_in_bytes); +} + +inline __m128i LoadUnaligned16Msan(const void* const source, + const ptrdiff_t over_read_in_bytes) { + return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes); +} + +//------------------------------------------------------------------------------ +// Store functions. + +inline void Store2(void* dst, const __m128i x) { + const int val = _mm_cvtsi128_si32(x); + memcpy(dst, &val, 2); +} + +inline void Store4(void* dst, const __m128i x) { + const int val = _mm_cvtsi128_si32(x); + memcpy(dst, &val, sizeof(val)); +} + +inline void StoreLo8(void* a, const __m128i v) { + _mm_storel_epi64(static_cast<__m128i*>(a), v); +} + +inline void StoreHi8(void* a, const __m128i v) { + _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v)); +} + +inline void StoreAligned16(void* a, const __m128i v) { + assert((reinterpret_cast(a) & 0xf) == 0); + _mm_store_si128(static_cast<__m128i*>(a), v); +} + +inline void StoreUnaligned16(void* a, const __m128i v) { + _mm_storeu_si128(static_cast<__m128i*>(a), v); +} + +//------------------------------------------------------------------------------ +// Arithmetic utilities. + +inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) { + assert(bits <= 16); + // Shift out all but the last bit. + const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1); + // Avg with zero will shift by 1 and round. + return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128()); +} + +inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) { + assert(bits <= 16); + const __m128i v_bias_d = + _mm_set1_epi16(static_cast((1 << bits) >> 1)); + const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d); + return _mm_srai_epi16(v_tmp_d, bits); +} + +inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_srli_epi32(v_tmp_d, bits); +} + +inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +//------------------------------------------------------------------------------ +// Masking utilities +inline __m128i MaskHighNBytes(int n) { + static constexpr uint8_t kMask[32] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + }; + + return LoadUnaligned16(kMask + n); +} + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_TARGETING_SSE4_1 +#endif // LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_ diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc new file mode 100644 index 0000000..3df2120 --- /dev/null +++ b/src/dsp/x86/convolve_avx2.cc @@ -0,0 +1,534 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/convolve.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_AVX2 +#include + +#include +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_avx2.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +constexpr int kHorizontalOffset = 3; + +// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and +// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final +// sum from outranging int16_t. +template +__m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) { + __m256i sum; + if (filter_index < 2) { + // 6 taps. + const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]); // k2k1 + const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]); // k4k3 + const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]); // k6k5 + sum = _mm256_add_epi16(v_madd_21, v_madd_43); + sum = _mm256_add_epi16(sum, v_madd_65); + } else if (filter_index == 2) { + // 8 taps. + const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]); // k1k0 + const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]); // k3k2 + const __m256i v_madd_54 = _mm256_maddubs_epi16(src[2], taps[2]); // k5k4 + const __m256i v_madd_76 = _mm256_maddubs_epi16(src[3], taps[3]); // k7k6 + const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32); + const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76); + sum = _mm256_add_epi16(v_sum_7654, v_sum_3210); + } else if (filter_index == 3) { + // 2 taps. + sum = _mm256_maddubs_epi16(src[0], taps[0]); // k4k3 + } else { + // 4 taps. + const __m256i v_madd_32 = _mm256_maddubs_epi16(src[0], taps[0]); // k3k2 + const __m256i v_madd_54 = _mm256_maddubs_epi16(src[1], taps[1]); // k5k4 + sum = _mm256_add_epi16(v_madd_32, v_madd_54); + } + return sum; +} + +template +__m256i SumHorizontalTaps(const __m256i* const src, + const __m256i* const v_tap) { + __m256i v_src[4]; + const __m256i src_long = *src; + const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long); + const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long); + + if (filter_index < 2) { + // 6 taps. + v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21 + v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 + v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65 + } else if (filter_index == 2) { + // 8 taps. + v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10 + v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 + v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 + v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76 + } else if (filter_index == 3) { + // 2 taps. + v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 + } else if (filter_index > 3) { + // 4 taps. + v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 + v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 + } + return SumOnePassTaps(v_src, v_tap); +} + +template +__m256i SimpleHorizontalTaps(const __m256i* const src, + const __m256i* const v_tap) { + __m256i sum = SumHorizontalTaps(src, v_tap); + + // Normally the Horizontal pass does the downshift in two passes: + // kInterRoundBitsHorizontal - 1 and then (kFilterBits - + // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them + // requires adding the rounding offset from the skipped shift. + constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); + + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(first_shift_rounding_bit)); + sum = RightShiftWithRounding_S16(sum, kFilterBits - 1); + return _mm256_packus_epi16(sum, sum); +} + +template +__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, + const __m128i* const v_tap) { + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]); + + if (filter_index == 3) { + // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17 + const __m128i v_src_43 = _mm_shuffle_epi8( + v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403)); + const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3 + return v_sum_43; + } + + // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 + const __m128i v_src_32 = _mm_shuffle_epi8( + v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302)); + // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx + const __m128i v_src_54 = _mm_shuffle_epi8( + v_src, _mm_set_epi32(0x800f0f0e, 0x0e0d0d0c, 0x80070706, 0x06050504)); + const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2 + const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4 + const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32); + return v_sum_5432; +} + +template +__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, + const __m128i* const v_tap) { + __m128i sum = SumHorizontalTaps2x2(src, src_stride, v_tap); + + // Normally the Horizontal pass does the downshift in two passes: + // kInterRoundBitsHorizontal - 1 and then (kFilterBits - + // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them + // requires adding the rounding offset from the skipped shift. + constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); + + sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit)); + sum = RightShiftWithRounding_S16(sum, kFilterBits - 1); + return _mm_packus_epi16(sum, sum); +} + +template +__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride, + const __m128i* const v_tap) { + const __m128i sum = + SumHorizontalTaps2x2(src, src_stride, v_tap); + + return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); +} + +// Filter 2xh sizes. +template +void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, + void* const dest, const ptrdiff_t pred_stride, + const int /*width*/, const int height, + const __m128i* const v_tap) { + auto* dest8 = static_cast(dest); + auto* dest16 = static_cast(dest); + + // Horizontal passes only need to account for |num_taps| 2 and 4 when + // |width| <= 4. + assert(num_taps <= 4); + if (num_taps <= 4) { + if (!is_compound) { + int y = 0; + do { + if (is_2d) { + const __m128i sum = + HorizontalTaps8To16_2x2(src, src_stride, v_tap); + Store4(&dest16[0], sum); + dest16 += pred_stride; + Store4(&dest16[0], _mm_srli_si128(sum, 8)); + dest16 += pred_stride; + } else { + const __m128i sum = + SimpleHorizontalTaps2x2(src, src_stride, v_tap); + Store2(dest8, sum); + dest8 += pred_stride; + Store2(dest8, _mm_srli_si128(sum, 4)); + dest8 += pred_stride; + } + + src += src_stride << 1; + y += 2; + } while (y < height - 1); + + // The 2d filters have an odd |height| because the horizontal pass + // generates context for the vertical pass. + if (is_2d) { + assert(height % 2 == 1); + __m128i sum; + const __m128i input = LoadLo8(&src[2]); + if (filter_index == 3) { + // 03 04 04 05 05 06 06 07 .... + const __m128i v_src_43 = + _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3); + sum = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3 + } else { + // 02 03 03 04 04 05 05 06 06 07 .... + const __m128i v_src_32 = + _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1); + // 04 05 05 06 06 07 07 08 ... + const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4); + const __m128i v_madd_32 = + _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2 + const __m128i v_madd_54 = + _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4 + sum = _mm_add_epi16(v_madd_54, v_madd_32); + } + sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); + Store4(dest16, sum); + } + } + } +} + +// Filter widths >= 4. +template +void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, + void* const dest, const ptrdiff_t pred_stride, + const int width, const int height, + const __m256i* const v_tap) { + auto* dest8 = static_cast(dest); + auto* dest16 = static_cast(dest); + + if (width >= 32) { + int y = height; + do { + int x = 0; + do { + if (is_2d || is_compound) { + // placeholder + } else { + // Load src used to calculate dest8[7:0] and dest8[23:16]. + const __m256i src_long = LoadUnaligned32(&src[x]); + const __m256i result = + SimpleHorizontalTaps(&src_long, v_tap); + // Load src used to calculate dest8[15:8] and dest8[31:24]. + const __m256i src_long2 = LoadUnaligned32(&src[x + 8]); + const __m256i result2 = + SimpleHorizontalTaps(&src_long2, v_tap); + // Combine results and store. + StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2)); + } + x += step * 4; + } while (x < width); + src += src_stride; + dest8 += pred_stride; + dest16 += pred_stride; + } while (--y != 0); + } else if (width == 16) { + int y = height; + do { + if (is_2d || is_compound) { + // placeholder + } else { + // Load into 2 128 bit lanes. + const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]), + LoadUnaligned16(&src[src_stride])); + const __m256i result = + SimpleHorizontalTaps(&src_long, v_tap); + const __m256i src_long2 = SetrM128i( + LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride])); + const __m256i result2 = + SimpleHorizontalTaps(&src_long2, v_tap); + const __m256i packed_result = _mm256_unpacklo_epi64(result, result2); + StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result)); + StoreUnaligned16(&dest8[pred_stride], + _mm256_extracti128_si256(packed_result, 1)); + } + src += src_stride * 2; + dest8 += pred_stride * 2; + dest16 += pred_stride * 2; + y -= 2; + } while (y != 0); + } else if (width == 8) { + int y = height; + do { + if (is_2d || is_compound) { + // placeholder + } else { + const __m128i this_row = LoadUnaligned16(&src[0]); + const __m128i next_row = LoadUnaligned16(&src[src_stride]); + // Load into 2 128 bit lanes. + const __m256i src_long = SetrM128i(this_row, next_row); + const __m256i result = + SimpleHorizontalTaps(&src_long, v_tap); + StoreLo8(&dest8[0], _mm256_castsi256_si128(result)); + StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1)); + } + src += src_stride * 2; + dest8 += pred_stride * 2; + dest16 += pred_stride * 2; + y -= 2; + } while (y != 0); + } else { // width == 4 + int y = height; + do { + if (is_2d || is_compound) { + // placeholder + } else { + const __m128i this_row = LoadUnaligned16(&src[0]); + const __m128i next_row = LoadUnaligned16(&src[src_stride]); + // Load into 2 128 bit lanes. + const __m256i src_long = SetrM128i(this_row, next_row); + const __m256i result = + SimpleHorizontalTaps(&src_long, v_tap); + Store4(&dest8[0], _mm256_castsi256_si128(result)); + Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1)); + } + src += src_stride * 2; + dest8 += pred_stride * 2; + dest16 += pred_stride * 2; + y -= 2; + } while (y != 0); + } +} + +template +LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter, + __m128i* v_tap) { + if (num_taps == 8) { + v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0 + v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 + v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 + v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); + v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); + v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]); + } + } else if (num_taps == 6) { + const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); + v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1 + v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 + v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); + } + } else if (num_taps == 4) { + v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 + v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + } + } else { // num_taps == 2 + const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); + v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + } + } +} + +template +LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter, + __m256i* v_tap) { + if (num_taps == 8) { + v_tap[0] = _mm256_broadcastw_epi16(*filter); // k1k0 + v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2 + v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4 + v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6)); // k7k6 + if (is_2d_vertical) { + // placeholder + } + } else if (num_taps == 6) { + v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1)); // k2k1 + v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3 + v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5)); // k6k5 + if (is_2d_vertical) { + // placeholder + } + } else if (num_taps == 4) { + v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2 + v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4 + if (is_2d_vertical) { + // placeholder + } + } else { // num_taps == 2 + v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3 + if (is_2d_vertical) { + // placeholder + } + } +} + +template +LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH( + const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, + const ptrdiff_t dst_stride, const int width, const int height, + const int filter_id, const int filter_index) { + assert(filter_id != 0); + __m128i v_tap[4]; + const __m128i v_horizontal_filter = + LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]); + + if (filter_index == 4) { // 4 tap. + SetupTaps<4>(&v_horizontal_filter, v_tap); + FilterHorizontal<4, 8, 4, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 5) { // 4 tap. + SetupTaps<4>(&v_horizontal_filter, v_tap); + FilterHorizontal<4, 8, 5, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else { // 2 tap. + SetupTaps<2>(&v_horizontal_filter, v_tap); + FilterHorizontal<2, 8, 3, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } +} + +template +LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( + const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, + const ptrdiff_t dst_stride, const int width, const int height, + const int filter_id, const int filter_index) { + assert(filter_id != 0); + __m256i v_tap[4]; + const __m128i v_horizontal_filter = + LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]); + + if (filter_index == 2) { // 8 tap. + SetupTaps<8>(&v_horizontal_filter, v_tap); + FilterHorizontal<8, 8, 2, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 1) { // 6 tap. + SetupTaps<6>(&v_horizontal_filter, v_tap); + FilterHorizontal<6, 8, 1, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 0) { // 6 tap. + SetupTaps<6>(&v_horizontal_filter, v_tap); + FilterHorizontal<6, 8, 0, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 4) { // 4 tap. + SetupTaps<4>(&v_horizontal_filter, v_tap); + FilterHorizontal<4, 8, 4, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 5) { // 4 tap. + SetupTaps<4>(&v_horizontal_filter, v_tap); + FilterHorizontal<4, 8, 5, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else { // 2 tap. + SetupTaps<2>(&v_horizontal_filter, v_tap); + FilterHorizontal<2, 8, 3, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } +} + +void ConvolveHorizontal_AVX2(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int /*vertical_filter_index*/, + const int horizontal_filter_id, + const int /*vertical_filter_id*/, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + const int filter_index = GetFilterIndex(horizontal_filter_index, width); + // Set |src| to the outermost tap. + const auto* src = static_cast(reference) - kHorizontalOffset; + auto* dest = static_cast(prediction); + + if (width > 2) { + DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height, + horizontal_filter_id, filter_index); + } else { + // Use non avx2 version for smaller widths. + DoHorizontalPass2xH(src, reference_stride, dest, pred_stride, width, height, + horizontal_filter_id, filter_index); + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->convolve[0][0][0][1] = ConvolveHorizontal_AVX2; +} + +} // namespace +} // namespace low_bitdepth + +void ConvolveInit_AVX2() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_AVX2 +namespace libgav1 { +namespace dsp { + +void ConvolveInit_AVX2() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_AVX2 diff --git a/src/dsp/x86/convolve_avx2.h b/src/dsp/x86/convolve_avx2.h new file mode 100644 index 0000000..6179d98 --- /dev/null +++ b/src/dsp/x86/convolve_avx2.h @@ -0,0 +1,43 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_ +#define LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::convolve, see the defines below for specifics. This +// function is not thread-safe. +void ConvolveInit_AVX2(); + +} // namespace dsp +} // namespace libgav1 + +// If avx2 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the avx2 implementation should be used. +#if LIBGAV1_TARGETING_AVX2 + +#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal +#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_AVX2 +#endif + +#endif // LIBGAV1_TARGETING_AVX2 + +#endif // LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc new file mode 100644 index 0000000..3a0fff5 --- /dev/null +++ b/src/dsp/x86/convolve_sse4.cc @@ -0,0 +1,2830 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/convolve.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 +#include + +#include +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +#include "src/dsp/convolve.inc" + +// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and +// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final +// sum from outranging int16_t. +template +__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { + __m128i sum; + if (filter_index < 2) { + // 6 taps. + const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1 + const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3 + const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5 + sum = _mm_add_epi16(v_madd_21, v_madd_43); + sum = _mm_add_epi16(sum, v_madd_65); + } else if (filter_index == 2) { + // 8 taps. + const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0 + const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2 + const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]); // k5k4 + const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]); // k7k6 + const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32); + const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76); + sum = _mm_add_epi16(v_sum_7654, v_sum_3210); + } else if (filter_index == 3) { + // 2 taps. + sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3 + } else { + // 4 taps. + const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]); // k3k2 + const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]); // k5k4 + sum = _mm_add_epi16(v_madd_32, v_madd_54); + } + return sum; +} + +template +__m128i SumHorizontalTaps(const uint8_t* const src, + const __m128i* const v_tap) { + __m128i v_src[4]; + const __m128i src_long = LoadUnaligned16(src); + const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long); + const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long); + + if (filter_index < 2) { + // 6 taps. + v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21 + v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 + v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65 + } else if (filter_index == 2) { + // 8 taps. + v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10 + v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 + v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 + v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76 + } else if (filter_index == 3) { + // 2 taps. + v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 + } else if (filter_index > 3) { + // 4 taps. + v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 + v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 + } + const __m128i sum = SumOnePassTaps(v_src, v_tap); + return sum; +} + +template +__m128i SimpleHorizontalTaps(const uint8_t* const src, + const __m128i* const v_tap) { + __m128i sum = SumHorizontalTaps(src, v_tap); + + // Normally the Horizontal pass does the downshift in two passes: + // kInterRoundBitsHorizontal - 1 and then (kFilterBits - + // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them + // requires adding the rounding offset from the skipped shift. + constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); + + sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit)); + sum = RightShiftWithRounding_S16(sum, kFilterBits - 1); + return _mm_packus_epi16(sum, sum); +} + +template +__m128i HorizontalTaps8To16(const uint8_t* const src, + const __m128i* const v_tap) { + const __m128i sum = SumHorizontalTaps(src, v_tap); + + return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); +} + +template +__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, + const __m128i* const v_tap) { + const __m128i input0 = LoadLo8(&src[2]); + const __m128i input1 = LoadLo8(&src[2 + src_stride]); + + if (filter_index == 3) { + // 03 04 04 05 05 06 06 07 .... + const __m128i input0_dup = + _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 3); + // 13 14 14 15 15 16 16 17 .... + const __m128i input1_dup = + _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 3); + const __m128i v_src_43 = _mm_unpacklo_epi64(input0_dup, input1_dup); + const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3 + return v_sum_43; + } + + // 02 03 03 04 04 05 05 06 06 07 .... + const __m128i input0_dup = + _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 1); + // 12 13 13 14 14 15 15 16 16 17 .... + const __m128i input1_dup = + _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 1); + // 04 05 05 06 06 07 07 08 ... + const __m128i input0_dup_54 = _mm_srli_si128(input0_dup, 4); + // 14 15 15 16 16 17 17 18 ... + const __m128i input1_dup_54 = _mm_srli_si128(input1_dup, 4); + const __m128i v_src_32 = _mm_unpacklo_epi64(input0_dup, input1_dup); + const __m128i v_src_54 = _mm_unpacklo_epi64(input0_dup_54, input1_dup_54); + const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2 + const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4 + const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32); + return v_sum_5432; +} + +template +__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, + const __m128i* const v_tap) { + __m128i sum = SumHorizontalTaps2x2(src, src_stride, v_tap); + + // Normally the Horizontal pass does the downshift in two passes: + // kInterRoundBitsHorizontal - 1 and then (kFilterBits - + // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them + // requires adding the rounding offset from the skipped shift. + constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); + + sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit)); + sum = RightShiftWithRounding_S16(sum, kFilterBits - 1); + return _mm_packus_epi16(sum, sum); +} + +template +__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride, + const __m128i* const v_tap) { + const __m128i sum = + SumHorizontalTaps2x2(src, src_stride, v_tap); + + return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); +} + +template +void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, + void* const dest, const ptrdiff_t pred_stride, + const int width, const int height, + const __m128i* const v_tap) { + auto* dest8 = static_cast(dest); + auto* dest16 = static_cast(dest); + + // 4 tap filters are never used when width > 4. + if (num_taps != 4 && width > 4) { + int y = 0; + do { + int x = 0; + do { + if (is_2d || is_compound) { + const __m128i v_sum = + HorizontalTaps8To16(&src[x], v_tap); + if (is_2d) { + StoreAligned16(&dest16[x], v_sum); + } else { + StoreUnaligned16(&dest16[x], v_sum); + } + } else { + const __m128i result = + SimpleHorizontalTaps(&src[x], v_tap); + StoreLo8(&dest8[x], result); + } + x += step; + } while (x < width); + src += src_stride; + dest8 += pred_stride; + dest16 += pred_stride; + } while (++y < height); + return; + } + + // Horizontal passes only needs to account for |num_taps| 2 and 4 when + // |width| <= 4. + assert(width <= 4); + assert(num_taps <= 4); + if (num_taps <= 4) { + if (width == 4) { + int y = 0; + do { + if (is_2d || is_compound) { + const __m128i v_sum = HorizontalTaps8To16(src, v_tap); + StoreLo8(dest16, v_sum); + } else { + const __m128i result = SimpleHorizontalTaps(src, v_tap); + Store4(&dest8[0], result); + } + src += src_stride; + dest8 += pred_stride; + dest16 += pred_stride; + } while (++y < height); + return; + } + + if (!is_compound) { + int y = 0; + do { + if (is_2d) { + const __m128i sum = + HorizontalTaps8To16_2x2(src, src_stride, v_tap); + Store4(&dest16[0], sum); + dest16 += pred_stride; + Store4(&dest16[0], _mm_srli_si128(sum, 8)); + dest16 += pred_stride; + } else { + const __m128i sum = + SimpleHorizontalTaps2x2(src, src_stride, v_tap); + Store2(dest8, sum); + dest8 += pred_stride; + Store2(dest8, _mm_srli_si128(sum, 4)); + dest8 += pred_stride; + } + + src += src_stride << 1; + y += 2; + } while (y < height - 1); + + // The 2d filters have an odd |height| because the horizontal pass + // generates context for the vertical pass. + if (is_2d) { + assert(height % 2 == 1); + __m128i sum; + const __m128i input = LoadLo8(&src[2]); + if (filter_index == 3) { + // 03 04 04 05 05 06 06 07 .... + const __m128i v_src_43 = + _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3); + sum = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3 + } else { + // 02 03 03 04 04 05 05 06 06 07 .... + const __m128i v_src_32 = + _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1); + // 04 05 05 06 06 07 07 08 ... + const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4); + const __m128i v_madd_32 = + _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2 + const __m128i v_madd_54 = + _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4 + sum = _mm_add_epi16(v_madd_54, v_madd_32); + } + sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); + Store4(dest16, sum); + } + } + } +} + +template +LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter, + __m128i* v_tap) { + if (num_taps == 8) { + v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0 + v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 + v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 + v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); + v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); + v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]); + } + } else if (num_taps == 6) { + const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); + v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1 + v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 + v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); + } + } else if (num_taps == 4) { + v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 + v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + } + } else { // num_taps == 2 + const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); + v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + } + } +} + +template +__m128i SimpleSum2DVerticalTaps(const __m128i* const src, + const __m128i* const taps) { + __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]); + __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]); + if (num_taps >= 4) { + __m128i madd_lo = + _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]); + __m128i madd_hi = + _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]); + sum_lo = _mm_add_epi32(sum_lo, madd_lo); + sum_hi = _mm_add_epi32(sum_hi, madd_hi); + if (num_taps >= 6) { + madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]); + madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]); + sum_lo = _mm_add_epi32(sum_lo, madd_lo); + sum_hi = _mm_add_epi32(sum_hi, madd_hi); + if (num_taps == 8) { + madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]); + madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]); + sum_lo = _mm_add_epi32(sum_lo, madd_lo); + sum_hi = _mm_add_epi32(sum_hi, madd_hi); + } + } + } + + if (is_compound) { + return _mm_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), + RightShiftWithRounding_S32(sum_hi, + kInterRoundBitsCompoundVertical - 1)); + } + + return _mm_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1), + RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1)); +} + +template +void Filter2DVertical(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int width, + const int height, const __m128i* const taps) { + assert(width >= 8); + constexpr int next_row = num_taps - 1; + // The Horizontal pass uses |width| as |stride| for the intermediate buffer. + const ptrdiff_t src_stride = width; + + auto* dst8 = static_cast(dst); + auto* dst16 = static_cast(dst); + + int x = 0; + do { + __m128i srcs[8]; + const uint16_t* src_x = src + x; + srcs[0] = LoadAligned16(src_x); + src_x += src_stride; + if (num_taps >= 4) { + srcs[1] = LoadAligned16(src_x); + src_x += src_stride; + srcs[2] = LoadAligned16(src_x); + src_x += src_stride; + if (num_taps >= 6) { + srcs[3] = LoadAligned16(src_x); + src_x += src_stride; + srcs[4] = LoadAligned16(src_x); + src_x += src_stride; + if (num_taps == 8) { + srcs[5] = LoadAligned16(src_x); + src_x += src_stride; + srcs[6] = LoadAligned16(src_x); + src_x += src_stride; + } + } + } + + int y = 0; + do { + srcs[next_row] = LoadAligned16(src_x); + src_x += src_stride; + + const __m128i sum = + SimpleSum2DVerticalTaps(srcs, taps); + if (is_compound) { + StoreUnaligned16(dst16 + x + y * dst_stride, sum); + } else { + StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(sum, sum)); + } + + srcs[0] = srcs[1]; + if (num_taps >= 4) { + srcs[1] = srcs[2]; + srcs[2] = srcs[3]; + if (num_taps >= 6) { + srcs[3] = srcs[4]; + srcs[4] = srcs[5]; + if (num_taps == 8) { + srcs[5] = srcs[6]; + srcs[6] = srcs[7]; + } + } + } + } while (++y < height); + x += 8; + } while (x < width); +} + +// Take advantage of |src_stride| == |width| to process two rows at a time. +template +void Filter2DVertical4xH(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int height, + const __m128i* const taps) { + auto* dst8 = static_cast(dst); + auto* dst16 = static_cast(dst); + + __m128i srcs[9]; + srcs[0] = LoadAligned16(src); + src += 8; + if (num_taps >= 4) { + srcs[2] = LoadAligned16(src); + src += 8; + srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]); + if (num_taps >= 6) { + srcs[4] = LoadAligned16(src); + src += 8; + srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]); + if (num_taps == 8) { + srcs[6] = LoadAligned16(src); + src += 8; + srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]); + } + } + } + + int y = 0; + do { + srcs[num_taps] = LoadAligned16(src); + src += 8; + srcs[num_taps - 1] = _mm_unpacklo_epi64( + _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]); + + const __m128i sum = + SimpleSum2DVerticalTaps(srcs, taps); + if (is_compound) { + StoreUnaligned16(dst16, sum); + dst16 += 4 << 1; + } else { + const __m128i results = _mm_packus_epi16(sum, sum); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + if (num_taps >= 4) { + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + if (num_taps >= 6) { + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; + if (num_taps == 8) { + srcs[5] = srcs[7]; + srcs[6] = srcs[8]; + } + } + } + y += 2; + } while (y < height); +} + +// Take advantage of |src_stride| == |width| to process four rows at a time. +template +void Filter2DVertical2xH(const uint16_t* src, void* const dst, + const ptrdiff_t dst_stride, const int height, + const __m128i* const taps) { + constexpr int next_row = (num_taps < 6) ? 4 : 8; + + auto* dst8 = static_cast(dst); + + __m128i srcs[9]; + srcs[0] = LoadAligned16(src); + src += 8; + if (num_taps >= 6) { + srcs[4] = LoadAligned16(src); + src += 8; + srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); + if (num_taps == 8) { + srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); + srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); + } + } + + int y = 0; + do { + srcs[next_row] = LoadAligned16(src); + src += 8; + if (num_taps == 2) { + srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); + } else if (num_taps == 4) { + srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); + srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); + srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); + } else if (num_taps == 6) { + srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); + srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); + srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4); + } else if (num_taps == 8) { + srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4); + srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8); + srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12); + } + + const __m128i sum = + SimpleSum2DVerticalTaps(srcs, taps); + const __m128i results = _mm_packus_epi16(sum, sum); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + // When |height| <= 4 the taps are restricted to 2 and 4 tap variants. + // Therefore we don't need to check this condition when |height| > 4. + if (num_taps <= 4 && height == 2) return; + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + if (num_taps == 6) { + srcs[1] = srcs[5]; + srcs[4] = srcs[8]; + } else if (num_taps == 8) { + srcs[1] = srcs[5]; + srcs[2] = srcs[6]; + srcs[3] = srcs[7]; + srcs[4] = srcs[8]; + } + + y += 4; + } while (y < height); +} + +template +LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( + const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, + const ptrdiff_t dst_stride, const int width, const int height, + const int filter_id, const int filter_index) { + assert(filter_id != 0); + __m128i v_tap[4]; + const __m128i v_horizontal_filter = + LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]); + + if (filter_index == 2) { // 8 tap. + SetupTaps<8>(&v_horizontal_filter, v_tap); + FilterHorizontal<8, 8, 2, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 1) { // 6 tap. + SetupTaps<6>(&v_horizontal_filter, v_tap); + FilterHorizontal<6, 8, 1, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 0) { // 6 tap. + SetupTaps<6>(&v_horizontal_filter, v_tap); + FilterHorizontal<6, 8, 0, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 4) { // 4 tap. + SetupTaps<4>(&v_horizontal_filter, v_tap); + FilterHorizontal<4, 8, 4, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 5) { // 4 tap. + SetupTaps<4>(&v_horizontal_filter, v_tap); + FilterHorizontal<4, 8, 5, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else { // 2 tap. + SetupTaps<2>(&v_horizontal_filter, v_tap); + FilterHorizontal<2, 8, 3, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } +} + +void Convolve2D_SSE4_1(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int vertical_filter_index, + const int horizontal_filter_id, + const int vertical_filter_id, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); + const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + + // The output of the horizontal filter is guaranteed to fit in 16 bits. + alignas(16) uint16_t + intermediate_result[kMaxSuperBlockSizeInPixels * + (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; + const int intermediate_height = height + vertical_taps - 1; + + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast(reference) - + (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset; + + DoHorizontalPass(src, src_stride, intermediate_result, width, + width, intermediate_height, + horizontal_filter_id, horiz_filter_index); + + // Vertical filter. + auto* dest = static_cast(prediction); + const ptrdiff_t dest_stride = pred_stride; + assert(vertical_filter_id != 0); + + __m128i taps[4]; + const __m128i v_filter = + LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]); + + if (vertical_taps == 8) { + SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 2) { + Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height, + taps); + } + } else if (vertical_taps == 6) { + SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 2) { + Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height, + taps); + } + } else if (vertical_taps == 4) { + SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 2) { + Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height, + taps); + } + } else { // |vertical_taps| == 2 + SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 2) { + Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height, + taps); + } else if (width == 4) { + Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height, + taps); + } else { + Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height, + taps); + } + } +} + +// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D +// Vertical calculations. +__m128i Compound1DShift(const __m128i sum) { + return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); +} + +template +__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) { + __m128i v_src[4]; + + if (filter_index < 2) { + // 6 taps. + v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); + v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); + v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); + } else if (filter_index == 2) { + // 8 taps. + v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); + v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); + v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); + v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]); + } else if (filter_index == 3) { + // 2 taps. + v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); + } else if (filter_index > 3) { + // 4 taps. + v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); + v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); + } + const __m128i sum = SumOnePassTaps(v_src, v_tap); + return sum; +} + +template +void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int width, const int height, + const __m128i* const v_tap) { + const int num_taps = GetNumTapsInFilter(filter_index); + const int next_row = num_taps - 1; + auto* dst8 = static_cast(dst); + auto* dst16 = static_cast(dst); + assert(width >= 8); + + int x = 0; + do { + const uint8_t* src_x = src + x; + __m128i srcs[8]; + srcs[0] = LoadLo8(src_x); + src_x += src_stride; + if (num_taps >= 4) { + srcs[1] = LoadLo8(src_x); + src_x += src_stride; + srcs[2] = LoadLo8(src_x); + src_x += src_stride; + if (num_taps >= 6) { + srcs[3] = LoadLo8(src_x); + src_x += src_stride; + srcs[4] = LoadLo8(src_x); + src_x += src_stride; + if (num_taps == 8) { + srcs[5] = LoadLo8(src_x); + src_x += src_stride; + srcs[6] = LoadLo8(src_x); + src_x += src_stride; + } + } + } + + int y = 0; + do { + srcs[next_row] = LoadLo8(src_x); + src_x += src_stride; + + const __m128i sums = SumVerticalTaps(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16 + x + y * dst_stride, results); + } else { + const __m128i results = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(results, results)); + } + + srcs[0] = srcs[1]; + if (num_taps >= 4) { + srcs[1] = srcs[2]; + srcs[2] = srcs[3]; + if (num_taps >= 6) { + srcs[3] = srcs[4]; + srcs[4] = srcs[5]; + if (num_taps == 8) { + srcs[5] = srcs[6]; + srcs[6] = srcs[7]; + } + } + } + } while (++y < height); + x += 8; + } while (x < width); +} + +template +void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int height, const __m128i* const v_tap) { + const int num_taps = GetNumTapsInFilter(filter_index); + auto* dst8 = static_cast(dst); + auto* dst16 = static_cast(dst); + + __m128i srcs[9]; + + if (num_taps == 2) { + srcs[2] = _mm_setzero_si128(); + // 00 01 02 03 + srcs[0] = Load4(src); + src += src_stride; + + int y = 0; + do { + // 10 11 12 13 + const __m128i a = Load4(src); + // 00 01 02 03 10 11 12 13 + srcs[0] = _mm_unpacklo_epi32(srcs[0], a); + src += src_stride; + // 20 21 22 23 + srcs[2] = Load4(src); + src += src_stride; + // 10 11 12 13 20 21 22 23 + srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); + + const __m128i sums = SumVerticalTaps(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16, results); + dst16 += 4 << 1; + } else { + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + y += 2; + } while (y < height); + } else if (num_taps == 4) { + srcs[4] = _mm_setzero_si128(); + // 00 01 02 03 + srcs[0] = Load4(src); + src += src_stride; + // 10 11 12 13 + const __m128i a = Load4(src); + // 00 01 02 03 10 11 12 13 + srcs[0] = _mm_unpacklo_epi32(srcs[0], a); + src += src_stride; + // 20 21 22 23 + srcs[2] = Load4(src); + src += src_stride; + // 10 11 12 13 20 21 22 23 + srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); + + int y = 0; + do { + // 30 31 32 33 + const __m128i b = Load4(src); + // 20 21 22 23 30 31 32 33 + srcs[2] = _mm_unpacklo_epi32(srcs[2], b); + src += src_stride; + // 40 41 42 43 + srcs[4] = Load4(src); + src += src_stride; + // 30 31 32 33 40 41 42 43 + srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); + + const __m128i sums = SumVerticalTaps(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16, results); + dst16 += 4 << 1; + } else { + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + y += 2; + } while (y < height); + } else if (num_taps == 6) { + srcs[6] = _mm_setzero_si128(); + // 00 01 02 03 + srcs[0] = Load4(src); + src += src_stride; + // 10 11 12 13 + const __m128i a = Load4(src); + // 00 01 02 03 10 11 12 13 + srcs[0] = _mm_unpacklo_epi32(srcs[0], a); + src += src_stride; + // 20 21 22 23 + srcs[2] = Load4(src); + src += src_stride; + // 10 11 12 13 20 21 22 23 + srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); + // 30 31 32 33 + const __m128i b = Load4(src); + // 20 21 22 23 30 31 32 33 + srcs[2] = _mm_unpacklo_epi32(srcs[2], b); + src += src_stride; + // 40 41 42 43 + srcs[4] = Load4(src); + src += src_stride; + // 30 31 32 33 40 41 42 43 + srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); + + int y = 0; + do { + // 50 51 52 53 + const __m128i c = Load4(src); + // 40 41 42 43 50 51 52 53 + srcs[4] = _mm_unpacklo_epi32(srcs[4], c); + src += src_stride; + // 60 61 62 63 + srcs[6] = Load4(src); + src += src_stride; + // 50 51 52 53 60 61 62 63 + srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); + + const __m128i sums = SumVerticalTaps(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16, results); + dst16 += 4 << 1; + } else { + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; + y += 2; + } while (y < height); + } else if (num_taps == 8) { + srcs[8] = _mm_setzero_si128(); + // 00 01 02 03 + srcs[0] = Load4(src); + src += src_stride; + // 10 11 12 13 + const __m128i a = Load4(src); + // 00 01 02 03 10 11 12 13 + srcs[0] = _mm_unpacklo_epi32(srcs[0], a); + src += src_stride; + // 20 21 22 23 + srcs[2] = Load4(src); + src += src_stride; + // 10 11 12 13 20 21 22 23 + srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); + // 30 31 32 33 + const __m128i b = Load4(src); + // 20 21 22 23 30 31 32 33 + srcs[2] = _mm_unpacklo_epi32(srcs[2], b); + src += src_stride; + // 40 41 42 43 + srcs[4] = Load4(src); + src += src_stride; + // 30 31 32 33 40 41 42 43 + srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); + // 50 51 52 53 + const __m128i c = Load4(src); + // 40 41 42 43 50 51 52 53 + srcs[4] = _mm_unpacklo_epi32(srcs[4], c); + src += src_stride; + // 60 61 62 63 + srcs[6] = Load4(src); + src += src_stride; + // 50 51 52 53 60 61 62 63 + srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); + + int y = 0; + do { + // 70 71 72 73 + const __m128i d = Load4(src); + // 60 61 62 63 70 71 72 73 + srcs[6] = _mm_unpacklo_epi32(srcs[6], d); + src += src_stride; + // 80 81 82 83 + srcs[8] = Load4(src); + src += src_stride; + // 70 71 72 73 80 81 82 83 + srcs[7] = _mm_unpacklo_epi32(d, srcs[8]); + + const __m128i sums = SumVerticalTaps(srcs, v_tap); + if (is_compound) { + const __m128i results = Compound1DShift(sums); + StoreUnaligned16(dst16, results); + dst16 += 4 << 1; + } else { + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + Store4(dst8, results); + dst8 += dst_stride; + Store4(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + } + + srcs[0] = srcs[2]; + srcs[1] = srcs[3]; + srcs[2] = srcs[4]; + srcs[3] = srcs[5]; + srcs[4] = srcs[6]; + srcs[5] = srcs[7]; + srcs[6] = srcs[8]; + y += 2; + } while (y < height); + } +} + +template +void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, + void* const dst, const ptrdiff_t dst_stride, + const int height, const __m128i* const v_tap) { + const int num_taps = GetNumTapsInFilter(filter_index); + auto* dst8 = static_cast(dst); + + __m128i srcs[9]; + + if (num_taps == 2) { + srcs[2] = _mm_setzero_si128(); + // 00 01 + srcs[0] = Load2(src); + src += src_stride; + + int y = 0; + do { + // 00 01 10 11 + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 30 31 + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + // 40 41 + srcs[2] = Load2<0>(src, srcs[2]); + src += src_stride; + // 00 01 10 11 20 21 30 31 40 41 + const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]); + // 10 11 20 21 30 31 40 41 + srcs[1] = _mm_srli_si128(srcs_0_2, 2); + // This uses srcs[0]..srcs[1]. + const __m128i sums = SumVerticalTaps(srcs, v_tap); + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + if (height == 2) return; + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[2]; + y += 4; + } while (y < height); + } else if (num_taps == 4) { + srcs[4] = _mm_setzero_si128(); + + // 00 01 + srcs[0] = Load2(src); + src += src_stride; + // 00 01 10 11 + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + + int y = 0; + do { + // 00 01 10 11 20 21 30 31 + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + // 40 41 + srcs[4] = Load2<0>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 + srcs[4] = Load2<1>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 60 61 + srcs[4] = Load2<2>(src, srcs[4]); + src += src_stride; + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 + const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); + // 10 11 20 21 30 31 40 41 + srcs[1] = _mm_srli_si128(srcs_0_4, 2); + // 20 21 30 31 40 41 50 51 + srcs[2] = _mm_srli_si128(srcs_0_4, 4); + // 30 31 40 41 50 51 60 61 + srcs[3] = _mm_srli_si128(srcs_0_4, 6); + + // This uses srcs[0]..srcs[3]. + const __m128i sums = SumVerticalTaps(srcs, v_tap); + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + if (height == 2) return; + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + y += 4; + } while (y < height); + } else if (num_taps == 6) { + // During the vertical pass the number of taps is restricted when + // |height| <= 4. + assert(height > 4); + srcs[8] = _mm_setzero_si128(); + + // 00 01 + srcs[0] = Load2(src); + src += src_stride; + // 00 01 10 11 + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 30 31 + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + // 40 41 + srcs[4] = Load2(src); + src += src_stride; + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 + const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]); + // 10 11 20 21 30 31 40 41 + srcs[1] = _mm_srli_si128(srcs_0_4x, 2); + + int y = 0; + do { + // 40 41 50 51 + srcs[4] = Load2<1>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 60 61 + srcs[4] = Load2<2>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 60 61 70 71 + srcs[4] = Load2<3>(src, srcs[4]); + src += src_stride; + // 80 81 + srcs[8] = Load2<0>(src, srcs[8]); + src += src_stride; + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 + const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); + // 20 21 30 31 40 41 50 51 + srcs[2] = _mm_srli_si128(srcs_0_4, 4); + // 30 31 40 41 50 51 60 61 + srcs[3] = _mm_srli_si128(srcs_0_4, 6); + const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]); + // 50 51 60 61 70 71 80 81 + srcs[5] = _mm_srli_si128(srcs_4_8, 2); + + // This uses srcs[0]..srcs[5]. + const __m128i sums = SumVerticalTaps(srcs, v_tap); + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + srcs[1] = srcs[5]; + srcs[4] = srcs[8]; + y += 4; + } while (y < height); + } else if (num_taps == 8) { + // During the vertical pass the number of taps is restricted when + // |height| <= 4. + assert(height > 4); + srcs[8] = _mm_setzero_si128(); + // 00 01 + srcs[0] = Load2(src); + src += src_stride; + // 00 01 10 11 + srcs[0] = Load2<1>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 + srcs[0] = Load2<2>(src, srcs[0]); + src += src_stride; + // 00 01 10 11 20 21 30 31 + srcs[0] = Load2<3>(src, srcs[0]); + src += src_stride; + // 40 41 + srcs[4] = Load2(src); + src += src_stride; + // 40 41 50 51 + srcs[4] = Load2<1>(src, srcs[4]); + src += src_stride; + // 40 41 50 51 60 61 + srcs[4] = Load2<2>(src, srcs[4]); + src += src_stride; + + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 + const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); + // 10 11 20 21 30 31 40 41 + srcs[1] = _mm_srli_si128(srcs_0_4, 2); + // 20 21 30 31 40 41 50 51 + srcs[2] = _mm_srli_si128(srcs_0_4, 4); + // 30 31 40 41 50 51 60 61 + srcs[3] = _mm_srli_si128(srcs_0_4, 6); + + int y = 0; + do { + // 40 41 50 51 60 61 70 71 + srcs[4] = Load2<3>(src, srcs[4]); + src += src_stride; + // 80 81 + srcs[8] = Load2<0>(src, srcs[8]); + src += src_stride; + // 80 81 90 91 + srcs[8] = Load2<1>(src, srcs[8]); + src += src_stride; + // 80 81 90 91 a0 a1 + srcs[8] = Load2<2>(src, srcs[8]); + src += src_stride; + + // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1 + const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]); + // 50 51 60 61 70 71 80 81 + srcs[5] = _mm_srli_si128(srcs_4_8, 2); + // 60 61 70 71 80 81 90 91 + srcs[6] = _mm_srli_si128(srcs_4_8, 4); + // 70 71 80 81 90 91 a0 a1 + srcs[7] = _mm_srli_si128(srcs_4_8, 6); + + // This uses srcs[0]..srcs[7]. + const __m128i sums = SumVerticalTaps(srcs, v_tap); + const __m128i results_16 = + RightShiftWithRounding_S16(sums, kFilterBits - 1); + const __m128i results = _mm_packus_epi16(results_16, results_16); + + Store2(dst8, results); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 2)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 4)); + dst8 += dst_stride; + Store2(dst8, _mm_srli_si128(results, 6)); + dst8 += dst_stride; + + srcs[0] = srcs[4]; + srcs[1] = srcs[5]; + srcs[2] = srcs[6]; + srcs[3] = srcs[7]; + srcs[4] = srcs[8]; + y += 4; + } while (y < height); + } +} + +void ConvolveVertical_SSE4_1(const void* const reference, + const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, + const int vertical_filter_index, + const int /*horizontal_filter_id*/, + const int vertical_filter_id, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + const int filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(filter_index); + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast(reference) - + (vertical_taps / 2 - 1) * src_stride; + auto* dest = static_cast(prediction); + const ptrdiff_t dest_stride = pred_stride; + assert(vertical_filter_id != 0); + + __m128i taps[4]; + const __m128i v_filter = + LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]); + + if (filter_index < 2) { // 6 tap. + SetupTaps<6>(&v_filter, taps); + if (width == 2) { + FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps); + } else if (width == 4) { + FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps); + } else { + FilterVertical<0>(src, src_stride, dest, dest_stride, width, height, + taps); + } + } else if (filter_index == 2) { // 8 tap. + SetupTaps<8>(&v_filter, taps); + if (width == 2) { + FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps); + } else if (width == 4) { + FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps); + } else { + FilterVertical<2>(src, src_stride, dest, dest_stride, width, height, + taps); + } + } else if (filter_index == 3) { // 2 tap. + SetupTaps<2>(&v_filter, taps); + if (width == 2) { + FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps); + } else if (width == 4) { + FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps); + } else { + FilterVertical<3>(src, src_stride, dest, dest_stride, width, height, + taps); + } + } else if (filter_index == 4) { // 4 tap. + SetupTaps<4>(&v_filter, taps); + if (width == 2) { + FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps); + } else if (width == 4) { + FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps); + } else { + FilterVertical<4>(src, src_stride, dest, dest_stride, width, height, + taps); + } + } else { + // TODO(slavarnway): Investigate adding |filter_index| == 1 special cases. + // See convolve_neon.cc + SetupTaps<4>(&v_filter, taps); + + if (width == 2) { + FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps); + } else if (width == 4) { + FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps); + } else { + FilterVertical<5>(src, src_stride, dest, dest_stride, width, height, + taps); + } + } +} + +void ConvolveCompoundCopy_SSE4(const void* const reference, + const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, + const int /*vertical_filter_index*/, + const int /*horizontal_filter_id*/, + const int /*vertical_filter_id*/, + const int width, const int height, + void* prediction, const ptrdiff_t pred_stride) { + const auto* src = static_cast(reference); + const ptrdiff_t src_stride = reference_stride; + auto* dest = static_cast(prediction); + constexpr int kRoundBitsVertical = + kInterRoundBitsVertical - kInterRoundBitsCompoundVertical; + if (width >= 16) { + int y = height; + do { + int x = 0; + do { + const __m128i v_src = LoadUnaligned16(&src[x]); + const __m128i v_src_ext_lo = _mm_cvtepu8_epi16(v_src); + const __m128i v_src_ext_hi = + _mm_cvtepu8_epi16(_mm_srli_si128(v_src, 8)); + const __m128i v_dest_lo = + _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical); + const __m128i v_dest_hi = + _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical); + // TODO(slavarnway): Investigate using aligned stores. + StoreUnaligned16(&dest[x], v_dest_lo); + StoreUnaligned16(&dest[x + 8], v_dest_hi); + x += 16; + } while (x < width); + src += src_stride; + dest += pred_stride; + } while (--y != 0); + } else if (width == 8) { + int y = height; + do { + const __m128i v_src = LoadLo8(&src[0]); + const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src); + const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical); + StoreUnaligned16(&dest[0], v_dest); + src += src_stride; + dest += pred_stride; + } while (--y != 0); + } else { /* width == 4 */ + int y = height; + do { + const __m128i v_src0 = Load4(&src[0]); + const __m128i v_src1 = Load4(&src[src_stride]); + const __m128i v_src = _mm_unpacklo_epi32(v_src0, v_src1); + const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src); + const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical); + StoreLo8(&dest[0], v_dest); + StoreHi8(&dest[pred_stride], v_dest); + src += src_stride * 2; + dest += pred_stride * 2; + y -= 2; + } while (y != 0); + } +} + +void ConvolveCompoundVertical_SSE4_1( + const void* const reference, const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, const int vertical_filter_index, + const int /*horizontal_filter_id*/, const int vertical_filter_id, + const int width, const int height, void* prediction, + const ptrdiff_t /*pred_stride*/) { + const int filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(filter_index); + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast(reference) - + (vertical_taps / 2 - 1) * src_stride; + auto* dest = static_cast(prediction); + assert(vertical_filter_id != 0); + + __m128i taps[4]; + const __m128i v_filter = + LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]); + + if (filter_index < 2) { // 6 tap. + SetupTaps<6>(&v_filter, taps); + if (width == 4) { + FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); + } else { + FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps); + } + } else if (filter_index == 2) { // 8 tap. + SetupTaps<8>(&v_filter, taps); + + if (width == 4) { + FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); + } else { + FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps); + } + } else if (filter_index == 3) { // 2 tap. + SetupTaps<2>(&v_filter, taps); + + if (width == 4) { + FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); + } else { + FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps); + } + } else if (filter_index == 4) { // 4 tap. + SetupTaps<4>(&v_filter, taps); + + if (width == 4) { + FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); + } else { + FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps); + } + } else { + SetupTaps<4>(&v_filter, taps); + + if (width == 4) { + FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4, + height, taps); + } else { + FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width, + width, height, taps); + } + } +} + +void ConvolveHorizontal_SSE4_1(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int /*vertical_filter_index*/, + const int horizontal_filter_id, + const int /*vertical_filter_id*/, + const int width, const int height, + void* prediction, const ptrdiff_t pred_stride) { + const int filter_index = GetFilterIndex(horizontal_filter_index, width); + // Set |src| to the outermost tap. + const auto* src = static_cast(reference) - kHorizontalOffset; + auto* dest = static_cast(prediction); + + DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height, + horizontal_filter_id, filter_index); +} + +void ConvolveCompoundHorizontal_SSE4_1( + const void* const reference, const ptrdiff_t reference_stride, + const int horizontal_filter_index, const int /*vertical_filter_index*/, + const int horizontal_filter_id, const int /*vertical_filter_id*/, + const int width, const int height, void* prediction, + const ptrdiff_t /*pred_stride*/) { + const int filter_index = GetFilterIndex(horizontal_filter_index, width); + const auto* src = static_cast(reference) - kHorizontalOffset; + auto* dest = static_cast(prediction); + + DoHorizontalPass( + src, reference_stride, dest, width, width, height, horizontal_filter_id, + filter_index); +} + +void ConvolveCompound2D_SSE4_1(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int vertical_filter_index, + const int horizontal_filter_id, + const int vertical_filter_id, const int width, + const int height, void* prediction, + const ptrdiff_t /*pred_stride*/) { + // The output of the horizontal filter, i.e. the intermediate_result, is + // guaranteed to fit in int16_t. + alignas(16) uint16_t + intermediate_result[kMaxSuperBlockSizeInPixels * + (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)]; + + // Horizontal filter. + // Filter types used for width <= 4 are different from those for width > 4. + // When width > 4, the valid filter index range is always [0, 3]. + // When width <= 4, the valid filter index range is always [4, 5]. + // Similarly for height. + const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); + const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); + const int vertical_taps = GetNumTapsInFilter(vert_filter_index); + const int intermediate_height = height + vertical_taps - 1; + const ptrdiff_t src_stride = reference_stride; + const auto* const src = static_cast(reference) - + (vertical_taps / 2 - 1) * src_stride - + kHorizontalOffset; + + DoHorizontalPass( + src, src_stride, intermediate_result, width, width, intermediate_height, + horizontal_filter_id, horiz_filter_index); + + // Vertical filter. + auto* dest = static_cast(prediction); + assert(vertical_filter_id != 0); + + const ptrdiff_t dest_stride = width; + __m128i taps[4]; + const __m128i v_filter = + LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]); + + if (vertical_taps == 8) { + SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 4) { + Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<8, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } else if (vertical_taps == 6) { + SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 4) { + Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<6, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } else if (vertical_taps == 4) { + SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 4) { + Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<4, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } else { // |vertical_taps| == 2 + SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps); + if (width == 4) { + Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest, + dest_stride, height, taps); + } else { + Filter2DVertical<2, /*is_compound=*/true>( + intermediate_result, dest, dest_stride, width, height, taps); + } + } +} + +// Pre-transposed filters. +template +inline void GetHalfSubPixelFilter(__m128i* output) { + // Filter 0 + alignas( + 16) static constexpr int8_t kHalfSubPixel6TapSignedFilterColumns[6][16] = + {{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}, + {0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1}, + {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4}, + {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63}, + {0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3}, + {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}; + // Filter 1 + alignas(16) static constexpr int8_t + kHalfSubPixel6TapMixedSignedFilterColumns[6][16] = { + {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0}, + {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1}, + {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17}, + {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31}, + {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}, + {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}}; + // Filter 2 + alignas( + 16) static constexpr int8_t kHalfSubPixel8TapSignedFilterColumns[8][16] = + {{0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, 0}, + {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1}, + {0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3, -1}, + {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4}, + {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63}, + {0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6, -3}, + {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1}, + {0, 0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}}; + // Filter 3 + alignas(16) static constexpr uint8_t kHalfSubPixel2TapFilterColumns[2][16] = { + {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4}, + {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}}; + // Filter 4 + alignas( + 16) static constexpr int8_t kHalfSubPixel4TapSignedFilterColumns[4][16] = + {{0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1}, + {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4}, + {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63}, + {0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}}; + // Filter 5 + alignas( + 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = { + {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1}, + {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17}, + {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31}, + {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}}; + switch (filter_index) { + case 0: + output[0] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[0]); + output[1] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[1]); + output[2] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[2]); + output[3] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[3]); + output[4] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[4]); + output[5] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[5]); + break; + case 1: + // The term "mixed" refers to the fact that the outer taps have a mix of + // negative and positive values. + output[0] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[0]); + output[1] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[1]); + output[2] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[2]); + output[3] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[3]); + output[4] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[4]); + output[5] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[5]); + break; + case 2: + output[0] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[0]); + output[1] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[1]); + output[2] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[2]); + output[3] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[3]); + output[4] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[4]); + output[5] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[5]); + output[6] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[6]); + output[7] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[7]); + break; + case 3: + output[0] = LoadAligned16(kHalfSubPixel2TapFilterColumns[0]); + output[1] = LoadAligned16(kHalfSubPixel2TapFilterColumns[1]); + break; + case 4: + output[0] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[0]); + output[1] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[1]); + output[2] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[2]); + output[3] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[3]); + break; + default: + assert(filter_index == 5); + output[0] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[0]); + output[1] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[1]); + output[2] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[2]); + output[3] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[3]); + break; + } +} + +// There are many opportunities for overreading in scaled convolve, because +// the range of starting points for filter windows is anywhere from 0 to 16 +// for 8 destination pixels, and the window sizes range from 2 to 8. To +// accommodate this range concisely, we use |grade_x| to mean the most steps +// in src that can be traversed in a single |step_x| increment, i.e. 1 or 2. +// More importantly, |grade_x| answers the question "how many vector loads are +// needed to cover the source values?" +// When |grade_x| == 1, the maximum number of source values needed is 8 separate +// starting positions plus 7 more to cover taps, all fitting into 16 bytes. +// When |grade_x| > 1, we are guaranteed to exceed 8 whole steps in src for +// every 8 |step_x| increments, on top of 8 possible taps. The first load covers +// the starting sources for each kernel, while the final load covers the taps. +// Since the offset value of src_x cannot exceed 8 and |num_taps| does not +// exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of +// |step_x|. +template +inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices, + __m128i* const source /*[num_taps >> 1]*/) { + const __m128i src_vals = LoadUnaligned16(src); + source[0] = _mm_shuffle_epi8(src_vals, src_indices); + if (grade_x == 1) { + if (num_taps > 2) { + source[1] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 2), src_indices); + } + if (num_taps > 4) { + source[2] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 4), src_indices); + } + if (num_taps > 6) { + source[3] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 6), src_indices); + } + } else { + assert(grade_x > 1); + assert(num_taps != 4); + // grade_x > 1 also means width >= 8 && num_taps != 4 + const __m128i src_vals_ext = LoadLo8(src + 16); + if (num_taps > 2) { + source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2), + src_indices); + source[2] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 4), + src_indices); + } + if (num_taps > 6) { + source[3] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 6), + src_indices); + } + } +} + +template +inline void PrepareHorizontalTaps(const __m128i subpel_indices, + const __m128i* filter_taps, + __m128i* out_taps) { + const __m128i scale_index_offsets = + _mm_srli_epi16(subpel_indices, kFilterIndexShift); + const __m128i filter_index_mask = _mm_set1_epi8(kSubPixelMask); + const __m128i filter_indices = + _mm_and_si128(_mm_packus_epi16(scale_index_offsets, scale_index_offsets), + filter_index_mask); + // Line up taps for maddubs_epi16. + // The unpack is also assumed to be lighter than shift+alignr. + for (int k = 0; k < (num_taps >> 1); ++k) { + const __m128i taps0 = _mm_shuffle_epi8(filter_taps[2 * k], filter_indices); + const __m128i taps1 = + _mm_shuffle_epi8(filter_taps[2 * k + 1], filter_indices); + out_taps[k] = _mm_unpacklo_epi8(taps0, taps1); + } +} + +inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) { + const __m128i src_indices16 = + _mm_srli_epi16(subpel_indices, kScaleSubPixelBits); + const __m128i src_indices = _mm_packus_epi16(src_indices16, src_indices16); + return _mm_unpacklo_epi8(src_indices, + _mm_add_epi8(src_indices, _mm_set1_epi8(1))); +} + +template +inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride, + int width, int subpixel_x, int step_x, + int intermediate_height, + int16_t* intermediate) { + // Account for the 0-taps that precede the 2 nonzero taps. + const int kernel_offset = (8 - num_taps) >> 1; + const int ref_x = subpixel_x >> kScaleSubPixelBits; + const int step_x8 = step_x << 3; + __m128i filter_taps[num_taps]; + GetHalfSubPixelFilter(filter_taps); + const __m128i index_steps = + _mm_mullo_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), + _mm_set1_epi16(static_cast(step_x))); + + __m128i taps[num_taps >> 1]; + __m128i source[num_taps >> 1]; + int p = subpixel_x; + // Case when width <= 4 is possible. + if (filter_index >= 3) { + if (filter_index > 3 || width <= 4) { + const uint8_t* src_x = + &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; + // Only add steps to the 10-bit truncated p to avoid overflow. + const __m128i p_fraction = _mm_set1_epi16(p & 1023); + const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction); + PrepareHorizontalTaps(subpel_indices, filter_taps, taps); + const __m128i packed_indices = HorizontalScaleIndices(subpel_indices); + + int y = intermediate_height; + do { + // Load and line up source values with the taps. Width 4 means no need + // to load extended source. + PrepareSourceVectors(src_x, packed_indices, + source); + + StoreLo8(intermediate, RightShiftWithRounding_S16( + SumOnePassTaps(source, taps), + kInterRoundBitsHorizontal - 1)); + src_x += src_stride; + intermediate += kIntermediateStride; + } while (--y != 0); + return; + } + } + + // |width| >= 8 + int x = 0; + do { + const uint8_t* src_x = + &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset]; + int16_t* intermediate_x = intermediate + x; + // Only add steps to the 10-bit truncated p to avoid overflow. + const __m128i p_fraction = _mm_set1_epi16(p & 1023); + const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction); + PrepareHorizontalTaps(subpel_indices, filter_taps, taps); + const __m128i packed_indices = HorizontalScaleIndices(subpel_indices); + + int y = intermediate_height; + do { + // For each x, a lane of src_k[k] contains src_x[k]. + PrepareSourceVectors(src_x, packed_indices, source); + + // Shift by one less because the taps are halved. + StoreAligned16( + intermediate_x, + RightShiftWithRounding_S16(SumOnePassTaps(source, taps), + kInterRoundBitsHorizontal - 1)); + src_x += src_stride; + intermediate_x += kIntermediateStride; + } while (--y != 0); + x += 8; + p += step_x8; + } while (x < width); +} + +template +inline void PrepareVerticalTaps(const int8_t* taps, __m128i* output) { + // Avoid overreading the filter due to starting at kernel_offset. + // The only danger of overread is in the final filter, which has 4 taps. + const __m128i filter = + _mm_cvtepi8_epi16((num_taps > 4) ? LoadLo8(taps) : Load4(taps)); + output[0] = _mm_shuffle_epi32(filter, 0); + if (num_taps > 2) { + output[1] = _mm_shuffle_epi32(filter, 0x55); + } + if (num_taps > 4) { + output[2] = _mm_shuffle_epi32(filter, 0xAA); + } + if (num_taps > 6) { + output[3] = _mm_shuffle_epi32(filter, 0xFF); + } +} + +// Process eight 16 bit inputs and output eight 16 bit values. +template +inline __m128i Sum2DVerticalTaps(const __m128i* const src, + const __m128i* taps) { + const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]); + __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps[0]); + const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]); + __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps[0]); + if (num_taps > 2) { + const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]); + sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps[1])); + const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]); + sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps[1])); + } + if (num_taps > 4) { + const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]); + sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps[2])); + const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]); + sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps[2])); + } + if (num_taps > 6) { + const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]); + sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps[3])); + const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]); + sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps[3])); + } + if (is_compound) { + return _mm_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), + RightShiftWithRounding_S32(sum_hi, + kInterRoundBitsCompoundVertical - 1)); + } + return _mm_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1), + RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1)); +} + +// Bottom half of each src[k] is the source for one filter, and the top half +// is the source for the other filter, for the next destination row. +template +__m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo, + const __m128i* taps_hi) { + const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]); + __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps_lo[0]); + const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]); + __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps_hi[0]); + if (num_taps > 2) { + const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]); + sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps_lo[1])); + const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]); + sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps_hi[1])); + } + if (num_taps > 4) { + const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]); + sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps_lo[2])); + const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]); + sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps_hi[2])); + } + if (num_taps > 6) { + const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]); + sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps_lo[3])); + const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]); + sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps_hi[3])); + } + + if (is_compound) { + return _mm_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), + RightShiftWithRounding_S32(sum_hi, + kInterRoundBitsCompoundVertical - 1)); + } + return _mm_packs_epi32( + RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1), + RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1)); +} + +// |width_class| is 2, 4, or 8, according to the Store function that should be +// used. +template +#if LIBGAV1_MSAN +__attribute__((no_sanitize_memory)) void ConvolveVerticalScale( +#else +inline void ConvolveVerticalScale( +#endif + const int16_t* src, const int width, const int subpixel_y, + const int filter_index, const int step_y, const int height, void* dest, + const ptrdiff_t dest_stride) { + constexpr ptrdiff_t src_stride = kIntermediateStride; + constexpr int kernel_offset = (8 - num_taps) / 2; + const int16_t* src_y = src; + // |dest| is 16-bit in compound mode, Pixel otherwise. + auto* dest16_y = static_cast(dest); + auto* dest_y = static_cast(dest); + __m128i s[num_taps]; + + int p = subpixel_y & 1023; + int y = height; + if (width_class <= 4) { + __m128i filter_taps_lo[num_taps >> 1]; + __m128i filter_taps_hi[num_taps >> 1]; + do { // y > 0 + for (int i = 0; i < num_taps; ++i) { + s[i] = LoadLo8(src_y + i * src_stride); + } + int filter_id = (p >> 6) & kSubPixelMask; + const int8_t* filter0 = + kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset; + PrepareVerticalTaps(filter0, filter_taps_lo); + p += step_y; + src_y = src + (p >> kScaleSubPixelBits) * src_stride; + + for (int i = 0; i < num_taps; ++i) { + s[i] = LoadHi8(s[i], src_y + i * src_stride); + } + filter_id = (p >> 6) & kSubPixelMask; + const int8_t* filter1 = + kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset; + PrepareVerticalTaps(filter1, filter_taps_hi); + p += step_y; + src_y = src + (p >> kScaleSubPixelBits) * src_stride; + + const __m128i sums = Sum2DVerticalTaps4x2( + s, filter_taps_lo, filter_taps_hi); + if (is_compound) { + assert(width_class > 2); + StoreLo8(dest16_y, sums); + dest16_y += dest_stride; + StoreHi8(dest16_y, sums); + dest16_y += dest_stride; + } else { + const __m128i result = _mm_packus_epi16(sums, sums); + if (width_class == 2) { + Store2(dest_y, result); + dest_y += dest_stride; + Store2(dest_y, _mm_srli_si128(result, 4)); + } else { + Store4(dest_y, result); + dest_y += dest_stride; + Store4(dest_y, _mm_srli_si128(result, 4)); + } + dest_y += dest_stride; + } + y -= 2; + } while (y != 0); + return; + } + + // |width_class| >= 8 + __m128i filter_taps[num_taps >> 1]; + do { // y > 0 + src_y = src + (p >> kScaleSubPixelBits) * src_stride; + const int filter_id = (p >> 6) & kSubPixelMask; + const int8_t* filter = + kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset; + PrepareVerticalTaps(filter, filter_taps); + + int x = 0; + do { // x < width + for (int i = 0; i < num_taps; ++i) { + s[i] = LoadUnaligned16(src_y + i * src_stride); + } + + const __m128i sums = + Sum2DVerticalTaps(s, filter_taps); + if (is_compound) { + StoreUnaligned16(dest16_y + x, sums); + } else { + StoreLo8(dest_y + x, _mm_packus_epi16(sums, sums)); + } + x += 8; + src_y += 8; + } while (x < width); + p += step_y; + dest_y += dest_stride; + dest16_y += dest_stride; + } while (--y != 0); +} + +template +void ConvolveScale2D_SSE4_1(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int vertical_filter_index, + const int subpixel_x, const int subpixel_y, + const int step_x, const int step_y, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width); + const int vert_filter_index = GetFilterIndex(vertical_filter_index, height); + assert(step_x <= 2048); + // The output of the horizontal filter, i.e. the intermediate_result, is + // guaranteed to fit in int16_t. + // TODO(petersonab): Reduce intermediate block stride to width to make smaller + // blocks faster. + alignas(16) int16_t + intermediate_result[kMaxSuperBlockSizeInPixels * + (2 * kMaxSuperBlockSizeInPixels + kSubPixelTaps)]; + const int num_vert_taps = GetNumTapsInFilter(vert_filter_index); + const int intermediate_height = + (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> + kScaleSubPixelBits) + + num_vert_taps; + + // Horizontal filter. + // Filter types used for width <= 4 are different from those for width > 4. + // When width > 4, the valid filter index range is always [0, 3]. + // When width <= 4, the valid filter index range is always [3, 5]. + // Similarly for height. + int16_t* intermediate = intermediate_result; + const ptrdiff_t src_stride = reference_stride; + const auto* src = static_cast(reference); + const int vert_kernel_offset = (8 - num_vert_taps) / 2; + src += vert_kernel_offset * src_stride; + + // Derive the maximum value of |step_x| at which all source values fit in one + // 16-byte load. Final index is src_x + |num_taps| - 1 < 16 + // step_x*7 is the final base sub-pixel index for the shuffle mask for filter + // inputs in each iteration on large blocks. When step_x is large, we need a + // second register and alignr in order to gather all filter inputs. + // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap. + const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index); + const int kernel_start_ceiling = 16 - num_horiz_taps; + // This truncated quotient |grade_x_threshold| selects |step_x| such that: + // (step_x * 7) >> kScaleSubPixelBits < single load limit + const int grade_x_threshold = + (kernel_start_ceiling << kScaleSubPixelBits) / 7; + switch (horiz_filter_index) { + case 0: + if (step_x > grade_x_threshold) { + ConvolveHorizontalScale<2, 0, 6>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } else { + ConvolveHorizontalScale<1, 0, 6>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } + break; + case 1: + if (step_x > grade_x_threshold) { + ConvolveHorizontalScale<2, 1, 6>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + + } else { + ConvolveHorizontalScale<1, 1, 6>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } + break; + case 2: + if (step_x > grade_x_threshold) { + ConvolveHorizontalScale<2, 2, 8>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } else { + ConvolveHorizontalScale<1, 2, 8>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } + break; + case 3: + if (step_x > grade_x_threshold) { + ConvolveHorizontalScale<2, 3, 2>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } else { + ConvolveHorizontalScale<1, 3, 2>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } + break; + case 4: + assert(width <= 4); + ConvolveHorizontalScale<1, 4, 4>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + break; + default: + assert(horiz_filter_index == 5); + assert(width <= 4); + ConvolveHorizontalScale<1, 5, 4>(src, src_stride, width, subpixel_x, + step_x, intermediate_height, + intermediate); + } + + // Vertical filter. + intermediate = intermediate_result; + switch (vert_filter_index) { + case 0: + case 1: + if (!is_compound && width == 2) { + ConvolveVerticalScale<6, 2, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale<6, 4, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<6, 8, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } + break; + case 2: + if (!is_compound && width == 2) { + ConvolveVerticalScale<8, 2, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale<8, 4, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<8, 8, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } + break; + case 3: + if (!is_compound && width == 2) { + ConvolveVerticalScale<2, 2, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale<2, 4, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<2, 8, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } + break; + default: + assert(vert_filter_index == 4 || vert_filter_index == 5); + if (!is_compound && width == 2) { + ConvolveVerticalScale<4, 2, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else if (width == 4) { + ConvolveVerticalScale<4, 4, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } else { + ConvolveVerticalScale<4, 8, is_compound>( + intermediate, width, subpixel_y, vert_filter_index, step_y, height, + prediction, pred_stride); + } + } +} + +inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) { + const __m128i left = LoadUnaligned16(src); + const __m128i right = LoadUnaligned16(src + 1); + StoreUnaligned16(dst, _mm_avg_epu8(left, right)); +} + +template +inline void IntraBlockCopyHorizontal(const uint8_t* src, + const ptrdiff_t src_stride, + const int height, uint8_t* dst, + const ptrdiff_t dst_stride) { + const ptrdiff_t src_remainder_stride = src_stride - (width - 16); + const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16); + + int y = height; + do { + HalfAddHorizontal(src, dst); + if (width >= 32) { + src += 16; + dst += 16; + HalfAddHorizontal(src, dst); + if (width >= 64) { + src += 16; + dst += 16; + HalfAddHorizontal(src, dst); + src += 16; + dst += 16; + HalfAddHorizontal(src, dst); + if (width == 128) { + src += 16; + dst += 16; + HalfAddHorizontal(src, dst); + src += 16; + dst += 16; + HalfAddHorizontal(src, dst); + src += 16; + dst += 16; + HalfAddHorizontal(src, dst); + src += 16; + dst += 16; + HalfAddHorizontal(src, dst); + } + } + } + src += src_remainder_stride; + dst += dst_remainder_stride; + } while (--y != 0); +} + +void ConvolveIntraBlockCopyHorizontal_SSE4_1( + const void* const reference, const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, + const int /*subpixel_x*/, const int /*subpixel_y*/, const int width, + const int height, void* const prediction, const ptrdiff_t pred_stride) { + const auto* src = static_cast(reference); + auto* dest = static_cast(prediction); + + if (width == 128) { + IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 64) { + IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 32) { + IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 16) { + IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 8) { + int y = height; + do { + const __m128i left = LoadLo8(src); + const __m128i right = LoadLo8(src + 1); + StoreLo8(dest, _mm_avg_epu8(left, right)); + + src += reference_stride; + dest += pred_stride; + } while (--y != 0); + } else if (width == 4) { + int y = height; + do { + __m128i left = Load4(src); + __m128i right = Load4(src + 1); + src += reference_stride; + left = _mm_unpacklo_epi32(left, Load4(src)); + right = _mm_unpacklo_epi32(right, Load4(src + 1)); + src += reference_stride; + + const __m128i result = _mm_avg_epu8(left, right); + + Store4(dest, result); + dest += pred_stride; + Store4(dest, _mm_srli_si128(result, 4)); + dest += pred_stride; + y -= 2; + } while (y != 0); + } else { + assert(width == 2); + __m128i left = _mm_setzero_si128(); + __m128i right = _mm_setzero_si128(); + int y = height; + do { + left = Load2<0>(src, left); + right = Load2<0>(src + 1, right); + src += reference_stride; + left = Load2<1>(src, left); + right = Load2<1>(src + 1, right); + src += reference_stride; + + const __m128i result = _mm_avg_epu8(left, right); + + Store2(dest, result); + dest += pred_stride; + Store2(dest, _mm_srli_si128(result, 2)); + dest += pred_stride; + y -= 2; + } while (y != 0); + } +} + +template +inline void IntraBlockCopyVertical(const uint8_t* src, + const ptrdiff_t src_stride, const int height, + uint8_t* dst, const ptrdiff_t dst_stride) { + const ptrdiff_t src_remainder_stride = src_stride - (width - 16); + const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16); + __m128i row[8], below[8]; + + row[0] = LoadUnaligned16(src); + if (width >= 32) { + src += 16; + row[1] = LoadUnaligned16(src); + if (width >= 64) { + src += 16; + row[2] = LoadUnaligned16(src); + src += 16; + row[3] = LoadUnaligned16(src); + if (width == 128) { + src += 16; + row[4] = LoadUnaligned16(src); + src += 16; + row[5] = LoadUnaligned16(src); + src += 16; + row[6] = LoadUnaligned16(src); + src += 16; + row[7] = LoadUnaligned16(src); + } + } + } + src += src_remainder_stride; + + int y = height; + do { + below[0] = LoadUnaligned16(src); + if (width >= 32) { + src += 16; + below[1] = LoadUnaligned16(src); + if (width >= 64) { + src += 16; + below[2] = LoadUnaligned16(src); + src += 16; + below[3] = LoadUnaligned16(src); + if (width == 128) { + src += 16; + below[4] = LoadUnaligned16(src); + src += 16; + below[5] = LoadUnaligned16(src); + src += 16; + below[6] = LoadUnaligned16(src); + src += 16; + below[7] = LoadUnaligned16(src); + } + } + } + src += src_remainder_stride; + + StoreUnaligned16(dst, _mm_avg_epu8(row[0], below[0])); + row[0] = below[0]; + if (width >= 32) { + dst += 16; + StoreUnaligned16(dst, _mm_avg_epu8(row[1], below[1])); + row[1] = below[1]; + if (width >= 64) { + dst += 16; + StoreUnaligned16(dst, _mm_avg_epu8(row[2], below[2])); + row[2] = below[2]; + dst += 16; + StoreUnaligned16(dst, _mm_avg_epu8(row[3], below[3])); + row[3] = below[3]; + if (width >= 128) { + dst += 16; + StoreUnaligned16(dst, _mm_avg_epu8(row[4], below[4])); + row[4] = below[4]; + dst += 16; + StoreUnaligned16(dst, _mm_avg_epu8(row[5], below[5])); + row[5] = below[5]; + dst += 16; + StoreUnaligned16(dst, _mm_avg_epu8(row[6], below[6])); + row[6] = below[6]; + dst += 16; + StoreUnaligned16(dst, _mm_avg_epu8(row[7], below[7])); + row[7] = below[7]; + } + } + } + dst += dst_remainder_stride; + } while (--y != 0); +} + +void ConvolveIntraBlockCopyVertical_SSE4_1( + const void* const reference, const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, + const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, + const int width, const int height, void* const prediction, + const ptrdiff_t pred_stride) { + const auto* src = static_cast(reference); + auto* dest = static_cast(prediction); + + if (width == 128) { + IntraBlockCopyVertical<128>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 64) { + IntraBlockCopyVertical<64>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 32) { + IntraBlockCopyVertical<32>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 16) { + IntraBlockCopyVertical<16>(src, reference_stride, height, dest, + pred_stride); + } else if (width == 8) { + __m128i row, below; + row = LoadLo8(src); + src += reference_stride; + + int y = height; + do { + below = LoadLo8(src); + src += reference_stride; + + StoreLo8(dest, _mm_avg_epu8(row, below)); + dest += pred_stride; + + row = below; + } while (--y != 0); + } else if (width == 4) { + __m128i row = Load4(src); + src += reference_stride; + + int y = height; + do { + __m128i below = Load4(src); + src += reference_stride; + + Store4(dest, _mm_avg_epu8(row, below)); + dest += pred_stride; + + row = below; + } while (--y != 0); + } else { + assert(width == 2); + __m128i row = Load2(src); + __m128i below = _mm_setzero_si128(); + src += reference_stride; + + int y = height; + do { + below = Load2<0>(src, below); + src += reference_stride; + + Store2(dest, _mm_avg_epu8(row, below)); + dest += pred_stride; + + row = below; + } while (--y != 0); + } +} + +// Load then add two uint8_t vectors. Return the uint16_t vector result. +inline __m128i LoadU8AndAddLong(const uint8_t* src, const uint8_t* src1) { + const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src)); + const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1)); + return _mm_add_epi16(a, b); +} + +inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) { + const __m128i a = _mm_add_epi16(v0, v1); + const __m128i b = _mm_srli_epi16(a, 1); + // Use avg here to shift right by 1 with round. + const __m128i c = _mm_avg_epu16(b, _mm_setzero_si128()); + return _mm_packus_epi16(c, c); +} + +template +inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride, + const int height, uint8_t* dst, + const ptrdiff_t dst_stride) { + const ptrdiff_t src_remainder_stride = src_stride - (width - 8); + const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8); + __m128i row[16]; + row[0] = LoadU8AndAddLong(src, src + 1); + if (width >= 16) { + src += 8; + row[1] = LoadU8AndAddLong(src, src + 1); + if (width >= 32) { + src += 8; + row[2] = LoadU8AndAddLong(src, src + 1); + src += 8; + row[3] = LoadU8AndAddLong(src, src + 1); + if (width >= 64) { + src += 8; + row[4] = LoadU8AndAddLong(src, src + 1); + src += 8; + row[5] = LoadU8AndAddLong(src, src + 1); + src += 8; + row[6] = LoadU8AndAddLong(src, src + 1); + src += 8; + row[7] = LoadU8AndAddLong(src, src + 1); + if (width == 128) { + src += 8; + row[8] = LoadU8AndAddLong(src, src + 1); + src += 8; + row[9] = LoadU8AndAddLong(src, src + 1); + src += 8; + row[10] = LoadU8AndAddLong(src, src + 1); + src += 8; + row[11] = LoadU8AndAddLong(src, src + 1); + src += 8; + row[12] = LoadU8AndAddLong(src, src + 1); + src += 8; + row[13] = LoadU8AndAddLong(src, src + 1); + src += 8; + row[14] = LoadU8AndAddLong(src, src + 1); + src += 8; + row[15] = LoadU8AndAddLong(src, src + 1); + } + } + } + } + src += src_remainder_stride; + + int y = height; + do { + const __m128i below_0 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[0], below_0)); + row[0] = below_0; + if (width >= 16) { + src += 8; + dst += 8; + + const __m128i below_1 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[1], below_1)); + row[1] = below_1; + if (width >= 32) { + src += 8; + dst += 8; + + const __m128i below_2 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[2], below_2)); + row[2] = below_2; + src += 8; + dst += 8; + + const __m128i below_3 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[3], below_3)); + row[3] = below_3; + if (width >= 64) { + src += 8; + dst += 8; + + const __m128i below_4 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[4], below_4)); + row[4] = below_4; + src += 8; + dst += 8; + + const __m128i below_5 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[5], below_5)); + row[5] = below_5; + src += 8; + dst += 8; + + const __m128i below_6 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[6], below_6)); + row[6] = below_6; + src += 8; + dst += 8; + + const __m128i below_7 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[7], below_7)); + row[7] = below_7; + if (width == 128) { + src += 8; + dst += 8; + + const __m128i below_8 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[8], below_8)); + row[8] = below_8; + src += 8; + dst += 8; + + const __m128i below_9 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[9], below_9)); + row[9] = below_9; + src += 8; + dst += 8; + + const __m128i below_10 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[10], below_10)); + row[10] = below_10; + src += 8; + dst += 8; + + const __m128i below_11 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[11], below_11)); + row[11] = below_11; + src += 8; + dst += 8; + + const __m128i below_12 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[12], below_12)); + row[12] = below_12; + src += 8; + dst += 8; + + const __m128i below_13 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[13], below_13)); + row[13] = below_13; + src += 8; + dst += 8; + + const __m128i below_14 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[14], below_14)); + row[14] = below_14; + src += 8; + dst += 8; + + const __m128i below_15 = LoadU8AndAddLong(src, src + 1); + StoreLo8(dst, AddU16RightShift2AndPack(row[15], below_15)); + row[15] = below_15; + } + } + } + } + src += src_remainder_stride; + dst += dst_remainder_stride; + } while (--y != 0); +} + +void ConvolveIntraBlockCopy2D_SSE4_1( + const void* const reference, const ptrdiff_t reference_stride, + const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/, + const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/, + const int width, const int height, void* const prediction, + const ptrdiff_t pred_stride) { + const auto* src = static_cast(reference); + auto* dest = static_cast(prediction); + // Note: allow vertical access to height + 1. Because this function is only + // for u/v plane of intra block copy, such access is guaranteed to be within + // the prediction block. + + if (width == 128) { + IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride); + } else if (width == 64) { + IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride); + } else if (width == 32) { + IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride); + } else if (width == 16) { + IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride); + } else if (width == 8) { + IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride); + } else if (width == 4) { + __m128i left = _mm_cvtepu8_epi16(Load4(src)); + __m128i right = _mm_cvtepu8_epi16(Load4(src + 1)); + src += reference_stride; + + __m128i row = _mm_add_epi16(left, right); + + int y = height; + do { + left = Load4(src); + right = Load4(src + 1); + src += reference_stride; + left = _mm_unpacklo_epi32(left, Load4(src)); + right = _mm_unpacklo_epi32(right, Load4(src + 1)); + src += reference_stride; + + const __m128i below = + _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right)); + const __m128i result = + AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below); + + Store4(dest, result); + dest += pred_stride; + Store4(dest, _mm_srli_si128(result, 4)); + dest += pred_stride; + + row = _mm_srli_si128(below, 8); + y -= 2; + } while (y != 0); + } else { + __m128i left = Load2(src); + __m128i right = Load2(src + 1); + src += reference_stride; + + __m128i row = + _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right)); + + int y = height; + do { + left = Load2<0>(src, left); + right = Load2<0>(src + 1, right); + src += reference_stride; + left = Load2<2>(src, left); + right = Load2<2>(src + 1, right); + src += reference_stride; + + const __m128i below = + _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right)); + const __m128i result = + AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below); + + Store2(dest, result); + dest += pred_stride; + Store2(dest, _mm_srli_si128(result, 4)); + dest += pred_stride; + + row = _mm_srli_si128(below, 8); + y -= 2; + } while (y != 0); + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->convolve[0][0][0][1] = ConvolveHorizontal_SSE4_1; + dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1; + dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1; + + dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4; + dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1; + dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1; + dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1; + + dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_SSE4_1; + dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_SSE4_1; + dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_SSE4_1; + + dsp->convolve_scale[0] = ConvolveScale2D_SSE4_1; + dsp->convolve_scale[1] = ConvolveScale2D_SSE4_1; +} + +} // namespace +} // namespace low_bitdepth + +void ConvolveInit_SSE4_1() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 +namespace libgav1 { +namespace dsp { + +void ConvolveInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/convolve_sse4.h b/src/dsp/x86/convolve_sse4.h new file mode 100644 index 0000000..d6c3155 --- /dev/null +++ b/src/dsp/x86/convolve_sse4.h @@ -0,0 +1,75 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::convolve, see the defines below for specifics. This +// function is not thread-safe. +void ConvolveInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +// If sse4 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the sse4 implementation should be used. +#if LIBGAV1_TARGETING_SSE4_1 + +#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal +#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical +#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_Convolve2D +#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundCopy +#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal +#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical +#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompound2D +#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_ConvolveScale2D +#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D +#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_SSE4_1 +#endif + +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_ diff --git a/src/dsp/x86/distance_weighted_blend_sse4.cc b/src/dsp/x86/distance_weighted_blend_sse4.cc new file mode 100644 index 0000000..deb57ef --- /dev/null +++ b/src/dsp/x86/distance_weighted_blend_sse4.cc @@ -0,0 +1,230 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/distance_weighted_blend.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kInterPostRoundBit = 4; + +inline __m128i ComputeWeightedAverage8(const __m128i& pred0, + const __m128i& pred1, + const __m128i& weights) { + // TODO(https://issuetracker.google.com/issues/150325685): Investigate range. + const __m128i preds_lo = _mm_unpacklo_epi16(pred0, pred1); + const __m128i mult_lo = _mm_madd_epi16(preds_lo, weights); + const __m128i result_lo = + RightShiftWithRounding_S32(mult_lo, kInterPostRoundBit + 4); + + const __m128i preds_hi = _mm_unpackhi_epi16(pred0, pred1); + const __m128i mult_hi = _mm_madd_epi16(preds_hi, weights); + const __m128i result_hi = + RightShiftWithRounding_S32(mult_hi, kInterPostRoundBit + 4); + + return _mm_packs_epi32(result_lo, result_hi); +} + +template +inline void DistanceWeightedBlend4xH_SSE4_1( + const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0, + const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) { + auto* dst = static_cast(dest); + const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); + + for (int y = 0; y < height; y += 4) { + // TODO(b/150326556): Use larger loads. + const __m128i src_00 = LoadLo8(pred_0); + const __m128i src_10 = LoadLo8(pred_1); + pred_0 += 4; + pred_1 += 4; + __m128i src_0 = LoadHi8(src_00, pred_0); + __m128i src_1 = LoadHi8(src_10, pred_1); + pred_0 += 4; + pred_1 += 4; + const __m128i res0 = ComputeWeightedAverage8(src_0, src_1, weights); + + const __m128i src_01 = LoadLo8(pred_0); + const __m128i src_11 = LoadLo8(pred_1); + pred_0 += 4; + pred_1 += 4; + src_0 = LoadHi8(src_01, pred_0); + src_1 = LoadHi8(src_11, pred_1); + pred_0 += 4; + pred_1 += 4; + const __m128i res1 = ComputeWeightedAverage8(src_0, src_1, weights); + + const __m128i result_pixels = _mm_packus_epi16(res0, res1); + Store4(dst, result_pixels); + dst += dest_stride; + const int result_1 = _mm_extract_epi32(result_pixels, 1); + memcpy(dst, &result_1, sizeof(result_1)); + dst += dest_stride; + const int result_2 = _mm_extract_epi32(result_pixels, 2); + memcpy(dst, &result_2, sizeof(result_2)); + dst += dest_stride; + const int result_3 = _mm_extract_epi32(result_pixels, 3); + memcpy(dst, &result_3, sizeof(result_3)); + dst += dest_stride; + } +} + +template +inline void DistanceWeightedBlend8xH_SSE4_1( + const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0, + const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) { + auto* dst = static_cast(dest); + const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); + + for (int y = 0; y < height; y += 2) { + const __m128i src_00 = LoadAligned16(pred_0); + const __m128i src_10 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; + const __m128i res0 = ComputeWeightedAverage8(src_00, src_10, weights); + + const __m128i src_01 = LoadAligned16(pred_0); + const __m128i src_11 = LoadAligned16(pred_1); + pred_0 += 8; + pred_1 += 8; + const __m128i res1 = ComputeWeightedAverage8(src_01, src_11, weights); + + const __m128i result_pixels = _mm_packus_epi16(res0, res1); + StoreLo8(dst, result_pixels); + dst += dest_stride; + StoreHi8(dst, result_pixels); + dst += dest_stride; + } +} + +inline void DistanceWeightedBlendLarge_SSE4_1( + const int16_t* pred_0, const int16_t* pred_1, const uint8_t weight_0, + const uint8_t weight_1, const int width, const int height, void* const dest, + const ptrdiff_t dest_stride) { + auto* dst = static_cast(dest); + const __m128i weights = _mm_set1_epi32(weight_0 | (weight_1 << 16)); + + int y = height; + do { + int x = 0; + do { + const __m128i src_0_lo = LoadAligned16(pred_0 + x); + const __m128i src_1_lo = LoadAligned16(pred_1 + x); + const __m128i res_lo = + ComputeWeightedAverage8(src_0_lo, src_1_lo, weights); + + const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8); + const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8); + const __m128i res_hi = + ComputeWeightedAverage8(src_0_hi, src_1_hi, weights); + + StoreUnaligned16(dst + x, _mm_packus_epi16(res_lo, res_hi)); + x += 16; + } while (x < width); + dst += dest_stride; + pred_0 += width; + pred_1 += width; + } while (--y != 0); +} + +void DistanceWeightedBlend_SSE4_1(const void* prediction_0, + const void* prediction_1, + const uint8_t weight_0, + const uint8_t weight_1, const int width, + const int height, void* const dest, + const ptrdiff_t dest_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + if (width == 4) { + if (height == 4) { + DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1, + dest, dest_stride); + } else if (height == 8) { + DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1, + dest, dest_stride); + } else { + assert(height == 16); + DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1, + dest, dest_stride); + } + return; + } + + if (width == 8) { + switch (height) { + case 4: + DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1, + dest, dest_stride); + return; + case 8: + DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1, + dest, dest_stride); + return; + case 16: + DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1, + dest, dest_stride); + return; + default: + assert(height == 32); + DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1, + dest, dest_stride); + + return; + } + } + + DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width, + height, dest, dest_stride); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); +#if DSP_ENABLED_8BPP_SSE4_1(DistanceWeightedBlend) + dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1; +#endif +} + +} // namespace + +void DistanceWeightedBlendInit_SSE4_1() { Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 + +namespace libgav1 { +namespace dsp { + +void DistanceWeightedBlendInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/distance_weighted_blend_sse4.h b/src/dsp/x86/distance_weighted_blend_sse4.h new file mode 100644 index 0000000..8646eca --- /dev/null +++ b/src/dsp/x86/distance_weighted_blend_sse4.h @@ -0,0 +1,41 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::distance_weighted_blend. This function is not thread-safe. +void DistanceWeightedBlendInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +// If sse4 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the sse4 implementation should be used. +#if LIBGAV1_TARGETING_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend +#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1 +#endif + +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_ diff --git a/src/dsp/x86/intra_edge_sse4.cc b/src/dsp/x86/intra_edge_sse4.cc new file mode 100644 index 0000000..4a8658d --- /dev/null +++ b/src/dsp/x86/intra_edge_sse4.cc @@ -0,0 +1,270 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intra_edge.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include +#include +#include +#include // memcpy + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kKernelTaps = 5; +constexpr int kKernels[3][kKernelTaps] = { + {0, 4, 8, 4, 0}, {0, 5, 6, 5, 0}, {2, 4, 4, 4, 2}}; +constexpr int kMaxEdgeBufferSize = 129; + +// This function applies the kernel [0, 4, 8, 4, 0] to 12 values. +// Assumes |edge| has 16 packed byte values. Produces 12 filter outputs to +// write as overlapping sets of 8-bytes. +inline void ComputeKernel1Store12(uint8_t* dest, const uint8_t* source) { + const __m128i edge_lo = LoadUnaligned16(source); + const __m128i edge_hi = _mm_srli_si128(edge_lo, 6); + // Samples matched with the '4' tap, expanded to 16-bit. + const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo); + const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi); + // Samples matched with the '8' tap, expanded to 16-bit. + const __m128i centers_lo = _mm_srli_si128(outers_lo, 2); + const __m128i centers_hi = _mm_srli_si128(outers_hi, 2); + + // Apply the taps by shifting. + const __m128i outers4_lo = _mm_slli_epi16(outers_lo, 2); + const __m128i outers4_hi = _mm_slli_epi16(outers_hi, 2); + const __m128i centers8_lo = _mm_slli_epi16(centers_lo, 3); + const __m128i centers8_hi = _mm_slli_epi16(centers_hi, 3); + // Move latter 4x values down to add with first 4x values for each output. + const __m128i partial_sums_lo = + _mm_add_epi16(outers4_lo, _mm_srli_si128(outers4_lo, 4)); + const __m128i partial_sums_hi = + _mm_add_epi16(outers4_hi, _mm_srli_si128(outers4_hi, 4)); + // Move 6x values down to add for the final kernel sum for each output. + const __m128i sums_lo = RightShiftWithRounding_U16( + _mm_add_epi16(partial_sums_lo, centers8_lo), 4); + const __m128i sums_hi = RightShiftWithRounding_U16( + _mm_add_epi16(partial_sums_hi, centers8_hi), 4); + + const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo); + const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi); + const __m128i result = + _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10); + StoreUnaligned16(dest, result); +} + +// This function applies the kernel [0, 5, 6, 5, 0] to 12 values. +// Assumes |edge| has 8 packed byte values, and that the 2 invalid values will +// be overwritten or safely discarded. +inline void ComputeKernel2Store12(uint8_t* dest, const uint8_t* source) { + const __m128i edge_lo = LoadUnaligned16(source); + const __m128i edge_hi = _mm_srli_si128(edge_lo, 6); + const __m128i outers_lo = _mm_cvtepu8_epi16(edge_lo); + const __m128i centers_lo = _mm_srli_si128(outers_lo, 2); + const __m128i outers_hi = _mm_cvtepu8_epi16(edge_hi); + const __m128i centers_hi = _mm_srli_si128(outers_hi, 2); + // Samples matched with the '5' tap, expanded to 16-bit. Add x + 4x. + const __m128i outers5_lo = + _mm_add_epi16(outers_lo, _mm_slli_epi16(outers_lo, 2)); + const __m128i outers5_hi = + _mm_add_epi16(outers_hi, _mm_slli_epi16(outers_hi, 2)); + // Samples matched with the '6' tap, expanded to 16-bit. Add 2x + 4x. + const __m128i centers6_lo = _mm_add_epi16(_mm_slli_epi16(centers_lo, 1), + _mm_slli_epi16(centers_lo, 2)); + const __m128i centers6_hi = _mm_add_epi16(_mm_slli_epi16(centers_hi, 1), + _mm_slli_epi16(centers_hi, 2)); + // Move latter 5x values down to add with first 5x values for each output. + const __m128i partial_sums_lo = + _mm_add_epi16(outers5_lo, _mm_srli_si128(outers5_lo, 4)); + // Move 6x values down to add for the final kernel sum for each output. + const __m128i sums_lo = RightShiftWithRounding_U16( + _mm_add_epi16(centers6_lo, partial_sums_lo), 4); + // Shift latter 5x values to add with first 5x values for each output. + const __m128i partial_sums_hi = + _mm_add_epi16(outers5_hi, _mm_srli_si128(outers5_hi, 4)); + // Move 6x values down to add for the final kernel sum for each output. + const __m128i sums_hi = RightShiftWithRounding_U16( + _mm_add_epi16(centers6_hi, partial_sums_hi), 4); + // First 6 values are valid outputs. + const __m128i result_lo = _mm_packus_epi16(sums_lo, sums_lo); + const __m128i result_hi = _mm_packus_epi16(sums_hi, sums_hi); + const __m128i result = + _mm_alignr_epi8(result_hi, _mm_slli_si128(result_lo, 10), 10); + StoreUnaligned16(dest, result); +} + +// This function applies the kernel [2, 4, 4, 4, 2] to 8 values. +inline void ComputeKernel3Store8(uint8_t* dest, const uint8_t* source) { + const __m128i edge_lo = LoadUnaligned16(source); + const __m128i edge_hi = _mm_srli_si128(edge_lo, 4); + // Finish |edge_lo| life cycle quickly. + // Multiply for 2x. + const __m128i source2_lo = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_lo), 1); + // Multiply 2x by 2 and align. + const __m128i source4_lo = _mm_srli_si128(_mm_slli_epi16(source2_lo, 1), 2); + // Finish |source2| life cycle quickly. + // Move latter 2x values down to add with first 2x values for each output. + __m128i sum = _mm_add_epi16(source2_lo, _mm_srli_si128(source2_lo, 8)); + // First 4x values already aligned to add with running total. + sum = _mm_add_epi16(sum, source4_lo); + // Move second 4x values down to add with running total. + sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 2)); + // Move third 4x values down to add with running total. + sum = _mm_add_epi16(sum, _mm_srli_si128(source4_lo, 4)); + // Multiply for 2x. + const __m128i source2_hi = _mm_slli_epi16(_mm_cvtepu8_epi16(edge_hi), 1); + // Multiply 2x by 2 and align. + const __m128i source4_hi = _mm_srli_si128(_mm_slli_epi16(source2_hi, 1), 2); + // Move latter 2x values down to add with first 2x values for each output. + __m128i sum_hi = _mm_add_epi16(source2_hi, _mm_srli_si128(source2_hi, 8)); + // First 4x values already aligned to add with running total. + sum_hi = _mm_add_epi16(sum_hi, source4_hi); + // Move second 4x values down to add with running total. + sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 2)); + // Move third 4x values down to add with running total. + sum_hi = _mm_add_epi16(sum_hi, _mm_srli_si128(source4_hi, 4)); + + // Because we have only 8 values here, it is safe to align before packing down + // to 8-bit without losing data. + sum = _mm_alignr_epi8(sum_hi, _mm_slli_si128(sum, 8), 8); + sum = RightShiftWithRounding_U16(sum, 4); + StoreLo8(dest, _mm_packus_epi16(sum, sum)); +} + +void IntraEdgeFilter_SSE4_1(void* buffer, int size, int strength) { + uint8_t edge[kMaxEdgeBufferSize + 4]; + memcpy(edge, buffer, size); + auto* dst_buffer = static_cast(buffer); + + // Only process |size| - 1 elements. Nothing to do in this case. + if (size == 1) return; + + int i = 0; + switch (strength) { + case 1: + // To avoid overwriting, we stop short from the total write size plus the + // initial offset. In this case 12 valid values are written in two blocks + // of 8 bytes each. + for (; i < size - 17; i += 12) { + ComputeKernel1Store12(dst_buffer + i + 1, edge + i); + } + break; + case 2: + // See the comment for case 1. + for (; i < size - 17; i += 12) { + ComputeKernel2Store12(dst_buffer + i + 1, edge + i); + } + break; + default: + assert(strength == 3); + // The first filter input is repeated for taps of value 2 and 4. + dst_buffer[1] = RightShiftWithRounding( + (6 * edge[0] + 4 * edge[1] + 4 * edge[2] + 2 * edge[3]), 4); + // In this case, one block of 8 bytes is written in each iteration, with + // an offset of 2. + for (; i < size - 10; i += 8) { + ComputeKernel3Store8(dst_buffer + i + 2, edge + i); + } + } + const int kernel_index = strength - 1; + for (int final_index = Clip3(i, 1, size - 2); final_index < size; + ++final_index) { + int sum = 0; + for (int j = 0; j < kKernelTaps; ++j) { + const int k = Clip3(final_index + j - 2, 0, size - 1); + sum += kKernels[kernel_index][j] * edge[k]; + } + dst_buffer[final_index] = RightShiftWithRounding(sum, 4); + } +} + +constexpr int kMaxUpsampleSize = 16; + +// Applies the upsampling kernel [-1, 9, 9, -1] to alternating pixels, and +// interleaves the results with the original values. This implementation assumes +// that it is safe to write the maximum number of upsampled pixels (32) to the +// edge buffer, even when |size| is small. +void IntraEdgeUpsampler_SSE4_1(void* buffer, int size) { + assert(size % 4 == 0 && size <= kMaxUpsampleSize); + auto* const pixel_buffer = static_cast(buffer); + uint8_t temp[kMaxUpsampleSize + 8]; + temp[0] = temp[1] = pixel_buffer[-1]; + memcpy(temp + 2, pixel_buffer, sizeof(temp[0]) * size); + temp[size + 2] = pixel_buffer[size - 1]; + + pixel_buffer[-2] = temp[0]; + const __m128i data = LoadUnaligned16(temp); + const __m128i src_lo = _mm_cvtepu8_epi16(data); + const __m128i src_hi = _mm_unpackhi_epi8(data, _mm_setzero_si128()); + const __m128i src9_hi = _mm_add_epi16(src_hi, _mm_slli_epi16(src_hi, 3)); + const __m128i src9_lo = _mm_add_epi16(src_lo, _mm_slli_epi16(src_lo, 3)); + __m128i sum_lo = _mm_sub_epi16(_mm_alignr_epi8(src9_hi, src9_lo, 2), src_lo); + sum_lo = _mm_add_epi16(sum_lo, _mm_alignr_epi8(src9_hi, src9_lo, 4)); + sum_lo = _mm_sub_epi16(sum_lo, _mm_alignr_epi8(src_hi, src_lo, 6)); + sum_lo = RightShiftWithRounding_S16(sum_lo, 4); + const __m128i result_lo = _mm_unpacklo_epi8(_mm_packus_epi16(sum_lo, sum_lo), + _mm_srli_si128(data, 2)); + StoreUnaligned16(pixel_buffer - 1, result_lo); + if (size > 8) { + const __m128i src_hi_extra = _mm_cvtepu8_epi16(LoadLo8(temp + 16)); + const __m128i src9_hi_extra = + _mm_add_epi16(src_hi_extra, _mm_slli_epi16(src_hi_extra, 3)); + __m128i sum_hi = + _mm_sub_epi16(_mm_alignr_epi8(src9_hi_extra, src9_hi, 2), src_hi); + sum_hi = _mm_add_epi16(sum_hi, _mm_alignr_epi8(src9_hi_extra, src9_hi, 4)); + sum_hi = _mm_sub_epi16(sum_hi, _mm_alignr_epi8(src_hi_extra, src_hi, 6)); + sum_hi = RightShiftWithRounding_S16(sum_hi, 4); + const __m128i result_hi = + _mm_unpacklo_epi8(_mm_packus_epi16(sum_hi, sum_hi), LoadLo8(temp + 10)); + StoreUnaligned16(pixel_buffer + 15, result_hi); + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); +#if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeFilter) + dsp->intra_edge_filter = IntraEdgeFilter_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(IntraEdgeUpsampler) + dsp->intra_edge_upsampler = IntraEdgeUpsampler_SSE4_1; +#endif +} + +} // namespace + +void IntraEdgeInit_SSE4_1() { Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 +namespace libgav1 { +namespace dsp { + +void IntraEdgeInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/intra_edge_sse4.h b/src/dsp/x86/intra_edge_sse4.h new file mode 100644 index 0000000..6ed4d40 --- /dev/null +++ b/src/dsp/x86/intra_edge_sse4.h @@ -0,0 +1,46 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::intra_edge_filter and Dsp::intra_edge_upsampler. This +// function is not thread-safe. +void IntraEdgeInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +// If sse4 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the sse4 implementation should be used. +#if LIBGAV1_TARGETING_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter +#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_IntraEdgeUpsampler +#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_SSE4_1 +#endif + +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_ diff --git a/src/dsp/x86/intrapred_cfl_sse4.cc b/src/dsp/x86/intrapred_cfl_sse4.cc new file mode 100644 index 0000000..fac1556 --- /dev/null +++ b/src/dsp/x86/intrapred_cfl_sse4.cc @@ -0,0 +1,976 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +//------------------------------------------------------------------------------ +// CflIntraPredictor_SSE4_1 + +inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12, + __m128i alpha_sign, __m128i dc_q0) { + __m128i ac_q3 = LoadUnaligned16(input); + __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); + __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12); + scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign); + return _mm_add_epi16(scaled_luma_q0, dc_q0); +} + +template +void CflIntraPredictor_SSE4_1( + void* const dest, ptrdiff_t stride, + const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int alpha) { + auto* dst = static_cast(dest); + const __m128i alpha_sign = _mm_set1_epi16(alpha); + const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); + auto* row = reinterpret_cast(luma); + const int kCflLumaBufferStrideLog2_16i = 5; + const int kCflLumaBufferStrideLog2_128i = kCflLumaBufferStrideLog2_16i - 3; + const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i); + const __m128i dc_val = _mm_set1_epi16(dst[0]); + do { + __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val); + if (width < 16) { + res = _mm_packus_epi16(res, res); + if (width == 4) { + Store4(dst, res); + } else { + StoreLo8(dst, res); + } + } else { + __m128i next = + CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val); + res = _mm_packus_epi16(res, next); + StoreUnaligned16(dst, res); + if (width == 32) { + res = CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val); + next = CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val); + res = _mm_packus_epi16(res, next); + StoreUnaligned16(dst + 16, res); + } + } + dst += stride; + } while ((row += (1 << kCflLumaBufferStrideLog2_128i)) < row_end); +} + +template +void CflSubsampler444_4xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int /*max_luma_width*/, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_assert(block_height_log2 <= 4, ""); + const int block_height = 1 << block_height_log2; + const int visible_height = max_luma_height; + const auto* src = static_cast(source); + __m128i sum = _mm_setzero_si128(); + int16_t* luma_ptr = luma[0]; + const __m128i zero = _mm_setzero_si128(); + __m128i samples; + int y = 0; + do { + samples = Load4(src); + src += stride; + int src_bytes; + memcpy(&src_bytes, src, 4); + samples = _mm_insert_epi32(samples, src_bytes, 1); + src += stride; + samples = _mm_slli_epi16(_mm_cvtepu8_epi16(samples), 3); + StoreLo8(luma_ptr, samples); + luma_ptr += kCflLumaBufferStride; + StoreHi8(luma_ptr, samples); + luma_ptr += kCflLumaBufferStride; + + // The maximum value here is 2**bd * H * 2**shift. Since the maximum H for + // 4XH is 16 = 2**4, we have 2**(8 + 4 + 3) = 2**15, which fits in 16 bits. + sum = _mm_add_epi16(sum, samples); + y += 2; + } while (y < visible_height); + + if (!is_inside) { + int y = visible_height; + do { + StoreHi8(luma_ptr, samples); + luma_ptr += kCflLumaBufferStride; + sum = _mm_add_epi16(sum, samples); + ++y; + } while (y < block_height); + } + + __m128i sum_tmp = _mm_unpackhi_epi16(sum, zero); + sum = _mm_cvtepu16_epi32(sum); + sum = _mm_add_epi32(sum, sum_tmp); + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8)); + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4)); + + __m128i averages = RightShiftWithRounding_U32( + sum, block_height_log2 + 2 /* log2 of width 4 */); + averages = _mm_shufflelo_epi16(averages, 0); + luma_ptr = luma[0]; + for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) { + const __m128i samples = LoadLo8(luma_ptr); + StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages)); + } +} + +template +void CflSubsampler444_4xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_assert(block_height_log2 <= 4, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + const int block_height = 1 << block_height_log2; + const int block_width = 4; + + if (block_height <= max_luma_height && block_width <= max_luma_width) { + CflSubsampler444_4xH_SSE4_1( + luma, max_luma_width, max_luma_height, source, stride); + } else { + CflSubsampler444_4xH_SSE4_1( + luma, max_luma_width, max_luma_height, source, stride); + } +} + +template +void CflSubsampler444_8xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_assert(block_height_log2 <= 5, ""); + const int block_height = 1 << block_height_log2, block_width = 8; + const int visible_height = max_luma_height; + const int invisible_width = inside ? 0 : block_width - max_luma_width; + const int visible_width = max_luma_width; + const __m128i blend_mask = + inside ? _mm_setzero_si128() : MaskHighNBytes(8 + invisible_width); + const __m128i dup16 = _mm_set1_epi32(0x01000100); + const auto* src = static_cast(source); + int16_t* luma_ptr = luma[0]; + const __m128i zero = _mm_setzero_si128(); + // Since the maximum height is 32, if we split them by parity, each one only + // needs to accumulate 16 rows. Just like the calculation done in 4XH, we can + // store them in 16 bits without casting to 32 bits. + __m128i sum_even = _mm_setzero_si128(), sum_odd = _mm_setzero_si128(); + __m128i sum; + __m128i samples1; + + int y = 0; + do { + __m128i samples0 = LoadLo8(src); + if (!inside) { + const __m128i border0 = + _mm_set1_epi8(static_cast(src[visible_width - 1])); + samples0 = _mm_blendv_epi8(samples0, border0, blend_mask); + } + src += stride; + samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples0), 3); + StoreUnaligned16(luma_ptr, samples0); + luma_ptr += kCflLumaBufferStride; + + sum_even = _mm_add_epi16(sum_even, samples0); + + samples1 = LoadLo8(src); + if (!inside) { + const __m128i border1 = + _mm_set1_epi8(static_cast(src[visible_width - 1])); + samples1 = _mm_blendv_epi8(samples1, border1, blend_mask); + } + src += stride; + samples1 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples1), 3); + StoreUnaligned16(luma_ptr, samples1); + luma_ptr += kCflLumaBufferStride; + + sum_odd = _mm_add_epi16(sum_odd, samples1); + y += 2; + } while (y < visible_height); + + if (!inside) { + for (int y = visible_height; y < block_height; y += 2) { + sum_even = _mm_add_epi16(sum_even, samples1); + StoreUnaligned16(luma_ptr, samples1); + luma_ptr += kCflLumaBufferStride; + + sum_odd = _mm_add_epi16(sum_odd, samples1); + StoreUnaligned16(luma_ptr, samples1); + luma_ptr += kCflLumaBufferStride; + } + } + + sum = _mm_add_epi32(_mm_unpackhi_epi16(sum_even, zero), + _mm_cvtepu16_epi32(sum_even)); + sum = _mm_add_epi32(sum, _mm_unpackhi_epi16(sum_odd, zero)); + sum = _mm_add_epi32(sum, _mm_cvtepu16_epi32(sum_odd)); + + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8)); + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4)); + + __m128i averages = RightShiftWithRounding_U32( + sum, block_height_log2 + 3 /* log2 of width 8 */); + averages = _mm_shuffle_epi8(averages, dup16); + luma_ptr = luma[0]; + for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) { + const __m128i samples = LoadUnaligned16(luma_ptr); + StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages)); + } +} + +template +void CflSubsampler444_8xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_assert(block_height_log2 <= 5, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + const int block_height = 1 << block_height_log2; + const int block_width = 8; + + const int horz_inside = block_width <= max_luma_width; + const int vert_inside = block_height <= max_luma_height; + if (horz_inside && vert_inside) { + CflSubsampler444_8xH_SSE4_1( + luma, max_luma_width, max_luma_height, source, stride); + } else { + CflSubsampler444_8xH_SSE4_1( + luma, max_luma_width, max_luma_height, source, stride); + } +} + +// This function will only work for block_width 16 and 32. +template +void CflSubsampler444_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_assert(block_width_log2 == 4 || block_width_log2 == 5, ""); + static_assert(block_height_log2 <= 5, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + const int block_height = 1 << block_height_log2; + const int block_width = 1 << block_width_log2; + + const int visible_height = max_luma_height; + const int visible_width_16 = inside ? 16 : std::min(16, max_luma_width); + const int invisible_width_16 = 16 - visible_width_16; + const __m128i blend_mask_16 = MaskHighNBytes(invisible_width_16); + const int visible_width_32 = inside ? 32 : max_luma_width; + const int invisible_width_32 = 32 - visible_width_32; + const __m128i blend_mask_32 = + MaskHighNBytes(std::min(16, invisible_width_32)); + + const __m128i dup16 = _mm_set1_epi32(0x01000100); + const __m128i zero = _mm_setzero_si128(); + const auto* src = static_cast(source); + int16_t* luma_ptr = luma[0]; + __m128i sum = _mm_setzero_si128(); + + __m128i samples0, samples1; + __m128i samples2, samples3; + __m128i inner_sum_lo, inner_sum_hi; + int y = 0; + do { +#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are + // then masked off by blendv, MSAN isn't smart enough to + // understand that. So we switch to a C implementation here. + uint16_t c_arr[16]; + for (int x = 0; x < 16; x++) { + const int x_index = std::min(x, visible_width_16 - 1); + c_arr[x] = src[x_index] << 3; + } + samples0 = LoadUnaligned16(c_arr); + samples1 = LoadUnaligned16(c_arr + 8); + static_cast(blend_mask_16); +#else + __m128i samples01 = LoadUnaligned16(src); + + if (!inside) { + const __m128i border16 = + _mm_set1_epi8(static_cast(src[visible_width_16 - 1])); + samples01 = _mm_blendv_epi8(samples01, border16, blend_mask_16); + } + samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3); + samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3); +#endif // LIBGAV1_MSAN + + StoreUnaligned16(luma_ptr, samples0); + StoreUnaligned16(luma_ptr + 8, samples1); + __m128i inner_sum = _mm_add_epi16(samples0, samples1); + + if (block_width == 32) { +#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are + // then masked off by blendv, MSAN isn't smart enough to + // understand that. So we switch to a C implementation here. + uint16_t c_arr[16]; + for (int x = 16; x < 32; x++) { + const int x_index = std::min(x, visible_width_32 - 1); + c_arr[x - 16] = src[x_index] << 3; + } + samples2 = LoadUnaligned16(c_arr); + samples3 = LoadUnaligned16(c_arr + 8); + static_cast(blend_mask_32); +#else + __m128i samples23 = LoadUnaligned16(src + 16); + if (!inside) { + const __m128i border32 = + _mm_set1_epi8(static_cast(src[visible_width_32 - 1])); + samples23 = _mm_blendv_epi8(samples23, border32, blend_mask_32); + } + samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3); + samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3); +#endif // LIBGAV1_MSAN + + StoreUnaligned16(luma_ptr + 16, samples2); + StoreUnaligned16(luma_ptr + 24, samples3); + inner_sum = _mm_add_epi16(samples2, inner_sum); + inner_sum = _mm_add_epi16(samples3, inner_sum); + } + + inner_sum_lo = _mm_cvtepu16_epi32(inner_sum); + inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero); + sum = _mm_add_epi32(sum, inner_sum_lo); + sum = _mm_add_epi32(sum, inner_sum_hi); + luma_ptr += kCflLumaBufferStride; + src += stride; + } while (++y < visible_height); + + if (!inside) { + for (int y = visible_height; y < block_height; + luma_ptr += kCflLumaBufferStride, ++y) { + sum = _mm_add_epi32(sum, inner_sum_lo); + StoreUnaligned16(luma_ptr, samples0); + sum = _mm_add_epi32(sum, inner_sum_hi); + StoreUnaligned16(luma_ptr + 8, samples1); + if (block_width == 32) { + StoreUnaligned16(luma_ptr + 16, samples2); + StoreUnaligned16(luma_ptr + 24, samples3); + } + } + } + + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8)); + sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4)); + + __m128i averages = + RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2); + averages = _mm_shuffle_epi8(averages, dup16); + luma_ptr = luma[0]; + for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) { + for (int x = 0; x < block_width; x += 8) { + __m128i samples = LoadUnaligned16(&luma_ptr[x]); + StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples, averages)); + } + } +} + +template +void CflSubsampler444_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + static_assert(block_width_log2 == 4 || block_width_log2 == 5, ""); + static_assert(block_height_log2 <= 5, ""); + assert(max_luma_width >= 4); + assert(max_luma_height >= 4); + + const int block_height = 1 << block_height_log2; + const int block_width = 1 << block_width_log2; + const int horz_inside = block_width <= max_luma_width; + const int vert_inside = block_height <= max_luma_height; + if (horz_inside && vert_inside) { + CflSubsampler444_SSE4_1( + luma, max_luma_width, max_luma_height, source, stride); + } else { + CflSubsampler444_SSE4_1( + luma, max_luma_width, max_luma_height, source, stride); + } +} + +// Takes in two sums of input row pairs, and completes the computation for two +// output rows. +inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0, + const __m128i vertical_sum1, + int16_t* luma_ptr) { + __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1); + result = _mm_slli_epi16(result, 1); + StoreLo8(luma_ptr, result); + StoreHi8(luma_ptr + kCflLumaBufferStride, result); + return result; +} + +// Takes two halves of a vertically added pair of rows and completes the +// computation for one output row. +inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0, + const __m128i vertical_sum1, + int16_t* luma_ptr) { + __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1); + result = _mm_slli_epi16(result, 1); + StoreUnaligned16(luma_ptr, result); + return result; +} + +template +void CflSubsampler420_4xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int /*max_luma_width*/, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const auto* src = static_cast(source); + int16_t* luma_ptr = luma[0]; + const __m128i zero = _mm_setzero_si128(); + __m128i final_sum = zero; + const int luma_height = std::min(block_height, max_luma_height >> 1); + int y = 0; + do { + // Note that double sampling and converting to 16bit makes a row fill the + // vector. + const __m128i samples_row0 = _mm_cvtepu8_epi16(LoadLo8(src)); + src += stride; + const __m128i samples_row1 = _mm_cvtepu8_epi16(LoadLo8(src)); + src += stride; + const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1); + + const __m128i samples_row2 = _mm_cvtepu8_epi16(LoadLo8(src)); + src += stride; + const __m128i samples_row3 = _mm_cvtepu8_epi16(LoadLo8(src)); + src += stride; + const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3); + __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr); + luma_ptr += kCflLumaBufferStride << 1; + + const __m128i samples_row4 = _mm_cvtepu8_epi16(LoadLo8(src)); + src += stride; + const __m128i samples_row5 = _mm_cvtepu8_epi16(LoadLo8(src)); + src += stride; + const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5); + + const __m128i samples_row6 = _mm_cvtepu8_epi16(LoadLo8(src)); + src += stride; + const __m128i samples_row7 = _mm_cvtepu8_epi16(LoadLo8(src)); + src += stride; + const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7); + sum = _mm_add_epi16( + sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr)); + luma_ptr += kCflLumaBufferStride << 1; + + final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum)); + final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero)); + y += 4; + } while (y < luma_height); + const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride); + const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill); + for (; y < block_height; ++y) { + StoreLo8(luma_ptr, final_fill); + luma_ptr += kCflLumaBufferStride; + + final_sum = _mm_add_epi32(final_sum, final_fill_to_sum); + } + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8)); + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4)); + + __m128i averages = RightShiftWithRounding_U32( + final_sum, block_height_log2 + 2 /*log2 of width 4*/); + + averages = _mm_shufflelo_epi16(averages, 0); + luma_ptr = luma[0]; + for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) { + const __m128i samples = LoadLo8(luma_ptr); + StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages)); + } +} + +// This duplicates the last two 16-bit values in |row|. +inline __m128i LastRowSamples(const __m128i row) { + return _mm_shuffle_epi32(row, 0xFF); +} + +// This duplicates the last 16-bit value in |row|. +inline __m128i LastRowResult(const __m128i row) { + const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF); + return _mm_shuffle_epi32(dup_row, 0xFF); +} + +template +inline void CflSubsampler420Impl_8xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int /*max_luma_width*/, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + const int block_height = 1 << block_height_log2; + const auto* src = static_cast(source); + const __m128i zero = _mm_setzero_si128(); + __m128i final_sum = zero; + int16_t* luma_ptr = luma[0]; + const int luma_height = std::min(block_height, max_luma_height >> 1); + int y = 0; + + do { + const __m128i samples_row00 = _mm_cvtepu8_epi16(LoadLo8(src)); + const __m128i samples_row01 = (max_luma_width == 16) + ? _mm_cvtepu8_epi16(LoadLo8(src + 8)) + : LastRowSamples(samples_row00); + src += stride; + const __m128i samples_row10 = _mm_cvtepu8_epi16(LoadLo8(src)); + const __m128i samples_row11 = (max_luma_width == 16) + ? _mm_cvtepu8_epi16(LoadLo8(src + 8)) + : LastRowSamples(samples_row10); + src += stride; + const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10); + const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11); + __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr); + luma_ptr += kCflLumaBufferStride; + + const __m128i samples_row20 = _mm_cvtepu8_epi16(LoadLo8(src)); + const __m128i samples_row21 = (max_luma_width == 16) + ? _mm_cvtepu8_epi16(LoadLo8(src + 8)) + : LastRowSamples(samples_row20); + src += stride; + const __m128i samples_row30 = _mm_cvtepu8_epi16(LoadLo8(src)); + const __m128i samples_row31 = (max_luma_width == 16) + ? _mm_cvtepu8_epi16(LoadLo8(src + 8)) + : LastRowSamples(samples_row30); + src += stride; + const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30); + const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31); + sum = _mm_add_epi16( + sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + const __m128i samples_row40 = _mm_cvtepu8_epi16(LoadLo8(src)); + const __m128i samples_row41 = (max_luma_width == 16) + ? _mm_cvtepu8_epi16(LoadLo8(src + 8)) + : LastRowSamples(samples_row40); + src += stride; + const __m128i samples_row50 = _mm_cvtepu8_epi16(LoadLo8(src)); + const __m128i samples_row51 = (max_luma_width == 16) + ? _mm_cvtepu8_epi16(LoadLo8(src + 8)) + : LastRowSamples(samples_row50); + src += stride; + const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50); + const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51); + sum = _mm_add_epi16( + sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + const __m128i samples_row60 = _mm_cvtepu8_epi16(LoadLo8(src)); + const __m128i samples_row61 = (max_luma_width == 16) + ? _mm_cvtepu8_epi16(LoadLo8(src + 8)) + : LastRowSamples(samples_row60); + src += stride; + const __m128i samples_row70 = _mm_cvtepu8_epi16(LoadLo8(src)); + const __m128i samples_row71 = (max_luma_width == 16) + ? _mm_cvtepu8_epi16(LoadLo8(src + 8)) + : LastRowSamples(samples_row70); + src += stride; + const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70); + const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71); + sum = _mm_add_epi16( + sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr)); + luma_ptr += kCflLumaBufferStride; + + final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum)); + final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero)); + y += 4; + } while (y < luma_height); + // Duplicate the final row downward to the end after max_luma_height. + const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride); + const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill); + const __m128i final_fill_to_sum1 = + _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8)); + const __m128i final_fill_to_sum = + _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1); + for (; y < block_height; ++y) { + StoreUnaligned16(luma_ptr, final_fill); + luma_ptr += kCflLumaBufferStride; + + final_sum = _mm_add_epi32(final_sum, final_fill_to_sum); + } + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8)); + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4)); + + __m128i averages = RightShiftWithRounding_S32( + final_sum, block_height_log2 + 3 /*log2 of width 8*/); + + averages = _mm_shufflelo_epi16(averages, 0); + averages = _mm_shuffle_epi32(averages, 0); + luma_ptr = luma[0]; + for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) { + const __m128i samples = LoadUnaligned16(luma_ptr); + StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages)); + } +} + +template +void CflSubsampler420_8xH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + if (max_luma_width == 8) { + CflSubsampler420Impl_8xH_SSE4_1( + luma, max_luma_width, max_luma_height, source, stride); + } else { + CflSubsampler420Impl_8xH_SSE4_1( + luma, max_luma_width, max_luma_height, source, stride); + } +} + +template +inline void CflSubsampler420Impl_WxH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int /*max_luma_width*/, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + const auto* src = static_cast(source); + const __m128i zero = _mm_setzero_si128(); + __m128i final_sum = zero; + const int block_height = 1 << block_height_log2; + const int luma_height = std::min(block_height, max_luma_height >> 1); + + int16_t* luma_ptr = luma[0]; + __m128i final_row_result; + // Begin first y section, covering width up to 16. + int y = 0; + do { + const uint8_t* src_next = src + stride; + const __m128i samples_row0_lo = LoadUnaligned16(src); + const __m128i samples_row00 = _mm_cvtepu8_epi16(samples_row0_lo); + const __m128i samples_row01 = (max_luma_width >= 16) + ? _mm_unpackhi_epi8(samples_row0_lo, zero) + : LastRowSamples(samples_row00); + const __m128i samples_row0_hi = LoadUnaligned16(src + 16); + const __m128i samples_row02 = (max_luma_width >= 24) + ? _mm_cvtepu8_epi16(samples_row0_hi) + : LastRowSamples(samples_row01); + const __m128i samples_row03 = (max_luma_width == 32) + ? _mm_unpackhi_epi8(samples_row0_hi, zero) + : LastRowSamples(samples_row02); + const __m128i samples_row1_lo = LoadUnaligned16(src_next); + const __m128i samples_row10 = _mm_cvtepu8_epi16(samples_row1_lo); + const __m128i samples_row11 = (max_luma_width >= 16) + ? _mm_unpackhi_epi8(samples_row1_lo, zero) + : LastRowSamples(samples_row10); + const __m128i samples_row1_hi = LoadUnaligned16(src_next + 16); + const __m128i samples_row12 = (max_luma_width >= 24) + ? _mm_cvtepu8_epi16(samples_row1_hi) + : LastRowSamples(samples_row11); + const __m128i samples_row13 = (max_luma_width == 32) + ? _mm_unpackhi_epi8(samples_row1_hi, zero) + : LastRowSamples(samples_row12); + const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10); + const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11); + const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12); + const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13); + __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr); + final_row_result = + StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8); + sum = _mm_add_epi16(sum, final_row_result); + final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum)); + final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero)); + src += stride << 1; + luma_ptr += kCflLumaBufferStride; + } while (++y < luma_height); + + // Because max_luma_width is at most 32, any values beyond x=16 will + // necessarily be duplicated. + if (block_width_log2 == 5) { + const __m128i wide_fill = LastRowResult(final_row_result); + // Multiply duplicated value by number of occurrences, height * 4, since + // there are 16 in each row and the value appears in the vector 4 times. + final_sum = _mm_add_epi32( + final_sum, + _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), block_height_log2 + 2)); + } + + // Begin second y section. + if (y < block_height) { + const __m128i final_fill0 = + LoadUnaligned16(luma_ptr - kCflLumaBufferStride); + const __m128i final_fill1 = + LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8); + const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1); + const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum); + const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero); + const __m128i final_fill_to_sum = + _mm_add_epi32(final_inner_sum0, final_inner_sum1); + + do { + StoreUnaligned16(luma_ptr, final_fill0); + StoreUnaligned16(luma_ptr + 8, final_fill1); + luma_ptr += kCflLumaBufferStride; + + final_sum = _mm_add_epi32(final_sum, final_fill_to_sum); + } while (++y < block_height); + } // End second y section. + + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8)); + final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4)); + + __m128i averages = RightShiftWithRounding_S32( + final_sum, block_width_log2 + block_height_log2); + averages = _mm_shufflelo_epi16(averages, 0); + averages = _mm_shuffle_epi32(averages, 0); + + luma_ptr = luma[0]; + for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) { + const __m128i samples0 = LoadUnaligned16(luma_ptr); + StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages)); + const __m128i samples1 = LoadUnaligned16(luma_ptr + 8); + final_row_result = _mm_sub_epi16(samples1, averages); + StoreUnaligned16(luma_ptr + 8, final_row_result); + } + if (block_width_log2 == 5) { + int16_t* wide_luma_ptr = luma[0] + 16; + const __m128i wide_fill = LastRowResult(final_row_result); + for (int i = 0; i < block_height; + ++i, wide_luma_ptr += kCflLumaBufferStride) { + StoreUnaligned16(wide_luma_ptr, wide_fill); + StoreUnaligned16(wide_luma_ptr + 8, wide_fill); + } + } +} + +template +void CflSubsampler420_WxH_SSE4_1( + int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], + const int max_luma_width, const int max_luma_height, + const void* const source, ptrdiff_t stride) { + switch (max_luma_width) { + case 8: + CflSubsampler420Impl_WxH_SSE4_1( + luma, max_luma_width, max_luma_height, source, stride); + return; + case 16: + CflSubsampler420Impl_WxH_SSE4_1( + luma, max_luma_width, max_luma_height, source, stride); + return; + case 24: + CflSubsampler420Impl_WxH_SSE4_1( + luma, max_luma_width, max_luma_height, source, stride); + return; + default: + assert(max_luma_width == 32); + CflSubsampler420Impl_WxH_SSE4_1( + luma, max_luma_width, max_luma_height, source, stride); + return; + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] = + CflSubsampler420_4xH_SSE4_1<2>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] = + CflSubsampler420_4xH_SSE4_1<3>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] = + CflSubsampler420_4xH_SSE4_1<4>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] = + CflSubsampler420_8xH_SSE4_1<2>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] = + CflSubsampler420_8xH_SSE4_1<3>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] = + CflSubsampler420_8xH_SSE4_1<4>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] = + CflSubsampler420_8xH_SSE4_1<5>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<4, 2>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<4, 3>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<4, 4>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<4, 5>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<5, 3>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<5, 4>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler420) + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] = + CflSubsampler420_WxH_SSE4_1<5, 5>; +#endif + +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] = + CflSubsampler444_4xH_SSE4_1<2>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] = + CflSubsampler444_4xH_SSE4_1<3>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] = + CflSubsampler444_4xH_SSE4_1<4>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] = + CflSubsampler444_8xH_SSE4_1<2>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] = + CflSubsampler444_8xH_SSE4_1<3>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] = + CflSubsampler444_8xH_SSE4_1<4>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] = + CflSubsampler444_8xH_SSE4_1<5>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] = + CflSubsampler444_SSE4_1<4, 2>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] = + CflSubsampler444_SSE4_1<4, 3>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] = + CflSubsampler444_SSE4_1<4, 4>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] = + CflSubsampler444_SSE4_1<4, 5>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] = + CflSubsampler444_SSE4_1<5, 3>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] = + CflSubsampler444_SSE4_1<5, 4>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler444) + dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] = + CflSubsampler444_SSE4_1<5, 5>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor_SSE4_1<4, 4>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor_SSE4_1<4, 8>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize4x16] = + CflIntraPredictor_SSE4_1<4, 16>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor_SSE4_1<8, 4>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor_SSE4_1<8, 8>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize8x16] = + CflIntraPredictor_SSE4_1<8, 16>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize8x32] = + CflIntraPredictor_SSE4_1<8, 32>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize16x4] = + CflIntraPredictor_SSE4_1<16, 4>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize16x8] = + CflIntraPredictor_SSE4_1<16, 8>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize16x16] = + CflIntraPredictor_SSE4_1<16, 16>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize16x32] = + CflIntraPredictor_SSE4_1<16, 32>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize32x8] = + CflIntraPredictor_SSE4_1<32, 8>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize32x16] = + CflIntraPredictor_SSE4_1<32, 16>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflIntraPredictor) + dsp->cfl_intra_predictors[kTransformSize32x32] = + CflIntraPredictor_SSE4_1<32, 32>; +#endif +} + +} // namespace +} // namespace low_bitdepth + +void IntraPredCflInit_SSE4_1() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 + +namespace libgav1 { +namespace dsp { + +void IntraPredCflInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/intrapred_smooth_sse4.cc b/src/dsp/x86/intrapred_smooth_sse4.cc new file mode 100644 index 0000000..e944ea3 --- /dev/null +++ b/src/dsp/x86/intrapred_smooth_sse4.cc @@ -0,0 +1,2662 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include +#include +#include +#include // memcpy + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +// Note these constants are duplicated from intrapred.cc to allow the compiler +// to have visibility of the values. This helps reduce loads and in the +// creation of the inverse weights. +constexpr uint8_t kSmoothWeights[] = { + // block dimension = 4 + 255, 149, 85, 64, + // block dimension = 8 + 255, 197, 146, 105, 73, 50, 37, 32, + // block dimension = 16 + 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, + // block dimension = 32 + 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, + 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, + // block dimension = 64 + 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, + 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, + 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, + 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4}; + +template +inline void WriteSmoothHorizontalSum4(void* const dest, const __m128i& left, + const __m128i& weights, + const __m128i& scaled_top_right, + const __m128i& round) { + const __m128i left_y = _mm_shuffle_epi32(left, y_mask); + const __m128i weighted_left_y = _mm_mullo_epi16(left_y, weights); + const __m128i pred_sum = _mm_add_epi32(scaled_top_right, weighted_left_y); + // Equivalent to RightShiftWithRounding(pred[x][y], 8). + const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8); + const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400); + Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8)); +} + +template +inline __m128i SmoothVerticalSum4(const __m128i& top, const __m128i& weights, + const __m128i& scaled_bottom_left) { + const __m128i weights_y = _mm_shuffle_epi32(weights, y_mask); + const __m128i weighted_top_y = _mm_mullo_epi16(top, weights_y); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi32(scaled_bottom_left, y_mask); + return _mm_add_epi32(scaled_bottom_left_y, weighted_top_y); +} + +template +inline void WriteSmoothVerticalSum4(uint8_t* dest, const __m128i& top, + const __m128i& weights, + const __m128i& scaled_bottom_left, + const __m128i& round) { + __m128i pred_sum = + SmoothVerticalSum4(top, weights, scaled_bottom_left); + // Equivalent to RightShiftWithRounding(pred[x][y], 8). + pred_sum = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8); + const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400); + Store4(dest, _mm_shuffle_epi8(pred_sum, cvtepi32_epi8)); +} + +// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V, +// |pixels| is a segment of the top row or the whole top row, and |weights| is +// repeated. +inline __m128i SmoothDirectionalSum8(const __m128i& pixels, + const __m128i& weights, + const __m128i& scaled_corner) { + const __m128i weighted_px = _mm_mullo_epi16(pixels, weights); + return _mm_add_epi16(scaled_corner, weighted_px); +} + +inline void WriteSmoothDirectionalSum8(uint8_t* dest, const __m128i& pixels, + const __m128i& weights, + const __m128i& scaled_corner, + const __m128i& round) { + const __m128i pred_sum = + SmoothDirectionalSum8(pixels, weights, scaled_corner); + // Equivalent to RightShiftWithRounding(pred[x][y], 8). + const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, round), 8); + StoreLo8(dest, _mm_packus_epi16(pred, pred)); +} + +// For Horizontal, pixels1 and pixels2 are the same repeated value. For +// Vertical, weights1 and weights2 are the same, and scaled_corner1 and +// scaled_corner2 are the same. +inline void WriteSmoothDirectionalSum16(uint8_t* dest, const __m128i& pixels1, + const __m128i& pixels2, + const __m128i& weights1, + const __m128i& weights2, + const __m128i& scaled_corner1, + const __m128i& scaled_corner2, + const __m128i& round) { + const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1); + const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2); + const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1); + const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2); + // Equivalent to RightShiftWithRounding(pred[x][y], 8). + const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8); + const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8); + StoreUnaligned16(dest, _mm_packus_epi16(pred1, pred2)); +} + +template +inline void WriteSmoothPredSum4(uint8_t* const dest, const __m128i& top, + const __m128i& left, const __m128i& weights_x, + const __m128i& weights_y, + const __m128i& scaled_bottom_left, + const __m128i& scaled_top_right, + const __m128i& round) { + const __m128i left_y = _mm_shuffle_epi32(left, y_mask); + const __m128i weighted_left_y = _mm_mullo_epi32(left_y, weights_x); + const __m128i weight_y = _mm_shuffle_epi32(weights_y, y_mask); + const __m128i weighted_top = _mm_mullo_epi32(weight_y, top); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi32(scaled_bottom_left, y_mask); + const __m128i col_pred = _mm_add_epi32(scaled_bottom_left_y, weighted_left_y); + const __m128i row_pred = _mm_add_epi32(scaled_top_right, weighted_top); + const __m128i pred_sum = _mm_add_epi32(row_pred, col_pred); + + // Equivalent to RightShiftWithRounding(pred[x][y], 9). + const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 9); + + const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400); + Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8)); +} + +// pixels[0]: above and below_pred interleave vector +// pixels[1]: left vector +// pixels[2]: right_pred vector +inline void LoadSmoothPixels4(const uint8_t* above, const uint8_t* left, + const int height, __m128i* pixels) { + if (height == 4) { + pixels[1] = Load4(left); + } else if (height == 8) { + pixels[1] = LoadLo8(left); + } else { + pixels[1] = LoadUnaligned16(left); + } + + const __m128i bottom_left = _mm_set1_epi16(left[height - 1]); + const __m128i top = _mm_cvtepu8_epi16(Load4(above)); + pixels[0] = _mm_unpacklo_epi16(top, bottom_left); + pixels[2] = _mm_set1_epi16(above[3]); +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], second half for height = 16 only +// weight_h[3]: same as [1], second half for height = 16 only +// weight_w[0]: weights_w and scale - weights_w interleave vector +inline void LoadSmoothWeights4(const uint8_t* weight_array, const int height, + __m128i* weight_h, __m128i* weight_w) { + const __m128i scale = _mm_set1_epi16(256); + const __m128i x_weights = Load4(weight_array); + weight_h[0] = _mm_cvtepu8_epi16(x_weights); + weight_h[1] = _mm_sub_epi16(scale, weight_h[0]); + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + + if (height == 8) { + const __m128i y_weights = LoadLo8(weight_array + 4); + weight_h[0] = _mm_cvtepu8_epi16(y_weights); + weight_h[1] = _mm_sub_epi16(scale, weight_h[0]); + } else if (height == 16) { + const __m128i zero = _mm_setzero_si128(); + const __m128i y_weights = LoadUnaligned16(weight_array + 12); + weight_h[0] = _mm_cvtepu8_epi16(y_weights); + weight_h[1] = _mm_sub_epi16(scale, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(y_weights, zero); + weight_h[3] = _mm_sub_epi16(scale, weight_h[2]); + } +} + +inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y, + const __m128i* weight_x, uint8_t* dst, + const ptrdiff_t stride, + const bool use_second_half) { + const __m128i round = _mm_set1_epi32(256); + const __m128i mask_increment = _mm_set1_epi16(0x0202); + const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400); + const __m128i zero = _mm_setzero_si128(); + const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixel[1], zero) + : _mm_unpacklo_epi8(pixel[1], zero); + __m128i y_select = _mm_set1_epi16(0x0100); + + for (int i = 0; i < 8; ++i) { + const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select); + const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select); + const __m128i interleaved_weights = + _mm_unpacklo_epi16(weight_y, inverted_weight_y); + __m128i vertical_pred = _mm_madd_epi16(pixel[0], interleaved_weights); + + __m128i horizontal_vect = _mm_shuffle_epi8(left, y_select); + horizontal_vect = _mm_unpacklo_epi16(horizontal_vect, pixel[2]); + __m128i sum = _mm_madd_epi16(horizontal_vect, weight_x[0]); + + sum = _mm_add_epi32(vertical_pred, sum); + sum = _mm_add_epi32(sum, round); + sum = _mm_srai_epi32(sum, 9); + + sum = _mm_shuffle_epi8(sum, cvtepi32_epi8); + Store4(dst, sum); + dst += stride; + + y_select = _mm_add_epi16(y_select, mask_increment); + } +} + +// The interleaving approach has some overhead that causes it to underperform in +// the 4x4 case. +void Smooth4x4_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* top_row, const void* left_column) { + const __m128i top = _mm_cvtepu8_epi32(Load4(top_row)); + const __m128i left = _mm_cvtepu8_epi32(Load4(left_column)); + const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights)); + const __m128i scale = _mm_set1_epi32(256); + // Fourth short is top_row[3]. + const __m128i top_right = _mm_shuffle_epi32(top, 0xFF); + // Fourth short is left_column[3]. + const __m128i bottom_left = _mm_shuffle_epi32(left, 0xFF); + const __m128i inverted_weights = _mm_sub_epi32(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + const __m128i scaled_bottom_left = + _mm_mullo_epi16(inverted_weights, bottom_left); + auto* dst = static_cast(dest); + // AV1 spec 7.11.2.6 (3) describes the sum: + // smoothPred[y][x:x+3] = weighted_top + scaled_right + weighted_left[y] + + // scaled_bottom[y] This could be a loop, but for the immediate value in the + // shuffles. + WriteSmoothPredSum4<0>(dst, top, left, weights, weights, scaled_bottom_left, + scaled_top_right, scale); + dst += stride; + WriteSmoothPredSum4<0x55>(dst, top, left, weights, weights, + scaled_bottom_left, scaled_top_right, scale); + dst += stride; + WriteSmoothPredSum4<0xAA>(dst, top, left, weights, weights, + scaled_bottom_left, scaled_top_right, scale); + dst += stride; + WriteSmoothPredSum4<0xFF>(dst, top, left, weights, weights, + scaled_bottom_left, scaled_top_right, scale); +} + +void Smooth4x8_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* top_row, const void* left_column) { + const auto* const left_ptr = static_cast(left_column); + const auto* const top_ptr = static_cast(top_row); + __m128i weights_x[1]; + __m128i weights_y[2]; + LoadSmoothWeights4(kSmoothWeights, 8, weights_y, weights_x); + __m128i pixels[3]; + LoadSmoothPixels4(top_ptr, left_ptr, 8, pixels); + auto* dst = static_cast(dest); + WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false); +} + +void Smooth4x16_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* top_row, const void* left_column) { + const auto* const left_ptr = static_cast(left_column); + const auto* const top_ptr = static_cast(top_row); + __m128i weights_x[1]; + __m128i weights_y[4]; + LoadSmoothWeights4(kSmoothWeights, 16, weights_y, weights_x); + __m128i pixels[3]; + LoadSmoothPixels4(top_ptr, left_ptr, 16, pixels); + auto* dst = static_cast(dest); + WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false); + dst += stride << 3; + WriteSmoothPred4x8(pixels, &weights_y[2], weights_x, dst, stride, true); +} + +// pixels[0]: above and below_pred interleave vector, first half +// pixels[1]: above and below_pred interleave vector, second half +// pixels[2]: left vector +// pixels[3]: right_pred vector +// pixels[4]: above and below_pred interleave vector, first half +// pixels[5]: above and below_pred interleave vector, second half +// pixels[6]: left vector + 16 +// pixels[7]: right_pred vector +inline void LoadSmoothPixels8(const uint8_t* above, const uint8_t* left, + const int height, __m128i* pixels) { + const __m128i bottom_left = _mm_set1_epi16(left[height - 1]); + __m128i top_row = _mm_cvtepu8_epi16(LoadLo8(above)); + pixels[0] = _mm_unpacklo_epi16(top_row, bottom_left); + pixels[1] = _mm_unpackhi_epi16(top_row, bottom_left); + + pixels[3] = _mm_set1_epi16(above[7]); + + if (height == 4) { + pixels[2] = Load4(left); + } else if (height == 8) { + pixels[2] = LoadLo8(left); + } else if (height == 16) { + pixels[2] = LoadUnaligned16(left); + } else { + pixels[2] = LoadUnaligned16(left); + pixels[4] = pixels[0]; + pixels[5] = pixels[1]; + pixels[6] = LoadUnaligned16(left + 16); + pixels[7] = pixels[3]; + } +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], offset 8 +// weight_h[3]: same as [1], offset 8 +// weight_h[4]: same as [0], offset 16 +// weight_h[5]: same as [1], offset 16 +// weight_h[6]: same as [0], offset 24 +// weight_h[7]: same as [1], offset 24 +// weight_w[0]: weights_w and scale - weights_w interleave vector, first half +// weight_w[1]: weights_w and scale - weights_w interleave vector, second half +inline void LoadSmoothWeights8(const uint8_t* weight_array, const int height, + __m128i* weight_w, __m128i* weight_h) { + const int offset = (height < 8) ? 0 : 4; + __m128i loaded_weights = LoadUnaligned16(&weight_array[offset]); + weight_h[0] = _mm_cvtepu8_epi16(loaded_weights); + const __m128i inverter = _mm_set1_epi16(256); + weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]); + + if (height == 4) { + loaded_weights = _mm_srli_si128(loaded_weights, 4); + __m128i weights_x = _mm_cvtepu8_epi16(loaded_weights); + __m128i inverted_weights_x = _mm_sub_epi16(inverter, weights_x); + weight_w[0] = _mm_unpacklo_epi16(weights_x, inverted_weights_x); + weight_w[1] = _mm_unpackhi_epi16(weights_x, inverted_weights_x); + } else { + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); + } + + if (height == 16) { + const __m128i zero = _mm_setzero_si128(); + loaded_weights = LoadUnaligned16(weight_array + 12); + weight_h[0] = _mm_cvtepu8_epi16(loaded_weights); + weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(loaded_weights, zero); + weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]); + } else if (height == 32) { + const __m128i zero = _mm_setzero_si128(); + const __m128i weight_lo = LoadUnaligned16(weight_array + 28); + weight_h[0] = _mm_cvtepu8_epi16(weight_lo); + weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero); + weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]); + const __m128i weight_hi = LoadUnaligned16(weight_array + 44); + weight_h[4] = _mm_cvtepu8_epi16(weight_hi); + weight_h[5] = _mm_sub_epi16(inverter, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero); + weight_h[7] = _mm_sub_epi16(inverter, weight_h[6]); + } +} + +inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x, + const __m128i* weights_y, const int height, + uint8_t* dst, const ptrdiff_t stride, + const bool use_second_half) { + const __m128i round = _mm_set1_epi32(256); + const __m128i mask_increment = _mm_set1_epi16(0x0202); + const __m128i cvt_epu16_epi8 = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + + const __m128i zero = _mm_setzero_si128(); + const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixels[2], zero) + : _mm_unpacklo_epi8(pixels[2], zero); + __m128i y_select = _mm_set1_epi16(0x100); + + for (int i = 0; i < height; ++i) { + const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select); + const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select); + const __m128i interleaved_weights = + _mm_unpacklo_epi16(weight_y, inverted_weight_y); + const __m128i vertical_sum0 = + _mm_madd_epi16(pixels[0], interleaved_weights); + const __m128i vertical_sum1 = + _mm_madd_epi16(pixels[1], interleaved_weights); + + __m128i horizontal_pixels = _mm_shuffle_epi8(left, y_select); + horizontal_pixels = _mm_unpacklo_epi16(horizontal_pixels, pixels[3]); + const __m128i horizontal_sum0 = + _mm_madd_epi16(horizontal_pixels, weights_x[0]); + const __m128i horizontal_sum1 = + _mm_madd_epi16(horizontal_pixels, weights_x[1]); + + __m128i sum0 = _mm_add_epi32(vertical_sum0, horizontal_sum0); + sum0 = _mm_add_epi32(sum0, round); + sum0 = _mm_srai_epi32(sum0, 9); + + __m128i sum1 = _mm_add_epi32(vertical_sum1, horizontal_sum1); + sum1 = _mm_add_epi32(sum1, round); + sum1 = _mm_srai_epi32(sum1, 9); + + sum0 = _mm_packus_epi16(sum0, sum1); + sum0 = _mm_shuffle_epi8(sum0, cvt_epu16_epi8); + StoreLo8(dst, sum0); + dst += stride; + + y_select = _mm_add_epi16(y_select, mask_increment); + } +} + +void Smooth8x4_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* top_row, const void* left_column) { + const auto* const left_ptr = static_cast(left_column); + const auto* const top_ptr = static_cast(top_row); + __m128i pixels[4]; + LoadSmoothPixels8(top_ptr, left_ptr, 4, pixels); + + __m128i weights_x[2], weights_y[2]; + LoadSmoothWeights8(kSmoothWeights, 4, weights_x, weights_y); + + auto* dst = static_cast(dest); + WriteSmoothPred8xH(pixels, weights_x, weights_y, 4, dst, stride, false); +} + +void Smooth8x8_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* top_row, const void* left_column) { + const auto* const left_ptr = static_cast(left_column); + const auto* const top_ptr = static_cast(top_row); + + __m128i pixels[4]; + LoadSmoothPixels8(top_ptr, left_ptr, 8, pixels); + + __m128i weights_x[2], weights_y[2]; + LoadSmoothWeights8(kSmoothWeights, 8, weights_x, weights_y); + + auto* dst = static_cast(dest); + WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false); +} + +void Smooth8x16_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* top_row, const void* left_column) { + const auto* const left_ptr = static_cast(left_column); + const auto* const top_ptr = static_cast(top_row); + __m128i pixels[4]; + LoadSmoothPixels8(top_ptr, left_ptr, 16, pixels); + + __m128i weights_x[2], weights_y[4]; + LoadSmoothWeights8(kSmoothWeights, 16, weights_x, weights_y); + + auto* dst = static_cast(dest); + WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false); + dst += stride << 3; + WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true); +} + +void Smooth8x32_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* top_row, const void* left_column) { + const auto* const left_ptr = static_cast(left_column); + const auto* const top_ptr = static_cast(top_row); + __m128i pixels[8]; + LoadSmoothPixels8(top_ptr, left_ptr, 32, pixels); + + __m128i weights_x[2], weights_y[8]; + LoadSmoothWeights8(kSmoothWeights, 32, weights_x, weights_y); + + auto* dst = static_cast(dest); + WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false); + dst += stride << 3; + WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true); + dst += stride << 3; + WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[4], 8, dst, stride, + false); + dst += stride << 3; + WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[6], 8, dst, stride, + true); +} + +template +void SmoothWxH(void* const dest, const ptrdiff_t stride, + const void* const top_row, const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + const auto* const top_ptr = static_cast(top_row); + const uint8_t* const sm_weights_h = kSmoothWeights + height - 4; + const uint8_t* const sm_weights_w = kSmoothWeights + width - 4; + const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = _mm_set1_epi16(256); + const __m128i bottom_left = _mm_cvtsi32_si128(left_ptr[height - 1]); + const __m128i top_right = _mm_set1_epi16(top_ptr[width - 1]); + const __m128i round = _mm_set1_epi32(256); + auto* dst = static_cast(dest); + for (int y = 0; y < height; ++y) { + const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]); + const __m128i left_y = _mm_cvtsi32_si128(left_ptr[y]); + const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y); + __m128i scaled_bottom_left = + _mm_mullo_epi16(scale_m_weights_y, bottom_left); + const __m128i weight_left_y = + _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0); + scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round); + scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0); + for (int x = 0; x < width; x += 8) { + const __m128i top_x = LoadLo8(top_ptr + x); + const __m128i weights_x = LoadLo8(sm_weights_w + x); + const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x); + const __m128i top_weights_x_lo = _mm_cvtepu8_epi16(top_weights_x); + const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero); + + // Here opposite weights and pixels are multiplied, where the order of + // interleaving is indicated in the names. + __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y); + __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y); + + // |scaled_bottom_left| is always scaled by the same weight each row, so + // we only derive |scaled_top_right| values here. + const __m128i inverted_weights_x = + _mm_sub_epi16(scale_value, _mm_cvtepu8_epi16(weights_x)); + const __m128i scaled_top_right = + _mm_mullo_epi16(inverted_weights_x, top_right); + const __m128i scaled_top_right_lo = _mm_cvtepu16_epi32(scaled_top_right); + const __m128i scaled_top_right_hi = + _mm_unpackhi_epi16(scaled_top_right, zero); + pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left); + pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left); + pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo); + pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi); + + // The round value for RightShiftWithRounding was added with + // |scaled_bottom_left|. + pred_lo = _mm_srli_epi32(pred_lo, 9); + pred_hi = _mm_srli_epi32(pred_hi, 9); + const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + StoreLo8(dst + x, _mm_packus_epi16(pred, pred)); + } + dst += stride; + } +} + +void SmoothHorizontal4x4_SSE4_1(void* dest, const ptrdiff_t stride, + const void* top_row, const void* left_column) { + const auto* const top_ptr = static_cast(top_row); + const __m128i top_right = _mm_set1_epi32(top_ptr[3]); + const auto* const left_ptr = static_cast(left_column); + const __m128i left = _mm_cvtepu8_epi32(Load4(left_ptr)); + const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights)); + __m128i scale = _mm_set1_epi32(256); + const __m128i inverted_weights = _mm_sub_epi32(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + scale = _mm_set1_epi32(128); + auto* dst = static_cast(dest); + WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale); +} + +void SmoothHorizontal4x8_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi32(top[3]); + const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights)); + __m128i scale = _mm_set1_epi32(256); + const __m128i inverted_weights = _mm_sub_epi32(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + scale = _mm_set1_epi32(128); + const auto* const left_ptr = static_cast(left_column); + __m128i left = _mm_cvtepu8_epi32(Load4(left_column)); + auto* dst = static_cast(dest); + WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale); + dst += stride; + + left = _mm_cvtepu8_epi32(Load4(left_ptr + 4)); + WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale); +} + +void SmoothHorizontal4x16_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi32(top[3]); + const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights)); + __m128i scale = _mm_set1_epi32(256); + const __m128i inverted_weights = _mm_sub_epi32(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + scale = _mm_set1_epi32(128); + const auto* const left_ptr = static_cast(left_column); + __m128i left = _mm_cvtepu8_epi32(Load4(left_column)); + auto* dst = static_cast(dest); + WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale); + dst += stride; + + left = _mm_cvtepu8_epi32(Load4(left_ptr + 4)); + WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale); + dst += stride; + + left = _mm_cvtepu8_epi32(Load4(left_ptr + 8)); + WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale); + dst += stride; + + left = _mm_cvtepu8_epi32(Load4(left_ptr + 12)); + WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale); + dst += stride; + WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale); +} + +void SmoothHorizontal8x4_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[7]); + const __m128i left = _mm_cvtepu8_epi16(Load4(left_column)); + const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4)); + __m128i scale = _mm_set1_epi16(256); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + scale = _mm_set1_epi16(128); + __m128i y_select = _mm_set1_epi32(0x01000100); + __m128i left_y = _mm_shuffle_epi8(left, y_select); + auto* dst = static_cast(dest); + WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale); + dst += stride; + y_select = _mm_set1_epi32(0x03020302); + left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale); + dst += stride; + y_select = _mm_set1_epi32(0x05040504); + left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale); + dst += stride; + y_select = _mm_set1_epi32(0x07060706); + left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale); +} + +void SmoothHorizontal8x8_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[7]); + const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); + const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4)); + __m128i scale = _mm_set1_epi16(256); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + scale = _mm_set1_epi16(128); + auto* dst = static_cast(dest); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale); + dst += stride; + } +} + +void SmoothHorizontal8x16_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[7]); + const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4)); + __m128i scale = _mm_set1_epi16(256); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + scale = _mm_set1_epi16(128); + const auto* const left_ptr = static_cast(left_column); + __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); + auto* dst = static_cast(dest); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale); + dst += stride; + } + left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale); + dst += stride; + } +} + +void SmoothHorizontal8x32_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[7]); + const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4)); + __m128i scale = _mm_set1_epi16(256); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + scale = _mm_set1_epi16(128); + const auto* const left_ptr = static_cast(left_column); + __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); + auto* dst = static_cast(dest); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale); + dst += stride; + } + left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale); + dst += stride; + } + left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale); + dst += stride; + } + left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale); + dst += stride; + } +} + +void SmoothHorizontal16x4_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[15]); + const __m128i left = _mm_cvtepu8_epi16(Load4(left_column)); + const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights1 = _mm_cvtepu8_epi16(weights); + const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + scale = _mm_set1_epi16(128); + __m128i y_mask = _mm_set1_epi32(0x01000100); + __m128i left_y = _mm_shuffle_epi8(left, y_mask); + auto* dst = static_cast(dest); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + dst += stride; + y_mask = _mm_set1_epi32(0x03020302); + left_y = _mm_shuffle_epi8(left, y_mask); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + dst += stride; + y_mask = _mm_set1_epi32(0x05040504); + left_y = _mm_shuffle_epi8(left, y_mask); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + dst += stride; + y_mask = _mm_set1_epi32(0x07060706); + left_y = _mm_shuffle_epi8(left, y_mask); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); +} + +void SmoothHorizontal16x8_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[15]); + const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); + const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights1 = _mm_cvtepu8_epi16(weights); + const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + scale = _mm_set1_epi16(128); + auto* dst = static_cast(dest); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + dst += stride; + } +} + +void SmoothHorizontal16x16_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[15]); + const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights1 = _mm_cvtepu8_epi16(weights); + const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + scale = _mm_set1_epi16(128); + const auto* const left_ptr = static_cast(left_column); + __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); + auto* dst = static_cast(dest); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + dst += stride; + } + left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + dst += stride; + } +} + +void SmoothHorizontal16x32_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[15]); + const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights1 = _mm_cvtepu8_epi16(weights); + const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + scale = _mm_set1_epi16(128); + const auto* const left_ptr = static_cast(left_column); + __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); + auto* dst = static_cast(dest); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + dst += stride; + } + left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + dst += stride; + } + left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + dst += stride; + } + left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + dst += stride; + } +} + +void SmoothHorizontal16x64_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[15]); + const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights1 = _mm_cvtepu8_epi16(weights); + const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + scale = _mm_set1_epi16(128); + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + for (int left_offset = 0; left_offset < 64; left_offset += 8) { + const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + dst += stride; + } + } +} + +void SmoothHorizontal32x8_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[31]); + const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); + const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28); + const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo); + const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); + const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi); + const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i scaled_top_right3 = + _mm_mullo_epi16(inverted_weights3, top_right); + const __m128i scaled_top_right4 = + _mm_mullo_epi16(inverted_weights4, top_right); + scale = _mm_set1_epi16(128); + auto* dst = static_cast(dest); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + dst += stride; + } +} + +void SmoothHorizontal32x16_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[31]); + const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column)); + const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28); + const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo); + const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); + const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi); + const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i scaled_top_right3 = + _mm_mullo_epi16(inverted_weights3, top_right); + const __m128i scaled_top_right4 = + _mm_mullo_epi16(inverted_weights4, top_right); + scale = _mm_set1_epi16(128); + auto* dst = static_cast(dest); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left1, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + dst += stride; + } + const __m128i left2 = + _mm_cvtepu8_epi16(LoadLo8(static_cast(left_column) + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left2, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + dst += stride; + } +} + +void SmoothHorizontal32x32_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[31]); + const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28); + const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo); + const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); + const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi); + const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i scaled_top_right3 = + _mm_mullo_epi16(inverted_weights3, top_right); + const __m128i scaled_top_right4 = + _mm_mullo_epi16(inverted_weights4, top_right); + scale = _mm_set1_epi16(128); + const auto* const left_ptr = static_cast(left_column); + __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); + auto* dst = static_cast(dest); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + dst += stride; + } + left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + dst += stride; + } + left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + dst += stride; + } + left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + dst += stride; + } +} + +void SmoothHorizontal32x64_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[31]); + const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28); + const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo); + const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); + const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi); + const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i scaled_top_right3 = + _mm_mullo_epi16(inverted_weights3, top_right); + const __m128i scaled_top_right4 = + _mm_mullo_epi16(inverted_weights4, top_right); + scale = _mm_set1_epi16(128); + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + for (int left_offset = 0; left_offset < 64; left_offset += 8) { + const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + dst += stride; + } + } +} + +void SmoothHorizontal64x16_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[63]); + const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column)); + const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60); + const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo); + const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8)); + const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi); + const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i scaled_top_right3 = + _mm_mullo_epi16(inverted_weights3, top_right); + const __m128i scaled_top_right4 = + _mm_mullo_epi16(inverted_weights4, top_right); + const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92); + const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108); + const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo); + const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8)); + const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi); + const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8)); + const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5); + const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6); + const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7); + const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8); + const __m128i scaled_top_right5 = + _mm_mullo_epi16(inverted_weights5, top_right); + const __m128i scaled_top_right6 = + _mm_mullo_epi16(inverted_weights6, top_right); + const __m128i scaled_top_right7 = + _mm_mullo_epi16(inverted_weights7, top_right); + const __m128i scaled_top_right8 = + _mm_mullo_epi16(inverted_weights8, top_right); + scale = _mm_set1_epi16(128); + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left1, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6, + scaled_top_right5, scaled_top_right6, scale); + WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8, + scaled_top_right7, scaled_top_right8, scale); + dst += stride; + } + const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left2, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6, + scaled_top_right5, scaled_top_right6, scale); + WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8, + scaled_top_right7, scaled_top_right8, scale); + dst += stride; + } +} + +void SmoothHorizontal64x32_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[63]); + const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column)); + const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60); + const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo); + const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8)); + const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi); + const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i scaled_top_right3 = + _mm_mullo_epi16(inverted_weights3, top_right); + const __m128i scaled_top_right4 = + _mm_mullo_epi16(inverted_weights4, top_right); + const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92); + const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108); + const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo); + const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8)); + const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi); + const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8)); + const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5); + const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6); + const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7); + const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8); + const __m128i scaled_top_right5 = + _mm_mullo_epi16(inverted_weights5, top_right); + const __m128i scaled_top_right6 = + _mm_mullo_epi16(inverted_weights6, top_right); + const __m128i scaled_top_right7 = + _mm_mullo_epi16(inverted_weights7, top_right); + const __m128i scaled_top_right8 = + _mm_mullo_epi16(inverted_weights8, top_right); + scale = _mm_set1_epi16(128); + auto* dst = static_cast(dest); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left1, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6, + scaled_top_right5, scaled_top_right6, scale); + WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8, + scaled_top_right7, scaled_top_right8, scale); + dst += stride; + } + const auto* const left_ptr = static_cast(left_column); + const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left2, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6, + scaled_top_right5, scaled_top_right6, scale); + WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8, + scaled_top_right7, scaled_top_right8, scale); + dst += stride; + } + const __m128i left3 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left3, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6, + scaled_top_right5, scaled_top_right6, scale); + WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8, + scaled_top_right7, scaled_top_right8, scale); + dst += stride; + } + const __m128i left4 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left4, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6, + scaled_top_right5, scaled_top_right6, scale); + WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8, + scaled_top_right7, scaled_top_right8, scale); + dst += stride; + } +} + +void SmoothHorizontal64x64_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const top = static_cast(top_row); + const __m128i top_right = _mm_set1_epi16(top[63]); + const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60); + const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo); + const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8)); + const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi); + const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i scaled_top_right3 = + _mm_mullo_epi16(inverted_weights3, top_right); + const __m128i scaled_top_right4 = + _mm_mullo_epi16(inverted_weights4, top_right); + const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92); + const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108); + const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo); + const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8)); + const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi); + const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8)); + const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5); + const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6); + const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7); + const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8); + const __m128i scaled_top_right5 = + _mm_mullo_epi16(inverted_weights5, top_right); + const __m128i scaled_top_right6 = + _mm_mullo_epi16(inverted_weights6, top_right); + const __m128i scaled_top_right7 = + _mm_mullo_epi16(inverted_weights7, top_right); + const __m128i scaled_top_right8 = + _mm_mullo_epi16(inverted_weights8, top_right); + scale = _mm_set1_epi16(128); + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + for (int left_offset = 0; left_offset < 64; left_offset += 8) { + const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, scale); + WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, scale); + WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6, + scaled_top_right5, scaled_top_right6, scale); + WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8, + scaled_top_right7, scaled_top_right8, scale); + dst += stride; + } + } +} + +inline void LoadSmoothVerticalPixels4(const uint8_t* above, const uint8_t* left, + const int height, __m128i* pixels) { + __m128i top = Load4(above); + const __m128i bottom_left = _mm_set1_epi16(left[height - 1]); + top = _mm_cvtepu8_epi16(top); + pixels[0] = _mm_unpacklo_epi16(top, bottom_left); +} + +// |weight_array| alternates weight vectors from the table with their inverted +// (256-w) counterparts. This is precomputed by the compiler when the weights +// table is visible to this module. Removing this visibility can cut speed by up +// to half in both 4xH and 8xH transforms. +inline void LoadSmoothVerticalWeights4(const uint8_t* weight_array, + const int height, __m128i* weights) { + const __m128i inverter = _mm_set1_epi16(256); + + if (height == 4) { + const __m128i weight = Load4(weight_array); + weights[0] = _mm_cvtepu8_epi16(weight); + weights[1] = _mm_sub_epi16(inverter, weights[0]); + } else if (height == 8) { + const __m128i weight = LoadLo8(weight_array + 4); + weights[0] = _mm_cvtepu8_epi16(weight); + weights[1] = _mm_sub_epi16(inverter, weights[0]); + } else { + const __m128i weight = LoadUnaligned16(weight_array + 12); + const __m128i zero = _mm_setzero_si128(); + weights[0] = _mm_cvtepu8_epi16(weight); + weights[1] = _mm_sub_epi16(inverter, weights[0]); + weights[2] = _mm_unpackhi_epi8(weight, zero); + weights[3] = _mm_sub_epi16(inverter, weights[2]); + } +} + +inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight, + const int height, uint8_t* dst, + const ptrdiff_t stride) { + const __m128i pred_round = _mm_set1_epi32(128); + const __m128i mask_increment = _mm_set1_epi16(0x0202); + const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400); + __m128i y_select = _mm_set1_epi16(0x0100); + + for (int y = 0; y < height; ++y) { + const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select); + const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select); + const __m128i alternate_weights = + _mm_unpacklo_epi16(weight_y, inverted_weight_y); + // Here the pixel vector is top_row[0], corner, top_row[1], corner, ... + // The madd instruction yields four results of the form: + // (top_row[x] * weight[y] + corner * inverted_weight[y]) + __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights); + sum = _mm_add_epi32(sum, pred_round); + sum = _mm_srai_epi32(sum, 8); + sum = _mm_shuffle_epi8(sum, cvtepu8_epi32); + Store4(dst, sum); + dst += stride; + y_select = _mm_add_epi16(y_select, mask_increment); + } +} + +void SmoothVertical4x4_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left = static_cast(left_column); + const auto* const above = static_cast(top_row); + auto* dst = static_cast(dest); + __m128i pixels; + LoadSmoothVerticalPixels4(above, left, 4, &pixels); + + __m128i weights[2]; + LoadSmoothVerticalWeights4(kSmoothWeights, 4, weights); + + WriteSmoothVertical4xH(&pixels, weights, 4, dst, stride); +} + +void SmoothVertical4x8_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left = static_cast(left_column); + const auto* const above = static_cast(top_row); + auto* dst = static_cast(dest); + __m128i pixels; + LoadSmoothVerticalPixels4(above, left, 8, &pixels); + + __m128i weights[2]; + LoadSmoothVerticalWeights4(kSmoothWeights, 8, weights); + + WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride); +} + +void SmoothVertical4x16_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left = static_cast(left_column); + const auto* const above = static_cast(top_row); + auto* dst = static_cast(dest); + __m128i pixels; + LoadSmoothVerticalPixels4(above, left, 16, &pixels); + + __m128i weights[4]; + LoadSmoothVerticalWeights4(kSmoothWeights, 16, weights); + + WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride); + dst += stride << 3; + WriteSmoothVertical4xH(&pixels, &weights[2], 8, dst, stride); +} + +void SmoothVertical8x4_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]); + const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights)); + __m128i scale = _mm_set1_epi16(256); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_bottom_left = + _mm_mullo_epi16(inverted_weights, bottom_left); + scale = _mm_set1_epi16(128); + + auto* dst = static_cast(dest); + __m128i y_select = _mm_set1_epi32(0x01000100); + const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row)); + __m128i weights_y = _mm_shuffle_epi8(weights, y_select); + __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale); + dst += stride; + y_select = _mm_set1_epi32(0x03020302); + weights_y = _mm_shuffle_epi8(weights, y_select); + scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale); + dst += stride; + y_select = _mm_set1_epi32(0x05040504); + weights_y = _mm_shuffle_epi8(weights, y_select); + scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale); + dst += stride; + y_select = _mm_set1_epi32(0x07060706); + weights_y = _mm_shuffle_epi8(weights, y_select); + scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale); +} + +void SmoothVertical8x8_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]); + const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4)); + __m128i scale = _mm_set1_epi16(256); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_bottom_left = + _mm_mullo_epi16(inverted_weights, bottom_left); + scale = _mm_set1_epi16(128); + const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row)); + auto* dst = static_cast(dest); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left, y_select); + WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, + scale); + dst += stride; + } +} + +void SmoothVertical8x16_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]); + const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); + + const __m128i weights1 = _mm_cvtepu8_epi16(weights); + const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8)); + __m128i scale = _mm_set1_epi16(256); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i scaled_bottom_left1 = + _mm_mullo_epi16(inverted_weights1, bottom_left); + const __m128i scaled_bottom_left2 = + _mm_mullo_epi16(inverted_weights2, bottom_left); + scale = _mm_set1_epi16(128); + const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row)); + auto* dst = static_cast(dest); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left1, y_select); + WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left2, y_select); + WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, + scale); + dst += stride; + } +} + +void SmoothVertical8x32_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + const __m128i zero = _mm_setzero_si128(); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]); + const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28); + const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44); + const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo); + const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); + const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi); + const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); + __m128i scale = _mm_set1_epi16(256); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_bottom_left1 = + _mm_mullo_epi16(inverted_weights1, bottom_left); + const __m128i scaled_bottom_left2 = + _mm_mullo_epi16(inverted_weights2, bottom_left); + const __m128i scaled_bottom_left3 = + _mm_mullo_epi16(inverted_weights3, bottom_left); + const __m128i scaled_bottom_left4 = + _mm_mullo_epi16(inverted_weights4, bottom_left); + scale = _mm_set1_epi16(128); + auto* dst = static_cast(dest); + const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left1, y_select); + WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left2, y_select); + WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left3, y_select); + WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left4, y_select); + WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, + scale); + dst += stride; + } +} + +void SmoothVertical16x4_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]); + const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights)); + __m128i scale = _mm_set1_epi16(256); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_bottom_left = + _mm_mullo_epi16(inverted_weights, bottom_left); + scale = _mm_set1_epi16(128); + const __m128i top = LoadUnaligned16(top_row); + const __m128i top_lo = _mm_cvtepu8_epi16(top); + const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8)); + + __m128i y_select = _mm_set1_epi32(0x01000100); + __m128i weights_y = _mm_shuffle_epi8(weights, y_select); + __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + y_select = _mm_set1_epi32(0x03020302); + weights_y = _mm_shuffle_epi8(weights, y_select); + scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + y_select = _mm_set1_epi32(0x05040504); + weights_y = _mm_shuffle_epi8(weights, y_select); + scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + y_select = _mm_set1_epi32(0x07060706); + weights_y = _mm_shuffle_epi8(weights, y_select); + scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); +} + +void SmoothVertical16x8_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]); + const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4)); + __m128i scale = _mm_set1_epi16(256); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_bottom_left = + _mm_mullo_epi16(inverted_weights, bottom_left); + scale = _mm_set1_epi16(128); + + const __m128i top = LoadUnaligned16(top_row); + const __m128i top_lo = _mm_cvtepu8_epi16(top); + const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left, y_select); + WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } +} + +void SmoothVertical16x16_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]); + const __m128i zero = _mm_setzero_si128(); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); + const __m128i weights_lo = _mm_cvtepu8_epi16(weights); + const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); + const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); + const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); + const __m128i scaled_bottom_left_lo = + _mm_mullo_epi16(inverted_weights_lo, bottom_left); + const __m128i scaled_bottom_left_hi = + _mm_mullo_epi16(inverted_weights_hi, bottom_left); + scale = _mm_set1_epi16(128); + + const __m128i top = LoadUnaligned16(top_row); + const __m128i top_lo = _mm_cvtepu8_epi16(top); + const __m128i top_hi = _mm_unpackhi_epi8(top, zero); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); + WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); + WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } +} + +void SmoothVertical16x32_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]); + const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28); + const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44); + __m128i scale = _mm_set1_epi16(256); + const __m128i zero = _mm_setzero_si128(); + const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo); + const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); + const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi); + const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_bottom_left1 = + _mm_mullo_epi16(inverted_weights1, bottom_left); + const __m128i scaled_bottom_left2 = + _mm_mullo_epi16(inverted_weights2, bottom_left); + const __m128i scaled_bottom_left3 = + _mm_mullo_epi16(inverted_weights3, bottom_left); + const __m128i scaled_bottom_left4 = + _mm_mullo_epi16(inverted_weights4, bottom_left); + scale = _mm_set1_epi16(128); + + const __m128i top = LoadUnaligned16(top_row); + const __m128i top_lo = _mm_cvtepu8_epi16(top); + const __m128i top_hi = _mm_unpackhi_epi8(top, zero); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left1, y_select); + WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left2, y_select); + WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left3, y_select); + WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left4, y_select); + WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } +} + +void SmoothVertical16x64_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]); + const __m128i scale = _mm_set1_epi16(256); + const __m128i round = _mm_set1_epi16(128); + const __m128i zero = _mm_setzero_si128(); + + const __m128i top = LoadUnaligned16(top_row); + const __m128i top_lo = _mm_cvtepu8_epi16(top); + const __m128i top_hi = _mm_unpackhi_epi8(top, zero); + const uint8_t* weights_base_ptr = kSmoothWeights + 60; + for (int left_offset = 0; left_offset < 64; left_offset += 16) { + const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset); + const __m128i weights_lo = _mm_cvtepu8_epi16(weights); + const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); + const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); + const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); + const __m128i scaled_bottom_left_lo = + _mm_mullo_epi16(inverted_weights_lo, bottom_left); + const __m128i scaled_bottom_left_hi = + _mm_mullo_epi16(inverted_weights_hi, bottom_left); + + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); + WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); + WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + } +} + +void SmoothVertical32x8_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + const auto* const top_ptr = static_cast(top_row); + auto* dst = static_cast(dest); + const __m128i zero = _mm_setzero_si128(); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]); + const __m128i top_lo = LoadUnaligned16(top_ptr); + const __m128i top_hi = LoadUnaligned16(top_ptr + 16); + const __m128i top1 = _mm_cvtepu8_epi16(top_lo); + const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); + const __m128i top3 = _mm_cvtepu8_epi16(top_hi); + const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4)); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_bottom_left = + _mm_mullo_epi16(inverted_weights, bottom_left); + scale = _mm_set1_epi16(128); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } +} + +void SmoothVertical32x16_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + const auto* const top_ptr = static_cast(top_row); + auto* dst = static_cast(dest); + const __m128i zero = _mm_setzero_si128(); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]); + const __m128i top_lo = LoadUnaligned16(top_ptr); + const __m128i top_hi = LoadUnaligned16(top_ptr + 16); + const __m128i top1 = _mm_cvtepu8_epi16(top_lo); + const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); + const __m128i top3 = _mm_cvtepu8_epi16(top_hi); + const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); + const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); + const __m128i weights1 = _mm_cvtepu8_epi16(weights); + const __m128i weights2 = _mm_unpackhi_epi8(weights, zero); + __m128i scale = _mm_set1_epi16(256); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i scaled_bottom_left1 = + _mm_mullo_epi16(inverted_weights1, bottom_left); + const __m128i scaled_bottom_left2 = + _mm_mullo_epi16(inverted_weights2, bottom_left); + scale = _mm_set1_epi16(128); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left1, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left2, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } +} + +void SmoothVertical32x32_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + const auto* const top_ptr = static_cast(top_row); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]); + const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28); + const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44); + const __m128i zero = _mm_setzero_si128(); + __m128i scale = _mm_set1_epi16(256); + const __m128i top_lo = LoadUnaligned16(top_ptr); + const __m128i top_hi = LoadUnaligned16(top_ptr + 16); + const __m128i top1 = _mm_cvtepu8_epi16(top_lo); + const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); + const __m128i top3 = _mm_cvtepu8_epi16(top_hi); + const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); + const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo); + const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); + const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi); + const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_bottom_left1 = + _mm_mullo_epi16(inverted_weights1, bottom_left); + const __m128i scaled_bottom_left2 = + _mm_mullo_epi16(inverted_weights2, bottom_left); + const __m128i scaled_bottom_left3 = + _mm_mullo_epi16(inverted_weights3, bottom_left); + const __m128i scaled_bottom_left4 = + _mm_mullo_epi16(inverted_weights4, bottom_left); + scale = _mm_set1_epi16(128); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left1, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left2, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left3, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left4, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } +} + +void SmoothVertical32x64_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + const auto* const top_ptr = static_cast(top_row); + const __m128i zero = _mm_setzero_si128(); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]); + const __m128i top_lo = LoadUnaligned16(top_ptr); + const __m128i top_hi = LoadUnaligned16(top_ptr + 16); + const __m128i top1 = _mm_cvtepu8_epi16(top_lo); + const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); + const __m128i top3 = _mm_cvtepu8_epi16(top_hi); + const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); + const __m128i scale = _mm_set1_epi16(256); + const __m128i round = _mm_set1_epi16(128); + const uint8_t* weights_base_ptr = kSmoothWeights + 60; + for (int left_offset = 0; left_offset < 64; left_offset += 16) { + const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset); + const __m128i weights_lo = _mm_cvtepu8_epi16(weights); + const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); + const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); + const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); + const __m128i scaled_bottom_left_lo = + _mm_mullo_epi16(inverted_weights_lo, bottom_left); + const __m128i scaled_bottom_left_hi = + _mm_mullo_epi16(inverted_weights_hi, bottom_left); + + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + } +} + +void SmoothVertical64x16_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + const auto* const top_ptr = static_cast(top_row); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]); + __m128i scale = _mm_set1_epi16(256); + const __m128i zero = _mm_setzero_si128(); + const __m128i top_lolo = LoadUnaligned16(top_ptr); + const __m128i top_lohi = LoadUnaligned16(top_ptr + 16); + const __m128i top1 = _mm_cvtepu8_epi16(top_lolo); + const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero); + const __m128i top3 = _mm_cvtepu8_epi16(top_lohi); + const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero); + + const __m128i weights = LoadUnaligned16(kSmoothWeights + 12); + const __m128i weights1 = _mm_cvtepu8_epi16(weights); + const __m128i weights2 = _mm_unpackhi_epi8(weights, zero); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i top_hilo = LoadUnaligned16(top_ptr + 32); + const __m128i top_hihi = LoadUnaligned16(top_ptr + 48); + const __m128i top5 = _mm_cvtepu8_epi16(top_hilo); + const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero); + const __m128i top7 = _mm_cvtepu8_epi16(top_hihi); + const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero); + const __m128i scaled_bottom_left1 = + _mm_mullo_epi16(inverted_weights1, bottom_left); + const __m128i scaled_bottom_left2 = + _mm_mullo_epi16(inverted_weights2, bottom_left); + scale = _mm_set1_epi16(128); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left1, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left2, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } +} + +void SmoothVertical64x32_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + const auto* const top_ptr = static_cast(top_row); + const __m128i zero = _mm_setzero_si128(); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]); + const __m128i top_lolo = LoadUnaligned16(top_ptr); + const __m128i top_lohi = LoadUnaligned16(top_ptr + 16); + const __m128i top1 = _mm_cvtepu8_epi16(top_lolo); + const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero); + const __m128i top3 = _mm_cvtepu8_epi16(top_lohi); + const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero); + const __m128i top_hilo = LoadUnaligned16(top_ptr + 32); + const __m128i top_hihi = LoadUnaligned16(top_ptr + 48); + const __m128i top5 = _mm_cvtepu8_epi16(top_hilo); + const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero); + const __m128i top7 = _mm_cvtepu8_epi16(top_hihi); + const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero); + const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28); + const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44); + const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo); + const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); + const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi); + const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); + __m128i scale = _mm_set1_epi16(256); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_bottom_left1 = + _mm_mullo_epi16(inverted_weights1, bottom_left); + const __m128i scaled_bottom_left2 = + _mm_mullo_epi16(inverted_weights2, bottom_left); + const __m128i scaled_bottom_left3 = + _mm_mullo_epi16(inverted_weights3, bottom_left); + const __m128i scaled_bottom_left4 = + _mm_mullo_epi16(inverted_weights4, bottom_left); + scale = _mm_set1_epi16(128); + + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left1, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left2, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left3, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left4, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + scale); + dst += stride; + } +} + +void SmoothVertical64x64_SSE4_1(void* const dest, const ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + const auto* const top_ptr = static_cast(top_row); + const __m128i zero = _mm_setzero_si128(); + const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]); + const __m128i top_lolo = LoadUnaligned16(top_ptr); + const __m128i top_lohi = LoadUnaligned16(top_ptr + 16); + const __m128i top1 = _mm_cvtepu8_epi16(top_lolo); + const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero); + const __m128i top3 = _mm_cvtepu8_epi16(top_lohi); + const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero); + const __m128i top_hilo = LoadUnaligned16(top_ptr + 32); + const __m128i top_hihi = LoadUnaligned16(top_ptr + 48); + const __m128i top5 = _mm_cvtepu8_epi16(top_hilo); + const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero); + const __m128i top7 = _mm_cvtepu8_epi16(top_hihi); + const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero); + const __m128i scale = _mm_set1_epi16(256); + const __m128i round = _mm_set1_epi16(128); + const uint8_t* weights_base_ptr = kSmoothWeights + 60; + for (int left_offset = 0; left_offset < 64; left_offset += 16) { + const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset); + const __m128i weights_lo = _mm_cvtepu8_epi16(weights); + const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); + const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); + const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); + const __m128i scaled_bottom_left_lo = + _mm_mullo_epi16(inverted_weights_lo, bottom_left); + const __m128i scaled_bottom_left_hi = + _mm_mullo_epi16(inverted_weights_hi, bottom_left); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); + WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] = + Smooth4x4_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] = + Smooth4x8_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] = + Smooth4x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] = + Smooth8x4_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] = + Smooth8x8_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] = + Smooth8x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] = + Smooth8x32_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] = + SmoothWxH<16, 4>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] = + SmoothWxH<16, 8>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] = + SmoothWxH<16, 16>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] = + SmoothWxH<16, 32>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] = + SmoothWxH<16, 64>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] = + SmoothWxH<32, 8>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] = + SmoothWxH<32, 16>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] = + SmoothWxH<32, 32>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] = + SmoothWxH<32, 64>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] = + SmoothWxH<64, 16>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] = + SmoothWxH<64, 32>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmooth) + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] = + SmoothWxH<64, 64>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] = + SmoothVertical4x4_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] = + SmoothVertical4x8_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] = + SmoothVertical4x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] = + SmoothVertical8x4_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] = + SmoothVertical8x8_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] = + SmoothVertical8x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] = + SmoothVertical8x32_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] = + SmoothVertical16x4_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] = + SmoothVertical16x8_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] = + SmoothVertical16x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] = + SmoothVertical16x32_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] = + SmoothVertical16x64_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] = + SmoothVertical32x8_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] = + SmoothVertical32x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] = + SmoothVertical32x32_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] = + SmoothVertical32x64_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] = + SmoothVertical64x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] = + SmoothVertical64x32_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothVertical) + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] = + SmoothVertical64x64_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal4x4_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal4x8_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal4x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal8x4_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal8x8_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal8x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal8x32_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16x4_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16x8_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16x32_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal16x64_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal32x8_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal32x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal32x32_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal32x64_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal64x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal64x32_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothHorizontal) + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] = + SmoothHorizontal64x64_SSE4_1; +#endif +} + +} // namespace +} // namespace low_bitdepth + +void IntraPredSmoothInit_SSE4_1() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 + +namespace libgav1 { +namespace dsp { + +void IntraPredSmoothInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/intrapred_sse4.cc b/src/dsp/x86/intrapred_sse4.cc new file mode 100644 index 0000000..9938dfe --- /dev/null +++ b/src/dsp/x86/intrapred_sse4.cc @@ -0,0 +1,3535 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/intrapred.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include +#include +#include +#include +#include // memcpy + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/dsp/x86/transpose_sse4.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace { + +//------------------------------------------------------------------------------ +// Utility Functions + +// This is a fast way to divide by a number of the form 2^n + 2^k, n > k. +// Divide by 2^k by right shifting by k, leaving the denominator 2^m + 1. In the +// block size cases, n - k is 1 or 2 (block is proportional to 1x2 or 1x4), so +// we use a multiplier that reflects division by 2+1=3 or 4+1=5 in the high +// bits. +constexpr int kThreeInverse = 0x5556; +constexpr int kFiveInverse = 0x3334; +template +inline __m128i DivideByMultiplyShift_U32(const __m128i dividend) { + const __m128i interm = _mm_srli_epi32(dividend, shiftk); + return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier)); +} + +// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which +// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes. +constexpr int kDuplicateFirstHalf = 0x44; + +//------------------------------------------------------------------------------ +// DcPredFuncs_SSE4_1 + +using DcSumFunc = __m128i (*)(const void* ref); +using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const __m128i dc); +using WriteDuplicateFunc = void (*)(void* dest, ptrdiff_t stride, + const __m128i column); +// For copying an entire column across a block. +using ColumnStoreFunc = void (*)(void* dest, ptrdiff_t stride, + const void* column); + +// DC intra-predictors for non-square blocks. +template +struct DcPredFuncs_SSE4_1 { + DcPredFuncs_SSE4_1() = delete; + + static void DcTop(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void Dc(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); +}; + +// Directional intra-predictors for square blocks. +template +struct DirectionalPredFuncs_SSE4_1 { + DirectionalPredFuncs_SSE4_1() = delete; + + static void Vertical(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); + static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row, + const void* left_column); +}; + +template +void DcPredFuncs_SSE4_1::DcTop(void* const dest, + ptrdiff_t stride, + const void* const top_row, + const void* /*left_column*/) { + const __m128i rounder = _mm_set1_epi32(1 << (width_log2 - 1)); + const __m128i sum = top_sumfn(top_row); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2); + storefn(dest, stride, dc); +} + +template +void DcPredFuncs_SSE4_1::DcLeft(void* const dest, ptrdiff_t stride, + const void* /*top_row*/, + const void* const left_column) { + const __m128i rounder = _mm_set1_epi32(1 << (height_log2 - 1)); + const __m128i sum = left_sumfn(left_column); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), height_log2); + storefn(dest, stride, dc); +} + +template +void DcPredFuncs_SSE4_1::Dc(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const __m128i rounder = + _mm_set1_epi32((1 << (width_log2 - 1)) + (1 << (height_log2 - 1))); + const __m128i sum_top = top_sumfn(top_row); + const __m128i sum_left = left_sumfn(left_column); + const __m128i sum = _mm_add_epi32(sum_top, sum_left); + if (width_log2 == height_log2) { + const __m128i dc = + _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2 + 1); + storefn(dest, stride, dc); + } else { + const __m128i dc = + DivideByMultiplyShift_U32(_mm_add_epi32(sum, rounder)); + storefn(dest, stride, dc); + } +} + +//------------------------------------------------------------------------------ +// DcPredFuncs_SSE4_1 directional predictors + +template +void DirectionalPredFuncs_SSE4_1::Horizontal( + void* const dest, ptrdiff_t stride, const void* /*top_row*/, + const void* const left_column) { + col_storefn(dest, stride, left_column); +} + +} // namespace + +//------------------------------------------------------------------------------ +namespace low_bitdepth { +namespace { + +// |ref| points to 4 bytes containing 4 packed ints. +inline __m128i DcSum4_SSE4_1(const void* const ref) { + const __m128i vals = Load4(ref); + const __m128i zero = _mm_setzero_si128(); + return _mm_sad_epu8(vals, zero); +} + +inline __m128i DcSum8_SSE4_1(const void* const ref) { + const __m128i vals = LoadLo8(ref); + const __m128i zero = _mm_setzero_si128(); + return _mm_sad_epu8(vals, zero); +} + +inline __m128i DcSum16_SSE4_1(const void* const ref) { + const __m128i zero = _mm_setzero_si128(); + const __m128i vals = LoadUnaligned16(ref); + const __m128i partial_sum = _mm_sad_epu8(vals, zero); + return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8)); +} + +inline __m128i DcSum32_SSE4_1(const void* const ref) { + const __m128i zero = _mm_setzero_si128(); + const __m128i vals1 = LoadUnaligned16(ref); + const __m128i vals2 = LoadUnaligned16(static_cast(ref) + 16); + const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero); + const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero); + const __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2); + return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8)); +} + +inline __m128i DcSum64_SSE4_1(const void* const ref) { + const auto* const ref_ptr = static_cast(ref); + const __m128i zero = _mm_setzero_si128(); + const __m128i vals1 = LoadUnaligned16(ref_ptr); + const __m128i vals2 = LoadUnaligned16(ref_ptr + 16); + const __m128i vals3 = LoadUnaligned16(ref_ptr + 32); + const __m128i vals4 = LoadUnaligned16(ref_ptr + 48); + const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero); + const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero); + __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2); + const __m128i partial_sum3 = _mm_sad_epu8(vals3, zero); + partial_sum = _mm_add_epi16(partial_sum, partial_sum3); + const __m128i partial_sum4 = _mm_sad_epu8(vals4, zero); + partial_sum = _mm_add_epi16(partial_sum, partial_sum4); + return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8)); +} + +template +inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride, + const __m128i dc) { + const __m128i zero = _mm_setzero_si128(); + const __m128i dc_dup = _mm_shuffle_epi8(dc, zero); + int y = height - 1; + auto* dst = static_cast(dest); + do { + Store4(dst, dc_dup); + dst += stride; + } while (--y != 0); + Store4(dst, dc_dup); +} + +template +inline void DcStore8xH_SSE4_1(void* const dest, ptrdiff_t stride, + const __m128i dc) { + const __m128i zero = _mm_setzero_si128(); + const __m128i dc_dup = _mm_shuffle_epi8(dc, zero); + int y = height - 1; + auto* dst = static_cast(dest); + do { + StoreLo8(dst, dc_dup); + dst += stride; + } while (--y != 0); + StoreLo8(dst, dc_dup); +} + +template +inline void DcStore16xH_SSE4_1(void* const dest, ptrdiff_t stride, + const __m128i dc) { + const __m128i zero = _mm_setzero_si128(); + const __m128i dc_dup = _mm_shuffle_epi8(dc, zero); + int y = height - 1; + auto* dst = static_cast(dest); + do { + StoreUnaligned16(dst, dc_dup); + dst += stride; + } while (--y != 0); + StoreUnaligned16(dst, dc_dup); +} + +template +inline void DcStore32xH_SSE4_1(void* const dest, ptrdiff_t stride, + const __m128i dc) { + const __m128i zero = _mm_setzero_si128(); + const __m128i dc_dup = _mm_shuffle_epi8(dc, zero); + int y = height - 1; + auto* dst = static_cast(dest); + do { + StoreUnaligned16(dst, dc_dup); + StoreUnaligned16(dst + 16, dc_dup); + dst += stride; + } while (--y != 0); + StoreUnaligned16(dst, dc_dup); + StoreUnaligned16(dst + 16, dc_dup); +} + +template +inline void DcStore64xH_SSE4_1(void* const dest, ptrdiff_t stride, + const __m128i dc) { + const __m128i zero = _mm_setzero_si128(); + const __m128i dc_dup = _mm_shuffle_epi8(dc, zero); + int y = height - 1; + auto* dst = static_cast(dest); + do { + StoreUnaligned16(dst, dc_dup); + StoreUnaligned16(dst + 16, dc_dup); + StoreUnaligned16(dst + 32, dc_dup); + StoreUnaligned16(dst + 48, dc_dup); + dst += stride; + } while (--y != 0); + StoreUnaligned16(dst, dc_dup); + StoreUnaligned16(dst + 16, dc_dup); + StoreUnaligned16(dst + 32, dc_dup); + StoreUnaligned16(dst + 48, dc_dup); +} + +// WriteDuplicateN assumes dup has 4 sets of 4 identical bytes that are meant to +// be copied for width N into dest. +inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride, + const __m128i dup32) { + auto* dst = static_cast(dest); + Store4(dst, dup32); + dst += stride; + const int row1 = _mm_extract_epi32(dup32, 1); + memcpy(dst, &row1, 4); + dst += stride; + const int row2 = _mm_extract_epi32(dup32, 2); + memcpy(dst, &row2, 4); + dst += stride; + const int row3 = _mm_extract_epi32(dup32, 3); + memcpy(dst, &row3, 4); +} + +inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride, + const __m128i dup32) { + const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32); + const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32); + auto* dst = static_cast(dest); + _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo); + dst += stride; + _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo)); + dst += stride; + _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi); + dst += stride; + _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi)); +} + +inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride, + const __m128i dup32) { + const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32); + const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32); + + auto* dst = static_cast(dest); + const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0); + dst += stride; + const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1); + dst += stride; + const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2); + dst += stride; + const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3); +} + +inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride, + const __m128i dup32) { + const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32); + const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32); + + auto* dst = static_cast(dest); + const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0); + dst += stride; + const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1); + dst += stride; + const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2); + dst += stride; + const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3); +} + +inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride, + const __m128i dup32) { + const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32); + const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32); + + auto* dst = static_cast(dest); + const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0); + dst += stride; + const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1); + dst += stride; + const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2); + dst += stride; + const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3); +} + +// ColStoreN copies each of the |height| values in |column| across its +// corresponding in dest. +template +inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const column) { + const __m128i col_data = Load4(column); + const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data); + const __m128i col_dup32 = _mm_unpacklo_epi16(col_dup16, col_dup16); + writefn(dest, stride, col_dup32); +} + +template +inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const column) { + const ptrdiff_t stride4 = stride << 2; + const __m128i col_data = LoadLo8(column); + const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data); + const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_dup16, col_dup16); + auto* dst = static_cast(dest); + writefn(dst, stride, col_dup32_lo); + dst += stride4; + const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_dup16, col_dup16); + writefn(dst, stride, col_dup32_hi); +} + +template +inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const column) { + const ptrdiff_t stride4 = stride << 2; + const __m128i col_data = _mm_loadu_si128(static_cast(column)); + const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data); + const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data); + const __m128i col_dup32_lolo = _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo); + auto* dst = static_cast(dest); + writefn(dst, stride, col_dup32_lolo); + dst += stride4; + const __m128i col_dup32_lohi = _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo); + writefn(dst, stride, col_dup32_lohi); + dst += stride4; + const __m128i col_dup32_hilo = _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi); + writefn(dst, stride, col_dup32_hilo); + dst += stride4; + const __m128i col_dup32_hihi = _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi); + writefn(dst, stride, col_dup32_hihi); +} + +template +inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const column) { + const ptrdiff_t stride4 = stride << 2; + auto* dst = static_cast(dest); + for (int y = 0; y < 32; y += 16) { + const __m128i col_data = + LoadUnaligned16(static_cast(column) + y); + const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data); + const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data); + const __m128i col_dup32_lolo = + _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo); + writefn(dst, stride, col_dup32_lolo); + dst += stride4; + const __m128i col_dup32_lohi = + _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo); + writefn(dst, stride, col_dup32_lohi); + dst += stride4; + const __m128i col_dup32_hilo = + _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi); + writefn(dst, stride, col_dup32_hilo); + dst += stride4; + const __m128i col_dup32_hihi = + _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi); + writefn(dst, stride, col_dup32_hihi); + dst += stride4; + } +} + +template +inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const column) { + const ptrdiff_t stride4 = stride << 2; + auto* dst = static_cast(dest); + for (int y = 0; y < 64; y += 16) { + const __m128i col_data = + LoadUnaligned16(static_cast(column) + y); + const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data); + const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data); + const __m128i col_dup32_lolo = + _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo); + writefn(dst, stride, col_dup32_lolo); + dst += stride4; + const __m128i col_dup32_lohi = + _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo); + writefn(dst, stride, col_dup32_lohi); + dst += stride4; + const __m128i col_dup32_hilo = + _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi); + writefn(dst, stride, col_dup32_hilo); + dst += stride4; + const __m128i col_dup32_hihi = + _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi); + writefn(dst, stride, col_dup32_hihi); + dst += stride4; + } +} + +struct DcDefs { + DcDefs() = delete; + + using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1, + DcStore4xH_SSE4_1<4>, 0, 0>; + // shiftk is the smaller of width_log2 and height_log2. + // dc_mult corresponds to the ratio of the smaller block size to the larger. + using _4x8 = DcPredFuncs_SSE4_1<2, 3, DcSum4_SSE4_1, DcSum8_SSE4_1, + DcStore4xH_SSE4_1<8>, 2, kThreeInverse>; + using _4x16 = DcPredFuncs_SSE4_1<2, 4, DcSum4_SSE4_1, DcSum16_SSE4_1, + DcStore4xH_SSE4_1<16>, 2, kFiveInverse>; + + using _8x4 = DcPredFuncs_SSE4_1<3, 2, DcSum8_SSE4_1, DcSum4_SSE4_1, + DcStore8xH_SSE4_1<4>, 2, kThreeInverse>; + using _8x8 = DcPredFuncs_SSE4_1<3, 3, DcSum8_SSE4_1, DcSum8_SSE4_1, + DcStore8xH_SSE4_1<8>, 0, 0>; + using _8x16 = DcPredFuncs_SSE4_1<3, 4, DcSum8_SSE4_1, DcSum16_SSE4_1, + DcStore8xH_SSE4_1<16>, 3, kThreeInverse>; + using _8x32 = DcPredFuncs_SSE4_1<3, 5, DcSum8_SSE4_1, DcSum32_SSE4_1, + DcStore8xH_SSE4_1<32>, 3, kFiveInverse>; + + using _16x4 = DcPredFuncs_SSE4_1<4, 2, DcSum16_SSE4_1, DcSum4_SSE4_1, + DcStore16xH_SSE4_1<4>, 2, kFiveInverse>; + using _16x8 = DcPredFuncs_SSE4_1<4, 3, DcSum16_SSE4_1, DcSum8_SSE4_1, + DcStore16xH_SSE4_1<8>, 3, kThreeInverse>; + using _16x16 = DcPredFuncs_SSE4_1<4, 4, DcSum16_SSE4_1, DcSum16_SSE4_1, + DcStore16xH_SSE4_1<16>, 0, 0>; + using _16x32 = DcPredFuncs_SSE4_1<4, 5, DcSum16_SSE4_1, DcSum32_SSE4_1, + DcStore16xH_SSE4_1<32>, 4, kThreeInverse>; + using _16x64 = DcPredFuncs_SSE4_1<4, 6, DcSum16_SSE4_1, DcSum64_SSE4_1, + DcStore16xH_SSE4_1<64>, 4, kFiveInverse>; + + using _32x8 = DcPredFuncs_SSE4_1<5, 3, DcSum32_SSE4_1, DcSum8_SSE4_1, + DcStore32xH_SSE4_1<8>, 3, kFiveInverse>; + using _32x16 = DcPredFuncs_SSE4_1<5, 4, DcSum32_SSE4_1, DcSum16_SSE4_1, + DcStore32xH_SSE4_1<16>, 4, kThreeInverse>; + using _32x32 = DcPredFuncs_SSE4_1<5, 5, DcSum32_SSE4_1, DcSum32_SSE4_1, + DcStore32xH_SSE4_1<32>, 0, 0>; + using _32x64 = DcPredFuncs_SSE4_1<5, 6, DcSum32_SSE4_1, DcSum64_SSE4_1, + DcStore32xH_SSE4_1<64>, 5, kThreeInverse>; + + using _64x16 = DcPredFuncs_SSE4_1<6, 4, DcSum64_SSE4_1, DcSum16_SSE4_1, + DcStore64xH_SSE4_1<16>, 4, kFiveInverse>; + using _64x32 = DcPredFuncs_SSE4_1<6, 5, DcSum64_SSE4_1, DcSum32_SSE4_1, + DcStore64xH_SSE4_1<32>, 5, kThreeInverse>; + using _64x64 = DcPredFuncs_SSE4_1<6, 6, DcSum64_SSE4_1, DcSum64_SSE4_1, + DcStore64xH_SSE4_1<64>, 0, 0>; +}; + +struct DirDefs { + DirDefs() = delete; + + using _4x4 = DirectionalPredFuncs_SSE4_1>; + using _4x8 = DirectionalPredFuncs_SSE4_1>; + using _4x16 = + DirectionalPredFuncs_SSE4_1>; + using _8x4 = DirectionalPredFuncs_SSE4_1>; + using _8x8 = DirectionalPredFuncs_SSE4_1>; + using _8x16 = + DirectionalPredFuncs_SSE4_1>; + using _8x32 = + DirectionalPredFuncs_SSE4_1>; + using _16x4 = + DirectionalPredFuncs_SSE4_1>; + using _16x8 = + DirectionalPredFuncs_SSE4_1>; + using _16x16 = + DirectionalPredFuncs_SSE4_1>; + using _16x32 = + DirectionalPredFuncs_SSE4_1>; + using _16x64 = + DirectionalPredFuncs_SSE4_1>; + using _32x8 = + DirectionalPredFuncs_SSE4_1>; + using _32x16 = + DirectionalPredFuncs_SSE4_1>; + using _32x32 = + DirectionalPredFuncs_SSE4_1>; + using _32x64 = + DirectionalPredFuncs_SSE4_1>; + using _64x16 = + DirectionalPredFuncs_SSE4_1>; + using _64x32 = + DirectionalPredFuncs_SSE4_1>; + using _64x64 = + DirectionalPredFuncs_SSE4_1>; +}; + +template +inline void WritePaethLine4(uint8_t* dst, const __m128i& top, + const __m128i& left, const __m128i& top_lefts, + const __m128i& top_dists, const __m128i& left_dists, + const __m128i& top_left_diffs) { + const __m128i top_dists_y = _mm_shuffle_epi32(top_dists, y_mask); + + const __m128i lefts_y = _mm_shuffle_epi32(left, y_mask); + const __m128i top_left_dists = + _mm_abs_epi32(_mm_add_epi32(lefts_y, top_left_diffs)); + + // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal + // operation is unavailable, so the logic for selecting top, left, or + // top_left is inverted. + __m128i not_select_left = _mm_cmpgt_epi32(left_dists, top_left_dists); + not_select_left = + _mm_or_si128(not_select_left, _mm_cmpgt_epi32(left_dists, top_dists_y)); + const __m128i not_select_top = _mm_cmpgt_epi32(top_dists_y, top_left_dists); + + const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y); + + const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts); + __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top); + top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out); + top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out); + + // The sequence of 32-bit packed operations was found (see CL via blame) to + // outperform 16-bit operations, despite the availability of the packus + // function, when tested on a Xeon E7 v3. + const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400); + const __m128i pred = _mm_shuffle_epi8( + _mm_or_si128(left_out, top_or_top_left_out), cvtepi32_epi8); + Store4(dst, pred); +} + +// top_left_diffs is the only variable whose ints may exceed 8 bits. Otherwise +// we would be able to do all of these operations as epi8 for a 16-pixel version +// of this function. Still, since lefts_y is just a vector of duplicates, it +// could pay off to accommodate top_left_dists for cmpgt, and repack into epi8 +// for the blends. +template +inline void WritePaethLine8(uint8_t* dst, const __m128i& top, + const __m128i& left, const __m128i& top_lefts, + const __m128i& top_dists, const __m128i& left_dists, + const __m128i& top_left_diffs) { + const __m128i select_y = _mm_set1_epi32(y_mask); + const __m128i top_dists_y = _mm_shuffle_epi8(top_dists, select_y); + + const __m128i lefts_y = _mm_shuffle_epi8(left, select_y); + const __m128i top_left_dists = + _mm_abs_epi16(_mm_add_epi16(lefts_y, top_left_diffs)); + + // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal + // operation is unavailable, so the logic for selecting top, left, or + // top_left is inverted. + __m128i not_select_left = _mm_cmpgt_epi16(left_dists, top_left_dists); + not_select_left = + _mm_or_si128(not_select_left, _mm_cmpgt_epi16(left_dists, top_dists_y)); + const __m128i not_select_top = _mm_cmpgt_epi16(top_dists_y, top_left_dists); + + const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y); + + const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts); + __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top); + top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out); + top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out); + + const __m128i pred = _mm_packus_epi16( + _mm_or_si128(left_out, top_or_top_left_out), /* unused */ left_out); + _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), pred); +} + +// |top| is an epi8 of length 16 +// |left| is epi8 of unknown length, as y_mask specifies access +// |top_lefts| is an epi8 of 16 duplicates +// |top_dists| is an epi8 of unknown length, as y_mask specifies access +// |left_dists| is an epi8 of length 16 +// |left_dists_lo| is an epi16 of length 8 +// |left_dists_hi| is an epi16 of length 8 +// |top_left_diffs_lo| is an epi16 of length 8 +// |top_left_diffs_hi| is an epi16 of length 8 +// The latter two vectors are epi16 because their values may reach -510. +// |left_dists| is provided alongside its spread out version because it doesn't +// change between calls and interacts with both kinds of packing. +template +inline void WritePaethLine16(uint8_t* dst, const __m128i& top, + const __m128i& left, const __m128i& top_lefts, + const __m128i& top_dists, + const __m128i& left_dists, + const __m128i& left_dists_lo, + const __m128i& left_dists_hi, + const __m128i& top_left_diffs_lo, + const __m128i& top_left_diffs_hi) { + const __m128i select_y = _mm_set1_epi32(y_mask); + const __m128i top_dists_y8 = _mm_shuffle_epi8(top_dists, select_y); + const __m128i top_dists_y16 = _mm_cvtepu8_epi16(top_dists_y8); + const __m128i lefts_y8 = _mm_shuffle_epi8(left, select_y); + const __m128i lefts_y16 = _mm_cvtepu8_epi16(lefts_y8); + + const __m128i top_left_dists_lo = + _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_lo)); + const __m128i top_left_dists_hi = + _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_hi)); + + const __m128i left_gt_top_left_lo = _mm_packs_epi16( + _mm_cmpgt_epi16(left_dists_lo, top_left_dists_lo), left_dists_lo); + const __m128i left_gt_top_left_hi = + _mm_packs_epi16(_mm_cmpgt_epi16(left_dists_hi, top_left_dists_hi), + /* unused second arg for pack */ left_dists_hi); + const __m128i left_gt_top_left = _mm_alignr_epi8( + left_gt_top_left_hi, _mm_slli_si128(left_gt_top_left_lo, 8), 8); + + const __m128i not_select_top_lo = + _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_lo), + /* unused second arg for pack */ top_dists_y16); + const __m128i not_select_top_hi = + _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_hi), + /* unused second arg for pack */ top_dists_y16); + const __m128i not_select_top = _mm_alignr_epi8( + not_select_top_hi, _mm_slli_si128(not_select_top_lo, 8), 8); + + const __m128i left_leq_top = + _mm_cmpeq_epi8(left_dists, _mm_min_epu8(top_dists_y8, left_dists)); + const __m128i select_left = _mm_andnot_si128(left_gt_top_left, left_leq_top); + + // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal + // operation is unavailable, so the logic for selecting top, left, or + // top_left is inverted. + const __m128i left_out = _mm_and_si128(select_left, lefts_y8); + + const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts); + __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top); + top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out); + top_or_top_left_out = _mm_andnot_si128(select_left, top_or_top_left_out); + const __m128i pred = _mm_or_si128(left_out, top_or_top_left_out); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), pred); +} + +void Paeth4x4_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, const void* const left_column) { + const __m128i left = _mm_cvtepu8_epi32(Load4(left_column)); + const __m128i top = _mm_cvtepu8_epi32(Load4(top_row)); + + const auto* const top_ptr = static_cast(top_row); + const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]); + + // Given that the spec defines "base" as top[x] + left[y] - top[-1], + // pLeft = abs(base - left[y]) = abs(top[x] - top[-1]) + // pTop = abs(base - top[x]) = abs(left[y] - top[-1]) + const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts)); + const __m128i top_dists = _mm_abs_epi32(_mm_sub_epi32(left, top_lefts)); + + const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts); + const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2); + auto* dst = static_cast(dest); + WritePaethLine4<0>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0x55>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0xAA>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0xFF>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); +} + +void Paeth4x8_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, const void* const left_column) { + const __m128i left = LoadLo8(left_column); + const __m128i left_lo = _mm_cvtepu8_epi32(left); + const __m128i left_hi = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4)); + + const __m128i top = _mm_cvtepu8_epi32(Load4(top_row)); + const auto* const top_ptr = static_cast(top_row); + const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]); + + // Given that the spec defines "base" as top[x] + left[y] - top[-1], + // pLeft = abs(base - left[y]) = abs(top[x] - top[-1]) + // pTop = abs(base - top[x]) = abs(left[y] - top[-1]) + const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts)); + const __m128i top_dists_lo = _mm_abs_epi32(_mm_sub_epi32(left_lo, top_lefts)); + const __m128i top_dists_hi = _mm_abs_epi32(_mm_sub_epi32(left_hi, top_lefts)); + + const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts); + const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2); + auto* dst = static_cast(dest); + WritePaethLine4<0>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0x55>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0xAA>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0xFF>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0x55>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0xAA>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0xFF>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists, + top_left_diff); +} + +void Paeth4x16_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const __m128i left = LoadUnaligned16(left_column); + const __m128i left_0 = _mm_cvtepu8_epi32(left); + const __m128i left_1 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4)); + const __m128i left_2 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 8)); + const __m128i left_3 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 12)); + + const __m128i top = _mm_cvtepu8_epi32(Load4(top_row)); + const auto* const top_ptr = static_cast(top_row); + const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]); + + // Given that the spec defines "base" as top[x] + left[y] - top[-1], + // pLeft = abs(base - left[y]) = abs(top[x] - top[-1]) + // pTop = abs(base - top[x]) = abs(left[y] - top[-1]) + const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts)); + const __m128i top_dists_0 = _mm_abs_epi32(_mm_sub_epi32(left_0, top_lefts)); + const __m128i top_dists_1 = _mm_abs_epi32(_mm_sub_epi32(left_1, top_lefts)); + const __m128i top_dists_2 = _mm_abs_epi32(_mm_sub_epi32(left_2, top_lefts)); + const __m128i top_dists_3 = _mm_abs_epi32(_mm_sub_epi32(left_3, top_lefts)); + + const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts); + const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2); + + auto* dst = static_cast(dest); + WritePaethLine4<0>(dst, top, left_0, top_lefts, top_dists_0, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0x55>(dst, top, left_0, top_lefts, top_dists_0, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0xAA>(dst, top, left_0, top_lefts, top_dists_0, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0xFF>(dst, top, left_0, top_lefts, top_dists_0, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0>(dst, top, left_1, top_lefts, top_dists_1, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0x55>(dst, top, left_1, top_lefts, top_dists_1, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0xAA>(dst, top, left_1, top_lefts, top_dists_1, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0xFF>(dst, top, left_1, top_lefts, top_dists_1, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0>(dst, top, left_2, top_lefts, top_dists_2, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0x55>(dst, top, left_2, top_lefts, top_dists_2, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0xAA>(dst, top, left_2, top_lefts, top_dists_2, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0xFF>(dst, top, left_2, top_lefts, top_dists_2, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0>(dst, top, left_3, top_lefts, top_dists_3, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0x55>(dst, top, left_3, top_lefts, top_dists_3, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0xAA>(dst, top, left_3, top_lefts, top_dists_3, left_dists, + top_left_diff); + dst += stride; + WritePaethLine4<0xFF>(dst, top, left_3, top_lefts, top_dists_3, left_dists, + top_left_diff); +} + +void Paeth8x4_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, const void* const left_column) { + const __m128i left = _mm_cvtepu8_epi16(Load4(left_column)); + const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row)); + const auto* const top_ptr = static_cast(top_row); + const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]); + + // Given that the spec defines "base" as top[x] + left[y] - top[-1], + // pLeft = abs(base - left[y]) = abs(top[x] - top[-1]) + // pTop = abs(base - top[x]) = abs(left[y] - top[-1]) + const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts)); + const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts)); + + const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts); + const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2); + auto* dst = static_cast(dest); + WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); + dst += stride; + WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); + dst += stride; + WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); + dst += stride; + WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); +} + +void Paeth8x8_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, const void* const left_column) { + const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column)); + const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row)); + const auto* const top_ptr = static_cast(top_row); + const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]); + + // Given that the spec defines "base" as top[x] + left[y] - top[-1], + // pLeft = abs(base - left[y]) = abs(top[x] - top[-1]) + // pTop = abs(base - top[x]) = abs(left[y] - top[-1]) + const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts)); + const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts)); + + const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts); + const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2); + auto* dst = static_cast(dest); + WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); + dst += stride; + WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); + dst += stride; + WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); + dst += stride; + WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); + dst += stride; + WritePaethLine8<0x09080908>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); + dst += stride; + WritePaethLine8<0x0B0A0B0A>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); + dst += stride; + WritePaethLine8<0x0D0C0D0C>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); + dst += stride; + WritePaethLine8<0x0F0E0F0E>(dst, top, left, top_lefts, top_dists, left_dists, + top_left_diff); +} + +void Paeth8x16_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const __m128i left = LoadUnaligned16(left_column); + const __m128i left_lo = _mm_cvtepu8_epi16(left); + const __m128i left_hi = _mm_cvtepu8_epi16(_mm_srli_si128(left, 8)); + const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row)); + const auto* const top_ptr = static_cast(top_row); + const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]); + + // Given that the spec defines "base" as top[x] + left[y] - top[-1], + // pLeft = abs(base - left[y]) = abs(top[x] - top[-1]) + // pTop = abs(base - top[x]) = abs(left[y] - top[-1]) + const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts)); + const __m128i top_dists_lo = _mm_abs_epi16(_mm_sub_epi16(left_lo, top_lefts)); + const __m128i top_dists_hi = _mm_abs_epi16(_mm_sub_epi16(left_hi, top_lefts)); + + const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts); + const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2); + auto* dst = static_cast(dest); + WritePaethLine8<0x01000100>(dst, top, left_lo, top_lefts, top_dists_lo, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x03020302>(dst, top, left_lo, top_lefts, top_dists_lo, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x05040504>(dst, top, left_lo, top_lefts, top_dists_lo, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x07060706>(dst, top, left_lo, top_lefts, top_dists_lo, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x09080908>(dst, top, left_lo, top_lefts, top_dists_lo, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x0B0A0B0A>(dst, top, left_lo, top_lefts, top_dists_lo, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x0D0C0D0C>(dst, top, left_lo, top_lefts, top_dists_lo, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x0F0E0F0E>(dst, top, left_lo, top_lefts, top_dists_lo, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x01000100>(dst, top, left_hi, top_lefts, top_dists_hi, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x03020302>(dst, top, left_hi, top_lefts, top_dists_hi, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x05040504>(dst, top, left_hi, top_lefts, top_dists_hi, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x07060706>(dst, top, left_hi, top_lefts, top_dists_hi, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x09080908>(dst, top, left_hi, top_lefts, top_dists_hi, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x0B0A0B0A>(dst, top, left_hi, top_lefts, top_dists_hi, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x0D0C0D0C>(dst, top, left_hi, top_lefts, top_dists_hi, + left_dists, top_left_diff); + dst += stride; + WritePaethLine8<0x0F0E0F0E>(dst, top, left_hi, top_lefts, top_dists_hi, + left_dists, top_left_diff); +} + +void Paeth8x32_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + auto* const dst = static_cast(dest); + Paeth8x16_SSE4_1(dst, stride, top_row, left_column); + Paeth8x16_SSE4_1(dst + (stride << 4), stride, top_row, left_ptr + 16); +} + +void Paeth16x4_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const __m128i left = Load4(left_column); + const __m128i top = LoadUnaligned16(top_row); + const __m128i top_lo = _mm_cvtepu8_epi16(top); + const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8)); + + const auto* const top_ptr = static_cast(top_row); + const __m128i top_lefts16 = _mm_set1_epi16(top_ptr[-1]); + const __m128i top_lefts8 = _mm_set1_epi8(static_cast(top_ptr[-1])); + + // Given that the spec defines "base" as top[x] + left[y] - top[-1], + // pLeft = abs(base - left[y]) = abs(top[x] - top[-1]) + // pTop = abs(base - top[x]) = abs(left[y] - top[-1]) + + const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8), + _mm_subs_epu8(top_lefts8, top)); + const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists); + const __m128i left_dists_hi = + _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8)); + const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8), + _mm_subs_epu8(top_lefts8, left)); + + const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16); + const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2); + const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2); + auto* dst = static_cast(dest); + WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists, + left_dists_lo, left_dists_hi, top_left_diff_lo, + top_left_diff_hi); + dst += stride; + WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); +} + +// Inlined for calling with offsets in larger transform sizes, mainly to +// preserve top_left. +inline void WritePaeth16x8(void* const dest, ptrdiff_t stride, + const uint8_t top_left, const __m128i top, + const __m128i left) { + const __m128i top_lo = _mm_cvtepu8_epi16(top); + const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8)); + + const __m128i top_lefts16 = _mm_set1_epi16(top_left); + const __m128i top_lefts8 = _mm_set1_epi8(static_cast(top_left)); + + // Given that the spec defines "base" as top[x] + left[y] - top_left, + // pLeft = abs(base - left[y]) = abs(top[x] - top[-1]) + // pTop = abs(base - top[x]) = abs(left[y] - top[-1]) + + const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8), + _mm_subs_epu8(top_lefts8, top)); + const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists); + const __m128i left_dists_hi = + _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8)); + const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8), + _mm_subs_epu8(top_lefts8, left)); + + const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16); + const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2); + const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2); + auto* dst = static_cast(dest); + WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists, + left_dists_lo, left_dists_hi, top_left_diff_lo, + top_left_diff_hi); + dst += stride; + WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); +} + +void Paeth16x8_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const __m128i top = LoadUnaligned16(top_row); + const __m128i left = LoadLo8(left_column); + const auto* const top_ptr = static_cast(top_row); + WritePaeth16x8(static_cast(dest), stride, top_ptr[-1], top, left); +} + +void WritePaeth16x16(void* const dest, ptrdiff_t stride, const uint8_t top_left, + const __m128i top, const __m128i left) { + const __m128i top_lo = _mm_cvtepu8_epi16(top); + const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8)); + + const __m128i top_lefts16 = _mm_set1_epi16(top_left); + const __m128i top_lefts8 = _mm_set1_epi8(static_cast(top_left)); + + // Given that the spec defines "base" as top[x] + left[y] - top[-1], + // pLeft = abs(base - left[y]) = abs(top[x] - top[-1]) + // pTop = abs(base - top[x]) = abs(left[y] - top[-1]) + + const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8), + _mm_subs_epu8(top_lefts8, top)); + const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists); + const __m128i left_dists_hi = + _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8)); + const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8), + _mm_subs_epu8(top_lefts8, left)); + + const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16); + const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2); + const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2); + auto* dst = static_cast(dest); + WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists, + left_dists_lo, left_dists_hi, top_left_diff_lo, + top_left_diff_hi); + dst += stride; + WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x08080808>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x09090909>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x0A0A0A0A>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x0B0B0B0B>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x0C0C0C0C>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x0D0D0D0D>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x0E0E0E0E>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); + dst += stride; + WritePaethLine16<0x0F0F0F0F>(dst, top, left, top_lefts8, top_dists, + left_dists, left_dists_lo, left_dists_hi, + top_left_diff_lo, top_left_diff_hi); +} + +void Paeth16x16_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const __m128i left = LoadUnaligned16(left_column); + const __m128i top = LoadUnaligned16(top_row); + const auto* const top_ptr = static_cast(top_row); + WritePaeth16x16(static_cast(dest), stride, top_ptr[-1], top, left); +} + +void Paeth16x32_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const __m128i left_0 = LoadUnaligned16(left_column); + const __m128i top = LoadUnaligned16(top_row); + const auto* const top_ptr = static_cast(top_row); + const uint8_t top_left = top_ptr[-1]; + auto* const dst = static_cast(dest); + WritePaeth16x16(dst, stride, top_left, top, left_0); + const auto* const left_ptr = static_cast(left_column); + const __m128i left_1 = LoadUnaligned16(left_ptr + 16); + WritePaeth16x16(dst + (stride << 4), stride, top_left, top, left_1); +} + +void Paeth16x64_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const ptrdiff_t stride16 = stride << 4; + const __m128i left_0 = LoadUnaligned16(left_column); + const __m128i top = LoadUnaligned16(top_row); + const auto* const top_ptr = static_cast(top_row); + const uint8_t top_left = top_ptr[-1]; + auto* dst = static_cast(dest); + WritePaeth16x16(dst, stride, top_left, top, left_0); + dst += stride16; + const auto* const left_ptr = static_cast(left_column); + const __m128i left_1 = LoadUnaligned16(left_ptr + 16); + WritePaeth16x16(dst, stride, top_left, top, left_1); + dst += stride16; + const __m128i left_2 = LoadUnaligned16(left_ptr + 32); + WritePaeth16x16(dst, stride, top_left, top, left_2); + dst += stride16; + const __m128i left_3 = LoadUnaligned16(left_ptr + 48); + WritePaeth16x16(dst, stride, top_left, top, left_3); +} + +void Paeth32x8_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const __m128i left = LoadLo8(left_column); + const auto* const top_ptr = static_cast(top_row); + const __m128i top_0 = LoadUnaligned16(top_row); + const uint8_t top_left = top_ptr[-1]; + auto* const dst = static_cast(dest); + WritePaeth16x8(dst, stride, top_left, top_0, left); + const __m128i top_1 = LoadUnaligned16(top_ptr + 16); + WritePaeth16x8(dst + 16, stride, top_left, top_1, left); +} + +void Paeth32x16_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const __m128i left = LoadUnaligned16(left_column); + const auto* const top_ptr = static_cast(top_row); + const __m128i top_0 = LoadUnaligned16(top_row); + const uint8_t top_left = top_ptr[-1]; + auto* const dst = static_cast(dest); + WritePaeth16x16(dst, stride, top_left, top_0, left); + const __m128i top_1 = LoadUnaligned16(top_ptr + 16); + WritePaeth16x16(dst + 16, stride, top_left, top_1, left); +} + +void Paeth32x32_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + const __m128i left_0 = LoadUnaligned16(left_ptr); + const auto* const top_ptr = static_cast(top_row); + const __m128i top_0 = LoadUnaligned16(top_ptr); + const __m128i left_1 = LoadUnaligned16(left_ptr + 16); + const __m128i top_1 = LoadUnaligned16(top_ptr + 16); + const uint8_t top_left = top_ptr[-1]; + auto* dst = static_cast(dest); + WritePaeth16x16(dst, stride, top_left, top_0, left_0); + WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0); + dst += (stride << 4); + WritePaeth16x16(dst, stride, top_left, top_0, left_1); + WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1); +} + +void Paeth32x64_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + const __m128i left_0 = LoadUnaligned16(left_ptr); + const auto* const top_ptr = static_cast(top_row); + const __m128i top_0 = LoadUnaligned16(top_ptr); + const __m128i left_1 = LoadUnaligned16(left_ptr + 16); + const __m128i top_1 = LoadUnaligned16(top_ptr + 16); + const __m128i left_2 = LoadUnaligned16(left_ptr + 32); + const __m128i left_3 = LoadUnaligned16(left_ptr + 48); + const uint8_t top_left = top_ptr[-1]; + auto* dst = static_cast(dest); + WritePaeth16x16(dst, stride, top_left, top_0, left_0); + WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0); + dst += (stride << 4); + WritePaeth16x16(dst, stride, top_left, top_0, left_1); + WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1); + dst += (stride << 4); + WritePaeth16x16(dst, stride, top_left, top_0, left_2); + WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2); + dst += (stride << 4); + WritePaeth16x16(dst, stride, top_left, top_0, left_3); + WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3); +} + +void Paeth64x16_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const __m128i left = LoadUnaligned16(left_column); + const auto* const top_ptr = static_cast(top_row); + const __m128i top_0 = LoadUnaligned16(top_ptr); + const __m128i top_1 = LoadUnaligned16(top_ptr + 16); + const __m128i top_2 = LoadUnaligned16(top_ptr + 32); + const __m128i top_3 = LoadUnaligned16(top_ptr + 48); + const uint8_t top_left = top_ptr[-1]; + auto* dst = static_cast(dest); + WritePaeth16x16(dst, stride, top_left, top_0, left); + WritePaeth16x16(dst + 16, stride, top_left, top_1, left); + WritePaeth16x16(dst + 32, stride, top_left, top_2, left); + WritePaeth16x16(dst + 48, stride, top_left, top_3, left); +} + +void Paeth64x32_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + const __m128i left_0 = LoadUnaligned16(left_ptr); + const __m128i left_1 = LoadUnaligned16(left_ptr + 16); + const auto* const top_ptr = static_cast(top_row); + const __m128i top_0 = LoadUnaligned16(top_ptr); + const __m128i top_1 = LoadUnaligned16(top_ptr + 16); + const __m128i top_2 = LoadUnaligned16(top_ptr + 32); + const __m128i top_3 = LoadUnaligned16(top_ptr + 48); + const uint8_t top_left = top_ptr[-1]; + auto* dst = static_cast(dest); + WritePaeth16x16(dst, stride, top_left, top_0, left_0); + WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0); + WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0); + WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0); + dst += (stride << 4); + WritePaeth16x16(dst, stride, top_left, top_0, left_1); + WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1); + WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1); + WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1); +} + +void Paeth64x64_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column) { + const auto* const left_ptr = static_cast(left_column); + const __m128i left_0 = LoadUnaligned16(left_ptr); + const __m128i left_1 = LoadUnaligned16(left_ptr + 16); + const __m128i left_2 = LoadUnaligned16(left_ptr + 32); + const __m128i left_3 = LoadUnaligned16(left_ptr + 48); + const auto* const top_ptr = static_cast(top_row); + const __m128i top_0 = LoadUnaligned16(top_ptr); + const __m128i top_1 = LoadUnaligned16(top_ptr + 16); + const __m128i top_2 = LoadUnaligned16(top_ptr + 32); + const __m128i top_3 = LoadUnaligned16(top_ptr + 48); + const uint8_t top_left = top_ptr[-1]; + auto* dst = static_cast(dest); + WritePaeth16x16(dst, stride, top_left, top_0, left_0); + WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0); + WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0); + WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0); + dst += (stride << 4); + WritePaeth16x16(dst, stride, top_left, top_0, left_1); + WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1); + WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1); + WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1); + dst += (stride << 4); + WritePaeth16x16(dst, stride, top_left, top_0, left_2); + WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2); + WritePaeth16x16(dst + 32, stride, top_left, top_2, left_2); + WritePaeth16x16(dst + 48, stride, top_left, top_3, left_2); + dst += (stride << 4); + WritePaeth16x16(dst, stride, top_left, top_0, left_3); + WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3); + WritePaeth16x16(dst + 32, stride, top_left, top_2, left_3); + WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3); +} + +//------------------------------------------------------------------------------ +// 7.11.2.4. Directional intra prediction process + +// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning +// upsampling is ruled out. In addition, the bits masked by 0x3F for +// |shift_val| are 0 for all multiples of 64, so the formula +// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to +// val = top[top_base_x+1] << 5, meaning only the second set of pixels is +// involved in the output. Hence |top| is offset by 1. +inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride, + const uint8_t* const top, const int width, + const int height) { + ptrdiff_t offset = 1; + if (height == 4) { + memcpy(dst, top + offset, width); + dst += stride; + memcpy(dst, top + offset + 1, width); + dst += stride; + memcpy(dst, top + offset + 2, width); + dst += stride; + memcpy(dst, top + offset + 3, width); + return; + } + int y = 0; + do { + memcpy(dst, top + offset, width); + dst += stride; + memcpy(dst, top + offset + 1, width); + dst += stride; + memcpy(dst, top + offset + 2, width); + dst += stride; + memcpy(dst, top + offset + 3, width); + dst += stride; + memcpy(dst, top + offset + 4, width); + dst += stride; + memcpy(dst, top + offset + 5, width); + dst += stride; + memcpy(dst, top + offset + 6, width); + dst += stride; + memcpy(dst, top + offset + 7, width); + dst += stride; + + offset += 8; + y += 8; + } while (y < height); +} + +inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride, + const uint8_t* const top, const int height, + const int xstep, const bool upsampled) { + const int upsample_shift = static_cast(upsampled); + const int scale_bits = 6 - upsample_shift; + const int rounding_bits = 5; + const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift; + const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]); + const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100) + : _mm_set_epi64x(0, 0x0403030202010100); + // Each 16-bit value here corresponds to a position that may exceed + // |max_base_x|. When added to the top_base_x, it is used to mask values + // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is + // not supported for packed integers. + const __m128i offsets = + _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); + + // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x| + // is always greater than |height|, so clipping to 1 is enough to make the + // logic work. + const int xstep_units = std::max(xstep >> scale_bits, 1); + const int min_corner_only_y = std::min(max_base_x / xstep_units, height); + + // Rows up to this y-value can be computed without checking for bounds. + int y = 0; + int top_x = xstep; + + for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) { + const int top_base_x = top_x >> scale_bits; + + // Permit negative values of |top_x|. + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi8(shift_val); + const __m128i max_shift = _mm_set1_epi8(32); + const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); + __m128i top_index_vect = _mm_set1_epi16(top_base_x); + top_index_vect = _mm_add_epi16(top_index_vect, offsets); + const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x); + + // Load 8 values because we will select the sampled values based on + // |upsampled|. + const __m128i values = LoadLo8(top + top_base_x); + const __m128i sampled_values = _mm_shuffle_epi8(values, sampler); + const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect); + __m128i prod = _mm_maddubs_epi16(sampled_values, shifts); + prod = RightShiftWithRounding_U16(prod, rounding_bits); + // Replace pixels from invalid range with top-right corner. + prod = _mm_blendv_epi8(prod, final_top_val, past_max); + Store4(dst, _mm_packus_epi16(prod, prod)); + } + + // Fill in corner-only rows. + for (; y < height; ++y) { + memset(dst, top[max_base_x], /* width */ 4); + dst += stride; + } +} + +// 7.11.2.4 (7) angle < 90 +inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride, + const uint8_t* const top_row, + const int width, const int height, + const int xstep, const bool upsampled) { + const int upsample_shift = static_cast(upsampled); + const __m128i sampler = + upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) + : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); + const int scale_bits = 6 - upsample_shift; + const int max_base_x = ((width + height) - 1) << upsample_shift; + + const __m128i max_shift = _mm_set1_epi8(32); + const int rounding_bits = 5; + const int base_step = 1 << upsample_shift; + const int base_step8 = base_step << 3; + + // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x| + // is always greater than |height|, so clipping to 1 is enough to make the + // logic work. + const int xstep_units = std::max(xstep >> scale_bits, 1); + const int min_corner_only_y = std::min(max_base_x / xstep_units, height); + + // Rows up to this y-value can be computed without checking for bounds. + const int max_no_corner_y = std::min( + LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep, + height); + // No need to check for exceeding |max_base_x| in the first loop. + int y = 0; + int top_x = xstep; + for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) { + int top_base_x = top_x >> scale_bits; + // Permit negative values of |top_x|. + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi8(shift_val); + const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); + int x = 0; + do { + const __m128i top_vals = LoadUnaligned16(top_row + top_base_x); + __m128i vals = _mm_shuffle_epi8(top_vals, sampler); + vals = _mm_maddubs_epi16(vals, shifts); + vals = RightShiftWithRounding_U16(vals, rounding_bits); + StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); + top_base_x += base_step8; + x += 8; + } while (x < width); + } + + // Each 16-bit value here corresponds to a position that may exceed + // |max_base_x|. When added to the top_base_x, it is used to mask values + // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is + // not supported for packed integers. + const __m128i offsets = + _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); + + const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x); + const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]); + const __m128i base_step8_vect = _mm_set1_epi16(base_step8); + for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) { + int top_base_x = top_x >> scale_bits; + + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi8(shift_val); + const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); + __m128i top_index_vect = _mm_set1_epi16(top_base_x); + top_index_vect = _mm_add_epi16(top_index_vect, offsets); + + int x = 0; + const int min_corner_only_x = + std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7; + for (; x < min_corner_only_x; + x += 8, top_base_x += base_step8, + top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) { + const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect); + // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents + // reading out of bounds. If all indices are past max and we don't need to + // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will + // reset for the next |y|. + top_base_x &= ~_mm_cvtsi128_si32(past_max); + const __m128i top_vals = LoadUnaligned16(top_row + top_base_x); + __m128i vals = _mm_shuffle_epi8(top_vals, sampler); + vals = _mm_maddubs_epi16(vals, shifts); + vals = RightShiftWithRounding_U16(vals, rounding_bits); + vals = _mm_blendv_epi8(vals, final_top_val, past_max); + StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); + } + // Corner-only section of the row. + memset(dest + x, top_row[max_base_x], width - x); + } + // Fill in corner-only rows. + for (; y < height; ++y) { + memset(dest, top_row[max_base_x], width); + dest += stride; + } +} + +// 7.11.2.4 (7) angle < 90 +inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride, + const uint8_t* const top_row, + const int width, const int height, + const int xstep, const bool upsampled) { + const int upsample_shift = static_cast(upsampled); + if (xstep == 64) { + DirectionalZone1_Step64(dest, stride, top_row, width, height); + return; + } + if (width == 4) { + DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled); + return; + } + if (width >= 32) { + DirectionalZone1_Large(dest, stride, top_row, width, height, xstep, + upsampled); + return; + } + const __m128i sampler = + upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) + : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); + const int scale_bits = 6 - upsample_shift; + const int max_base_x = ((width + height) - 1) << upsample_shift; + + const __m128i max_shift = _mm_set1_epi8(32); + const int rounding_bits = 5; + const int base_step = 1 << upsample_shift; + const int base_step8 = base_step << 3; + + // No need to check for exceeding |max_base_x| in the loops. + if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) { + int top_x = xstep; + int y = 0; + do { + int top_base_x = top_x >> scale_bits; + // Permit negative values of |top_x|. + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi8(shift_val); + const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); + int x = 0; + do { + const __m128i top_vals = LoadUnaligned16(top_row + top_base_x); + __m128i vals = _mm_shuffle_epi8(top_vals, sampler); + vals = _mm_maddubs_epi16(vals, shifts); + vals = RightShiftWithRounding_U16(vals, rounding_bits); + StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); + top_base_x += base_step8; + x += 8; + } while (x < width); + dest += stride; + top_x += xstep; + } while (++y < height); + return; + } + + // Each 16-bit value here corresponds to a position that may exceed + // |max_base_x|. When added to the top_base_x, it is used to mask values + // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is + // not supported for packed integers. + const __m128i offsets = + _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); + + const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x); + const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]); + const __m128i base_step8_vect = _mm_set1_epi16(base_step8); + int top_x = xstep; + int y = 0; + do { + int top_base_x = top_x >> scale_bits; + + if (top_base_x >= max_base_x) { + for (int i = y; i < height; ++i) { + memset(dest, top_row[max_base_x], width); + dest += stride; + } + return; + } + + const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi8(shift_val); + const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); + __m128i top_index_vect = _mm_set1_epi16(top_base_x); + top_index_vect = _mm_add_epi16(top_index_vect, offsets); + + int x = 0; + for (; x < width - 8; + x += 8, top_base_x += base_step8, + top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) { + const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect); + // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents + // reading out of bounds. If all indices are past max and we don't need to + // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will + // reset for the next |y|. + top_base_x &= ~_mm_cvtsi128_si32(past_max); + const __m128i top_vals = LoadUnaligned16(top_row + top_base_x); + __m128i vals = _mm_shuffle_epi8(top_vals, sampler); + vals = _mm_maddubs_epi16(vals, shifts); + vals = RightShiftWithRounding_U16(vals, rounding_bits); + vals = _mm_blendv_epi8(vals, final_top_val, past_max); + StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); + } + const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect); + __m128i vals; + if (upsampled) { + vals = LoadUnaligned16(top_row + top_base_x); + } else { + const __m128i top_vals = LoadLo8(top_row + top_base_x); + vals = _mm_shuffle_epi8(top_vals, sampler); + vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15); + } + vals = _mm_maddubs_epi16(vals, shifts); + vals = RightShiftWithRounding_U16(vals, rounding_bits); + vals = _mm_blendv_epi8(vals, final_top_val, past_max); + StoreLo8(dest + x, _mm_packus_epi16(vals, vals)); + dest += stride; + top_x += xstep; + } while (++y < height); +} + +void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const int width, const int height, + const int xstep, + const bool upsampled_top) { + const auto* const top_ptr = static_cast(top_row); + auto* dst = static_cast(dest); + DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep, + upsampled_top); +} + +template +inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride, + const uint8_t* const left_column, + const int base_left_y, const int ystep) { + // For use in the non-upsampled case. + const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100); + const int upsample_shift = static_cast(upsampled); + const int scale_bits = 6 - upsample_shift; + const __m128i max_shift = _mm_set1_epi8(32); + const int rounding_bits = 5; + + __m128i result_block[4]; + for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) { + const int left_base_y = left_y >> scale_bits; + const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi8(shift_val); + const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); + __m128i vals; + if (upsampled) { + vals = LoadLo8(left_column + left_base_y); + } else { + const __m128i top_vals = LoadLo8(left_column + left_base_y); + vals = _mm_shuffle_epi8(top_vals, sampler); + } + vals = _mm_maddubs_epi16(vals, shifts); + vals = RightShiftWithRounding_U16(vals, rounding_bits); + result_block[x] = _mm_packus_epi16(vals, vals); + } + const __m128i result = Transpose4x4_U8(result_block); + // This is result_row0. + Store4(dest, result); + dest += stride; + const int result_row1 = _mm_extract_epi32(result, 1); + memcpy(dest, &result_row1, sizeof(result_row1)); + dest += stride; + const int result_row2 = _mm_extract_epi32(result, 2); + memcpy(dest, &result_row2, sizeof(result_row2)); + dest += stride; + const int result_row3 = _mm_extract_epi32(result, 3); + memcpy(dest, &result_row3, sizeof(result_row3)); +} + +template +inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride, + const uint8_t* const left_column, + const int base_left_y, const int ystep) { + // For use in the non-upsampled case. + const __m128i sampler = + _mm_set_epi64x(0x0807070606050504, 0x0403030202010100); + const int upsample_shift = static_cast(upsampled); + const int scale_bits = 6 - upsample_shift; + const __m128i max_shift = _mm_set1_epi8(32); + const int rounding_bits = 5; + + __m128i result_block[8]; + for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) { + const int left_base_y = left_y >> scale_bits; + const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + const __m128i shift = _mm_set1_epi8(shift_val); + const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift); + const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift); + __m128i vals; + if (upsampled) { + vals = LoadUnaligned16(left_column + left_base_y); + } else { + const __m128i top_vals = LoadUnaligned16(left_column + left_base_y); + vals = _mm_shuffle_epi8(top_vals, sampler); + } + vals = _mm_maddubs_epi16(vals, shifts); + result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits); + } + Transpose8x8_U16(result_block, result_block); + for (int y = 0; y < height; ++y) { + StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y])); + dest += stride; + } +} + +// 7.11.2.4 (9) angle > 180 +void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride, + const void* const left_column, + const int width, const int height, + const int ystep, + const bool upsampled) { + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + const int upsample_shift = static_cast(upsampled); + if (width == 4 || height == 4) { + const ptrdiff_t stride4 = stride << 2; + if (upsampled) { + int left_y = ystep; + int x = 0; + do { + uint8_t* dst_x = dst + x; + int y = 0; + do { + DirectionalZone3_4x4( + dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep); + dst_x += stride4; + y += 4; + } while (y < height); + left_y += ystep << 2; + x += 4; + } while (x < width); + } else { + int left_y = ystep; + int x = 0; + do { + uint8_t* dst_x = dst + x; + int y = 0; + do { + DirectionalZone3_4x4(dst_x, stride, left_ptr + y, left_y, + ystep); + dst_x += stride4; + y += 4; + } while (y < height); + left_y += ystep << 2; + x += 4; + } while (x < width); + } + return; + } + + const ptrdiff_t stride8 = stride << 3; + if (upsampled) { + int left_y = ystep; + int x = 0; + do { + uint8_t* dst_x = dst + x; + int y = 0; + do { + DirectionalZone3_8xH( + dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep); + dst_x += stride8; + y += 8; + } while (y < height); + left_y += ystep << 3; + x += 8; + } while (x < width); + } else { + int left_y = ystep; + int x = 0; + do { + uint8_t* dst_x = dst + x; + int y = 0; + do { + DirectionalZone3_8xH( + dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep); + dst_x += stride8; + y += 8; + } while (y < height); + left_y += ystep << 3; + x += 8; + } while (x < width); + } +} + +//------------------------------------------------------------------------------ +// Directional Zone 2 Functions +// 7.11.2.4 (8) + +// DirectionalBlend* selectively overwrites the values written by +// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each +// row. +template +inline void DirectionalBlend4_SSE4_1(uint8_t* dest, + const __m128i& dest_index_vect, + const __m128i& vals, + const __m128i& zone_bounds) { + const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector); + const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect); + const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest)); + const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left); + Store4(dest, _mm_packus_epi16(blended_vals, blended_vals)); +} + +inline void DirectionalBlend8_SSE4_1(uint8_t* dest, + const __m128i& dest_index_vect, + const __m128i& vals, + const __m128i& zone_bounds, + const __m128i& bounds_selector) { + const __m128i max_dest_x_vect = + _mm_shuffle_epi8(zone_bounds, bounds_selector); + const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect); + const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest)); + const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left); + StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals)); +} + +constexpr int kDirectionalWeightBits = 5; +// |source| is packed with 4 or 8 pairs of 8-bit values from left or top. +// |shifts| is named to match the specification, with 4 or 8 pairs of (32 - +// shift) and shift. Shift is guaranteed to be between 0 and 32. +inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source, + const __m128i& shifts, + const __m128i& sampler) { + const __m128i src_vals = LoadUnaligned16(source); + __m128i vals = _mm_shuffle_epi8(src_vals, sampler); + vals = _mm_maddubs_epi16(vals, shifts); + return RightShiftWithRounding_U16(vals, kDirectionalWeightBits); +} + +// Because the source values "move backwards" as the row index increases, the +// indices derived from ystep are generally negative. This is accommodated by +// making sure the relative indices are within [-15, 0] when the function is +// called, and sliding them into the inclusive range [0, 15], relative to a +// lower base address. +constexpr int kPositiveIndexOffset = 15; + +template +inline void DirectionalZone2FromLeftCol_4x4_SSE4_1( + uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base, + __m128i left_y) { + const int upsample_shift = static_cast(upsampled); + const int scale_bits = 6 - upsample_shift; + const __m128i max_shifts = _mm_set1_epi8(32); + const __m128i shift_mask = _mm_set1_epi32(0x003F003F); + const __m128i index_increment = _mm_cvtsi32_si128(0x01010101); + const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset); + // Left_column and sampler are both offset by 15 so the indices are always + // positive. + const uint8_t* left_column = left_column_base - kPositiveIndexOffset; + for (int y = 0; y < 4; dst += stride, ++y) { + __m128i offset_y = _mm_srai_epi16(left_y, scale_bits); + offset_y = _mm_packs_epi16(offset_y, offset_y); + + const __m128i adjacent = _mm_add_epi8(offset_y, index_increment); + __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent); + // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they + // can work as shuffle indices. Some values may be out of bounds, but their + // pred results will be masked over by top prediction. + sampler = _mm_add_epi8(sampler, positive_offset); + + __m128i shifts = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1); + shifts = _mm_packus_epi16(shifts, shifts); + const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts); + shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); + const __m128i vals = DirectionalZone2FromSource_SSE4_1( + left_column + (y << upsample_shift), shifts, sampler); + Store4(dst, _mm_packus_epi16(vals, vals)); + } +} + +// The height at which a load of 16 bytes will not contain enough source pixels +// from |left_column| to supply an accurate row when computing 8 pixels at a +// time. The values are found by inspection. By coincidence, all angles that +// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up +// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. +constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { + 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; + +template +inline void DirectionalZone2FromLeftCol_8x8_SSE4_1( + uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column, + __m128i left_y) { + const int upsample_shift = static_cast(upsampled); + const int scale_bits = 6 - upsample_shift; + const __m128i max_shifts = _mm_set1_epi8(32); + const __m128i shift_mask = _mm_set1_epi32(0x003F003F); + const __m128i index_increment = _mm_set1_epi8(1); + const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset); + for (int y = 0; y < 8; dst += stride, ++y) { + __m128i offset_y = _mm_srai_epi16(left_y, scale_bits); + offset_y = _mm_packs_epi16(offset_y, offset_y); + const __m128i adjacent = _mm_add_epi8(offset_y, index_increment); + + // Offset the relative index because ystep is negative in Zone 2 and shuffle + // indices must be nonnegative. + __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent); + sampler = _mm_add_epi8(sampler, denegation); + + __m128i shifts = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1); + shifts = _mm_packus_epi16(shifts, shifts); + const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts); + shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); + + // The specification adds (y << 6) to left_y, which is subject to + // upsampling, but this puts sampler indices out of the 0-15 range. It is + // equivalent to offset the source address by (y << upsample_shift) instead. + const __m128i vals = DirectionalZone2FromSource_SSE4_1( + left_column - kPositiveIndexOffset + (y << upsample_shift), shifts, + sampler); + StoreLo8(dst, _mm_packus_epi16(vals, vals)); + } +} + +// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 << +// upsampled_top), for each row. When there are 4 values, they can be duplicated +// with a non-register shuffle mask. +// |shifts| is one pair of weights that applies throughout a given row. +template +inline void DirectionalZone1Blend_4x4( + uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride, + __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts, + const __m128i& dest_index_x, int top_x, const int xstep) { + const int upsample_shift = static_cast(upsampled_top); + const int scale_bits_x = 6 - upsample_shift; + top_x -= xstep; + + int top_base_x = (top_x >> scale_bits_x); + const __m128i vals0 = DirectionalZone2FromSource_SSE4_1( + top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler); + DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds); + top_x -= xstep; + dest += stride; + + top_base_x = (top_x >> scale_bits_x); + const __m128i vals1 = DirectionalZone2FromSource_SSE4_1( + top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler); + DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds); + top_x -= xstep; + dest += stride; + + top_base_x = (top_x >> scale_bits_x); + const __m128i vals2 = DirectionalZone2FromSource_SSE4_1( + top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler); + DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds); + top_x -= xstep; + dest += stride; + + top_base_x = (top_x >> scale_bits_x); + const __m128i vals3 = DirectionalZone2FromSource_SSE4_1( + top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler); + DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds); +} + +template +inline void DirectionalZone1Blend_8xH( + uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride, + __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts, + const __m128i& dest_index_x, int top_x, const int xstep) { + const int upsample_shift = static_cast(upsampled_top); + const int scale_bits_x = 6 - upsample_shift; + + __m128i y_selector = _mm_set1_epi32(0x01000100); + const __m128i index_increment = _mm_set1_epi32(0x02020202); + for (int y = 0; y < height; ++y, + y_selector = _mm_add_epi8(y_selector, index_increment), + dest += stride) { + top_x -= xstep; + const int top_base_x = top_x >> scale_bits_x; + const __m128i vals = DirectionalZone2FromSource_SSE4_1( + top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler); + DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector); + } +} + +// 7.11.2.4 (8) 90 < angle > 180 +// The strategy for this function is to know how many blocks can be processed +// with just pixels from |top_ptr|, then handle mixed blocks, then handle only +// blocks that take from |left_ptr|. Additionally, a fast index-shuffle +// approach is used for pred values from |left_column| in sections that permit +// it. +template +inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride, + const uint8_t* const top_row, + const uint8_t* const left_column, + const int width, const int height, + const int xstep, const int ystep) { + auto* dst = static_cast(dest); + const int upsample_left_shift = static_cast(upsampled_left); + const int upsample_top_shift = static_cast(upsampled_top); + const __m128i max_shift = _mm_set1_epi8(32); + const ptrdiff_t stride8 = stride << 3; + const __m128i dest_index_x = + _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000); + const __m128i sampler_top = + upsampled_top + ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) + : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); + const __m128i shift_mask = _mm_set1_epi32(0x003F003F); + // All columns from |min_top_only_x| to the right will only need |top_row| to + // compute. This assumes minimum |xstep| is 3. + const int min_top_only_x = std::min((height * xstep) >> 6, width); + + // For steep angles, the source pixels from left_column may not fit in a + // 16-byte load for shuffling. + // TODO(petersonab): Find a more precise formula for this subject to x. + const int max_shuffle_height = + std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]); + + const int xstep8 = xstep << 3; + const __m128i xstep8_vect = _mm_set1_epi16(xstep8); + // Accumulate xstep across 8 rows. + const __m128i xstep_dup = _mm_set1_epi16(-xstep); + const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); + const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments); + // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 + const __m128i scaled_one = _mm_set1_epi16(-64); + __m128i xstep_bounds_base = + (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift) + : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift); + + const int left_base_increment = ystep >> 6; + const int ystep_remainder = ystep & 0x3F; + const int ystep8 = ystep << 3; + const int left_base_increment8 = ystep8 >> 6; + const int ystep_remainder8 = ystep8 & 0x3F; + const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8); + + // If the 64 scaling is regarded as a decimal point, the first value of the + // left_y vector omits the portion which is covered under the left_column + // offset. Following values need the full ystep as a relative offset. + const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder); + const __m128i ystep_dup = _mm_set1_epi16(-ystep); + __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x); + left_y = _mm_add_epi16(ystep_init, left_y); + + const __m128i increment_top8 = _mm_set1_epi16(8 << 6); + int x = 0; + + // This loop treats each set of 4 columns in 3 stages with y-value boundaries. + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + for (int left_offset = -left_base_increment; x < min_top_only_x; + x += 8, + xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8), + // Watch left_y because it can still get big. + left_y = _mm_add_epi16(left_y, increment_left8), + left_offset -= left_base_increment8) { + uint8_t* dst_x = dst + x; + + // Round down to the nearest multiple of 8. + const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; + DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), + max_top_only_y, -xstep, upsampled_top); + DirectionalZone1_4xH(dst_x + 4, stride, + top_row + ((x + 4) << upsample_top_shift), + max_top_only_y, -xstep, upsampled_top); + + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y); + // All rows from |min_left_only_y| down for this set of columns, only need + // |left_column| to compute. + const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); + // At high angles such that min_left_only_y < 8, ystep is low and xstep is + // high. This means that max_shuffle_height is unbounded and xstep_bounds + // will overflow in 16 bits. This is prevented by stopping the first + // blending loop at min_left_only_y for such cases, which means we skip over + // the second blending loop as well. + const int left_shuffle_stop_y = + std::min(max_shuffle_height, min_left_only_y); + __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect); + __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect); + int top_x = -xstep_y; + + for (; y < left_shuffle_stop_y; + y += 8, dst_x += stride8, + xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), + xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), + top_x -= xstep8) { + DirectionalZone2FromLeftCol_8x8_SSE4_1( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), left_y); + + __m128i shifts = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), + shift_mask), + 1); + shifts = _mm_packus_epi16(shifts, shifts); + __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); + shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); + __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); + DirectionalZone1Blend_8xH( + dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, + xstep_bounds_off, shifts, dest_index_x, top_x, xstep); + } + // Pick up from the last y-value, using the 10% slower but secure method for + // left prediction. + const auto base_left_y = static_cast(_mm_extract_epi16(left_y, 0)); + for (; y < min_left_only_y; + y += 8, dst_x += stride8, + xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect), + xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect), + top_x -= xstep8) { + const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); + + DirectionalZone3_8xH( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep); + + __m128i shifts = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), + shift_mask), + 1); + shifts = _mm_packus_epi16(shifts, shifts); + __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); + shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); + DirectionalZone1Blend_8xH( + dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, + xstep_bounds_off, shifts, dest_index_x, top_x, xstep); + } + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_8xH( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep); + } + } + for (; x < width; x += 4) { + DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift), + height, -xstep, upsampled_top); + } +} + +template +inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride, + const uint8_t* const top_row, + const uint8_t* const left_column, + const int width, const int height, + const int xstep, const int ystep) { + auto* dst = static_cast(dest); + const int upsample_left_shift = static_cast(upsampled_left); + const int upsample_top_shift = static_cast(upsampled_top); + const __m128i max_shift = _mm_set1_epi8(32); + const ptrdiff_t stride4 = stride << 2; + const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000); + const __m128i sampler_top = + upsampled_top + ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100) + : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100); + // All columns from |min_top_only_x| to the right will only need |top_row| to + // compute. + assert(xstep >= 3); + const int min_top_only_x = std::min((height * xstep) >> 6, width); + + const int xstep4 = xstep << 2; + const __m128i xstep4_vect = _mm_set1_epi16(xstep4); + const __m128i xstep_dup = _mm_set1_epi16(-xstep); + const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001); + __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments); + const __m128i scaled_one = _mm_set1_epi16(-64); + // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 + __m128i xstep_bounds_base = + (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift) + : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift); + + const int left_base_increment = ystep >> 6; + const int ystep_remainder = ystep & 0x3F; + const int ystep4 = ystep << 2; + const int left_base_increment4 = ystep4 >> 6; + // This is guaranteed to be less than 64, but accumulation may bring it past + // 64 for higher x values. + const int ystep_remainder4 = ystep4 & 0x3F; + const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4); + const __m128i increment_top4 = _mm_set1_epi16(4 << 6); + + // If the 64 scaling is regarded as a decimal point, the first value of the + // left_y vector omits the portion which will go into the left_column offset. + // Following values need the full ystep as a relative offset. + const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder); + const __m128i ystep_dup = _mm_set1_epi16(-ystep); + __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x); + left_y = _mm_add_epi16(ystep_init, left_y); + const __m128i shift_mask = _mm_set1_epi32(0x003F003F); + + int x = 0; + // Loop over x for columns with a mixture of sources. + for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4, + xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4), + left_y = _mm_add_epi16(left_y, increment_left4), + left_offset -= left_base_increment4) { + uint8_t* dst_x = dst + x; + + // Round down to the nearest multiple of 8. + const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4; + DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift), + max_top_only_y, -xstep, upsampled_top); + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y); + // All rows from |min_left_only_y| down for this set of columns, only need + // |left_column| to compute. Rounded up to the nearest multiple of 4. + const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height); + + __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect); + __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect); + int top_x = -xstep_y; + + // Loop over y for mixed rows. + for (; y < min_left_only_y; + y += 4, dst_x += stride4, + xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect), + xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect), + top_x -= xstep4) { + DirectionalZone2FromLeftCol_4x4_SSE4_1( + dst_x, stride, + left_column + ((left_offset + y) * (1 << upsample_left_shift)), + left_y); + + __m128i shifts = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift), + shift_mask), + 1); + shifts = _mm_packus_epi16(shifts, shifts); + const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts); + shifts = _mm_unpacklo_epi8(opposite_shifts, shifts); + const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6); + DirectionalZone1Blend_4x4( + dst_x, top_row + (x << upsample_top_shift), stride, sampler_top, + xstep_bounds_off, shifts, dest_index_x, top_x, xstep); + } + // Loop over y for left-only rows, if any. + for (; y < height; y += 4, dst_x += stride4) { + DirectionalZone2FromLeftCol_4x4_SSE4_1( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), left_y); + } + } + // Loop over top-only columns, if any. + for (; x < width; x += 4) { + DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift), + height, -xstep, upsampled_top); + } +} + +void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column, + const int width, const int height, + const int xstep, const int ystep, + const bool upsampled_top, + const bool upsampled_left) { + // Increasing the negative buffer for this function allows more rows to be + // processed at a time without branching in an inner loop to check the base. + uint8_t top_buffer[288]; + uint8_t left_buffer[288]; + memcpy(top_buffer + 128, static_cast(top_row) - 16, 160); + memcpy(left_buffer + 128, static_cast(left_column) - 16, 160); + const uint8_t* top_ptr = top_buffer + 144; + const uint8_t* left_ptr = left_buffer + 144; + if (width == 4 || height == 4) { + if (upsampled_left) { + if (upsampled_top) { + DirectionalZone2_4_SSE4_1(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } else { + DirectionalZone2_4_SSE4_1(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } + } else { + if (upsampled_top) { + DirectionalZone2_4_SSE4_1(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } else { + DirectionalZone2_4_SSE4_1(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } + } + return; + } + if (upsampled_left) { + if (upsampled_top) { + DirectionalZone2_SSE4_1(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } else { + DirectionalZone2_SSE4_1(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } + } else { + if (upsampled_top) { + DirectionalZone2_SSE4_1(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } else { + DirectionalZone2_SSE4_1(dest, stride, top_ptr, left_ptr, + width, height, xstep, ystep); + } + } +} + +//------------------------------------------------------------------------------ +// FilterIntraPredictor_SSE4_1 + +// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th +// at zero to preserve the sum. +inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride, + const __m128i& pixels, const __m128i& taps_0_1, + const __m128i& taps_2_3, const __m128i& taps_4_5, + const __m128i& taps_6_7) { + const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1); + const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3); + // |output_half| contains 8 partial sums. + __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23); + __m128i output = _mm_hadd_epi16(output_half, output_half); + const __m128i output_row0 = + _mm_packus_epi16(RightShiftWithRounding_S16(output, 4), + /* arbitrary pack arg */ output); + Store4(dst, output_row0); + const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5); + const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7); + output_half = _mm_hadd_epi16(mul_1_01, mul_1_23); + output = _mm_hadd_epi16(output_half, output_half); + const __m128i output_row1 = + _mm_packus_epi16(RightShiftWithRounding_S16(output, 4), + /* arbitrary pack arg */ output); + Store4(dst + stride, output_row1); +} + +// 4xH transform sizes are given special treatment because LoadLo8 goes out +// of bounds and every block involves the left column. This implementation +// loads TL from the top row for the first block, so it is not +inline void Filter4xH(uint8_t* dest, ptrdiff_t stride, + const uint8_t* const top_ptr, + const uint8_t* const left_ptr, FilterIntraPredictor pred, + const int height) { + const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]); + const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]); + const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]); + const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]); + __m128i top = Load4(top_ptr - 1); + __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4); + __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr)); + left = _mm_slli_si128(left, 5); + + // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1], + // left[2], left[3], left[4], left[5], left[6], left[7] + pixels = _mm_or_si128(left, pixels); + + // Duplicate first 8 bytes. + pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + dest += stride; // Move to y = 1. + pixels = Load4(dest); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1], + // left[0], left[1], ... + pixels = _mm_or_si128(left, pixels); + + // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last + // byte is an unused value, which shall be multiplied by 0 when we apply the + // filter. + constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006; + + // Insert left[-1] in front as TL and put left[0] and left[1] at the end. + const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 2. + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + dest += stride; // Move to y = 3. + + // Compute the middle 8 rows before using common code for the final 4 rows. + // Because the common code below this block assumes that + if (height == 16) { + // This shift allows us to use pixel_order2 twice after shifting by 2 later. + left = _mm_slli_si128(left, 1); + pixels = Load4(dest); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4], + // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3] + pixels = _mm_or_si128(left, pixels); + + // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The + // last byte is an unused value, as above. The top-left was shifted to + // position nine to keep two empty spaces after the top pixels. + constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009; + + // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at + // the end. + const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + dest += stride; // Move to y = 4. + + // First 4x2 in the if body. + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + + // Clear all but final pixel in the first 8 of left column. + __m128i keep_top_left = _mm_srli_si128(left, 13); + dest += stride; // Move to y = 5. + pixels = Load4(dest); + left = _mm_srli_si128(left, 2); + + // Relative pixels: top[0], top[1], top[2], top[3], left[-6], + // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1] + pixels = _mm_or_si128(left, pixels); + left = LoadLo8(left_ptr + 8); + + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + dest += stride; // Move to y = 6. + + // Second 4x2 in the if body. + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + + // Position TL value so we can use pixel_order1. + keep_top_left = _mm_slli_si128(keep_top_left, 6); + dest += stride; // Move to y = 7. + pixels = Load4(dest); + left = _mm_slli_si128(left, 7); + left = _mm_or_si128(left, keep_top_left); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, + // left[-1], left[0], left[1], left[2], left[3], ... + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 8. + + // Third 4x2 in the if body. + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + dest += stride; // Move to y = 9. + + // Prepare final inputs. + pixels = Load4(dest); + left = _mm_srli_si128(left, 2); + + // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2] + // left[-1], left[0], left[1], left[2], left[3], ... + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 10. + + // Fourth 4x2 in the if body. + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + dest += stride; // Move to y = 11. + } + + // In both the 8 and 16 case, we assume that the left vector has the next TL + // at position 8. + if (height > 4) { + // Erase prior left pixels by shifting TL to position 0. + left = _mm_srli_si128(left, 8); + left = _mm_slli_si128(left, 6); + pixels = Load4(dest); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, + // left[-1], left[0], left[1], left[2], left[3], ... + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 12 or 4. + + // First of final two 4x2 blocks. + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + dest += stride; // Move to y = 13 or 5. + pixels = Load4(dest); + left = _mm_srli_si128(left, 2); + + // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2] + // left[-1], left[0], left[1], left[2], left[3], ... + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 14 or 6. + + // Last of final two 4x2 blocks. + Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + } +} + +void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const top_row, + const void* const left_column, + FilterIntraPredictor pred, const int width, + const int height) { + const auto* const top_ptr = static_cast(top_row); + const auto* const left_ptr = static_cast(left_column); + auto* dst = static_cast(dest); + if (width == 4) { + Filter4xH(dst, stride, top_ptr, left_ptr, pred, height); + return; + } + + // There is one set of 7 taps for each of the 4x2 output pixels. + const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]); + const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]); + const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]); + const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]); + + // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at + // the end is an unused value, which shall be multiplied by 0 when we apply + // the filter. + constexpr int64_t kCondenseLeftMask = 0x0F09080403020100; + + // Takes the "left section" and puts it right after p0-p4. + const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask); + + // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last + // byte is unused as above. + constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008; + + // Shuffles the "top left" from the left section, to the front. Used when + // grabbing data from left_column and not top_row. + const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask); + + // This first pass takes care of the cases where the top left pixel comes from + // top_row. + __m128i pixels = LoadLo8(top_ptr - 1); + __m128i left = _mm_slli_si128(Load4(left_column), 8); + pixels = _mm_or_si128(pixels, left); + + // Two sets of the same pixels to multiply with two sets of taps. + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7); + left = _mm_srli_si128(left, 1); + + // Load + pixels = Load4(dst + stride); + + // Because of the above shift, this OR 'invades' the final of the first 8 + // bytes of |pixels|. This is acceptable because the 8th filter tap is always + // a padded 0. + pixels = _mm_or_si128(pixels, left); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + const ptrdiff_t stride2 = stride << 1; + const ptrdiff_t stride4 = stride << 2; + Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + dst += 4; + for (int x = 3; x < width - 4; x += 4) { + pixels = Load4(top_ptr + x); + pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4); + pixels = _mm_insert_epi8(pixels, dst[-1], 5); + pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); + Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + pixels = Load4(dst + stride - 1); + pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4); + pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5); + pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); + Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, + taps_4_5, taps_6_7); + dst += 4; + } + + // Now we handle heights that reference previous blocks rather than top_row. + for (int y = 4; y < height; y += 4) { + // Leftmost 4x4 block for this height. + dst -= width; + dst += stride4; + + // Top Left is not available by offset in these leftmost blocks. + pixels = Load4(dst - stride); + left = _mm_slli_si128(Load4(left_ptr + y - 1), 8); + left = _mm_insert_epi8(left, left_ptr[y + 3], 12); + pixels = _mm_or_si128(pixels, left); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + + // The bytes shifted into positions 6 and 7 will be ignored by the shuffle. + left = _mm_srli_si128(left, 2); + pixels = Load4(dst + stride); + pixels = _mm_or_si128(pixels, left); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, + taps_4_5, taps_6_7); + + dst += 4; + + // Remaining 4x4 blocks for this height. + for (int x = 4; x < width; x += 4) { + pixels = Load4(dst - stride - 1); + pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4); + pixels = _mm_insert_epi8(pixels, dst[-1], 5); + pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); + Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, + taps_6_7); + pixels = Load4(dst + stride - 1); + pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4); + pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5); + pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf); + Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, + taps_4_5, taps_6_7); + dst += 4; + } + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + static_cast(dsp); +// These guards check if this version of the function was not superseded by +// a higher optimization level, such as AVX. The corresponding #define also +// prevents the C version from being added to the table. +#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor) + dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1) + dsp->directional_intra_predictor_zone1 = + DirectionalIntraPredictorZone1_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2) + dsp->directional_intra_predictor_zone2 = + DirectionalIntraPredictorZone2_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3) + dsp->directional_intra_predictor_zone3 = + DirectionalIntraPredictorZone3_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] = + DcDefs::_4x4::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] = + DcDefs::_4x8::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] = + DcDefs::_4x16::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] = + DcDefs::_8x4::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] = + DcDefs::_8x8::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] = + DcDefs::_8x16::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] = + DcDefs::_8x32::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] = + DcDefs::_16x4::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] = + DcDefs::_16x8::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] = + DcDefs::_16x16::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] = + DcDefs::_16x32::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] = + DcDefs::_16x64::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] = + DcDefs::_32x8::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] = + DcDefs::_32x16::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] = + DcDefs::_32x32::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] = + DcDefs::_32x64::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] = + DcDefs::_64x16::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] = + DcDefs::_64x32::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] = + DcDefs::_64x64::DcTop; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] = + DcDefs::_4x4::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] = + DcDefs::_4x8::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] = + DcDefs::_4x16::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] = + DcDefs::_8x4::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] = + DcDefs::_8x8::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] = + DcDefs::_8x16::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] = + DcDefs::_8x32::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] = + DcDefs::_16x4::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] = + DcDefs::_16x8::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] = + DcDefs::_16x16::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] = + DcDefs::_16x32::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] = + DcDefs::_16x64::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] = + DcDefs::_32x8::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] = + DcDefs::_32x16::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] = + DcDefs::_32x32::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] = + DcDefs::_32x64::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] = + DcDefs::_64x16::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] = + DcDefs::_64x32::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] = + DcDefs::_64x64::DcLeft; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc) + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = + DcDefs::_4x4::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDc) + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] = + DcDefs::_4x8::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDc) + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] = + DcDefs::_4x16::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDc) + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] = + DcDefs::_8x4::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDc) + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] = + DcDefs::_8x8::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDc) + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] = + DcDefs::_8x16::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDc) + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] = + DcDefs::_8x32::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDc) + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] = + DcDefs::_16x4::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDc) + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] = + DcDefs::_16x8::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDc) + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] = + DcDefs::_16x16::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDc) + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] = + DcDefs::_16x32::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDc) + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] = + DcDefs::_16x64::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDc) + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] = + DcDefs::_32x8::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDc) + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] = + DcDefs::_32x16::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDc) + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] = + DcDefs::_32x32::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDc) + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] = + DcDefs::_32x64::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDc) + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] = + DcDefs::_64x16::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDc) + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] = + DcDefs::_64x32::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDc) + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] = + DcDefs::_64x64::Dc; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] = + Paeth4x4_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] = + Paeth4x8_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] = + Paeth4x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] = + Paeth8x4_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] = + Paeth8x8_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] = + Paeth8x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] = + Paeth8x32_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] = + Paeth16x4_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] = + Paeth16x8_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] = + Paeth16x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] = + Paeth16x32_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] = + Paeth16x64_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] = + Paeth32x8_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] = + Paeth32x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] = + Paeth32x32_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] = + Paeth32x64_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] = + Paeth64x16_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] = + Paeth64x32_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorPaeth) + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] = + Paeth64x64_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] = + DirDefs::_4x4::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] = + DirDefs::_4x8::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] = + DirDefs::_4x16::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] = + DirDefs::_8x4::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] = + DirDefs::_8x8::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] = + DirDefs::_8x16::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] = + DirDefs::_8x32::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] = + DirDefs::_16x4::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] = + DirDefs::_16x8::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] = + DirDefs::_16x16::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] = + DirDefs::_16x32::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] = + DirDefs::_16x64::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] = + DirDefs::_32x8::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] = + DirDefs::_32x16::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] = + DirDefs::_32x32::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] = + DirDefs::_32x64::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] = + DirDefs::_64x16::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] = + DirDefs::_64x32::Horizontal; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] = + DirDefs::_64x64::Horizontal; +#endif +} // NOLINT(readability/fn_size) +// TODO(petersonab): Split Init8bpp function into family-specific files. + +} // namespace +} // namespace low_bitdepth + +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +template +inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride, + const __m128i dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0); + int y = height - 1; + auto* dst = static_cast(dest); + do { + StoreLo8(dst, dc_dup); + dst += stride; + } while (--y != 0); + StoreLo8(dst, dc_dup); +} + +// WriteDuplicateN assumes dup has 4 32-bit "units," each of which comprises 2 +// identical shorts that need N total copies written into dest. The unpacking +// works the same as in the 8bpp case, except that each 32-bit unit needs twice +// as many copies. +inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride, + const __m128i dup32) { + const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32); + auto* dst = static_cast(dest); + _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo); + dst += stride; + _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo)); + dst += stride; + const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32); + _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi); + dst += stride; + _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi)); +} + +inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride, + const __m128i dup32) { + const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32); + const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32); + + auto* dst = static_cast(dest); + const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0); + dst += stride; + const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1); + dst += stride; + const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2); + dst += stride; + const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3); +} + +inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride, + const __m128i dup32) { + const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32); + const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32); + + auto* dst = static_cast(dest); + const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0); + dst += stride; + const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1); + dst += stride; + const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2); + dst += stride; + const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3); +} + +inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride, + const __m128i dup32) { + const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32); + const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32); + + auto* dst = static_cast(dest); + const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0); + dst += stride; + const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1); + dst += stride; + const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2); + dst += stride; + const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3); +} + +inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride, + const __m128i dup32) { + const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32); + const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32); + + auto* dst = static_cast(dest); + const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo); + for (int x = 0; x < 128; x += 16) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_0); + } + dst += stride; + const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo); + for (int x = 0; x < 128; x += 16) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_1); + } + dst += stride; + const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi); + for (int x = 0; x < 128; x += 16) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_2); + } + dst += stride; + const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi); + for (int x = 0; x < 128; x += 16) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_3); + } +} + +// ColStoreN copies each of the |height| values in |column| across its +// corresponding row in dest. +template +inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const column) { + const __m128i col_data = LoadLo8(column); + const __m128i col_dup32 = _mm_unpacklo_epi16(col_data, col_data); + writefn(dest, stride, col_dup32); +} + +template +inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const column) { + const __m128i col_data = LoadUnaligned16(column); + const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data); + const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data); + auto* dst = static_cast(dest); + writefn(dst, stride, col_dup32_lo); + const ptrdiff_t stride4 = stride << 2; + dst += stride4; + writefn(dst, stride, col_dup32_hi); +} + +template +inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const column) { + const ptrdiff_t stride4 = stride << 2; + auto* dst = static_cast(dest); + for (int y = 0; y < 32; y += 16) { + const __m128i col_data = + LoadUnaligned16(static_cast(column) + y); + const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data); + const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data); + writefn(dst, stride, col_dup32_lo); + dst += stride4; + writefn(dst, stride, col_dup32_hi); + dst += stride4; + } +} + +template +inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const column) { + const ptrdiff_t stride4 = stride << 2; + auto* dst = static_cast(dest); + for (int y = 0; y < 64; y += 16) { + const __m128i col_data = + LoadUnaligned16(static_cast(column) + y); + const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data); + const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data); + writefn(dst, stride, col_dup32_lo); + dst += stride4; + writefn(dst, stride, col_dup32_hi); + dst += stride4; + } +} + +template +inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride, + const void* const column) { + const ptrdiff_t stride4 = stride << 2; + auto* dst = static_cast(dest); + for (int y = 0; y < 128; y += 16) { + const __m128i col_data = + LoadUnaligned16(static_cast(column) + y); + const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data); + const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data); + writefn(dst, stride, col_dup32_lo); + dst += stride4; + writefn(dst, stride, col_dup32_hi); + dst += stride4; + } +} + +// |ref| points to 8 bytes containing 4 packed int16 values. +inline __m128i DcSum4_SSE4_1(const void* ref) { + const __m128i vals = _mm_loadl_epi64(static_cast(ref)); + const __m128i ones = _mm_set1_epi16(1); + + // half_sum[31:0] = a1+a2 + // half_sum[63:32] = a3+a4 + const __m128i half_sum = _mm_madd_epi16(vals, ones); + // Place half_sum[63:32] in shift_sum[31:0]. + const __m128i shift_sum = _mm_srli_si128(half_sum, 4); + return _mm_add_epi32(half_sum, shift_sum); +} + +struct DcDefs { + DcDefs() = delete; + + using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1, + DcStore4xH_SSE4_1<4>, 0, 0>; +}; + +struct DirDefs { + DirDefs() = delete; + + using _4x4 = DirectionalPredFuncs_SSE4_1>; + using _4x8 = DirectionalPredFuncs_SSE4_1>; + using _4x16 = + DirectionalPredFuncs_SSE4_1>; + using _8x4 = DirectionalPredFuncs_SSE4_1>; + using _8x8 = DirectionalPredFuncs_SSE4_1>; + using _8x16 = + DirectionalPredFuncs_SSE4_1>; + using _8x32 = + DirectionalPredFuncs_SSE4_1>; + using _16x4 = + DirectionalPredFuncs_SSE4_1>; + using _16x8 = + DirectionalPredFuncs_SSE4_1>; + using _16x16 = + DirectionalPredFuncs_SSE4_1>; + using _16x32 = + DirectionalPredFuncs_SSE4_1>; + using _16x64 = + DirectionalPredFuncs_SSE4_1>; + using _32x8 = + DirectionalPredFuncs_SSE4_1>; + using _32x16 = + DirectionalPredFuncs_SSE4_1>; + using _32x32 = + DirectionalPredFuncs_SSE4_1>; + using _32x64 = + DirectionalPredFuncs_SSE4_1>; + using _64x16 = + DirectionalPredFuncs_SSE4_1>; + using _64x32 = + DirectionalPredFuncs_SSE4_1>; + using _64x64 = + DirectionalPredFuncs_SSE4_1>; +}; + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); + static_cast(dsp); +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop) + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] = + DcDefs::_4x4::DcTop; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft) + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] = + DcDefs::_4x4::DcLeft; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDc) + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] = + DcDefs::_4x4::Dc; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] = + DirDefs::_4x4::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] = + DirDefs::_4x8::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] = + DirDefs::_4x16::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] = + DirDefs::_8x4::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] = + DirDefs::_8x8::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] = + DirDefs::_8x16::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] = + DirDefs::_8x32::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] = + DirDefs::_16x4::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] = + DirDefs::_16x8::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] = + DirDefs::_16x16::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] = + DirDefs::_16x32::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] = + DirDefs::_16x64::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] = + DirDefs::_32x8::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] = + DirDefs::_32x16::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] = + DirDefs::_32x32::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] = + DirDefs::_32x64::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] = + DirDefs::_64x16::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] = + DirDefs::_64x32::Horizontal; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal) + dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] = + DirDefs::_64x64::Horizontal; +#endif +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void IntraPredInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 +namespace libgav1 { +namespace dsp { + +void IntraPredInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/intrapred_sse4.h b/src/dsp/x86/intrapred_sse4.h new file mode 100644 index 0000000..7f4fcd7 --- /dev/null +++ b/src/dsp/x86/intrapred_sse4.h @@ -0,0 +1,1060 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*, +// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and +// Dsp::filter_intra_predictor, see the defines below for specifics. These +// functions are not thread-safe. +void IntraPredInit_SSE4_1(); +void IntraPredCflInit_SSE4_1(); +void IntraPredSmoothInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +// If sse4 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the sse4 implementation should be used. +#if LIBGAV1_TARGETING_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor +#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 +#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor +#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal +#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +//------------------------------------------------------------------------------ +// 10bpp + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop +#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft +#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc +#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal +#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_ diff --git a/src/dsp/x86/inverse_transform_sse4.cc b/src/dsp/x86/inverse_transform_sse4.cc new file mode 100644 index 0000000..787d706 --- /dev/null +++ b/src/dsp/x86/inverse_transform_sse4.cc @@ -0,0 +1,3086 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/inverse_transform.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/dsp/x86/transpose_sse4.h" +#include "src/utils/array_2d.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +// Include the constants and utility functions inside the anonymous namespace. +#include "src/dsp/inverse_transform.inc" + +template +LIBGAV1_ALWAYS_INLINE void StoreDst(int16_t* dst, int32_t stride, int32_t idx, + const __m128i* s) { + // NOTE: It is expected that the compiler will unroll these loops. + if (store_width == 16) { + for (int i = 0; i < store_count; i += 4) { + StoreUnaligned16(&dst[i * stride + idx], s[i]); + StoreUnaligned16(&dst[(i + 1) * stride + idx], s[i + 1]); + StoreUnaligned16(&dst[(i + 2) * stride + idx], s[i + 2]); + StoreUnaligned16(&dst[(i + 3) * stride + idx], s[i + 3]); + } + } + if (store_width == 8) { + for (int i = 0; i < store_count; i += 4) { + StoreLo8(&dst[i * stride + idx], s[i]); + StoreLo8(&dst[(i + 1) * stride + idx], s[i + 1]); + StoreLo8(&dst[(i + 2) * stride + idx], s[i + 2]); + StoreLo8(&dst[(i + 3) * stride + idx], s[i + 3]); + } + } +} + +template +LIBGAV1_ALWAYS_INLINE void LoadSrc(const int16_t* src, int32_t stride, + int32_t idx, __m128i* x) { + // NOTE: It is expected that the compiler will unroll these loops. + if (load_width == 16) { + for (int i = 0; i < load_count; i += 4) { + x[i] = LoadUnaligned16(&src[i * stride + idx]); + x[i + 1] = LoadUnaligned16(&src[(i + 1) * stride + idx]); + x[i + 2] = LoadUnaligned16(&src[(i + 2) * stride + idx]); + x[i + 3] = LoadUnaligned16(&src[(i + 3) * stride + idx]); + } + } + if (load_width == 8) { + for (int i = 0; i < load_count; i += 4) { + x[i] = LoadLo8(&src[i * stride + idx]); + x[i + 1] = LoadLo8(&src[(i + 1) * stride + idx]); + x[i + 2] = LoadLo8(&src[(i + 2) * stride + idx]); + x[i + 3] = LoadLo8(&src[(i + 3) * stride + idx]); + } + } +} + +// Butterfly rotate 4 values. +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(__m128i* a, __m128i* b, + const int angle, + const bool flip) { + const int16_t cos128 = Cos128(angle); + const int16_t sin128 = Sin128(angle); + const __m128i psin_pcos = _mm_set1_epi32( + static_cast(cos128) | (static_cast(sin128) << 16)); + const __m128i ba = _mm_unpacklo_epi16(*a, *b); + const __m128i ab = _mm_unpacklo_epi16(*b, *a); + const __m128i sign = + _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001); + // -sin cos, -sin cos, -sin cos, -sin cos + const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign); + const __m128i x0 = _mm_madd_epi16(ba, msin_pcos); + const __m128i y0 = _mm_madd_epi16(ab, psin_pcos); + const __m128i x1 = RightShiftWithRounding_S32(x0, 12); + const __m128i y1 = RightShiftWithRounding_S32(y0, 12); + const __m128i x = _mm_packs_epi32(x1, x1); + const __m128i y = _mm_packs_epi32(y1, y1); + if (flip) { + *a = y; + *b = x; + } else { + *a = x; + *b = y; + } +} + +// Butterfly rotate 8 values. +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_8(__m128i* a, __m128i* b, + const int angle, + const bool flip) { + const int16_t cos128 = Cos128(angle); + const int16_t sin128 = Sin128(angle); + const __m128i psin_pcos = _mm_set1_epi32( + static_cast(cos128) | (static_cast(sin128) << 16)); + const __m128i sign = + _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001); + // -sin cos, -sin cos, -sin cos, -sin cos + const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign); + const __m128i ba = _mm_unpacklo_epi16(*a, *b); + const __m128i ab = _mm_unpacklo_epi16(*b, *a); + const __m128i ba_hi = _mm_unpackhi_epi16(*a, *b); + const __m128i ab_hi = _mm_unpackhi_epi16(*b, *a); + const __m128i x0 = _mm_madd_epi16(ba, msin_pcos); + const __m128i y0 = _mm_madd_epi16(ab, psin_pcos); + const __m128i x0_hi = _mm_madd_epi16(ba_hi, msin_pcos); + const __m128i y0_hi = _mm_madd_epi16(ab_hi, psin_pcos); + const __m128i x1 = RightShiftWithRounding_S32(x0, 12); + const __m128i y1 = RightShiftWithRounding_S32(y0, 12); + const __m128i x1_hi = RightShiftWithRounding_S32(x0_hi, 12); + const __m128i y1_hi = RightShiftWithRounding_S32(y0_hi, 12); + const __m128i x = _mm_packs_epi32(x1, x1_hi); + const __m128i y = _mm_packs_epi32(y1, y1_hi); + if (flip) { + *a = y; + *b = x; + } else { + *a = x; + *b = y; + } +} + +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(__m128i* a, __m128i* b, + const int angle, + const bool flip) { + const int16_t cos128 = Cos128(angle); + const int16_t sin128 = Sin128(angle); + const __m128i pcos = _mm_set1_epi16(cos128 << 3); + const __m128i psin = _mm_set1_epi16(-(sin128 << 3)); + const __m128i x = _mm_mulhrs_epi16(*b, psin); + const __m128i y = _mm_mulhrs_epi16(*b, pcos); + if (flip) { + *a = y; + *b = x; + } else { + *a = x; + *b = y; + } +} + +LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(__m128i* a, + __m128i* b, + const int angle, + const bool flip) { + const int16_t cos128 = Cos128(angle); + const int16_t sin128 = Sin128(angle); + const __m128i pcos = _mm_set1_epi16(cos128 << 3); + const __m128i psin = _mm_set1_epi16(sin128 << 3); + const __m128i x = _mm_mulhrs_epi16(*a, pcos); + const __m128i y = _mm_mulhrs_epi16(*a, psin); + if (flip) { + *a = y; + *b = x; + } else { + *a = x; + *b = y; + } +} + +LIBGAV1_ALWAYS_INLINE void HadamardRotation(__m128i* a, __m128i* b, bool flip) { + __m128i x, y; + if (flip) { + y = _mm_adds_epi16(*b, *a); + x = _mm_subs_epi16(*b, *a); + } else { + x = _mm_adds_epi16(*a, *b); + y = _mm_subs_epi16(*a, *b); + } + *a = x; + *b = y; +} + +using ButterflyRotationFunc = void (*)(__m128i* a, __m128i* b, int angle, + bool flip); + +LIBGAV1_ALWAYS_INLINE __m128i ShiftResidual(const __m128i residual, + const __m128i v_row_shift_add, + const __m128i v_row_shift) { + const __m128i k7ffd = _mm_set1_epi16(0x7ffd); + // The max row_shift is 2, so int16_t values greater than 0x7ffd may + // overflow. Generate a mask for this case. + const __m128i mask = _mm_cmpgt_epi16(residual, k7ffd); + const __m128i x = _mm_add_epi16(residual, v_row_shift_add); + // Assume int16_t values. + const __m128i a = _mm_sra_epi16(x, v_row_shift); + // Assume uint16_t values. + const __m128i b = _mm_srl_epi16(x, v_row_shift); + // Select the correct shifted value. + return _mm_blendv_epi8(a, b, mask); +} + +//------------------------------------------------------------------------------ +// Discrete Cosine Transforms (DCT). + +template +LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0); + const __m128i v_src = + (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0); + const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_kTransformRowMultiplier = + _mm_set1_epi16(kTransformRowMultiplier << 3); + const __m128i v_src_round = + _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier); + const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask); + const int16_t cos128 = Cos128(32); + const __m128i xy = _mm_mulhrs_epi16(s0, _mm_set1_epi16(cos128 << 3)); + + // Expand to 32 bits to prevent int16_t overflows during the shift add. + const __m128i v_row_shift_add = _mm_set1_epi32(row_shift); + const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add); + const __m128i a = _mm_cvtepi16_epi32(xy); + const __m128i a1 = _mm_cvtepi16_epi32(_mm_srli_si128(xy, 8)); + const __m128i b = _mm_add_epi32(a, v_row_shift_add); + const __m128i b1 = _mm_add_epi32(a1, v_row_shift_add); + const __m128i c = _mm_sra_epi32(b, v_row_shift); + const __m128i c1 = _mm_sra_epi32(b1, v_row_shift); + const __m128i xy_shifted = _mm_packs_epi32(c, c1); + + if (width == 4) { + StoreLo8(dst, xy_shifted); + } else { + for (int i = 0; i < width; i += 8) { + StoreUnaligned16(dst, xy_shifted); + dst += 8; + } + } + return true; +} + +template +LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const int16_t cos128 = Cos128(32); + + // Calculate dc values for first row. + if (width == 4) { + const __m128i v_src = LoadLo8(dst); + const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3)); + StoreLo8(dst, xy); + } else { + int i = 0; + do { + const __m128i v_src = LoadUnaligned16(&dst[i]); + const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3)); + StoreUnaligned16(&dst[i], xy); + i += 8; + } while (i < width); + } + + // Copy first row to the rest of the block. + for (int y = 1; y < height; ++y) { + memcpy(&dst[y * width], dst, width * sizeof(dst[0])); + } + return true; +} + +template +LIBGAV1_ALWAYS_INLINE void Dct4Stages(__m128i* s) { + // stage 12. + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true); + ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false); + } else { + butterfly_rotation(&s[0], &s[1], 32, true); + butterfly_rotation(&s[2], &s[3], 48, false); + } + + // stage 17. + HadamardRotation(&s[0], &s[3], false); + HadamardRotation(&s[1], &s[2], false); +} + +// Process 4 dct4 rows or columns, depending on the transpose flag. +template +LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, int32_t step, + bool transpose) { + auto* const dst = static_cast(dest); + __m128i s[4], x[4]; + + if (stage_is_rectangular) { + if (transpose) { + __m128i input[8]; + LoadSrc<8, 8>(dst, step, 0, input); + Transpose4x8To8x4_U16(input, x); + } else { + LoadSrc<16, 4>(dst, step, 0, x); + } + } else { + LoadSrc<8, 4>(dst, step, 0, x); + if (transpose) { + Transpose4x4_U16(x, x); + } + } + // stage 1. + // kBitReverseLookup 0, 2, 1, 3 + s[0] = x[0]; + s[1] = x[2]; + s[2] = x[1]; + s[3] = x[3]; + + Dct4Stages(s); + + if (stage_is_rectangular) { + if (transpose) { + __m128i output[8]; + Transpose8x4To4x8_U16(s, output); + StoreDst<8, 8>(dst, step, 0, output); + } else { + StoreDst<16, 4>(dst, step, 0, s); + } + } else { + if (transpose) { + Transpose4x4_U16(s, s); + } + StoreDst<8, 4>(dst, step, 0, s); + } +} + +template +LIBGAV1_ALWAYS_INLINE void Dct8Stages(__m128i* s) { + // stage 8. + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false); + ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false); + } else { + butterfly_rotation(&s[4], &s[7], 56, false); + butterfly_rotation(&s[5], &s[6], 24, false); + } + + // stage 13. + HadamardRotation(&s[4], &s[5], false); + HadamardRotation(&s[6], &s[7], true); + + // stage 18. + butterfly_rotation(&s[6], &s[5], 32, true); + + // stage 22. + HadamardRotation(&s[0], &s[7], false); + HadamardRotation(&s[1], &s[6], false); + HadamardRotation(&s[2], &s[5], false); + HadamardRotation(&s[3], &s[4], false); +} + +// Process dct8 rows or columns, depending on the transpose flag. +template +LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, int32_t step, + bool transpose) { + auto* const dst = static_cast(dest); + __m128i s[8], x[8]; + + if (stage_is_rectangular) { + if (transpose) { + __m128i input[4]; + LoadSrc<16, 4>(dst, step, 0, input); + Transpose8x4To4x8_U16(input, x); + } else { + LoadSrc<8, 8>(dst, step, 0, x); + } + } else { + if (transpose) { + __m128i input[8]; + LoadSrc<16, 8>(dst, step, 0, input); + Transpose8x8_U16(input, x); + } else { + LoadSrc<16, 8>(dst, step, 0, x); + } + } + + // stage 1. + // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7, + s[0] = x[0]; + s[1] = x[4]; + s[2] = x[2]; + s[3] = x[6]; + s[4] = x[1]; + s[5] = x[5]; + s[6] = x[3]; + s[7] = x[7]; + + Dct4Stages(s); + Dct8Stages(s); + + if (stage_is_rectangular) { + if (transpose) { + __m128i output[4]; + Transpose4x8To8x4_U16(s, output); + StoreDst<16, 4>(dst, step, 0, output); + } else { + StoreDst<8, 8>(dst, step, 0, s); + } + } else { + if (transpose) { + __m128i output[8]; + Transpose8x8_U16(s, output); + StoreDst<16, 8>(dst, step, 0, output); + } else { + StoreDst<16, 8>(dst, step, 0, s); + } + } +} + +template +LIBGAV1_ALWAYS_INLINE void Dct16Stages(__m128i* s) { + // stage 5. + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false); + ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false); + ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false); + ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false); + } else { + butterfly_rotation(&s[8], &s[15], 60, false); + butterfly_rotation(&s[9], &s[14], 28, false); + butterfly_rotation(&s[10], &s[13], 44, false); + butterfly_rotation(&s[11], &s[12], 12, false); + } + + // stage 9. + HadamardRotation(&s[8], &s[9], false); + HadamardRotation(&s[10], &s[11], true); + HadamardRotation(&s[12], &s[13], false); + HadamardRotation(&s[14], &s[15], true); + + // stage 14. + butterfly_rotation(&s[14], &s[9], 48, true); + butterfly_rotation(&s[13], &s[10], 112, true); + + // stage 19. + HadamardRotation(&s[8], &s[11], false); + HadamardRotation(&s[9], &s[10], false); + HadamardRotation(&s[12], &s[15], true); + HadamardRotation(&s[13], &s[14], true); + + // stage 23. + butterfly_rotation(&s[13], &s[10], 32, true); + butterfly_rotation(&s[12], &s[11], 32, true); + + // stage 26. + HadamardRotation(&s[0], &s[15], false); + HadamardRotation(&s[1], &s[14], false); + HadamardRotation(&s[2], &s[13], false); + HadamardRotation(&s[3], &s[12], false); + HadamardRotation(&s[4], &s[11], false); + HadamardRotation(&s[5], &s[10], false); + HadamardRotation(&s[6], &s[9], false); + HadamardRotation(&s[7], &s[8], false); +} + +// Process dct16 rows or columns, depending on the transpose flag. +template +LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, int32_t step, + bool transpose) { + auto* const dst = static_cast(dest); + __m128i s[16], x[16]; + + if (stage_is_rectangular) { + if (transpose) { + __m128i input[4]; + LoadSrc<16, 4>(dst, step, 0, input); + Transpose8x4To4x8_U16(input, x); + LoadSrc<16, 4>(dst, step, 8, input); + Transpose8x4To4x8_U16(input, &x[8]); + } else { + LoadSrc<8, 16>(dst, step, 0, x); + } + } else { + if (transpose) { + for (int idx = 0; idx < 16; idx += 8) { + __m128i input[8]; + LoadSrc<16, 8>(dst, step, idx, input); + Transpose8x8_U16(input, &x[idx]); + } + } else { + LoadSrc<16, 16>(dst, step, 0, x); + } + } + + // stage 1 + // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15, + s[0] = x[0]; + s[1] = x[8]; + s[2] = x[4]; + s[3] = x[12]; + s[4] = x[2]; + s[5] = x[10]; + s[6] = x[6]; + s[7] = x[14]; + s[8] = x[1]; + s[9] = x[9]; + s[10] = x[5]; + s[11] = x[13]; + s[12] = x[3]; + s[13] = x[11]; + s[14] = x[7]; + s[15] = x[15]; + + Dct4Stages(s); + Dct8Stages(s); + Dct16Stages(s); + + if (stage_is_rectangular) { + if (transpose) { + __m128i output[4]; + Transpose4x8To8x4_U16(s, output); + StoreDst<16, 4>(dst, step, 0, output); + Transpose4x8To8x4_U16(&s[8], output); + StoreDst<16, 4>(dst, step, 8, output); + } else { + StoreDst<8, 16>(dst, step, 0, s); + } + } else { + if (transpose) { + for (int idx = 0; idx < 16; idx += 8) { + __m128i output[8]; + Transpose8x8_U16(&s[idx], output); + StoreDst<16, 8>(dst, step, idx, output); + } + } else { + StoreDst<16, 16>(dst, step, 0, s); + } + } +} + +template +LIBGAV1_ALWAYS_INLINE void Dct32Stages(__m128i* s) { + // stage 3 + if (is_fast_butterfly) { + ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false); + ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false); + ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false); + ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false); + ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false); + ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false); + ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false); + ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false); + } else { + butterfly_rotation(&s[16], &s[31], 62, false); + butterfly_rotation(&s[17], &s[30], 30, false); + butterfly_rotation(&s[18], &s[29], 46, false); + butterfly_rotation(&s[19], &s[28], 14, false); + butterfly_rotation(&s[20], &s[27], 54, false); + butterfly_rotation(&s[21], &s[26], 22, false); + butterfly_rotation(&s[22], &s[25], 38, false); + butterfly_rotation(&s[23], &s[24], 6, false); + } + // stage 6. + HadamardRotation(&s[16], &s[17], false); + HadamardRotation(&s[18], &s[19], true); + HadamardRotation(&s[20], &s[21], false); + HadamardRotation(&s[22], &s[23], true); + HadamardRotation(&s[24], &s[25], false); + HadamardRotation(&s[26], &s[27], true); + HadamardRotation(&s[28], &s[29], false); + HadamardRotation(&s[30], &s[31], true); + + // stage 10. + butterfly_rotation(&s[30], &s[17], 24 + 32, true); + butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true); + butterfly_rotation(&s[26], &s[21], 24, true); + butterfly_rotation(&s[25], &s[22], 24 + 64, true); + + // stage 15. + HadamardRotation(&s[16], &s[19], false); + HadamardRotation(&s[17], &s[18], false); + HadamardRotation(&s[20], &s[23], true); + HadamardRotation(&s[21], &s[22], true); + HadamardRotation(&s[24], &s[27], false); + HadamardRotation(&s[25], &s[26], false); + HadamardRotation(&s[28], &s[31], true); + HadamardRotation(&s[29], &s[30], true); + + // stage 20. + butterfly_rotation(&s[29], &s[18], 48, true); + butterfly_rotation(&s[28], &s[19], 48, true); + butterfly_rotation(&s[27], &s[20], 48 + 64, true); + butterfly_rotation(&s[26], &s[21], 48 + 64, true); + + // stage 24. + HadamardRotation(&s[16], &s[23], false); + HadamardRotation(&s[17], &s[22], false); + HadamardRotation(&s[18], &s[21], false); + HadamardRotation(&s[19], &s[20], false); + HadamardRotation(&s[24], &s[31], true); + HadamardRotation(&s[25], &s[30], true); + HadamardRotation(&s[26], &s[29], true); + HadamardRotation(&s[27], &s[28], true); + + // stage 27. + butterfly_rotation(&s[27], &s[20], 32, true); + butterfly_rotation(&s[26], &s[21], 32, true); + butterfly_rotation(&s[25], &s[22], 32, true); + butterfly_rotation(&s[24], &s[23], 32, true); + + // stage 29. + HadamardRotation(&s[0], &s[31], false); + HadamardRotation(&s[1], &s[30], false); + HadamardRotation(&s[2], &s[29], false); + HadamardRotation(&s[3], &s[28], false); + HadamardRotation(&s[4], &s[27], false); + HadamardRotation(&s[5], &s[26], false); + HadamardRotation(&s[6], &s[25], false); + HadamardRotation(&s[7], &s[24], false); + HadamardRotation(&s[8], &s[23], false); + HadamardRotation(&s[9], &s[22], false); + HadamardRotation(&s[10], &s[21], false); + HadamardRotation(&s[11], &s[20], false); + HadamardRotation(&s[12], &s[19], false); + HadamardRotation(&s[13], &s[18], false); + HadamardRotation(&s[14], &s[17], false); + HadamardRotation(&s[15], &s[16], false); +} + +// Process dct32 rows or columns, depending on the transpose flag. +LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const int32_t step, + const bool transpose) { + auto* const dst = static_cast(dest); + __m128i s[32], x[32]; + + if (transpose) { + for (int idx = 0; idx < 32; idx += 8) { + __m128i input[8]; + LoadSrc<16, 8>(dst, step, idx, input); + Transpose8x8_U16(input, &x[idx]); + } + } else { + LoadSrc<16, 32>(dst, step, 0, x); + } + + // stage 1 + // kBitReverseLookup + // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30, + s[0] = x[0]; + s[1] = x[16]; + s[2] = x[8]; + s[3] = x[24]; + s[4] = x[4]; + s[5] = x[20]; + s[6] = x[12]; + s[7] = x[28]; + s[8] = x[2]; + s[9] = x[18]; + s[10] = x[10]; + s[11] = x[26]; + s[12] = x[6]; + s[13] = x[22]; + s[14] = x[14]; + s[15] = x[30]; + + // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31, + s[16] = x[1]; + s[17] = x[17]; + s[18] = x[9]; + s[19] = x[25]; + s[20] = x[5]; + s[21] = x[21]; + s[22] = x[13]; + s[23] = x[29]; + s[24] = x[3]; + s[25] = x[19]; + s[26] = x[11]; + s[27] = x[27]; + s[28] = x[7]; + s[29] = x[23]; + s[30] = x[15]; + s[31] = x[31]; + + Dct4Stages(s); + Dct8Stages(s); + Dct16Stages(s); + Dct32Stages(s); + + if (transpose) { + for (int idx = 0; idx < 32; idx += 8) { + __m128i output[8]; + Transpose8x8_U16(&s[idx], output); + StoreDst<16, 8>(dst, step, idx, output); + } + } else { + StoreDst<16, 32>(dst, step, 0, s); + } +} + +// Allow the compiler to call this function instead of force inlining. Tests +// show the performance is slightly faster. +void Dct64_SSE4_1(void* dest, int32_t step, bool transpose) { + auto* const dst = static_cast(dest); + __m128i s[64], x[32]; + + if (transpose) { + // The last 32 values of every row are always zero if the |tx_width| is + // 64. + for (int idx = 0; idx < 32; idx += 8) { + __m128i input[8]; + LoadSrc<16, 8>(dst, step, idx, input); + Transpose8x8_U16(input, &x[idx]); + } + } else { + // The last 32 values of every column are always zero if the |tx_height| is + // 64. + LoadSrc<16, 32>(dst, step, 0, x); + } + + // stage 1 + // kBitReverseLookup + // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60, + s[0] = x[0]; + s[2] = x[16]; + s[4] = x[8]; + s[6] = x[24]; + s[8] = x[4]; + s[10] = x[20]; + s[12] = x[12]; + s[14] = x[28]; + + // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62, + s[16] = x[2]; + s[18] = x[18]; + s[20] = x[10]; + s[22] = x[26]; + s[24] = x[6]; + s[26] = x[22]; + s[28] = x[14]; + s[30] = x[30]; + + // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61, + s[32] = x[1]; + s[34] = x[17]; + s[36] = x[9]; + s[38] = x[25]; + s[40] = x[5]; + s[42] = x[21]; + s[44] = x[13]; + s[46] = x[29]; + + // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63 + s[48] = x[3]; + s[50] = x[19]; + s[52] = x[11]; + s[54] = x[27]; + s[56] = x[7]; + s[58] = x[23]; + s[60] = x[15]; + s[62] = x[31]; + + Dct4Stages(s); + Dct8Stages(s); + Dct16Stages(s); + Dct32Stages(s); + + //-- start dct 64 stages + // stage 2. + ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false); + ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false); + ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false); + ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false); + ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false); + ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false); + ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false); + ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false); + ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false); + ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false); + ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false); + ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false); + ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false); + ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false); + ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false); + ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false); + + // stage 4. + HadamardRotation(&s[32], &s[33], false); + HadamardRotation(&s[34], &s[35], true); + HadamardRotation(&s[36], &s[37], false); + HadamardRotation(&s[38], &s[39], true); + HadamardRotation(&s[40], &s[41], false); + HadamardRotation(&s[42], &s[43], true); + HadamardRotation(&s[44], &s[45], false); + HadamardRotation(&s[46], &s[47], true); + HadamardRotation(&s[48], &s[49], false); + HadamardRotation(&s[50], &s[51], true); + HadamardRotation(&s[52], &s[53], false); + HadamardRotation(&s[54], &s[55], true); + HadamardRotation(&s[56], &s[57], false); + HadamardRotation(&s[58], &s[59], true); + HadamardRotation(&s[60], &s[61], false); + HadamardRotation(&s[62], &s[63], true); + + // stage 7. + ButterflyRotation_8(&s[62], &s[33], 60 - 0, true); + ButterflyRotation_8(&s[61], &s[34], 60 - 0 + 64, true); + ButterflyRotation_8(&s[58], &s[37], 60 - 32, true); + ButterflyRotation_8(&s[57], &s[38], 60 - 32 + 64, true); + ButterflyRotation_8(&s[54], &s[41], 60 - 16, true); + ButterflyRotation_8(&s[53], &s[42], 60 - 16 + 64, true); + ButterflyRotation_8(&s[50], &s[45], 60 - 48, true); + ButterflyRotation_8(&s[49], &s[46], 60 - 48 + 64, true); + + // stage 11. + HadamardRotation(&s[32], &s[35], false); + HadamardRotation(&s[33], &s[34], false); + HadamardRotation(&s[36], &s[39], true); + HadamardRotation(&s[37], &s[38], true); + HadamardRotation(&s[40], &s[43], false); + HadamardRotation(&s[41], &s[42], false); + HadamardRotation(&s[44], &s[47], true); + HadamardRotation(&s[45], &s[46], true); + HadamardRotation(&s[48], &s[51], false); + HadamardRotation(&s[49], &s[50], false); + HadamardRotation(&s[52], &s[55], true); + HadamardRotation(&s[53], &s[54], true); + HadamardRotation(&s[56], &s[59], false); + HadamardRotation(&s[57], &s[58], false); + HadamardRotation(&s[60], &s[63], true); + HadamardRotation(&s[61], &s[62], true); + + // stage 16. + ButterflyRotation_8(&s[61], &s[34], 56, true); + ButterflyRotation_8(&s[60], &s[35], 56, true); + ButterflyRotation_8(&s[59], &s[36], 56 + 64, true); + ButterflyRotation_8(&s[58], &s[37], 56 + 64, true); + ButterflyRotation_8(&s[53], &s[42], 56 - 32, true); + ButterflyRotation_8(&s[52], &s[43], 56 - 32, true); + ButterflyRotation_8(&s[51], &s[44], 56 - 32 + 64, true); + ButterflyRotation_8(&s[50], &s[45], 56 - 32 + 64, true); + + // stage 21. + HadamardRotation(&s[32], &s[39], false); + HadamardRotation(&s[33], &s[38], false); + HadamardRotation(&s[34], &s[37], false); + HadamardRotation(&s[35], &s[36], false); + HadamardRotation(&s[40], &s[47], true); + HadamardRotation(&s[41], &s[46], true); + HadamardRotation(&s[42], &s[45], true); + HadamardRotation(&s[43], &s[44], true); + HadamardRotation(&s[48], &s[55], false); + HadamardRotation(&s[49], &s[54], false); + HadamardRotation(&s[50], &s[53], false); + HadamardRotation(&s[51], &s[52], false); + HadamardRotation(&s[56], &s[63], true); + HadamardRotation(&s[57], &s[62], true); + HadamardRotation(&s[58], &s[61], true); + HadamardRotation(&s[59], &s[60], true); + + // stage 25. + ButterflyRotation_8(&s[59], &s[36], 48, true); + ButterflyRotation_8(&s[58], &s[37], 48, true); + ButterflyRotation_8(&s[57], &s[38], 48, true); + ButterflyRotation_8(&s[56], &s[39], 48, true); + ButterflyRotation_8(&s[55], &s[40], 112, true); + ButterflyRotation_8(&s[54], &s[41], 112, true); + ButterflyRotation_8(&s[53], &s[42], 112, true); + ButterflyRotation_8(&s[52], &s[43], 112, true); + + // stage 28. + HadamardRotation(&s[32], &s[47], false); + HadamardRotation(&s[33], &s[46], false); + HadamardRotation(&s[34], &s[45], false); + HadamardRotation(&s[35], &s[44], false); + HadamardRotation(&s[36], &s[43], false); + HadamardRotation(&s[37], &s[42], false); + HadamardRotation(&s[38], &s[41], false); + HadamardRotation(&s[39], &s[40], false); + HadamardRotation(&s[48], &s[63], true); + HadamardRotation(&s[49], &s[62], true); + HadamardRotation(&s[50], &s[61], true); + HadamardRotation(&s[51], &s[60], true); + HadamardRotation(&s[52], &s[59], true); + HadamardRotation(&s[53], &s[58], true); + HadamardRotation(&s[54], &s[57], true); + HadamardRotation(&s[55], &s[56], true); + + // stage 30. + ButterflyRotation_8(&s[55], &s[40], 32, true); + ButterflyRotation_8(&s[54], &s[41], 32, true); + ButterflyRotation_8(&s[53], &s[42], 32, true); + ButterflyRotation_8(&s[52], &s[43], 32, true); + ButterflyRotation_8(&s[51], &s[44], 32, true); + ButterflyRotation_8(&s[50], &s[45], 32, true); + ButterflyRotation_8(&s[49], &s[46], 32, true); + ButterflyRotation_8(&s[48], &s[47], 32, true); + + // stage 31. + for (int i = 0; i < 32; i += 4) { + HadamardRotation(&s[i], &s[63 - i], false); + HadamardRotation(&s[i + 1], &s[63 - i - 1], false); + HadamardRotation(&s[i + 2], &s[63 - i - 2], false); + HadamardRotation(&s[i + 3], &s[63 - i - 3], false); + } + //-- end dct 64 stages + + if (transpose) { + for (int idx = 0; idx < 64; idx += 8) { + __m128i output[8]; + Transpose8x8_U16(&s[idx], output); + StoreDst<16, 8>(dst, step, idx, output); + } + } else { + StoreDst<16, 64>(dst, step, 0, s); + } +} + +//------------------------------------------------------------------------------ +// Asymmetric Discrete Sine Transforms (ADST). + +template +LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, int32_t step, + bool transpose) { + auto* const dst = static_cast(dest); + __m128i s[8], x[4]; + + if (stage_is_rectangular) { + if (transpose) { + __m128i input[8]; + LoadSrc<8, 8>(dst, step, 0, input); + Transpose4x8To8x4_U16(input, x); + } else { + LoadSrc<16, 4>(dst, step, 0, x); + } + } else { + LoadSrc<8, 4>(dst, step, 0, x); + if (transpose) { + Transpose4x4_U16(x, x); + } + } + + const __m128i kAdst4Multiplier_1 = _mm_set1_epi16(kAdst4Multiplier[1]); + const __m128i kAdst4Multiplier_2 = _mm_set1_epi16(kAdst4Multiplier[2]); + const __m128i kAdst4Multiplier_3 = _mm_set1_epi16(kAdst4Multiplier[3]); + const __m128i kAdst4Multiplier_m0_1 = + _mm_set1_epi32(static_cast(kAdst4Multiplier[1]) | + (static_cast(-kAdst4Multiplier[0]) << 16)); + const __m128i kAdst4Multiplier_3_0 = + _mm_set1_epi32(static_cast(kAdst4Multiplier[0]) | + (static_cast(kAdst4Multiplier[3]) << 16)); + + // stage 1. + const __m128i x3_x0 = _mm_unpacklo_epi16(x[0], x[3]); + const __m128i x2_x0 = _mm_unpacklo_epi16(x[0], x[2]); + const __m128i zero_x1 = _mm_cvtepu16_epi32(x[1]); + const __m128i zero_x2 = _mm_cvtepu16_epi32(x[2]); + const __m128i zero_x3 = _mm_cvtepu16_epi32(x[3]); + + s[5] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_1); + s[6] = _mm_madd_epi16(zero_x3, kAdst4Multiplier_3); + + // stage 2. + // ((src[0] - src[2]) + src[3]) * kAdst4Multiplier[2] + const __m128i k2_x3_x0 = _mm_madd_epi16(x3_x0, kAdst4Multiplier_2); + const __m128i k2_zero_x2 = _mm_madd_epi16(zero_x2, kAdst4Multiplier_2); + const __m128i b7 = _mm_sub_epi32(k2_x3_x0, k2_zero_x2); + + // stage 3. + s[0] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_3_0); + s[1] = _mm_madd_epi16(x2_x0, kAdst4Multiplier_m0_1); + s[2] = b7; + s[3] = _mm_madd_epi16(zero_x1, kAdst4Multiplier_2); + + // stage 4. + s[0] = _mm_add_epi32(s[0], s[5]); + s[1] = _mm_sub_epi32(s[1], s[6]); + + // stages 5 and 6. + x[0] = _mm_add_epi32(s[0], s[3]); + x[1] = _mm_add_epi32(s[1], s[3]); + x[2] = _mm_add_epi32(s[0], s[1]); + x[3] = _mm_sub_epi32(x[2], s[3]); + + x[0] = RightShiftWithRounding_S32(x[0], 12); + x[1] = RightShiftWithRounding_S32(x[1], 12); + x[2] = RightShiftWithRounding_S32(s[2], 12); + x[3] = RightShiftWithRounding_S32(x[3], 12); + + x[0] = _mm_packs_epi32(x[0], x[1]); + x[2] = _mm_packs_epi32(x[2], x[3]); + x[1] = _mm_srli_si128(x[0], 8); + x[3] = _mm_srli_si128(x[2], 8); + + if (stage_is_rectangular) { + if (transpose) { + __m128i output[8]; + Transpose8x4To4x8_U16(x, output); + StoreDst<8, 8>(dst, step, 0, output); + } else { + StoreDst<16, 4>(dst, step, 0, x); + } + } else { + if (transpose) { + Transpose4x4_U16(x, x); + } + StoreDst<8, 4>(dst, step, 0, x); + } +} + +constexpr int16_t kAdst4DcOnlyMultiplier[8] = {1321, 0, 2482, 0, + 3344, 0, 2482, 1321}; + +LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const __m128i v_src = + _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0); + const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_kTransformRowMultiplier = + _mm_set1_epi16(kTransformRowMultiplier << 3); + const __m128i v_src_round = + _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier); + const __m128i s0 = _mm_blendv_epi8(v_src, v_src_round, v_mask); + const __m128i v_kAdst4DcOnlyMultipliers = + LoadUnaligned16(kAdst4DcOnlyMultiplier); + // s0*k0 s0*k1 s0*k2 s0*k1 + // + + // s0*0 s0*0 s0*0 s0*k0 + const __m128i x3 = _mm_madd_epi16(s0, v_kAdst4DcOnlyMultipliers); + const __m128i dst_0 = RightShiftWithRounding_S32(x3, 12); + const __m128i v_row_shift_add = _mm_set1_epi32(row_shift); + const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add); + const __m128i a = _mm_add_epi32(dst_0, v_row_shift_add); + const __m128i b = _mm_sra_epi32(a, v_row_shift); + const __m128i c = _mm_packs_epi32(b, b); + StoreLo8(dst, c); + + return true; +} + +LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + int i = 0; + do { + const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&dst[i])); + const __m128i kAdst4Multiplier_0 = _mm_set1_epi32(kAdst4Multiplier[0]); + const __m128i kAdst4Multiplier_1 = _mm_set1_epi32(kAdst4Multiplier[1]); + const __m128i kAdst4Multiplier_2 = _mm_set1_epi32(kAdst4Multiplier[2]); + const __m128i s0 = _mm_mullo_epi32(kAdst4Multiplier_0, v_src); + const __m128i s1 = _mm_mullo_epi32(kAdst4Multiplier_1, v_src); + const __m128i s2 = _mm_mullo_epi32(kAdst4Multiplier_2, v_src); + const __m128i x0 = s0; + const __m128i x1 = s1; + const __m128i x2 = s2; + const __m128i x3 = _mm_add_epi32(s0, s1); + const __m128i dst_0 = RightShiftWithRounding_S32(x0, 12); + const __m128i dst_1 = RightShiftWithRounding_S32(x1, 12); + const __m128i dst_2 = RightShiftWithRounding_S32(x2, 12); + const __m128i dst_3 = RightShiftWithRounding_S32(x3, 12); + const __m128i dst_0_1 = _mm_packs_epi32(dst_0, dst_1); + const __m128i dst_2_3 = _mm_packs_epi32(dst_2, dst_3); + StoreLo8(&dst[i], dst_0_1); + StoreHi8(&dst[i + width * 1], dst_0_1); + StoreLo8(&dst[i + width * 2], dst_2_3); + StoreHi8(&dst[i + width * 3], dst_2_3); + i += 4; + } while (i < width); + + return true; +} + +template +LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, int32_t step, + bool transpose) { + auto* const dst = static_cast(dest); + __m128i s[8], x[8]; + + if (stage_is_rectangular) { + if (transpose) { + __m128i input[4]; + LoadSrc<16, 4>(dst, step, 0, input); + Transpose8x4To4x8_U16(input, x); + } else { + LoadSrc<8, 8>(dst, step, 0, x); + } + } else { + if (transpose) { + __m128i input[8]; + LoadSrc<16, 8>(dst, step, 0, input); + Transpose8x8_U16(input, x); + } else { + LoadSrc<16, 8>(dst, step, 0, x); + } + } + + // stage 1. + s[0] = x[7]; + s[1] = x[0]; + s[2] = x[5]; + s[3] = x[2]; + s[4] = x[3]; + s[5] = x[4]; + s[6] = x[1]; + s[7] = x[6]; + + // stage 2. + butterfly_rotation(&s[0], &s[1], 60 - 0, true); + butterfly_rotation(&s[2], &s[3], 60 - 16, true); + butterfly_rotation(&s[4], &s[5], 60 - 32, true); + butterfly_rotation(&s[6], &s[7], 60 - 48, true); + + // stage 3. + HadamardRotation(&s[0], &s[4], false); + HadamardRotation(&s[1], &s[5], false); + HadamardRotation(&s[2], &s[6], false); + HadamardRotation(&s[3], &s[7], false); + + // stage 4. + butterfly_rotation(&s[4], &s[5], 48 - 0, true); + butterfly_rotation(&s[7], &s[6], 48 - 32, true); + + // stage 5. + HadamardRotation(&s[0], &s[2], false); + HadamardRotation(&s[4], &s[6], false); + HadamardRotation(&s[1], &s[3], false); + HadamardRotation(&s[5], &s[7], false); + + // stage 6. + butterfly_rotation(&s[2], &s[3], 32, true); + butterfly_rotation(&s[6], &s[7], 32, true); + + // stage 7. + const __m128i v_zero = _mm_setzero_si128(); + x[0] = s[0]; + x[1] = _mm_subs_epi16(v_zero, s[4]); + x[2] = s[6]; + x[3] = _mm_subs_epi16(v_zero, s[2]); + x[4] = s[3]; + x[5] = _mm_subs_epi16(v_zero, s[7]); + x[6] = s[5]; + x[7] = _mm_subs_epi16(v_zero, s[1]); + + if (stage_is_rectangular) { + if (transpose) { + __m128i output[4]; + Transpose4x8To8x4_U16(x, output); + StoreDst<16, 4>(dst, step, 0, output); + } else { + StoreDst<8, 8>(dst, step, 0, x); + } + } else { + if (transpose) { + __m128i output[8]; + Transpose8x8_U16(x, output); + StoreDst<16, 8>(dst, step, 0, output); + } else { + StoreDst<16, 8>(dst, step, 0, x); + } + } +} + +LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + __m128i s[8]; + + const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0); + const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_kTransformRowMultiplier = + _mm_set1_epi16(kTransformRowMultiplier << 3); + const __m128i v_src_round = + _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier); + // stage 1. + s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask); + + // stage 2. + ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true); + + // stage 3. + s[4] = s[0]; + s[5] = s[1]; + + // stage 4. + ButterflyRotation_4(&s[4], &s[5], 48, true); + + // stage 5. + s[2] = s[0]; + s[3] = s[1]; + s[6] = s[4]; + s[7] = s[5]; + + // stage 6. + ButterflyRotation_4(&s[2], &s[3], 32, true); + ButterflyRotation_4(&s[6], &s[7], 32, true); + + // stage 7. + __m128i x[8]; + const __m128i v_zero = _mm_setzero_si128(); + x[0] = s[0]; + x[1] = _mm_subs_epi16(v_zero, s[4]); + x[2] = s[6]; + x[3] = _mm_subs_epi16(v_zero, s[2]); + x[4] = s[3]; + x[5] = _mm_subs_epi16(v_zero, s[7]); + x[6] = s[5]; + x[7] = _mm_subs_epi16(v_zero, s[1]); + + const __m128i x1_x0 = _mm_unpacklo_epi16(x[0], x[1]); + const __m128i x3_x2 = _mm_unpacklo_epi16(x[2], x[3]); + const __m128i x5_x4 = _mm_unpacklo_epi16(x[4], x[5]); + const __m128i x7_x6 = _mm_unpacklo_epi16(x[6], x[7]); + const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2); + const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6); + + const __m128i v_row_shift_add = _mm_set1_epi32(row_shift); + const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add); + const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add); + const __m128i a1 = _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add); + const __m128i b = _mm_sra_epi32(a, v_row_shift); + const __m128i b1 = _mm_sra_epi32(a1, v_row_shift); + StoreUnaligned16(dst, _mm_packs_epi32(b, b1)); + + return true; +} + +LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + __m128i s[8]; + + int i = 0; + do { + const __m128i v_src = LoadLo8(dst); + // stage 1. + s[1] = v_src; + + // stage 2. + ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true); + + // stage 3. + s[4] = s[0]; + s[5] = s[1]; + + // stage 4. + ButterflyRotation_4(&s[4], &s[5], 48, true); + + // stage 5. + s[2] = s[0]; + s[3] = s[1]; + s[6] = s[4]; + s[7] = s[5]; + + // stage 6. + ButterflyRotation_4(&s[2], &s[3], 32, true); + ButterflyRotation_4(&s[6], &s[7], 32, true); + + // stage 7. + __m128i x[8]; + const __m128i v_zero = _mm_setzero_si128(); + x[0] = s[0]; + x[1] = _mm_subs_epi16(v_zero, s[4]); + x[2] = s[6]; + x[3] = _mm_subs_epi16(v_zero, s[2]); + x[4] = s[3]; + x[5] = _mm_subs_epi16(v_zero, s[7]); + x[6] = s[5]; + x[7] = _mm_subs_epi16(v_zero, s[1]); + + for (int j = 0; j < 8; ++j) { + StoreLo8(&dst[j * width], x[j]); + } + i += 4; + dst += 4; + } while (i < width); + + return true; +} + +template +LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, int32_t step, + bool transpose) { + auto* const dst = static_cast(dest); + __m128i s[16], x[16]; + + if (stage_is_rectangular) { + if (transpose) { + __m128i input[4]; + LoadSrc<16, 4>(dst, step, 0, input); + Transpose8x4To4x8_U16(input, x); + LoadSrc<16, 4>(dst, step, 8, input); + Transpose8x4To4x8_U16(input, &x[8]); + } else { + LoadSrc<8, 16>(dst, step, 0, x); + } + } else { + if (transpose) { + for (int idx = 0; idx < 16; idx += 8) { + __m128i input[8]; + LoadSrc<16, 8>(dst, step, idx, input); + Transpose8x8_U16(input, &x[idx]); + } + } else { + LoadSrc<16, 16>(dst, step, 0, x); + } + } + + // stage 1. + s[0] = x[15]; + s[1] = x[0]; + s[2] = x[13]; + s[3] = x[2]; + s[4] = x[11]; + s[5] = x[4]; + s[6] = x[9]; + s[7] = x[6]; + s[8] = x[7]; + s[9] = x[8]; + s[10] = x[5]; + s[11] = x[10]; + s[12] = x[3]; + s[13] = x[12]; + s[14] = x[1]; + s[15] = x[14]; + + // stage 2. + butterfly_rotation(&s[0], &s[1], 62 - 0, true); + butterfly_rotation(&s[2], &s[3], 62 - 8, true); + butterfly_rotation(&s[4], &s[5], 62 - 16, true); + butterfly_rotation(&s[6], &s[7], 62 - 24, true); + butterfly_rotation(&s[8], &s[9], 62 - 32, true); + butterfly_rotation(&s[10], &s[11], 62 - 40, true); + butterfly_rotation(&s[12], &s[13], 62 - 48, true); + butterfly_rotation(&s[14], &s[15], 62 - 56, true); + + // stage 3. + HadamardRotation(&s[0], &s[8], false); + HadamardRotation(&s[1], &s[9], false); + HadamardRotation(&s[2], &s[10], false); + HadamardRotation(&s[3], &s[11], false); + HadamardRotation(&s[4], &s[12], false); + HadamardRotation(&s[5], &s[13], false); + HadamardRotation(&s[6], &s[14], false); + HadamardRotation(&s[7], &s[15], false); + + // stage 4. + butterfly_rotation(&s[8], &s[9], 56 - 0, true); + butterfly_rotation(&s[13], &s[12], 8 + 0, true); + butterfly_rotation(&s[10], &s[11], 56 - 32, true); + butterfly_rotation(&s[15], &s[14], 8 + 32, true); + + // stage 5. + HadamardRotation(&s[0], &s[4], false); + HadamardRotation(&s[8], &s[12], false); + HadamardRotation(&s[1], &s[5], false); + HadamardRotation(&s[9], &s[13], false); + HadamardRotation(&s[2], &s[6], false); + HadamardRotation(&s[10], &s[14], false); + HadamardRotation(&s[3], &s[7], false); + HadamardRotation(&s[11], &s[15], false); + + // stage 6. + butterfly_rotation(&s[4], &s[5], 48 - 0, true); + butterfly_rotation(&s[12], &s[13], 48 - 0, true); + butterfly_rotation(&s[7], &s[6], 48 - 32, true); + butterfly_rotation(&s[15], &s[14], 48 - 32, true); + + // stage 7. + HadamardRotation(&s[0], &s[2], false); + HadamardRotation(&s[4], &s[6], false); + HadamardRotation(&s[8], &s[10], false); + HadamardRotation(&s[12], &s[14], false); + HadamardRotation(&s[1], &s[3], false); + HadamardRotation(&s[5], &s[7], false); + HadamardRotation(&s[9], &s[11], false); + HadamardRotation(&s[13], &s[15], false); + + // stage 8. + butterfly_rotation(&s[2], &s[3], 32, true); + butterfly_rotation(&s[6], &s[7], 32, true); + butterfly_rotation(&s[10], &s[11], 32, true); + butterfly_rotation(&s[14], &s[15], 32, true); + + // stage 9. + const __m128i v_zero = _mm_setzero_si128(); + x[0] = s[0]; + x[1] = _mm_subs_epi16(v_zero, s[8]); + x[2] = s[12]; + x[3] = _mm_subs_epi16(v_zero, s[4]); + x[4] = s[6]; + x[5] = _mm_subs_epi16(v_zero, s[14]); + x[6] = s[10]; + x[7] = _mm_subs_epi16(v_zero, s[2]); + x[8] = s[3]; + x[9] = _mm_subs_epi16(v_zero, s[11]); + x[10] = s[15]; + x[11] = _mm_subs_epi16(v_zero, s[7]); + x[12] = s[5]; + x[13] = _mm_subs_epi16(v_zero, s[13]); + x[14] = s[9]; + x[15] = _mm_subs_epi16(v_zero, s[1]); + + if (stage_is_rectangular) { + if (transpose) { + __m128i output[4]; + Transpose4x8To8x4_U16(x, output); + StoreDst<16, 4>(dst, step, 0, output); + Transpose4x8To8x4_U16(&x[8], output); + StoreDst<16, 4>(dst, step, 8, output); + } else { + StoreDst<8, 16>(dst, step, 0, x); + } + } else { + if (transpose) { + for (int idx = 0; idx < 16; idx += 8) { + __m128i output[8]; + Transpose8x8_U16(&x[idx], output); + StoreDst<16, 8>(dst, step, idx, output); + } + } else { + StoreDst<16, 16>(dst, step, 0, x); + } + } +} + +LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(__m128i* s, __m128i* x) { + // stage 2. + ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true); + + // stage 3. + s[8] = s[0]; + s[9] = s[1]; + + // stage 4. + ButterflyRotation_4(&s[8], &s[9], 56, true); + + // stage 5. + s[4] = s[0]; + s[12] = s[8]; + s[5] = s[1]; + s[13] = s[9]; + + // stage 6. + ButterflyRotation_4(&s[4], &s[5], 48, true); + ButterflyRotation_4(&s[12], &s[13], 48, true); + + // stage 7. + s[2] = s[0]; + s[6] = s[4]; + s[10] = s[8]; + s[14] = s[12]; + s[3] = s[1]; + s[7] = s[5]; + s[11] = s[9]; + s[15] = s[13]; + + // stage 8. + ButterflyRotation_4(&s[2], &s[3], 32, true); + ButterflyRotation_4(&s[6], &s[7], 32, true); + ButterflyRotation_4(&s[10], &s[11], 32, true); + ButterflyRotation_4(&s[14], &s[15], 32, true); + + // stage 9. + const __m128i v_zero = _mm_setzero_si128(); + x[0] = s[0]; + x[1] = _mm_subs_epi16(v_zero, s[8]); + x[2] = s[12]; + x[3] = _mm_subs_epi16(v_zero, s[4]); + x[4] = s[6]; + x[5] = _mm_subs_epi16(v_zero, s[14]); + x[6] = s[10]; + x[7] = _mm_subs_epi16(v_zero, s[2]); + x[8] = s[3]; + x[9] = _mm_subs_epi16(v_zero, s[11]); + x[10] = s[15]; + x[11] = _mm_subs_epi16(v_zero, s[7]); + x[12] = s[5]; + x[13] = _mm_subs_epi16(v_zero, s[13]); + x[14] = s[9]; + x[15] = _mm_subs_epi16(v_zero, s[1]); +} + +LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + __m128i s[16]; + __m128i x[16]; + + const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0); + const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_kTransformRowMultiplier = + _mm_set1_epi16(kTransformRowMultiplier << 3); + const __m128i v_src_round = + _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier); + // stage 1. + s[1] = _mm_blendv_epi8(v_src, v_src_round, v_mask); + + Adst16DcOnlyInternal(s, x); + + for (int i = 0; i < 2; ++i) { + const __m128i x1_x0 = _mm_unpacklo_epi16(x[0 + i * 8], x[1 + i * 8]); + const __m128i x3_x2 = _mm_unpacklo_epi16(x[2 + i * 8], x[3 + i * 8]); + const __m128i x5_x4 = _mm_unpacklo_epi16(x[4 + i * 8], x[5 + i * 8]); + const __m128i x7_x6 = _mm_unpacklo_epi16(x[6 + i * 8], x[7 + i * 8]); + const __m128i x3_x0 = _mm_unpacklo_epi32(x1_x0, x3_x2); + const __m128i x7_x4 = _mm_unpacklo_epi32(x5_x4, x7_x6); + + const __m128i v_row_shift_add = _mm_set1_epi32(row_shift); + const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add); + const __m128i a = _mm_add_epi32(_mm_cvtepi16_epi32(x3_x0), v_row_shift_add); + const __m128i a1 = + _mm_add_epi32(_mm_cvtepi16_epi32(x7_x4), v_row_shift_add); + const __m128i b = _mm_sra_epi32(a, v_row_shift); + const __m128i b1 = _mm_sra_epi32(a1, v_row_shift); + StoreUnaligned16(&dst[i * 8], _mm_packs_epi32(b, b1)); + } + return true; +} + +LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest, + int adjusted_tx_height, + int width) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + int i = 0; + do { + __m128i s[16]; + __m128i x[16]; + const __m128i v_src = LoadUnaligned16(dst); + // stage 1. + s[1] = v_src; + + Adst16DcOnlyInternal(s, x); + + for (int j = 0; j < 16; ++j) { + StoreLo8(&dst[j * width], x[j]); + } + i += 4; + dst += 4; + } while (i < width); + + return true; +} + +//------------------------------------------------------------------------------ +// Identity Transforms. + +template +LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, int32_t step) { + auto* const dst = static_cast(dest); + + if (is_row_shift) { + const int shift = 1; + const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11); + const __m128i v_multiplier_one = + _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001); + for (int i = 0; i < 4; i += 2) { + const __m128i v_src = LoadUnaligned16(&dst[i * step]); + const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src); + const __m128i v_src_round_hi = _mm_unpackhi_epi16(v_dual_round, v_src); + const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one); + const __m128i a_hi = _mm_madd_epi16(v_src_round_hi, v_multiplier_one); + const __m128i b = _mm_srai_epi32(a, 12 + shift); + const __m128i b_hi = _mm_srai_epi32(a_hi, 12 + shift); + StoreUnaligned16(&dst[i * step], _mm_packs_epi32(b, b_hi)); + } + } else { + const __m128i v_multiplier = + _mm_set1_epi16(kIdentity4MultiplierFraction << 3); + for (int i = 0; i < 4; i += 2) { + const __m128i v_src = LoadUnaligned16(&dst[i * step]); + const __m128i a = _mm_mulhrs_epi16(v_src, v_multiplier); + const __m128i b = _mm_adds_epi16(a, v_src); + StoreUnaligned16(&dst[i * step], b); + } + } +} + +LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int tx_height) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]); + const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_kTransformRowMultiplier = + _mm_set1_epi16(kTransformRowMultiplier << 3); + const __m128i v_src_round = + _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier); + const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round, v_mask); + + const int shift = (tx_height < 16) ? 0 : 1; + const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11); + const __m128i v_multiplier_one = + _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001); + const __m128i v_src_round_lo = _mm_unpacklo_epi16(v_dual_round, v_src); + const __m128i a = _mm_madd_epi16(v_src_round_lo, v_multiplier_one); + const __m128i b = _mm_srai_epi32(a, 12 + shift); + dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0); + return true; +} + +LIBGAV1_ALWAYS_INLINE void Identity4ColumnStoreToFrame( + Array2DView frame, const int start_x, const int start_y, + const int tx_width, const int tx_height, const int16_t* source) { + const int stride = frame.columns(); + uint8_t* dst = frame[start_y] + start_x; + + const __m128i v_multiplier_fraction = + _mm_set1_epi16(static_cast(kIdentity4MultiplierFraction << 3)); + const __m128i v_eight = _mm_set1_epi16(8); + + if (tx_width == 4) { + int i = 0; + do { + const __m128i v_src = LoadLo8(&source[i * tx_width]); + const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction); + const __m128i frame_data = Load4(dst); + const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src); + const __m128i a = _mm_adds_epi16(v_dst_i, v_eight); + const __m128i b = _mm_srai_epi16(a, 4); + const __m128i c = _mm_cvtepu8_epi16(frame_data); + const __m128i d = _mm_adds_epi16(c, b); + Store4(dst, _mm_packus_epi16(d, d)); + dst += stride; + } while (++i < tx_height); + } else { + int i = 0; + do { + const int row = i * tx_width; + int j = 0; + do { + const __m128i v_src = LoadUnaligned16(&source[row + j]); + const __m128i v_src_mult = + _mm_mulhrs_epi16(v_src, v_multiplier_fraction); + const __m128i frame_data = LoadLo8(dst + j); + const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_src); + const __m128i a = _mm_adds_epi16(v_dst_i, v_eight); + const __m128i b = _mm_srai_epi16(a, 4); + const __m128i c = _mm_cvtepu8_epi16(frame_data); + const __m128i d = _mm_adds_epi16(c, b); + StoreLo8(dst + j, _mm_packus_epi16(d, d)); + j += 8; + } while (j < tx_width); + dst += stride; + } while (++i < tx_height); + } +} + +LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame( + Array2DView frame, const int start_x, const int start_y, + const int tx_width, const int tx_height, const int16_t* source) { + const int stride = frame.columns(); + uint8_t* dst = frame[start_y] + start_x; + + const __m128i v_multiplier_fraction = + _mm_set1_epi16(static_cast(kIdentity4MultiplierFraction << 3)); + const __m128i v_eight = _mm_set1_epi16(8); + const __m128i v_kTransformRowMultiplier = + _mm_set1_epi16(kTransformRowMultiplier << 3); + + if (tx_width == 4) { + int i = 0; + do { + const __m128i v_src = LoadLo8(&source[i * tx_width]); + const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier_fraction); + const __m128i frame_data = Load4(dst); + const __m128i v_dst_row = _mm_adds_epi16(v_src_mult, v_src); + const __m128i v_src_mult2 = + _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction); + const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data); + const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row); + const __m128i a = _mm_adds_epi16(v_dst_col, v_eight); + const __m128i b = _mm_srai_epi16(a, 4); + const __m128i c = _mm_adds_epi16(frame_data16, b); + Store4(dst, _mm_packus_epi16(c, c)); + dst += stride; + } while (++i < tx_height); + } else { + int i = 0; + do { + const int row = i * tx_width; + int j = 0; + do { + const __m128i v_src = LoadUnaligned16(&source[row + j]); + const __m128i v_src_round = + _mm_mulhrs_epi16(v_src, v_kTransformRowMultiplier); + const __m128i v_dst_row = _mm_adds_epi16(v_src_round, v_src_round); + const __m128i v_src_mult2 = + _mm_mulhrs_epi16(v_dst_row, v_multiplier_fraction); + const __m128i frame_data = LoadLo8(dst + j); + const __m128i frame_data16 = _mm_cvtepu8_epi16(frame_data); + const __m128i v_dst_col = _mm_adds_epi16(v_src_mult2, v_dst_row); + const __m128i a = _mm_adds_epi16(v_dst_col, v_eight); + const __m128i b = _mm_srai_epi16(a, 4); + const __m128i c = _mm_adds_epi16(frame_data16, b); + StoreLo8(dst + j, _mm_packus_epi16(c, c)); + j += 8; + } while (j < tx_width); + dst += stride; + } while (++i < tx_height); + } +} + +LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, int32_t step) { + auto* const dst = static_cast(dest); + + // When combining the identity8 multiplier with the row shift, the + // calculations for tx_height equal to 32 can be simplified from + // ((A * 2) + 2) >> 2) to ((A + 1) >> 1). + const __m128i v_row_multiplier = _mm_set1_epi16(1 << 14); + for (int h = 0; h < 4; ++h) { + const __m128i v_src = LoadUnaligned16(&dst[h * step]); + const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_row_multiplier); + StoreUnaligned16(&dst[h * step], v_src_mult); + } +} + +LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, int32_t step) { + auto* const dst = static_cast(dest); + + for (int h = 0; h < 4; ++h) { + const __m128i v_src = LoadUnaligned16(&dst[h * step]); + // For bitdepth == 8, the identity row clamps to a signed 16bit value, so + // saturating add here is ok. + const __m128i a = _mm_adds_epi16(v_src, v_src); + StoreUnaligned16(&dst[h * step], a); + } +} + +LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int row_shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]); + const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_kTransformRowMultiplier = + _mm_set1_epi16(kTransformRowMultiplier << 3); + const __m128i v_src_round = + _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier); + const __m128i v_src = + _mm_cvtepi16_epi32(_mm_blendv_epi8(v_src0, v_src_round, v_mask)); + const __m128i v_srcx2 = _mm_add_epi32(v_src, v_src); + const __m128i v_row_shift_add = _mm_set1_epi32(row_shift); + const __m128i v_row_shift = _mm_cvtepu32_epi64(v_row_shift_add); + const __m128i a = _mm_add_epi32(v_srcx2, v_row_shift_add); + const __m128i b = _mm_sra_epi32(a, v_row_shift); + dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0); + return true; +} + +LIBGAV1_ALWAYS_INLINE void Identity8ColumnStoreToFrame_SSE4_1( + Array2DView frame, const int start_x, const int start_y, + const int tx_width, const int tx_height, const int16_t* source) { + const int stride = frame.columns(); + uint8_t* dst = frame[start_y] + start_x; + const __m128i v_eight = _mm_set1_epi16(8); + if (tx_width == 4) { + int i = 0; + do { + const int row = i * tx_width; + const __m128i v_src = LoadLo8(&source[row]); + const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src); + const __m128i frame_data = Load4(dst); + const __m128i a = _mm_adds_epi16(v_dst_i, v_eight); + const __m128i b = _mm_srai_epi16(a, 4); + const __m128i c = _mm_cvtepu8_epi16(frame_data); + const __m128i d = _mm_adds_epi16(c, b); + Store4(dst, _mm_packus_epi16(d, d)); + dst += stride; + } while (++i < tx_height); + } else { + int i = 0; + do { + const int row = i * tx_width; + int j = 0; + do { + const __m128i v_src = LoadUnaligned16(&source[row + j]); + const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src); + const __m128i frame_data = LoadLo8(dst + j); + const __m128i a = _mm_adds_epi16(v_dst_i, v_eight); + const __m128i b = _mm_srai_epi16(a, 4); + const __m128i c = _mm_cvtepu8_epi16(frame_data); + const __m128i d = _mm_adds_epi16(c, b); + StoreLo8(dst + j, _mm_packus_epi16(d, d)); + j += 8; + } while (j < tx_width); + dst += stride; + } while (++i < tx_height); + } +} + +LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, int32_t step, + int shift) { + auto* const dst = static_cast(dest); + + const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11); + const __m128i v_multiplier_one = + _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001); + const __m128i v_shift = _mm_set_epi64x(0, 12 + shift); + + for (int h = 0; h < 4; ++h) { + const __m128i v_src = LoadUnaligned16(&dst[h * step]); + const __m128i v_src2 = LoadUnaligned16(&dst[h * step + 8]); + const __m128i v_src_round0 = _mm_unpacklo_epi16(v_dual_round, v_src); + const __m128i v_src_round1 = _mm_unpackhi_epi16(v_dual_round, v_src); + const __m128i v_src2_round0 = _mm_unpacklo_epi16(v_dual_round, v_src2); + const __m128i v_src2_round1 = _mm_unpackhi_epi16(v_dual_round, v_src2); + const __m128i madd0 = _mm_madd_epi16(v_src_round0, v_multiplier_one); + const __m128i madd1 = _mm_madd_epi16(v_src_round1, v_multiplier_one); + const __m128i madd20 = _mm_madd_epi16(v_src2_round0, v_multiplier_one); + const __m128i madd21 = _mm_madd_epi16(v_src2_round1, v_multiplier_one); + const __m128i shift0 = _mm_sra_epi32(madd0, v_shift); + const __m128i shift1 = _mm_sra_epi32(madd1, v_shift); + const __m128i shift20 = _mm_sra_epi32(madd20, v_shift); + const __m128i shift21 = _mm_sra_epi32(madd21, v_shift); + StoreUnaligned16(&dst[h * step], _mm_packs_epi32(shift0, shift1)); + StoreUnaligned16(&dst[h * step + 8], _mm_packs_epi32(shift20, shift21)); + } +} + +LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height, + bool should_round, int shift) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]); + const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0); + const __m128i v_kTransformRowMultiplier = + _mm_set1_epi16(kTransformRowMultiplier << 3); + const __m128i v_src_round0 = + _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier); + const __m128i v_src = _mm_blendv_epi8(v_src0, v_src_round0, v_mask); + const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11); + const __m128i v_multiplier_one = + _mm_set1_epi32((kIdentity16Multiplier << 16) | 0x0001); + const __m128i v_shift = _mm_set_epi64x(0, 12 + shift); + const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src); + const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one); + const __m128i b = _mm_sra_epi32(a, v_shift); + dst[0] = _mm_extract_epi16(_mm_packs_epi32(b, b), 0); + return true; +} + +LIBGAV1_ALWAYS_INLINE void Identity16ColumnStoreToFrame_SSE4_1( + Array2DView frame, const int start_x, const int start_y, + const int tx_width, const int tx_height, const int16_t* source) { + const int stride = frame.columns(); + uint8_t* dst = frame[start_y] + start_x; + const __m128i v_eight = _mm_set1_epi16(8); + const __m128i v_multiplier = + _mm_set1_epi16(static_cast(kIdentity4MultiplierFraction << 4)); + + if (tx_width == 4) { + int i = 0; + do { + const __m128i v_src = LoadLo8(&source[i * tx_width]); + const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier); + const __m128i frame_data = Load4(dst); + const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src); + const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2); + const __m128i a = _mm_adds_epi16(v_dst_i, v_eight); + const __m128i b = _mm_srai_epi16(a, 4); + const __m128i c = _mm_cvtepu8_epi16(frame_data); + const __m128i d = _mm_adds_epi16(c, b); + Store4(dst, _mm_packus_epi16(d, d)); + dst += stride; + } while (++i < tx_height); + } else { + int i = 0; + do { + const int row = i * tx_width; + int j = 0; + do { + const __m128i v_src = LoadUnaligned16(&source[row + j]); + const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_multiplier); + const __m128i frame_data = LoadLo8(dst + j); + const __m128i v_srcx2 = _mm_adds_epi16(v_src, v_src); + const __m128i v_dst_i = _mm_adds_epi16(v_src_mult, v_srcx2); + const __m128i a = _mm_adds_epi16(v_dst_i, v_eight); + const __m128i b = _mm_srai_epi16(a, 4); + const __m128i c = _mm_cvtepu8_epi16(frame_data); + const __m128i d = _mm_adds_epi16(c, b); + StoreLo8(dst + j, _mm_packus_epi16(d, d)); + j += 8; + } while (j < tx_width); + dst += stride; + } while (++i < tx_height); + } +} + +LIBGAV1_ALWAYS_INLINE void Identity32Row16_SSE4_1(void* dest, + const int32_t step) { + auto* const dst = static_cast(dest); + + // When combining the identity32 multiplier with the row shift, the + // calculation for tx_height equal to 16 can be simplified from + // ((A * 4) + 1) >> 1) to (A * 2). + for (int h = 0; h < 4; ++h) { + for (int i = 0; i < 32; i += 8) { + const __m128i v_src = LoadUnaligned16(&dst[h * step + i]); + // For bitdepth == 8, the identity row clamps to a signed 16bit value, so + // saturating add here is ok. + const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src); + StoreUnaligned16(&dst[h * step + i], v_dst_i); + } + } +} + +LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest, + int adjusted_tx_height) { + if (adjusted_tx_height > 1) return false; + + auto* dst = static_cast(dest); + const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]); + const __m128i v_kTransformRowMultiplier = + _mm_set1_epi16(kTransformRowMultiplier << 3); + const __m128i v_src = _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier); + + // When combining the identity32 multiplier with the row shift, the + // calculation for tx_height equal to 16 can be simplified from + // ((A * 4) + 1) >> 1) to (A * 2). + const __m128i v_dst_0 = _mm_adds_epi16(v_src, v_src); + dst[0] = _mm_extract_epi16(v_dst_0, 0); + return true; +} + +LIBGAV1_ALWAYS_INLINE void Identity32ColumnStoreToFrame( + Array2DView frame, const int start_x, const int start_y, + const int tx_width, const int tx_height, const int16_t* source) { + const int stride = frame.columns(); + uint8_t* dst = frame[start_y] + start_x; + const __m128i v_two = _mm_set1_epi16(2); + + int i = 0; + do { + const int row = i * tx_width; + int j = 0; + do { + const __m128i v_dst_i = LoadUnaligned16(&source[row + j]); + const __m128i frame_data = LoadLo8(dst + j); + const __m128i a = _mm_adds_epi16(v_dst_i, v_two); + const __m128i b = _mm_srai_epi16(a, 2); + const __m128i c = _mm_cvtepu8_epi16(frame_data); + const __m128i d = _mm_adds_epi16(c, b); + StoreLo8(dst + j, _mm_packus_epi16(d, d)); + j += 8; + } while (j < tx_width); + dst += stride; + } while (++i < tx_height); +} + +//------------------------------------------------------------------------------ +// Walsh Hadamard Transform. + +// Process 4 wht4 rows and columns. +LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView frame, + const int start_x, const int start_y, + const void* source, + const int adjusted_tx_height) { + const auto* const src = static_cast(source); + __m128i s[4], x[4]; + + if (adjusted_tx_height == 1) { + // Special case: only src[0] is nonzero. + // src[0] 0 0 0 + // 0 0 0 0 + // 0 0 0 0 + // 0 0 0 0 + // + // After the row and column transforms are applied, we have: + // f h h h + // g i i i + // g i i i + // g i i i + // where f, g, h, i are computed as follows. + int16_t f = (src[0] >> 2) - (src[0] >> 3); + const int16_t g = f >> 1; + f = f - (f >> 1); + const int16_t h = (src[0] >> 3) - (src[0] >> 4); + const int16_t i = (src[0] >> 4); + s[0] = _mm_set1_epi16(h); + s[0] = _mm_insert_epi16(s[0], f, 0); + s[1] = _mm_set1_epi16(i); + s[1] = _mm_insert_epi16(s[1], g, 0); + s[2] = s[3] = s[1]; + } else { + x[0] = LoadLo8(&src[0 * 4]); + x[2] = LoadLo8(&src[1 * 4]); + x[3] = LoadLo8(&src[2 * 4]); + x[1] = LoadLo8(&src[3 * 4]); + + // Row transforms. + Transpose4x4_U16(x, x); + s[0] = _mm_srai_epi16(x[0], 2); + s[2] = _mm_srai_epi16(x[1], 2); + s[3] = _mm_srai_epi16(x[2], 2); + s[1] = _mm_srai_epi16(x[3], 2); + s[0] = _mm_add_epi16(s[0], s[2]); + s[3] = _mm_sub_epi16(s[3], s[1]); + __m128i e = _mm_sub_epi16(s[0], s[3]); + e = _mm_srai_epi16(e, 1); + s[1] = _mm_sub_epi16(e, s[1]); + s[2] = _mm_sub_epi16(e, s[2]); + s[0] = _mm_sub_epi16(s[0], s[1]); + s[3] = _mm_add_epi16(s[3], s[2]); + Transpose4x4_U16(s, s); + + // Column transforms. + s[0] = _mm_add_epi16(s[0], s[2]); + s[3] = _mm_sub_epi16(s[3], s[1]); + e = _mm_sub_epi16(s[0], s[3]); + e = _mm_srai_epi16(e, 1); + s[1] = _mm_sub_epi16(e, s[1]); + s[2] = _mm_sub_epi16(e, s[2]); + s[0] = _mm_sub_epi16(s[0], s[1]); + s[3] = _mm_add_epi16(s[3], s[2]); + } + + // Store to frame. + const int stride = frame.columns(); + uint8_t* dst = frame[start_y] + start_x; + for (int row = 0; row < 4; ++row) { + const __m128i frame_data = Load4(dst); + const __m128i a = _mm_cvtepu8_epi16(frame_data); + // Saturate to prevent overflowing int16_t + const __m128i b = _mm_adds_epi16(a, s[row]); + Store4(dst, _mm_packus_epi16(b, b)); + dst += stride; + } +} + +//------------------------------------------------------------------------------ +// row/column transform loops + +template +LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound( + Array2DView frame, const int start_x, const int start_y, + const int tx_width, const int tx_height, const int16_t* source, + TransformType tx_type) { + const bool flip_rows = + enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false; + const __m128i v_eight = _mm_set1_epi16(8); + const int stride = frame.columns(); + uint8_t* dst = frame[start_y] + start_x; + if (tx_width == 4) { + for (int i = 0; i < tx_height; ++i) { + const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4; + const __m128i residual = LoadLo8(&source[row]); + const __m128i frame_data = Load4(dst); + // Saturate to prevent overflowing int16_t + const __m128i a = _mm_adds_epi16(residual, v_eight); + const __m128i b = _mm_srai_epi16(a, 4); + const __m128i c = _mm_cvtepu8_epi16(frame_data); + const __m128i d = _mm_adds_epi16(c, b); + Store4(dst, _mm_packus_epi16(d, d)); + dst += stride; + } + } else if (tx_width == 8) { + for (int i = 0; i < tx_height; ++i) { + const int row = flip_rows ? (tx_height - i - 1) * 8 : i * 8; + const __m128i residual = LoadUnaligned16(&source[row]); + const __m128i frame_data = LoadLo8(dst); + // Saturate to prevent overflowing int16_t + const __m128i b = _mm_adds_epi16(residual, v_eight); + const __m128i c = _mm_srai_epi16(b, 4); + const __m128i d = _mm_cvtepu8_epi16(frame_data); + const __m128i e = _mm_adds_epi16(d, c); + StoreLo8(dst, _mm_packus_epi16(e, e)); + dst += stride; + } + } else { + for (int i = 0; i < tx_height; ++i) { + const int y = start_y + i; + const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width; + int j = 0; + do { + const int x = start_x + j; + const __m128i residual = LoadUnaligned16(&source[row + j]); + const __m128i residual_hi = LoadUnaligned16(&source[row + j + 8]); + const __m128i frame_data = LoadUnaligned16(frame[y] + x); + const __m128i b = _mm_adds_epi16(residual, v_eight); + const __m128i b_hi = _mm_adds_epi16(residual_hi, v_eight); + const __m128i c = _mm_srai_epi16(b, 4); + const __m128i c_hi = _mm_srai_epi16(b_hi, 4); + const __m128i d = _mm_cvtepu8_epi16(frame_data); + const __m128i d_hi = _mm_cvtepu8_epi16(_mm_srli_si128(frame_data, 8)); + const __m128i e = _mm_adds_epi16(d, c); + const __m128i e_hi = _mm_adds_epi16(d_hi, c_hi); + StoreUnaligned16(frame[y] + x, _mm_packus_epi16(e, e_hi)); + j += 16; + } while (j < tx_width); + } + } +} + +template +LIBGAV1_ALWAYS_INLINE void FlipColumns(int16_t* source, int tx_width) { + const __m128i word_reverse_8 = + _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e); + if (tx_width >= 16) { + int i = 0; + do { + // read 16 shorts + const __m128i v3210 = LoadUnaligned16(&source[i]); + const __m128i v7654 = LoadUnaligned16(&source[i + 8]); + const __m128i v0123 = _mm_shuffle_epi8(v3210, word_reverse_8); + const __m128i v4567 = _mm_shuffle_epi8(v7654, word_reverse_8); + StoreUnaligned16(&source[i], v4567); + StoreUnaligned16(&source[i + 8], v0123); + i += 16; + } while (i < tx_width * tx_height); + } else if (tx_width == 8) { + for (int i = 0; i < 8 * tx_height; i += 8) { + const __m128i a = LoadUnaligned16(&source[i]); + const __m128i b = _mm_shuffle_epi8(a, word_reverse_8); + StoreUnaligned16(&source[i], b); + } + } else { + const __m128i dual_word_reverse_4 = + _mm_set_epi32(0x09080b0a, 0x0d0c0f0e, 0x01000302, 0x05040706); + // Process two rows per iteration. + for (int i = 0; i < 4 * tx_height; i += 8) { + const __m128i a = LoadUnaligned16(&source[i]); + const __m128i b = _mm_shuffle_epi8(a, dual_word_reverse_4); + StoreUnaligned16(&source[i], b); + } + } +} + +template +LIBGAV1_ALWAYS_INLINE void ApplyRounding(int16_t* source, int num_rows) { + const __m128i v_kTransformRowMultiplier = + _mm_set1_epi16(kTransformRowMultiplier << 3); + if (tx_width == 4) { + // Process two rows per iteration. + int i = 0; + do { + const __m128i a = LoadUnaligned16(&source[i]); + const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier); + StoreUnaligned16(&source[i], b); + i += 8; + } while (i < tx_width * num_rows); + } else { + int i = 0; + do { + // The last 32 values of every row are always zero if the |tx_width| is + // 64. + const int non_zero_width = (tx_width < 64) ? tx_width : 32; + int j = 0; + do { + const __m128i a = LoadUnaligned16(&source[i * tx_width + j]); + const __m128i b = _mm_mulhrs_epi16(a, v_kTransformRowMultiplier); + StoreUnaligned16(&source[i * tx_width + j], b); + j += 8; + } while (j < non_zero_width); + } while (++i < num_rows); + } +} + +template +LIBGAV1_ALWAYS_INLINE void RowShift(int16_t* source, int num_rows, + int row_shift) { + const __m128i v_row_shift_add = _mm_set1_epi16(row_shift); + const __m128i v_row_shift = _mm_cvtepu16_epi64(v_row_shift_add); + if (tx_width == 4) { + // Process two rows per iteration. + int i = 0; + do { + const __m128i residual = LoadUnaligned16(&source[i]); + const __m128i shifted_residual = + ShiftResidual(residual, v_row_shift_add, v_row_shift); + StoreUnaligned16(&source[i], shifted_residual); + i += 8; + } while (i < tx_width * num_rows); + } else { + int i = 0; + do { + for (int j = 0; j < tx_width; j += 8) { + const __m128i residual = LoadUnaligned16(&source[i * tx_width + j]); + const __m128i shifted_residual = + ShiftResidual(residual, v_row_shift_add, v_row_shift); + StoreUnaligned16(&source[i * tx_width + j], shifted_residual); + } + } while (++i < num_rows); + } +} + +void Dct4TransformLoopRow_SSE4_1(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const bool should_round = (tx_height == 8); + const int row_shift = static_cast(tx_height == 16); + + if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<4>(src, adjusted_tx_height); + } + + if (adjusted_tx_height <= 4) { + // Process 4 1d dct4 rows in parallel. + Dct4_SSE4_1(src, /*step=*/4, + /*transpose=*/true); + } else { + // Process 8 1d dct4 rows in parallel per iteration. + int i = 0; + do { + Dct4_SSE4_1(&src[i * 4], /*step=*/4, + /*transpose=*/true); + i += 8; + } while (i < adjusted_tx_height); + } + if (tx_height == 16) { + RowShift<4>(src, adjusted_tx_height, 1); + } +} + +void Dct4TransformLoopColumn_SSE4_1(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<4>(src, tx_width); + } + + if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) { + if (tx_width == 4) { + // Process 4 1d dct4 columns in parallel. + Dct4_SSE4_1(src, tx_width, + /*transpose=*/false); + } else { + // Process 8 1d dct4 columns in parallel per iteration. + int i = 0; + do { + Dct4_SSE4_1(&src[i], tx_width, + /*transpose=*/false); + i += 8; + } while (i < tx_width); + } + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound(frame, start_x, start_y, tx_width, 4, src, tx_type); +} + +void Dct8TransformLoopRow_SSE4_1(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<8>(src, adjusted_tx_height); + } + + if (adjusted_tx_height <= 4) { + // Process 4 1d dct8 rows in parallel. + Dct8_SSE4_1(src, /*step=*/8, /*transpose=*/true); + } else { + // Process 8 1d dct8 rows in parallel per iteration. + int i = 0; + do { + Dct8_SSE4_1(&src[i * 8], /*step=*/8, + /*transpose=*/true); + i += 8; + } while (i < adjusted_tx_height); + } + if (row_shift > 0) { + RowShift<8>(src, adjusted_tx_height, row_shift); + } +} + +void Dct8TransformLoopColumn_SSE4_1(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<8>(src, tx_width); + } + + if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) { + if (tx_width == 4) { + // Process 4 1d dct8 columns in parallel. + Dct8_SSE4_1(src, 4, /*transpose=*/false); + } else { + // Process 8 1d dct8 columns in parallel per iteration. + int i = 0; + do { + Dct8_SSE4_1(&src[i], tx_width, + /*transpose=*/false); + i += 8; + } while (i < tx_width); + } + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound(frame, start_x, start_y, tx_width, 8, src, tx_type); +} + +void Dct16TransformLoopRow_SSE4_1(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<16>(src, adjusted_tx_height); + } + + if (adjusted_tx_height <= 4) { + // Process 4 1d dct16 rows in parallel. + Dct16_SSE4_1(src, 16, /*transpose=*/true); + } else { + int i = 0; + do { + // Process 8 1d dct16 rows in parallel per iteration. + Dct16_SSE4_1(&src[i * 16], 16, + /*transpose=*/true); + i += 8; + } while (i < adjusted_tx_height); + } + // row_shift is always non zero here. + RowShift<16>(src, adjusted_tx_height, row_shift); +} + +void Dct16TransformLoopColumn_SSE4_1(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, + void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<16>(src, tx_width); + } + + if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) { + if (tx_width == 4) { + // Process 4 1d dct16 columns in parallel. + Dct16_SSE4_1(src, 4, /*transpose=*/false); + } else { + int i = 0; + do { + // Process 8 1d dct16 columns in parallel per iteration. + Dct16_SSE4_1(&src[i], tx_width, + /*transpose=*/false); + i += 8; + } while (i < tx_width); + } + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound(frame, start_x, start_y, tx_width, 16, src, tx_type); +} + +void Dct32TransformLoopRow_SSE4_1(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<32>(src, adjusted_tx_height); + } + // Process 8 1d dct32 rows in parallel per iteration. + int i = 0; + do { + Dct32_SSE4_1(&src[i * 32], 32, /*transpose=*/true); + i += 8; + } while (i < adjusted_tx_height); + // row_shift is always non zero here. + RowShift<32>(src, adjusted_tx_height, row_shift); +} + +void Dct32TransformLoopColumn_SSE4_1(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, + void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) { + // Process 8 1d dct32 columns in parallel per iteration. + int i = 0; + do { + Dct32_SSE4_1(&src[i], tx_width, /*transpose=*/false); + i += 8; + } while (i < tx_width); + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound(frame, start_x, start_y, tx_width, 32, src, tx_type); +} + +void Dct64TransformLoopRow_SSE4_1(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<64>(src, adjusted_tx_height); + } + // Process 8 1d dct64 rows in parallel per iteration. + int i = 0; + do { + Dct64_SSE4_1(&src[i * 64], 64, /*transpose=*/true); + i += 8; + } while (i < adjusted_tx_height); + // row_shift is always non zero here. + RowShift<64>(src, adjusted_tx_height, row_shift); +} + +void Dct64TransformLoopColumn_SSE4_1(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, + void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) { + // Process 8 1d dct64 columns in parallel per iteration. + int i = 0; + do { + Dct64_SSE4_1(&src[i], tx_width, /*transpose=*/false); + i += 8; + } while (i < tx_width); + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound(frame, start_x, start_y, tx_width, 64, src, tx_type); +} + +void Adst4TransformLoopRow_SSE4_1(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const int row_shift = static_cast(tx_height == 16); + const bool should_round = (tx_height == 8); + + if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<4>(src, adjusted_tx_height); + } + + // Process 4 1d adst4 rows in parallel per iteration. + int i = 0; + do { + Adst4_SSE4_1(&src[i * 4], /*step=*/4, /*transpose=*/true); + i += 4; + } while (i < adjusted_tx_height); + + if (row_shift != 0) { + RowShift<4>(src, adjusted_tx_height, 1); + } +} + +void Adst4TransformLoopColumn_SSE4_1(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, + void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<4>(src, tx_width); + } + + if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) { + // Process 4 1d adst4 columns in parallel per iteration. + int i = 0; + do { + Adst4_SSE4_1(&src[i], tx_width, /*transpose=*/false); + i += 4; + } while (i < tx_width); + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound(frame, start_x, start_y, + tx_width, 4, src, tx_type); +} + +void Adst8TransformLoopRow_SSE4_1(TransformType /*tx_type*/, + TransformSize tx_size, int adjusted_tx_height, + void* src_buffer, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<8>(src, adjusted_tx_height); + } + + if (adjusted_tx_height <= 4) { + // Process 4 1d adst8 rows in parallel. + Adst8_SSE4_1(src, /*step=*/8, + /*transpose=*/true); + } else { + // Process 8 1d adst8 rows in parallel per iteration. + int i = 0; + do { + Adst8_SSE4_1(&src[i * 8], /*step=*/8, + /*transpose=*/true); + i += 8; + } while (i < adjusted_tx_height); + } + if (row_shift > 0) { + RowShift<8>(src, adjusted_tx_height, row_shift); + } +} + +void Adst8TransformLoopColumn_SSE4_1(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, + void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<8>(src, tx_width); + } + + if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) { + if (tx_width == 4) { + // Process 4 1d adst8 columns in parallel. + Adst8_SSE4_1(src, 4, /*transpose=*/false); + } else { + // Process 8 1d adst8 columns in parallel per iteration. + int i = 0; + do { + Adst8_SSE4_1(&src[i], tx_width, + /*transpose=*/false); + i += 8; + } while (i < tx_width); + } + } + auto& frame = *static_cast*>(dst_frame); + StoreToFrameWithRound(frame, start_x, start_y, + tx_width, 8, src, tx_type); +} + +void Adst16TransformLoopRow_SSE4_1(TransformType /*tx_type*/, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + + if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<16>(src, adjusted_tx_height); + } + + if (adjusted_tx_height <= 4) { + // Process 4 1d adst16 rows in parallel. + Adst16_SSE4_1(src, 16, /*transpose=*/true); + } else { + int i = 0; + do { + // Process 8 1d adst16 rows in parallel per iteration. + Adst16_SSE4_1(&src[i * 16], 16, + /*transpose=*/true); + i += 8; + } while (i < adjusted_tx_height); + } + // row_shift is always non zero here. + RowShift<16>(src, adjusted_tx_height, row_shift); +} + +void Adst16TransformLoopColumn_SSE4_1(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, + void* dst_frame) { + auto& frame = *static_cast*>(dst_frame); + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<16>(src, tx_width); + } + + if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) { + if (tx_width == 4) { + // Process 4 1d adst16 columns in parallel. + Adst16_SSE4_1(src, 4, /*transpose=*/false); + } else { + int i = 0; + do { + // Process 8 1d adst16 columns in parallel per iteration. + Adst16_SSE4_1(&src[i], tx_width, + /*transpose=*/false); + i += 8; + } while (i < tx_width); + } + } + StoreToFrameWithRound(frame, start_x, start_y, + tx_width, 16, src, tx_type); +} + +void Identity4TransformLoopRow_SSE4_1(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + // Special case: Process row calculations during column transform call. + // Improves performance. + if (tx_type == kTransformTypeIdentityIdentity && + tx_size == kTransformSize4x4) { + return; + } + + auto* src = static_cast(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const bool should_round = (tx_height == 8); + if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) { + return; + } + + if (should_round) { + ApplyRounding<4>(src, adjusted_tx_height); + } + if (tx_height < 16) { + int i = 0; + do { + Identity4_SSE4_1(&src[i * 4], /*step=*/4); + i += 4; + } while (i < adjusted_tx_height); + } else { + int i = 0; + do { + Identity4_SSE4_1(&src[i * 4], /*step=*/4); + i += 4; + } while (i < adjusted_tx_height); + } +} + +void Identity4TransformLoopColumn_SSE4_1(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, + void* src_buffer, int start_x, + int start_y, void* dst_frame) { + auto& frame = *static_cast*>(dst_frame); + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + // Special case: Process row calculations during column transform call. + if (tx_type == kTransformTypeIdentityIdentity && + (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) { + Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); + return; + } + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<4>(src, tx_width); + } + + Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); +} + +void Identity8TransformLoopRow_SSE4_1(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + // Special case: Process row calculations during column transform call. + // Improves performance. + if (tx_type == kTransformTypeIdentityIdentity && + tx_size == kTransformSize8x4) { + return; + } + + auto* src = static_cast(src_buffer); + const int tx_height = kTransformHeight[tx_size]; + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<8>(src, adjusted_tx_height); + } + + // When combining the identity8 multiplier with the row shift, the + // calculations for tx_height == 8 and tx_height == 16 can be simplified + // from ((A * 2) + 1) >> 1) to A. + if ((tx_height & 0x18) != 0) { + return; + } + if (tx_height == 32) { + int i = 0; + do { + Identity8Row32_SSE4_1(&src[i * 8], /*step=*/8); + i += 4; + } while (i < adjusted_tx_height); + return; + } + + assert(tx_size == kTransformSize8x4); + int i = 0; + do { + Identity8Row4_SSE4_1(&src[i * 8], /*step=*/8); + i += 4; + } while (i < adjusted_tx_height); +} + +void Identity8TransformLoopColumn_SSE4_1(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, + void* src_buffer, int start_x, + int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<8>(src, tx_width); + } + + auto& frame = *static_cast*>(dst_frame); + Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); +} + +void Identity16TransformLoopRow_SSE4_1(TransformType /*tx_type*/, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + auto* src = static_cast(src_buffer); + const bool should_round = kShouldRound[tx_size]; + const uint8_t row_shift = kTransformRowShift[tx_size]; + if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) { + return; + } + + if (should_round) { + ApplyRounding<16>(src, adjusted_tx_height); + } + int i = 0; + do { + Identity16Row_SSE4_1(&src[i * 16], /*step=*/16, + kTransformRowShift[tx_size]); + i += 4; + } while (i < adjusted_tx_height); +} + +void Identity16TransformLoopColumn_SSE4_1(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, + void* src_buffer, int start_x, + int start_y, void* dst_frame) { + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + if (kTransformFlipColumnsMask.Contains(tx_type)) { + FlipColumns<16>(src, tx_width); + } + auto& frame = *static_cast*>(dst_frame); + Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); +} + +void Identity32TransformLoopRow_SSE4_1(TransformType /*tx_type*/, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int /*start_x*/, int /*start_y*/, + void* /*dst_frame*/) { + const int tx_height = kTransformHeight[tx_size]; + // When combining the identity32 multiplier with the row shift, the + // calculations for tx_height == 8 and tx_height == 32 can be simplified + // from ((A * 4) + 2) >> 2) to A. + if ((tx_height & 0x28) != 0) { + return; + } + + // Process kTransformSize32x16. The src is always rounded before the + // identity transform and shifted by 1 afterwards. + auto* src = static_cast(src_buffer); + if (Identity32DcOnly(src, adjusted_tx_height)) { + return; + } + + assert(tx_size == kTransformSize32x16); + ApplyRounding<32>(src, adjusted_tx_height); + int i = 0; + do { + Identity32Row16_SSE4_1(&src[i * 32], /*step=*/32); + i += 4; + } while (i < adjusted_tx_height); +} + +void Identity32TransformLoopColumn_SSE4_1(TransformType /*tx_type*/, + TransformSize tx_size, + int adjusted_tx_height, + void* src_buffer, int start_x, + int start_y, void* dst_frame) { + auto& frame = *static_cast*>(dst_frame); + auto* src = static_cast(src_buffer); + const int tx_width = kTransformWidth[tx_size]; + + Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width, + adjusted_tx_height, src); +} + +void Wht4TransformLoopRow_SSE4_1(TransformType tx_type, TransformSize tx_size, + int /*adjusted_tx_height*/, + void* /*src_buffer*/, int /*start_x*/, + int /*start_y*/, void* /*dst_frame*/) { + assert(tx_type == kTransformTypeDctDct); + assert(tx_size == kTransformSize4x4); + static_cast(tx_type); + static_cast(tx_size); + // Do both row and column transforms in the column-transform pass. +} + +void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type, + TransformSize tx_size, + int adjusted_tx_height, void* src_buffer, + int start_x, int start_y, void* dst_frame) { + assert(tx_type == kTransformTypeDctDct); + assert(tx_size == kTransformSize4x4); + static_cast(tx_type); + static_cast(tx_size); + + // Do both row and column transforms in the column-transform pass. + // Process 4 1d wht4 rows and columns in parallel. + const auto* src = static_cast(src_buffer); + auto& frame = *static_cast*>(dst_frame); + Wht4_SSE4_1(frame, start_x, start_y, src, adjusted_tx_height); +} + +//------------------------------------------------------------------------------ + +template +void InitAll(Dsp* const dsp) { + // Maximum transform size for Dct is 64. + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] = + Dct4TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] = + Dct4TransformLoopColumn_SSE4_1; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] = + Dct8TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] = + Dct8TransformLoopColumn_SSE4_1; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] = + Dct16TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] = + Dct16TransformLoopColumn_SSE4_1; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] = + Dct32TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] = + Dct32TransformLoopColumn_SSE4_1; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] = + Dct64TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] = + Dct64TransformLoopColumn_SSE4_1; + + // Maximum transform size for Adst is 16. + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] = + Adst4TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] = + Adst4TransformLoopColumn_SSE4_1; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] = + Adst8TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] = + Adst8TransformLoopColumn_SSE4_1; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] = + Adst16TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] = + Adst16TransformLoopColumn_SSE4_1; + + // Maximum transform size for Identity transform is 32. + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] = + Identity4TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] = + Identity4TransformLoopColumn_SSE4_1; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] = + Identity8TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] = + Identity8TransformLoopColumn_SSE4_1; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] = + Identity16TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] = + Identity16TransformLoopColumn_SSE4_1; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] = + Identity32TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] = + Identity32TransformLoopColumn_SSE4_1; + + // Maximum transform size for Wht is 4. + dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] = + Wht4TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] = + Wht4TransformLoopColumn_SSE4_1; +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + InitAll(dsp); +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct) + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] = + Dct4TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] = + Dct4TransformLoopColumn_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformDct) + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] = + Dct8TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] = + Dct8TransformLoopColumn_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformDct) + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] = + Dct16TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] = + Dct16TransformLoopColumn_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformDct) + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] = + Dct32TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] = + Dct32TransformLoopColumn_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize64_1DTransformDct) + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] = + Dct64TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] = + Dct64TransformLoopColumn_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst) + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] = + Adst4TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] = + Adst4TransformLoopColumn_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformAdst) + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] = + Adst8TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] = + Adst8TransformLoopColumn_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformAdst) + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] = + Adst16TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] = + Adst16TransformLoopColumn_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity) + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] = + Identity4TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] = + Identity4TransformLoopColumn_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformIdentity) + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] = + Identity8TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] = + Identity8TransformLoopColumn_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformIdentity) + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] = + Identity16TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] = + Identity16TransformLoopColumn_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformIdentity) + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] = + Identity32TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] = + Identity32TransformLoopColumn_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht) + dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] = + Wht4TransformLoopRow_SSE4_1; + dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] = + Wht4TransformLoopColumn_SSE4_1; +#endif +#endif +} + +} // namespace +} // namespace low_bitdepth + +void InverseTransformInit_SSE4_1() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 +#else // !LIBGAV1_TARGETING_SSE4_1 +namespace libgav1 { +namespace dsp { + +void InverseTransformInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/inverse_transform_sse4.h b/src/dsp/x86/inverse_transform_sse4.h new file mode 100644 index 0000000..106084b --- /dev/null +++ b/src/dsp/x86/inverse_transform_sse4.h @@ -0,0 +1,89 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::inverse_transforms, see the defines below for specifics. +// This function is not thread-safe. +void InverseTransformInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +// If sse4 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the sse4 implementation should be used. +#if LIBGAV1_TARGETING_SSE4_1 + +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct +#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct +#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct +#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct +#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct +#define LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst +#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst +#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst +#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity +#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity +#define LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity +#define LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity +#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht +#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_SSE4_1 +#endif +#endif // LIBGAV1_TARGETING_SSE4_1 +#endif // LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_ diff --git a/src/dsp/x86/loop_filter_sse4.cc b/src/dsp/x86/loop_filter_sse4.cc new file mode 100644 index 0000000..d67b450 --- /dev/null +++ b/src/dsp/x86/loop_filter_sse4.cc @@ -0,0 +1,2256 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/loop_filter.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" + +namespace libgav1 { +namespace dsp { +namespace { + +inline __m128i FilterAdd2Sub2(const __m128i& total, const __m128i& a1, + const __m128i& a2, const __m128i& s1, + const __m128i& s2) { + __m128i x = _mm_add_epi16(a1, total); + x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(s1, s2)), a2); + return x; +} + +} // namespace + +namespace low_bitdepth { +namespace { + +inline __m128i AbsDiff(const __m128i& a, const __m128i& b) { + return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); +} + +inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0, + const __m128i& outer_thresh) { + const __m128i fe = _mm_set1_epi8(static_cast(0xfe)); + // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh; + const __m128i abs_pmq = AbsDiff(p1p0, q1q0); + const __m128i a = _mm_adds_epu8(abs_pmq, abs_pmq); + const __m128i b = _mm_srli_epi16(_mm_and_si128(abs_pmq, fe), 1); + const __m128i c = _mm_adds_epu8(a, _mm_srli_si128(b, 4)); + return _mm_subs_epu8(c, outer_thresh); +} + +inline __m128i Hev(const __m128i& qp1, const __m128i& qp0, + const __m128i& hev_thresh) { + const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0); + const __m128i max_pq = + _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4)); + const __m128i hev_mask0 = _mm_cvtepu8_epi16(max_pq); + const __m128i hev_mask1 = _mm_cmpgt_epi16(hev_mask0, hev_thresh); + const __m128i hev_mask = _mm_packs_epi16(hev_mask1, hev_mask1); + return hev_mask; +} + +inline __m128i AddShift3(const __m128i& a, const __m128i& b) { + const __m128i c = _mm_adds_epi8(a, b); + const __m128i d = _mm_unpacklo_epi8(c, c); + const __m128i e = _mm_srai_epi16(d, 11); /* >> 3 */ + return _mm_packs_epi16(e, e); +} + +inline __m128i AddShift1(const __m128i& a, const __m128i& b) { + const __m128i c = _mm_adds_epi8(a, b); + const __m128i d = _mm_unpacklo_epi8(c, c); + const __m128i e = _mm_srai_epi16(d, 9); /* >> 1 */ + return _mm_packs_epi16(e, e); +} + +//------------------------------------------------------------------------------ +// 4-tap filters + +inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0, + const __m128i& qp1, const __m128i& qp0, + const __m128i& outer_thresh, + const __m128i& inner_thresh) { + const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh); + const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0); + const __m128i inner_mask = _mm_subs_epu8( + _mm_max_epu8(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 4)), inner_thresh); + // ~mask + const __m128i zero = _mm_setzero_si128(); + const __m128i a = _mm_or_si128(outer_mask, inner_mask); + const __m128i b = _mm_cmpeq_epi8(a, zero); + return b; +} + +inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1, + __m128i* oqp0, const __m128i& mask, const __m128i& hev) { + const __m128i t80 = _mm_set1_epi8(static_cast(0x80)); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1); + const __m128i qps1qps0 = _mm_xor_si128(qp1qp0, t80); + const __m128i ps1qs0 = _mm_shuffle_epi32(qps1qps0, 0x09); + const __m128i qs1ps0 = _mm_shuffle_epi32(qps1qps0, 0x0c); + const __m128i _hev = _mm_unpacklo_epi32(hev, hev); + const __m128i x = _mm_subs_epi8(ps1qs0, qs1ps0); + __m128i a = _mm_and_si128(_mm_srli_si128(x, 4), _hev); + + a = _mm_adds_epi8(a, x); + a = _mm_adds_epi8(a, x); + a = _mm_adds_epi8(a, x); + a = _mm_and_si128(a, mask); + a = _mm_unpacklo_epi32(a, a); + + const __m128i t4t3 = _mm_set_epi32(0x0, 0x0, 0x04040404, 0x03030303); + const __m128i a1a2 = AddShift3(a, t4t3); + const __m128i a1a1 = _mm_shuffle_epi32(a1a2, 0x55); + const __m128i a3a3 = _mm_andnot_si128(_hev, AddShift1(a1a1, t1)); + // -1 -1 -1 -1 1 1 1 1 -1 -1 -1 -1 1 1 1 1 + const __m128i adjust_sign_for_add = + _mm_unpacklo_epi32(t1, _mm_cmpeq_epi8(t1, t1)); + + const __m128i a3a3a1a2 = _mm_unpacklo_epi64(a1a2, a3a3); + const __m128i ma3a3ma1a2 = _mm_sign_epi8(a3a3a1a2, adjust_sign_for_add); + + const __m128i b = _mm_adds_epi8(qps1qps0, ma3a3ma1a2); + const __m128i c = _mm_xor_si128(b, t80); + + *oqp0 = c; + *oqp1 = _mm_srli_si128(c, 8); +} + +void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + auto* const dst = static_cast(dest); + const __m128i zero = _mm_setzero_si128(); + const __m128i v_outer_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero); + const __m128i v_inner_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero); + const __m128i v_hev_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh), 0); + + const __m128i p1 = Load4(dst - 2 * stride); + const __m128i p0 = Load4(dst - 1 * stride); + const __m128i q0 = Load4(dst + 0 * stride); + const __m128i q1 = Load4(dst + 1 * stride); + const __m128i qp1 = _mm_unpacklo_epi32(p1, q1); + const __m128i qp0 = _mm_unpacklo_epi32(p0, q0); + const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1); + const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1); + + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = + NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh); + + __m128i oqp1; + __m128i oqp0; + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask); + + Store4(dst - 2 * stride, oqp1); + Store4(dst - 1 * stride, oqp0); + Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4)); + Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4)); +} + +inline void Transpose4x4(const __m128i& x0, const __m128i& x1, + const __m128i& x2, const __m128i& x3, __m128i* d0, + __m128i* d1, __m128i* d2, __m128i* d3) { + // input + // x0 00 01 02 03 xx xx xx xx xx xx xx xx xx xx xx xx + // x1 10 11 12 13 xx xx xx xx xx xx xx xx xx xx xx xx + // x2 20 21 22 23 xx xx xx xx xx xx xx xx xx xx xx xx + // x3 30 31 32 33 xx xx xx xx xx xx xx xx xx xx xx xx + // output + // d0 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // d1 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // d2 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // d3 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + const __m128i w0 = _mm_unpacklo_epi8(x0, x1); + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + const __m128i w1 = _mm_unpacklo_epi8(x2, x3); + + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + *d0 = _mm_unpacklo_epi16(w0, w1); + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d1 = _mm_srli_si128(*d0, 4); + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(*d0, 8); + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(*d0, 12); +} + +void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh, + int hev_thresh) { + auto* const dst = static_cast(dest); + const __m128i zero = _mm_setzero_si128(); + const __m128i v_outer_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero); + const __m128i v_inner_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero); + const __m128i v_hev_thresh0 = + _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero); + const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero); + + __m128i x0 = Load4(dst - 2 + 0 * stride); + __m128i x1 = Load4(dst - 2 + 1 * stride); + __m128i x2 = Load4(dst - 2 + 2 * stride); + __m128i x3 = Load4(dst - 2 + 3 * stride); + + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + const __m128i w0 = _mm_unpacklo_epi8(x0, x1); + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + const __m128i w1 = _mm_unpacklo_epi8(x2, x3); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + const __m128i d0 = _mm_unpacklo_epi16(w0, w1); + const __m128i qp1 = _mm_shuffle_epi32(d0, 0xc); + const __m128i qp0 = _mm_srli_si128(d0, 4); + const __m128i q1q0 = _mm_srli_si128(d0, 8); + const __m128i p1p0 = _mm_shuffle_epi32(d0, 0x1); + + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = + NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh); + + __m128i oqp1; + __m128i oqp0; + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask); + + const __m128i p1 = oqp1; + const __m128i p0 = oqp0; + const __m128i q0 = _mm_srli_si128(oqp0, 4); + const __m128i q1 = _mm_srli_si128(oqp1, 4); + + Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3); + + Store4(dst - 2 + 0 * stride, x0); + Store4(dst - 2 + 1 * stride, x1); + Store4(dst - 2 + 2 * stride, x2); + Store4(dst - 2 + 3 * stride, x3); +} + +//------------------------------------------------------------------------------ +// 5-tap (chroma) filters + +inline __m128i NeedsFilter6(const __m128i& q1q0, const __m128i& p1p0, + const __m128i& qp2, const __m128i& qp1, + const __m128i& qp0, const __m128i& outer_thresh, + const __m128i& inner_thresh) { + const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh); + const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1); + const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0); + const __m128i max_pq = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0); + const __m128i inner_mask = _mm_subs_epu8( + _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh); + // ~mask + const __m128i zero = _mm_setzero_si128(); + const __m128i a = _mm_or_si128(outer_mask, inner_mask); + const __m128i b = _mm_cmpeq_epi8(a, zero); + return b; +} + +inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1, + const __m128i& qp0, const __m128i& flat_thresh) { + const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0); + const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0); + const __m128i max_pq = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0); + const __m128i flat_mask = _mm_subs_epu8( + _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh); + // ~mask + const __m128i zero = _mm_setzero_si128(); + const __m128i a = _mm_cmpeq_epi8(flat_mask, zero); + return a; +} + +inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0, + __m128i* oqp1, __m128i* oqp0) { + const __m128i four = _mm_set1_epi16(4); + const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2); + const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1); + const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0); + const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e); + const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e); + + __m128i f6_lo = + _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo)); + + f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo); + + f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo), + _mm_add_epi16(qp0_lo, pq0_lo)); + + // p2 * 3 + p1 * 2 + p0 * 2 + q0 + // q2 * 3 + q1 * 2 + q0 * 2 + p0 + *oqp1 = _mm_srli_epi16(f6_lo, 3); + *oqp1 = _mm_packus_epi16(*oqp1, *oqp1); + + // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo); + *oqp0 = _mm_srli_epi16(f6_lo, 3); + *oqp0 = _mm_packus_epi16(*oqp0, *oqp0); +} + +void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + auto* const dst = static_cast(dest); + const __m128i zero = _mm_setzero_si128(); + const __m128i v_flat_thresh = _mm_set1_epi8(1); + const __m128i v_outer_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero); + const __m128i v_inner_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero); + const __m128i v_hev_thresh0 = + _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero); + const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero); + + const __m128i p2 = Load4(dst - 3 * stride); + const __m128i p1 = Load4(dst - 2 * stride); + const __m128i p0 = Load4(dst - 1 * stride); + const __m128i q0 = Load4(dst + 0 * stride); + const __m128i q1 = Load4(dst + 1 * stride); + const __m128i q2 = Load4(dst + 2 * stride); + const __m128i qp2 = _mm_unpacklo_epi32(p2, q2); + const __m128i qp1 = _mm_unpacklo_epi32(p1, q1); + const __m128i qp0 = _mm_unpacklo_epi32(p0, q0); + const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1); + const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1); + + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = + NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh); + __m128i oqp1; + __m128i oqp0; + + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask); + + const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh); + const __m128i v_mask = + _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0); + + if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) { + __m128i oqp1_f6; + __m128i oqp0_f6; + + Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6); + + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask); + } + + Store4(dst - 2 * stride, oqp1); + Store4(dst - 1 * stride, oqp0); + Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4)); + Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4)); +} + +inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1, + const __m128i& x2, const __m128i& x3, __m128i* d0, + __m128i* d1, __m128i* d2, __m128i* d3, + __m128i* d4, __m128i* d5, __m128i* d6, + __m128i* d7) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx + + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + const __m128i w0 = _mm_unpacklo_epi8(x0, x1); + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + const __m128i w1 = _mm_unpacklo_epi8(x2, x3); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + const __m128i ww0 = _mm_unpacklo_epi16(w0, w1); + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + const __m128i ww1 = _mm_unpackhi_epi16(w0, w1); + + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + *d0 = ww0; + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d1 = _mm_srli_si128(ww0, 4); + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(ww0, 8); + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(ww0, 12); + // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + *d4 = ww1; + // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + *d5 = _mm_srli_si128(ww1, 4); + // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + *d6 = _mm_srli_si128(ww1, 8); + // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx + *d7 = _mm_srli_si128(ww1, 12); +} + +void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh, + int hev_thresh) { + auto* const dst = static_cast(dest); + const __m128i zero = _mm_setzero_si128(); + const __m128i v_flat_thresh = _mm_set1_epi8(1); + const __m128i v_outer_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero); + const __m128i v_inner_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero); + const __m128i v_hev_thresh0 = + _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero); + const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero); + + __m128i x0 = LoadLo8(dst - 3 + 0 * stride); + __m128i x1 = LoadLo8(dst - 3 + 1 * stride); + __m128i x2 = LoadLo8(dst - 3 + 2 * stride); + __m128i x3 = LoadLo8(dst - 3 + 3 * stride); + + __m128i p2, p1, p0, q0, q1, q2; + __m128i z0, z1; // not used + + Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1); + + const __m128i qp2 = _mm_unpacklo_epi32(p2, q2); + const __m128i qp1 = _mm_unpacklo_epi32(p1, q1); + const __m128i qp0 = _mm_unpacklo_epi32(p0, q0); + const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1); + const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1); + + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = + NeedsFilter6(q1q0, p1p0, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh); + __m128i oqp1; + __m128i oqp0; + + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask); + + const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh); + const __m128i v_mask = + _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0); + + if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) { + __m128i oqp1_f6; + __m128i oqp0_f6; + + Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6); + + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask); + } + + p1 = oqp1; + p0 = oqp0; + q0 = _mm_srli_si128(oqp0, 4); + q1 = _mm_srli_si128(oqp1, 4); + + Transpose4x4(p1, p0, q0, q1, &x0, &x1, &x2, &x3); + + Store4(dst - 2 + 0 * stride, x0); + Store4(dst - 2 + 1 * stride, x1); + Store4(dst - 2 + 2 * stride, x2); + Store4(dst - 2 + 3 * stride, x3); +} + +//------------------------------------------------------------------------------ +// 7-tap filters + +inline __m128i NeedsFilter8(const __m128i& q1q0, const __m128i& p1p0, + const __m128i& qp3, const __m128i& qp2, + const __m128i& qp1, const __m128i& qp0, + const __m128i& outer_thresh, + const __m128i& inner_thresh) { + const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh); + const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1); + const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0); + const __m128i max_pq_a = _mm_max_epu8(abs_qp2mqp1, abs_qp1mqp0); + const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2); + const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq2); + const __m128i inner_mask = _mm_subs_epu8( + _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), inner_thresh); + // ~mask + const __m128i zero = _mm_setzero_si128(); + const __m128i a = _mm_or_si128(outer_mask, inner_mask); + const __m128i b = _mm_cmpeq_epi8(a, zero); + return b; +} + +inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2, + const __m128i& qp1, const __m128i& qp0, + const __m128i& flat_thresh) { + const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0); + const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0); + const __m128i max_pq_a = _mm_max_epu8(abs_pq2mpq0, abs_qp1mqp0); + const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0); + const __m128i max_pq = _mm_max_epu8(max_pq_a, abs_pq3mpq0); + const __m128i flat_mask = _mm_subs_epu8( + _mm_max_epu8(max_pq, _mm_srli_si128(max_pq, 4)), flat_thresh); + // ~mask + const __m128i zero = _mm_setzero_si128(); + const __m128i a = _mm_cmpeq_epi8(flat_mask, zero); + return a; +} + +inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1, + const __m128i& qp0, __m128i* oqp2, __m128i* oqp1, + __m128i* oqp0) { + const __m128i four = _mm_set1_epi16(4); + const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3); + const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2); + const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1); + const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0); + const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e); + const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e); + const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e); + + __m128i f8_lo = + _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo)); + + f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo); + + f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo), + _mm_add_epi16(qp0_lo, pq0_lo)); + + // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0 + *oqp2 = _mm_srli_epi16(f8_lo, 3); + *oqp2 = _mm_packus_epi16(*oqp2, *oqp2); + + // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1 + f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo); + *oqp1 = _mm_srli_epi16(f8_lo, 3); + *oqp1 = _mm_packus_epi16(*oqp1, *oqp1); + + // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2 + f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo); + *oqp0 = _mm_srli_epi16(f8_lo, 3); + *oqp0 = _mm_packus_epi16(*oqp0, *oqp0); +} + +void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + auto* const dst = static_cast(dest); + const __m128i zero = _mm_setzero_si128(); + const __m128i v_flat_thresh = _mm_set1_epi8(1); + const __m128i v_outer_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero); + const __m128i v_inner_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero); + const __m128i v_hev_thresh0 = + _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero); + const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero); + + const __m128i p3 = Load4(dst - 4 * stride); + const __m128i p2 = Load4(dst - 3 * stride); + const __m128i p1 = Load4(dst - 2 * stride); + const __m128i p0 = Load4(dst - 1 * stride); + const __m128i q0 = Load4(dst + 0 * stride); + const __m128i q1 = Load4(dst + 1 * stride); + const __m128i q2 = Load4(dst + 2 * stride); + const __m128i q3 = Load4(dst + 3 * stride); + + const __m128i qp3 = _mm_unpacklo_epi32(p3, q3); + const __m128i qp2 = _mm_unpacklo_epi32(p2, q2); + const __m128i qp1 = _mm_unpacklo_epi32(p1, q1); + const __m128i qp0 = _mm_unpacklo_epi32(p0, q0); + const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1); + const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1); + + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0, + v_outer_thresh, v_inner_thresh); + __m128i oqp1; + __m128i oqp0; + + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask); + + const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh); + const __m128i v_mask = + _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0); + + if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) { + __m128i oqp2_f8; + __m128i oqp1_f8; + __m128i oqp0_f8; + + Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8); + + oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask); + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask); + Store4(dst - 3 * stride, oqp2_f8); + Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4)); + } + + Store4(dst - 2 * stride, oqp1); + Store4(dst - 1 * stride, oqp0); + Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4)); + Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4)); +} + +inline void Transpose8x8To8x4(const __m128i& x0, const __m128i& x1, + const __m128i& x2, const __m128i& x3, + const __m128i& x4, const __m128i& x5, + const __m128i& x6, const __m128i& x7, __m128i* d0, + __m128i* d1, __m128i* d2, __m128i* d3) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + // output + // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx + // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx + // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx + + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + const __m128i w0 = _mm_unpacklo_epi8(x0, x1); + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + const __m128i w1 = _mm_unpacklo_epi8(x2, x3); + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + const __m128i w2 = _mm_unpacklo_epi8(x4, x5); + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + const __m128i w3 = _mm_unpacklo_epi8(x6, x7); + + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + const __m128i w4 = _mm_unpacklo_epi16(w0, w1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + const __m128i w5 = _mm_unpacklo_epi16(w2, w3); + + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d0 = _mm_unpacklo_epi32(w4, w5); + *d1 = _mm_srli_si128(*d0, 8); + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + *d2 = _mm_unpackhi_epi32(w4, w5); + *d3 = _mm_srli_si128(*d2, 8); +} + +void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh, int inner_thresh, + int hev_thresh) { + auto* const dst = static_cast(dest); + const __m128i zero = _mm_setzero_si128(); + const __m128i v_flat_thresh = _mm_set1_epi8(1); + const __m128i v_outer_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero); + const __m128i v_inner_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero); + const __m128i v_hev_thresh0 = + _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero); + const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero); + + __m128i x0 = LoadLo8(dst - 4 + 0 * stride); + __m128i x1 = LoadLo8(dst - 4 + 1 * stride); + __m128i x2 = LoadLo8(dst - 4 + 2 * stride); + __m128i x3 = LoadLo8(dst - 4 + 3 * stride); + + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + + const __m128i qp3 = _mm_unpacklo_epi32(p3, q3); + const __m128i qp2 = _mm_unpacklo_epi32(p2, q2); + const __m128i qp1 = _mm_unpacklo_epi32(p1, q1); + const __m128i qp0 = _mm_unpacklo_epi32(p0, q0); + const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1); + const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1); + + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0, + v_outer_thresh, v_inner_thresh); + __m128i oqp1; + __m128i oqp0; + + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask); + + const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh); + const __m128i v_mask = + _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0); + + if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) { + __m128i oqp2_f8; + __m128i oqp1_f8; + __m128i oqp0_f8; + + Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8); + + oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask); + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask); + + p2 = oqp2_f8; + q2 = _mm_srli_si128(oqp2_f8, 4); + } + + p1 = oqp1; + p0 = oqp0; + q0 = _mm_srli_si128(oqp0, 4); + q1 = _mm_srli_si128(oqp1, 4); + + Transpose8x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3); + + StoreLo8(dst - 4 + 0 * stride, x0); + StoreLo8(dst - 4 + 1 * stride, x1); + StoreLo8(dst - 4 + 2 * stride, x2); + StoreLo8(dst - 4 + 3 * stride, x3); +} + +//------------------------------------------------------------------------------ +// 13-tap filters + +inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4, + const __m128i& qp3, const __m128i& qp2, const __m128i& qp1, + const __m128i& qp0, __m128i* oqp5, __m128i* oqp4, + __m128i* oqp3, __m128i* oqp2, __m128i* oqp1, + __m128i* oqp0) { + const __m128i eight = _mm_set1_epi16(8); + const __m128i qp6_lo = _mm_cvtepu8_epi16(qp6); + const __m128i qp5_lo = _mm_cvtepu8_epi16(qp5); + const __m128i qp4_lo = _mm_cvtepu8_epi16(qp4); + const __m128i qp3_lo = _mm_cvtepu8_epi16(qp3); + const __m128i qp2_lo = _mm_cvtepu8_epi16(qp2); + const __m128i qp1_lo = _mm_cvtepu8_epi16(qp1); + const __m128i qp0_lo = _mm_cvtepu8_epi16(qp0); + const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e); + const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e); + const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e); + const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e); + const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e); + const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e); + + __m128i f14_lo = + _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo)); + + f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo), + _mm_add_epi16(qp5_lo, qp4_lo)); + + f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo), + _mm_add_epi16(qp3_lo, qp2_lo)); + + f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo), + _mm_add_epi16(qp0_lo, pq0_lo)); + + // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0 + *oqp5 = _mm_srli_epi16(f14_lo, 4); + *oqp5 = _mm_packus_epi16(*oqp5, *oqp5); + + // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1 + f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo); + *oqp4 = _mm_srli_epi16(f14_lo, 4); + *oqp4 = _mm_packus_epi16(*oqp4, *oqp4); + + // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2 + f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo); + *oqp3 = _mm_srli_epi16(f14_lo, 4); + *oqp3 = _mm_packus_epi16(*oqp3, *oqp3); + + // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3 + // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3 + f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo); + *oqp2 = _mm_srli_epi16(f14_lo, 4); + *oqp2 = _mm_packus_epi16(*oqp2, *oqp2); + + // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4 + // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4 + f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo); + *oqp1 = _mm_srli_epi16(f14_lo, 4); + *oqp1 = _mm_packus_epi16(*oqp1, *oqp1); + + // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5 + // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5 + f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo); + *oqp0 = _mm_srli_epi16(f14_lo, 4); + *oqp0 = _mm_packus_epi16(*oqp0, *oqp0); +} + +void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + auto* const dst = static_cast(dest); + const __m128i zero = _mm_setzero_si128(); + const __m128i v_flat_thresh = _mm_set1_epi8(1); + const __m128i v_outer_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero); + const __m128i v_inner_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero); + const __m128i v_hev_thresh0 = + _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero); + const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero); + + const __m128i p3 = Load4(dst - 4 * stride); + const __m128i p2 = Load4(dst - 3 * stride); + const __m128i p1 = Load4(dst - 2 * stride); + const __m128i p0 = Load4(dst - 1 * stride); + const __m128i q0 = Load4(dst + 0 * stride); + const __m128i q1 = Load4(dst + 1 * stride); + const __m128i q2 = Load4(dst + 2 * stride); + const __m128i q3 = Load4(dst + 3 * stride); + + const __m128i qp3 = _mm_unpacklo_epi32(p3, q3); + const __m128i qp2 = _mm_unpacklo_epi32(p2, q2); + const __m128i qp1 = _mm_unpacklo_epi32(p1, q1); + const __m128i qp0 = _mm_unpacklo_epi32(p0, q0); + const __m128i q1q0 = _mm_unpacklo_epi32(q0, q1); + const __m128i p1p0 = _mm_unpacklo_epi32(p0, p1); + + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0, + v_outer_thresh, v_inner_thresh); + + __m128i oqp1; + __m128i oqp0; + + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask); + + const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh); + const __m128i v_mask = + _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0); + + if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) { + const __m128i p6 = Load4(dst - 7 * stride); + const __m128i p5 = Load4(dst - 6 * stride); + const __m128i p4 = Load4(dst - 5 * stride); + const __m128i q4 = Load4(dst + 4 * stride); + const __m128i q5 = Load4(dst + 5 * stride); + const __m128i q6 = Load4(dst + 6 * stride); + const __m128i qp6 = _mm_unpacklo_epi32(p6, q6); + const __m128i qp5 = _mm_unpacklo_epi32(p5, q5); + const __m128i qp4 = _mm_unpacklo_epi32(p4, q4); + + const __m128i v_isflatouter4_mask = + IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh); + const __m128i v_flat4_mask = + _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0); + + __m128i oqp2_f8; + __m128i oqp1_f8; + __m128i oqp0_f8; + + Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8); + + oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask); + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask); + + if (_mm_test_all_zeros(v_flat4_mask, + _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) { + __m128i oqp5_f14; + __m128i oqp4_f14; + __m128i oqp3_f14; + __m128i oqp2_f14; + __m128i oqp1_f14; + __m128i oqp0_f14; + + Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14, + &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14); + + oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask); + oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask); + oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask); + oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask); + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask); + + Store4(dst - 6 * stride, oqp5_f14); + Store4(dst - 5 * stride, oqp4_f14); + Store4(dst - 4 * stride, oqp3_f14); + Store4(dst + 3 * stride, _mm_srli_si128(oqp3_f14, 4)); + Store4(dst + 4 * stride, _mm_srli_si128(oqp4_f14, 4)); + Store4(dst + 5 * stride, _mm_srli_si128(oqp5_f14, 4)); + } + + Store4(dst - 3 * stride, oqp2_f8); + Store4(dst + 2 * stride, _mm_srli_si128(oqp2_f8, 4)); + } + + Store4(dst - 2 * stride, oqp1); + Store4(dst - 1 * stride, oqp0); + Store4(dst + 0 * stride, _mm_srli_si128(oqp0, 4)); + Store4(dst + 1 * stride, _mm_srli_si128(oqp1, 4)); +} + +// Each of the 8x4 blocks of input data (p7-p0 and q0-q7) are transposed to 4x8, +// then unpacked to the correct qp register. (qp7 - qp0) +// +// p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7 +// +// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f +// 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f +// 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f +// 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f + +inline void DualTranspose8x4To4x8(const __m128i& x0, const __m128i& x1, + const __m128i& x2, const __m128i& x3, + __m128i* q0p0, __m128i* q1p1, __m128i* q2p2, + __m128i* q3p3, __m128i* q4p4, __m128i* q5p5, + __m128i* q6p6, __m128i* q7p7) { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + const __m128i w0 = _mm_unpacklo_epi8(x0, x1); + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + const __m128i w1 = _mm_unpacklo_epi8(x2, x3); + // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f + const __m128i w2 = _mm_unpackhi_epi8(x0, x1); + // 28 38 29 39 2a 3a 2b 3b 2c 3c 2d 3d 2e 3e 2f 3f + const __m128i w3 = _mm_unpackhi_epi8(x2, x3); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + const __m128i ww0 = _mm_unpacklo_epi16(w0, w1); + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + const __m128i ww1 = _mm_unpackhi_epi16(w0, w1); + // 08 18 28 38 09 19 29 39 0a 1a 2a 3a 0b 1b 2b 3b + const __m128i ww2 = _mm_unpacklo_epi16(w2, w3); + // 0c 1c 2c 3c 0d 1d 2d 3d 0e 1e 2e 3e 0f 1f 2f 3f + const __m128i ww3 = _mm_unpackhi_epi16(w2, w3); + // 00 10 20 30 0f 1f 2f 3f xx xx xx xx xx xx xx xx + *q7p7 = _mm_unpacklo_epi32(ww0, _mm_srli_si128(ww3, 12)); + // 01 11 21 31 0e 1e 2e 3e xx xx xx xx xx xx xx xx + *q6p6 = _mm_unpackhi_epi32(_mm_slli_si128(ww0, 4), ww3); + // 02 12 22 32 0d 1d 2d 3d xx xx xx xx xx xx xx xx + *q5p5 = _mm_unpackhi_epi32(ww0, _mm_slli_si128(ww3, 4)); + // 03 13 23 33 0c 1c 2c 3c xx xx xx xx xx xx xx xx + *q4p4 = _mm_unpacklo_epi32(_mm_srli_si128(ww0, 12), ww3); + // 04 14 24 34 0b 1b 2b 3b xx xx xx xx xx xx xx xx + *q3p3 = _mm_unpacklo_epi32(ww1, _mm_srli_si128(ww2, 12)); + // 05 15 25 35 0a 1a 2a 3a xx xx xx xx xx xx xx xx + *q2p2 = _mm_unpackhi_epi32(_mm_slli_si128(ww1, 4), ww2); + // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx + *q1p1 = _mm_unpackhi_epi32(ww1, _mm_slli_si128(ww2, 4)); + // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx + *q0p0 = _mm_unpacklo_epi32(_mm_srli_si128(ww1, 12), ww2); +} + +inline void DualTranspose4x8To8x4(const __m128i& qp7, const __m128i& qp6, + const __m128i& qp5, const __m128i& qp4, + const __m128i& qp3, const __m128i& qp2, + const __m128i& qp1, const __m128i& qp0, + __m128i* x0, __m128i* x1, __m128i* x2, + __m128i* x3) { + // qp7: 00 10 20 30 0f 1f 2f 3f xx xx xx xx xx xx xx xx + // qp6: 01 11 21 31 0e 1e 2e 3e xx xx xx xx xx xx xx xx + // qp5: 02 12 22 32 0d 1d 2d 3d xx xx xx xx xx xx xx xx + // qp4: 03 13 23 33 0c 1c 2c 3c xx xx xx xx xx xx xx xx + // qp3: 04 14 24 34 0b 1b 2b 3b xx xx xx xx xx xx xx xx + // qp2: 05 15 25 35 0a 1a 2a 3a xx xx xx xx xx xx xx xx + // qp1: 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx + // qp0: 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx + + // 00 01 10 11 20 21 30 31 0f 0e 1f 1e 2f 2e 3f 3e + const __m128i w0 = _mm_unpacklo_epi8(qp7, qp6); + // 02 03 12 13 22 23 32 33 xx xx xx xx xx xx xx xx + const __m128i w1 = _mm_unpacklo_epi8(qp5, qp4); + // 04 05 14 15 24 25 34 35 xx xx xx xx xx xx xx xx + const __m128i w2 = _mm_unpacklo_epi8(qp3, qp2); + // 06 07 16 17 26 27 36 37 xx xx xx xx xx xx xx xx + const __m128i w3 = _mm_unpacklo_epi8(qp1, qp0); + // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 + const __m128i w4 = _mm_unpacklo_epi16(w0, w1); + // 04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37 + const __m128i w5 = _mm_unpacklo_epi16(w2, w3); + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + const __m128i d0 = _mm_unpacklo_epi32(w4, w5); + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + const __m128i d2 = _mm_unpackhi_epi32(w4, w5); + // xx xx xx xx xx xx xx xx 08 09 18 19 28 29 38 39 + const __m128i w10 = _mm_unpacklo_epi8(qp0, qp1); + // xx xx xx xx xx xx xx xx 0a 0b 1a 1b 2a 2b 3a 3b + const __m128i w11 = _mm_unpacklo_epi8(qp2, qp3); + // xx xx xx xx xx xx xx xx 0c 0d 1c 1d 2c 2d 3c 3d + const __m128i w12 = _mm_unpacklo_epi8(qp4, qp5); + // xx xx xx xx xx xx xx xx 0e 0f 1e 1f 2e 2f 3e 3f + const __m128i w13 = _mm_unpacklo_epi8(qp6, qp7); + // 08 09 0a 0b 18 19 1a 1b 28 29 2a 2b 38 39 3a 3b + const __m128i w14 = _mm_unpackhi_epi16(w10, w11); + // 0c 0d 0e 0f 1c 1d 1e 1f 2c 2d 2e 2f 3c 3d 3e 3f + const __m128i w15 = _mm_unpackhi_epi16(w12, w13); + // 08 09 0a 0b 0c 0d 0e 0f 18 19 1a 1b 1c 1d 1e 1f + const __m128i d1 = _mm_unpacklo_epi32(w14, w15); + // 28 29 2a 2b 2c 2d 2e 2f 38 39 3a 3b 3c 3d 3e 3f + const __m128i d3 = _mm_unpackhi_epi32(w14, w15); + + // p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7 + // + // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f + *x0 = _mm_unpacklo_epi64(d0, d1); + // 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f + *x1 = _mm_unpackhi_epi64(d0, d1); + // 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f + *x2 = _mm_unpacklo_epi64(d2, d3); + // 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f + *x3 = _mm_unpackhi_epi64(d2, d3); +} + +void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh) { + auto* const dst = static_cast(dest); + const __m128i zero = _mm_setzero_si128(); + const __m128i v_flat_thresh = _mm_set1_epi8(1); + const __m128i v_outer_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(outer_thresh), zero); + const __m128i v_inner_thresh = + _mm_shuffle_epi8(_mm_cvtsi32_si128(inner_thresh), zero); + const __m128i v_hev_thresh0 = + _mm_shuffle_epi8(_mm_cvtsi32_si128(hev_thresh), zero); + const __m128i v_hev_thresh = _mm_unpacklo_epi8(v_hev_thresh0, zero); + + __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride); + __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride); + __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride); + __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride); + + __m128i qp7, qp6, qp5, qp4, qp3, qp2, qp1, qp0; + + DualTranspose8x4To4x8(x0, x1, x2, x3, &qp0, &qp1, &qp2, &qp3, &qp4, &qp5, + &qp6, &qp7); + + const __m128i qp1qp0 = _mm_unpacklo_epi64(qp0, qp1); + const __m128i q1q0 = _mm_shuffle_epi32(qp1qp0, 0x0d); + const __m128i p1p0 = _mm_shuffle_epi32(qp1qp0, 0x08); + + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = NeedsFilter8(q1q0, p1p0, qp3, qp2, qp1, qp0, + v_outer_thresh, v_inner_thresh); + + __m128i oqp1; + __m128i oqp0; + + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask); + + const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh); + const __m128i v_mask = + _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0); + + if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) { + const __m128i v_isflatouter4_mask = + IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh); + const __m128i v_flat4_mask = + _mm_shuffle_epi32(_mm_and_si128(v_mask, v_isflatouter4_mask), 0); + + __m128i oqp2_f8; + __m128i oqp1_f8; + __m128i oqp0_f8; + + Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8); + + oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask); + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask); + + if (_mm_test_all_zeros(v_flat4_mask, + _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) { + __m128i oqp5_f14; + __m128i oqp4_f14; + __m128i oqp3_f14; + __m128i oqp2_f14; + __m128i oqp1_f14; + __m128i oqp0_f14; + + Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14, + &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14); + + oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask); + oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask); + oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask); + oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask); + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask); + qp3 = oqp3_f14; + qp4 = oqp4_f14; + qp5 = oqp5_f14; + } + qp2 = oqp2_f8; + } + + DualTranspose4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1, &x2, + &x3); + + StoreUnaligned16(dst - 8 + 0 * stride, x0); + StoreUnaligned16(dst - 8 + 1 * stride, x1); + StoreUnaligned16(dst - 8 + 2 * stride, x2); + StoreUnaligned16(dst - 8 + 3 * stride, x3); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + static_cast(dsp); +#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal) + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = Horizontal4; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal) + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = Horizontal6; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal) + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = Horizontal8; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal) + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = + Horizontal14; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical) + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical) + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical) + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical) + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = Vertical14; +#endif +} +} // namespace +} // namespace low_bitdepth + +//------------------------------------------------------------------------------ +namespace high_bitdepth { +namespace { + +#if LIBGAV1_MAX_BITDEPTH >= 10 + +template +struct LoopFilterFuncs_SSE4_1 { + LoopFilterFuncs_SSE4_1() = delete; + + static constexpr int kThreshShift = bitdepth - 8; + + static void Vertical4(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); + static void Horizontal4(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); + static void Vertical6(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); + static void Horizontal6(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); + static void Vertical8(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); + static void Horizontal8(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); + static void Vertical14(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); + static void Horizontal14(void* dest, ptrdiff_t stride, int outer_thresh, + int inner_thresh, int hev_thresh); +}; + +inline __m128i Clamp(const __m128i& min, const __m128i& max, + const __m128i& val) { + const __m128i a = _mm_min_epi16(val, max); + const __m128i b = _mm_max_epi16(a, min); + return b; +} + +inline __m128i AddShift3(const __m128i& a, const __m128i& b, + const __m128i& vmin, const __m128i& vmax) { + const __m128i c = _mm_adds_epi16(a, b); + const __m128i d = Clamp(vmin, vmax, c); + const __m128i e = _mm_srai_epi16(d, 3); /* >> 3 */ + return e; +} + +inline __m128i AddShift1(const __m128i& a, const __m128i& b) { + const __m128i c = _mm_adds_epi16(a, b); + const __m128i e = _mm_srai_epi16(c, 1); /* >> 1 */ + return e; +} + +inline __m128i AbsDiff(const __m128i& a, const __m128i& b) { + return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); +} + +inline __m128i Hev(const __m128i& qp1, const __m128i& qp0, + const __m128i& hev_thresh) { + const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0); + const __m128i max_pq = + _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8)); + const __m128i hev_mask = _mm_cmpgt_epi16(max_pq, hev_thresh); + return hev_mask; +} + +inline __m128i CheckOuterThreshF4(const __m128i& q1q0, const __m128i& p1p0, + const __m128i& outer_thresh) { + // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh; + const __m128i abs_pmq = AbsDiff(p1p0, q1q0); + const __m128i a = _mm_adds_epu16(abs_pmq, abs_pmq); + const __m128i b = _mm_srli_epi16(abs_pmq, 1); + const __m128i c = _mm_adds_epu16(a, _mm_srli_si128(b, 8)); + return _mm_subs_epu16(c, outer_thresh); +} + +inline __m128i NeedsFilter4(const __m128i& q1q0, const __m128i& p1p0, + const __m128i& qp1, const __m128i& qp0, + const __m128i& outer_thresh, + const __m128i& inner_thresh) { + const __m128i outer_mask = CheckOuterThreshF4(q1q0, p1p0, outer_thresh); + const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0); + const __m128i max_abs_qp1mqp = + _mm_max_epu16(abs_qp1mqp0, _mm_srli_si128(abs_qp1mqp0, 8)); + const __m128i inner_mask = _mm_subs_epu16(max_abs_qp1mqp, inner_thresh); + // ~mask + const __m128i zero = _mm_setzero_si128(); + const __m128i a = _mm_or_si128(outer_mask, inner_mask); + const __m128i b = _mm_cmpeq_epi16(a, zero); + return b; +} + +inline void Filter4(const __m128i& qp1, const __m128i& qp0, __m128i* oqp1, + __m128i* oqp0, const __m128i& mask, const __m128i& hev, + int bitdepth) { + const __m128i t4 = _mm_set1_epi16(4); + const __m128i t3 = _mm_set1_epi16(3); + const __m128i t80 = _mm_set1_epi16(static_cast(1 << (bitdepth - 1))); + const __m128i t1 = _mm_set1_epi16(0x1); + const __m128i vmin = _mm_subs_epi16(_mm_setzero_si128(), t80); + const __m128i vmax = _mm_subs_epi16(t80, t1); + const __m128i ps1 = _mm_subs_epi16(qp1, t80); + const __m128i ps0 = _mm_subs_epi16(qp0, t80); + const __m128i qs0 = _mm_srli_si128(ps0, 8); + const __m128i qs1 = _mm_srli_si128(ps1, 8); + + __m128i a = _mm_subs_epi16(ps1, qs1); + a = _mm_and_si128(Clamp(vmin, vmax, a), hev); + + const __m128i x = _mm_subs_epi16(qs0, ps0); + a = _mm_adds_epi16(a, x); + a = _mm_adds_epi16(a, x); + a = _mm_adds_epi16(a, x); + a = _mm_and_si128(Clamp(vmin, vmax, a), mask); + + const __m128i a1 = AddShift3(a, t4, vmin, vmax); + const __m128i a2 = AddShift3(a, t3, vmin, vmax); + const __m128i a3 = _mm_andnot_si128(hev, AddShift1(a1, t1)); + + const __m128i ops1 = _mm_adds_epi16(ps1, a3); + const __m128i ops0 = _mm_adds_epi16(ps0, a2); + const __m128i oqs0 = _mm_subs_epi16(qs0, a1); + const __m128i oqs1 = _mm_subs_epi16(qs1, a3); + + __m128i oqps1 = _mm_unpacklo_epi64(ops1, oqs1); + __m128i oqps0 = _mm_unpacklo_epi64(ops0, oqs0); + + oqps1 = Clamp(vmin, vmax, oqps1); + oqps0 = Clamp(vmin, vmax, oqps0); + + *oqp1 = _mm_adds_epi16(oqps1, t80); + *oqp0 = _mm_adds_epi16(oqps0, t80); +} + +template +void LoopFilterFuncs_SSE4_1::Horizontal4(void* dest, + ptrdiff_t stride8, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + auto* const dst = static_cast(dest); + const ptrdiff_t stride = stride8 / 2; + const __m128i v_outer_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0); + const __m128i v_inner_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0); + const __m128i v_hev_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0); + const __m128i p1 = LoadLo8(dst - 2 * stride); + const __m128i p0 = LoadLo8(dst - 1 * stride); + const __m128i qp0 = LoadHi8(p0, dst + 0 * stride); + const __m128i qp1 = LoadHi8(p1, dst + 1 * stride); + const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1); + const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1); + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = + NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh); + + __m128i oqp1; + __m128i oqp0; + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth); + + StoreLo8(dst - 2 * stride, oqp1); + StoreLo8(dst - 1 * stride, oqp0); + StoreHi8(dst + 0 * stride, oqp0); + StoreHi8(dst + 1 * stride, oqp1); +} + +template +void LoopFilterFuncs_SSE4_1::Vertical4(void* dest, ptrdiff_t stride8, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + auto* const dst = static_cast(dest); + const ptrdiff_t stride = stride8 / 2; + const __m128i v_outer_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0); + const __m128i v_inner_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0); + const __m128i v_hev_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0); + const __m128i x0 = LoadLo8(dst - 2 + 0 * stride); + const __m128i x1 = LoadLo8(dst - 2 + 1 * stride); + const __m128i x2 = LoadLo8(dst - 2 + 2 * stride); + const __m128i x3 = LoadLo8(dst - 2 + 3 * stride); + // 00 10 01 11 02 12 03 13 + const __m128i w0 = _mm_unpacklo_epi16(x0, x1); + // 20 30 21 31 22 32 23 33 + const __m128i w1 = _mm_unpacklo_epi16(x2, x3); + // 00 10 20 30 01 11 21 31 p0p1 + const __m128i a = _mm_unpacklo_epi32(w0, w1); + const __m128i p1p0 = _mm_shuffle_epi32(a, 0x4e); + // 02 12 22 32 03 13 23 33 q1q0 + const __m128i q1q0 = _mm_unpackhi_epi32(w0, w1); + const __m128i qp1 = _mm_unpackhi_epi64(p1p0, q1q0); + const __m128i qp0 = _mm_unpacklo_epi64(p1p0, q1q0); + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = + NeedsFilter4(q1q0, p1p0, qp1, qp0, v_outer_thresh, v_inner_thresh); + + __m128i oqp1; + __m128i oqp0; + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth); + + // 00 10 01 11 02 12 03 13 + const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0); + // 20 30 21 31 22 32 23 33 + const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1); + // 00 10 20 30 01 11 21 31 + const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3); + // 02 12 22 32 03 13 23 33 + const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3); + + StoreLo8(dst - 2 + 0 * stride, op0p1); + StoreHi8(dst - 2 + 1 * stride, op0p1); + StoreLo8(dst - 2 + 2 * stride, oq1q0); + StoreHi8(dst - 2 + 3 * stride, oq1q0); +} + +//------------------------------------------------------------------------------ +// 5-tap (chroma) filters + +inline __m128i CheckOuterThreshF6(const __m128i& qp1, const __m128i& qp0, + const __m128i& outer_thresh) { + // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh; + const __m128i q1q0 = _mm_unpackhi_epi64(qp0, qp1); + const __m128i p1p0 = _mm_unpacklo_epi64(qp0, qp1); + return CheckOuterThreshF4(q1q0, p1p0, outer_thresh); +} + +inline __m128i NeedsFilter6(const __m128i& qp2, const __m128i& qp1, + const __m128i& qp0, const __m128i& outer_thresh, + const __m128i& inner_thresh) { + const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh); + const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1); + const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0); + const __m128i max_pq = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0); + const __m128i inner_mask = _mm_subs_epu16( + _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh); + // ~mask + const __m128i zero = _mm_setzero_si128(); + const __m128i a = _mm_or_si128(outer_mask, inner_mask); + const __m128i b = _mm_cmpeq_epi16(a, zero); + return b; +} + +inline __m128i IsFlat3(const __m128i& qp2, const __m128i& qp1, + const __m128i& qp0, const __m128i& flat_thresh) { + const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0); + const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0); + const __m128i max_pq = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0); + const __m128i flat_mask = _mm_subs_epu16( + _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh); + // ~mask + const __m128i zero = _mm_setzero_si128(); + const __m128i a = _mm_cmpeq_epi16(flat_mask, zero); + return a; +} + +inline void Filter6(const __m128i& qp2, const __m128i& qp1, const __m128i& qp0, + __m128i* oqp1, __m128i* oqp0) { + const __m128i four = _mm_set1_epi16(4); + const __m128i qp2_lo = qp2; + const __m128i qp1_lo = qp1; + const __m128i qp0_lo = qp0; + const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e); + const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e); + + __m128i f6_lo; + f6_lo = + _mm_add_epi16(_mm_add_epi16(qp2_lo, four), _mm_add_epi16(qp2_lo, qp2_lo)); + + f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp1_lo), qp1_lo); + + f6_lo = _mm_add_epi16(_mm_add_epi16(f6_lo, qp0_lo), + _mm_add_epi16(qp0_lo, pq0_lo)); + + // p2 * 3 + p1 * 2 + p0 * 2 + q0 + // q2 * 3 + q1 * 2 + q0 * 2 + p0 + *oqp1 = _mm_srli_epi16(f6_lo, 3); + + // p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + // q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + f6_lo = FilterAdd2Sub2(f6_lo, pq0_lo, pq1_lo, qp2_lo, qp2_lo); + *oqp0 = _mm_srli_epi16(f6_lo, 3); +} + +template +void LoopFilterFuncs_SSE4_1::Horizontal6(void* dest, + ptrdiff_t stride8, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + auto* const dst = static_cast(dest); + const ptrdiff_t stride = stride8 / 2; + const __m128i v_flat_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0); + const __m128i v_outer_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0); + const __m128i v_inner_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0); + const __m128i v_hev_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0); + + const __m128i p2 = LoadLo8(dst - 3 * stride); + const __m128i p1 = LoadLo8(dst - 2 * stride); + const __m128i p0 = LoadLo8(dst - 1 * stride); + const __m128i q0 = LoadLo8(dst + 0 * stride); + const __m128i q1 = LoadLo8(dst + 1 * stride); + const __m128i q2 = LoadLo8(dst + 2 * stride); + + const __m128i qp2 = _mm_unpacklo_epi64(p2, q2); + const __m128i qp1 = _mm_unpacklo_epi64(p1, q1); + const __m128i qp0 = _mm_unpacklo_epi64(p0, q0); + + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = + NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh); + __m128i oqp1; + __m128i oqp0; + + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth); + + const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh); + const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask); + const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo); + + if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) { + __m128i oqp1_f6; + __m128i oqp0_f6; + + Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6); + + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask); + } + + StoreLo8(dst - 2 * stride, oqp1); + StoreLo8(dst - 1 * stride, oqp0); + StoreHi8(dst + 0 * stride, oqp0); + StoreHi8(dst + 1 * stride, oqp1); +} + +inline void Transpose8x4To4x8(const __m128i& x0, const __m128i& x1, + const __m128i& x2, const __m128i& x3, __m128i* d0, + __m128i* d1, __m128i* d2, __m128i* d3, + __m128i* d4, __m128i* d5, __m128i* d6, + __m128i* d7) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // output + // 00 10 20 30 xx xx xx xx + // 01 11 21 31 xx xx xx xx + // 02 12 22 32 xx xx xx xx + // 03 13 23 33 xx xx xx xx + // 04 14 24 34 xx xx xx xx + // 05 15 25 35 xx xx xx xx + // 06 16 26 36 xx xx xx xx + // 07 17 27 37 xx xx xx xx + + // 00 10 01 11 02 12 03 13 + const __m128i w0 = _mm_unpacklo_epi16(x0, x1); + // 20 30 21 31 22 32 23 33 + const __m128i w1 = _mm_unpacklo_epi16(x2, x3); + // 04 14 05 15 06 16 07 17 + const __m128i w2 = _mm_unpackhi_epi16(x0, x1); + // 24 34 25 35 26 36 27 37 + const __m128i w3 = _mm_unpackhi_epi16(x2, x3); + + // 00 10 20 30 01 11 21 31 + const __m128i ww0 = _mm_unpacklo_epi32(w0, w1); + // 04 14 24 34 05 15 25 35 + const __m128i ww1 = _mm_unpacklo_epi32(w2, w3); + // 02 12 22 32 03 13 23 33 + const __m128i ww2 = _mm_unpackhi_epi32(w0, w1); + // 06 16 26 36 07 17 27 37 + const __m128i ww3 = _mm_unpackhi_epi32(w2, w3); + + // 00 10 20 30 xx xx xx xx + *d0 = ww0; + // 01 11 21 31 xx xx xx xx + *d1 = _mm_srli_si128(ww0, 8); + // 02 12 22 32 xx xx xx xx + *d2 = ww2; + // 03 13 23 33 xx xx xx xx + *d3 = _mm_srli_si128(ww2, 8); + // 04 14 24 34 xx xx xx xx + *d4 = ww1; + // 05 15 25 35 xx xx xx xx + *d5 = _mm_srli_si128(ww1, 8); + // 06 16 26 36 xx xx xx xx + *d6 = ww3; + // 07 17 27 37 xx xx xx xx + *d7 = _mm_srli_si128(ww3, 8); +} + +template +void LoopFilterFuncs_SSE4_1::Vertical6(void* dest, ptrdiff_t stride8, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + auto* const dst = static_cast(dest); + const ptrdiff_t stride = stride8 / 2; + const __m128i v_flat_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0); + const __m128i v_outer_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0); + const __m128i v_inner_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0); + const __m128i v_hev_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0); + + __m128i x0 = LoadUnaligned16(dst - 3 + 0 * stride); + __m128i x1 = LoadUnaligned16(dst - 3 + 1 * stride); + __m128i x2 = LoadUnaligned16(dst - 3 + 2 * stride); + __m128i x3 = LoadUnaligned16(dst - 3 + 3 * stride); + + __m128i p2, p1, p0, q0, q1, q2; + __m128i z0, z1; // not used + + Transpose8x4To4x8(x0, x1, x2, x3, &p2, &p1, &p0, &q0, &q1, &q2, &z0, &z1); + + const __m128i qp2 = _mm_unpacklo_epi64(p2, q2); + const __m128i qp1 = _mm_unpacklo_epi64(p1, q1); + const __m128i qp0 = _mm_unpacklo_epi64(p0, q0); + + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = + NeedsFilter6(qp2, qp1, qp0, v_outer_thresh, v_inner_thresh); + __m128i oqp1; + __m128i oqp0; + + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth); + + const __m128i v_isflat3_mask = IsFlat3(qp2, qp1, qp0, v_flat_thresh); + const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask); + const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo); + + if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) { + __m128i oqp1_f6; + __m128i oqp0_f6; + + Filter6(qp2, qp1, qp0, &oqp1_f6, &oqp0_f6); + + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f6, v_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f6, v_mask); + } + + // 00 10 01 11 02 12 03 13 + const __m128i w2 = _mm_unpacklo_epi16(oqp1, oqp0); + // 20 30 21 31 22 32 23 33 + const __m128i w3 = _mm_unpackhi_epi16(oqp0, oqp1); + // 00 10 20 30 01 11 21 31 + const __m128i op0p1 = _mm_unpacklo_epi32(w2, w3); + // 02 12 22 32 03 13 23 33 + const __m128i oq1q0 = _mm_unpackhi_epi32(w2, w3); + + StoreLo8(dst - 2 + 0 * stride, op0p1); + StoreHi8(dst - 2 + 1 * stride, op0p1); + StoreLo8(dst - 2 + 2 * stride, oq1q0); + StoreHi8(dst - 2 + 3 * stride, oq1q0); +} + +//------------------------------------------------------------------------------ +// 7-tap filters +inline __m128i NeedsFilter8(const __m128i& qp3, const __m128i& qp2, + const __m128i& qp1, const __m128i& qp0, + const __m128i& outer_thresh, + const __m128i& inner_thresh) { + const __m128i outer_mask = CheckOuterThreshF6(qp1, qp0, outer_thresh); + const __m128i abs_qp2mqp1 = AbsDiff(qp2, qp1); + const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0); + const __m128i max_pq_a = _mm_max_epu16(abs_qp2mqp1, abs_qp1mqp0); + const __m128i abs_pq3mpq2 = AbsDiff(qp3, qp2); + const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq2); + const __m128i inner_mask = _mm_subs_epu16( + _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), inner_thresh); + // ~mask + const __m128i zero = _mm_setzero_si128(); + const __m128i a = _mm_or_si128(outer_mask, inner_mask); + const __m128i b = _mm_cmpeq_epi16(a, zero); + return b; +} + +inline __m128i IsFlat4(const __m128i& qp3, const __m128i& qp2, + const __m128i& qp1, const __m128i& qp0, + const __m128i& flat_thresh) { + const __m128i abs_pq2mpq0 = AbsDiff(qp2, qp0); + const __m128i abs_qp1mqp0 = AbsDiff(qp1, qp0); + const __m128i max_pq_a = _mm_max_epu16(abs_pq2mpq0, abs_qp1mqp0); + const __m128i abs_pq3mpq0 = AbsDiff(qp3, qp0); + const __m128i max_pq = _mm_max_epu16(max_pq_a, abs_pq3mpq0); + const __m128i flat_mask = _mm_subs_epu16( + _mm_max_epu16(max_pq, _mm_srli_si128(max_pq, 8)), flat_thresh); + // ~mask + const __m128i zero = _mm_setzero_si128(); + const __m128i a = _mm_cmpeq_epi16(flat_mask, zero); + return a; +} + +inline void Filter8(const __m128i& qp3, const __m128i& qp2, const __m128i& qp1, + const __m128i& qp0, __m128i* oqp2, __m128i* oqp1, + __m128i* oqp0) { + const __m128i four = _mm_set1_epi16(4); + const __m128i qp3_lo = qp3; + const __m128i qp2_lo = qp2; + const __m128i qp1_lo = qp1; + const __m128i qp0_lo = qp0; + const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e); + const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e); + const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e); + + __m128i f8_lo = + _mm_add_epi16(_mm_add_epi16(qp3_lo, four), _mm_add_epi16(qp3_lo, qp3_lo)); + + f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp2_lo), qp2_lo); + + f8_lo = _mm_add_epi16(_mm_add_epi16(f8_lo, qp1_lo), + _mm_add_epi16(qp0_lo, pq0_lo)); + + // p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + // q3 + q3 + q3 + 2 * q2 + q1 + q0 + p0 + *oqp2 = _mm_srli_epi16(f8_lo, 3); + + // p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + // q3 + q3 + q2 + 2 * q1 + q0 + p0 + p1 + f8_lo = FilterAdd2Sub2(f8_lo, qp1_lo, pq1_lo, qp3_lo, qp2_lo); + *oqp1 = _mm_srli_epi16(f8_lo, 3); + + // p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + // q3 + q2 + q1 + 2 * q0 + p0 + p1 + p2 + f8_lo = FilterAdd2Sub2(f8_lo, qp0_lo, pq2_lo, qp3_lo, qp1_lo); + *oqp0 = _mm_srli_epi16(f8_lo, 3); +} + +template +void LoopFilterFuncs_SSE4_1::Horizontal8(void* dest, + ptrdiff_t stride8, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + auto* const dst = static_cast(dest); + const ptrdiff_t stride = stride8 / 2; + const __m128i v_flat_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0); + const __m128i v_outer_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0); + const __m128i v_inner_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0); + const __m128i v_hev_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0); + + const __m128i p3 = LoadLo8(dst - 4 * stride); + const __m128i p2 = LoadLo8(dst - 3 * stride); + const __m128i p1 = LoadLo8(dst - 2 * stride); + const __m128i p0 = LoadLo8(dst - 1 * stride); + const __m128i q0 = LoadLo8(dst + 0 * stride); + const __m128i q1 = LoadLo8(dst + 1 * stride); + const __m128i q2 = LoadLo8(dst + 2 * stride); + const __m128i q3 = LoadLo8(dst + 3 * stride); + const __m128i qp3 = _mm_unpacklo_epi64(p3, q3); + const __m128i qp2 = _mm_unpacklo_epi64(p2, q2); + const __m128i qp1 = _mm_unpacklo_epi64(p1, q1); + const __m128i qp0 = _mm_unpacklo_epi64(p0, q0); + + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = + NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh); + __m128i oqp1; + __m128i oqp0; + + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth); + + const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh); + const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask); + const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo); + + if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) { + __m128i oqp2_f8; + __m128i oqp1_f8; + __m128i oqp0_f8; + + Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8); + + oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask); + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask); + StoreLo8(dst - 3 * stride, oqp2_f8); + StoreHi8(dst + 2 * stride, oqp2_f8); + } + + StoreLo8(dst - 2 * stride, oqp1); + StoreLo8(dst - 1 * stride, oqp0); + StoreHi8(dst + 0 * stride, oqp0); + StoreHi8(dst + 1 * stride, oqp1); +} + +inline void TransposeLower4x8To8x4(const __m128i& x0, const __m128i& x1, + const __m128i& x2, const __m128i& x3, + const __m128i& x4, const __m128i& x5, + const __m128i& x6, const __m128i& x7, + __m128i* d0, __m128i* d1, __m128i* d2, + __m128i* d3) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + // output + // d0 00 10 20 30 40 50 60 70 + // d1 01 11 21 31 41 51 61 71 + // d2 02 12 22 32 42 52 62 72 + // d3 03 13 23 33 43 53 63 73 + + // 00 10 01 11 02 12 03 13 + const __m128i w0 = _mm_unpacklo_epi16(x0, x1); + // 20 30 21 31 22 32 23 33 + const __m128i w1 = _mm_unpacklo_epi16(x2, x3); + // 40 50 41 51 42 52 43 53 + const __m128i w2 = _mm_unpacklo_epi16(x4, x5); + // 60 70 61 71 62 72 63 73 + const __m128i w3 = _mm_unpacklo_epi16(x6, x7); + + // 00 10 20 30 01 11 21 31 + const __m128i w4 = _mm_unpacklo_epi32(w0, w1); + // 40 50 60 70 41 51 61 71 + const __m128i w5 = _mm_unpacklo_epi32(w2, w3); + // 02 12 22 32 03 13 23 33 + const __m128i w6 = _mm_unpackhi_epi32(w0, w1); + // 42 52 62 72 43 53 63 73 + const __m128i w7 = _mm_unpackhi_epi32(w2, w3); + + // 00 10 20 30 40 50 60 70 + *d0 = _mm_unpacklo_epi64(w4, w5); + // 01 11 21 31 41 51 61 71 + *d1 = _mm_unpackhi_epi64(w4, w5); + // 02 12 22 32 42 52 62 72 + *d2 = _mm_unpacklo_epi64(w6, w7); + // 03 13 23 33 43 53 63 73 + *d3 = _mm_unpackhi_epi64(w6, w7); +} + +template +void LoopFilterFuncs_SSE4_1::Vertical8(void* dest, ptrdiff_t stride8, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + auto* const dst = static_cast(dest); + const ptrdiff_t stride = stride8 / 2; + const __m128i v_flat_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0); + const __m128i v_outer_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0); + const __m128i v_inner_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0); + const __m128i v_hev_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0); + + __m128i x0 = LoadUnaligned16(dst - 4 + 0 * stride); + __m128i x1 = LoadUnaligned16(dst - 4 + 1 * stride); + __m128i x2 = LoadUnaligned16(dst - 4 + 2 * stride); + __m128i x3 = LoadUnaligned16(dst - 4 + 3 * stride); + + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + Transpose8x4To4x8(x0, x1, x2, x3, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + + const __m128i qp3 = _mm_unpacklo_epi64(p3, q3); + const __m128i qp2 = _mm_unpacklo_epi64(p2, q2); + const __m128i qp1 = _mm_unpacklo_epi64(p1, q1); + const __m128i qp0 = _mm_unpacklo_epi64(p0, q0); + + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = + NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh); + __m128i oqp1; + __m128i oqp0; + + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth); + + const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh); + const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask); + const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo); + + if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) { + __m128i oqp2_f8; + __m128i oqp1_f8; + __m128i oqp0_f8; + + Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8); + + oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask); + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask); + + p2 = oqp2_f8; + q2 = _mm_srli_si128(oqp2_f8, 8); + } + + p1 = oqp1; + p0 = oqp0; + q0 = _mm_srli_si128(oqp0, 8); + q1 = _mm_srli_si128(oqp1, 8); + + TransposeLower4x8To8x4(p3, p2, p1, p0, q0, q1, q2, q3, &x0, &x1, &x2, &x3); + + StoreUnaligned16(dst - 4 + 0 * stride, x0); + StoreUnaligned16(dst - 4 + 1 * stride, x1); + StoreUnaligned16(dst - 4 + 2 * stride, x2); + StoreUnaligned16(dst - 4 + 3 * stride, x3); +} + +//------------------------------------------------------------------------------ +// 13-tap filters + +inline void Filter14(const __m128i& qp6, const __m128i& qp5, const __m128i& qp4, + const __m128i& qp3, const __m128i& qp2, const __m128i& qp1, + const __m128i& qp0, __m128i* oqp5, __m128i* oqp4, + __m128i* oqp3, __m128i* oqp2, __m128i* oqp1, + __m128i* oqp0) { + const __m128i eight = _mm_set1_epi16(8); + const __m128i qp6_lo = qp6; + const __m128i qp5_lo = qp5; + const __m128i qp4_lo = qp4; + const __m128i qp3_lo = qp3; + const __m128i qp2_lo = qp2; + const __m128i qp1_lo = qp1; + const __m128i qp0_lo = qp0; + const __m128i pq5_lo = _mm_shuffle_epi32(qp5_lo, 0x4e); + const __m128i pq4_lo = _mm_shuffle_epi32(qp4_lo, 0x4e); + const __m128i pq3_lo = _mm_shuffle_epi32(qp3_lo, 0x4e); + const __m128i pq2_lo = _mm_shuffle_epi32(qp2_lo, 0x4e); + const __m128i pq1_lo = _mm_shuffle_epi32(qp1_lo, 0x4e); + const __m128i pq0_lo = _mm_shuffle_epi32(qp0_lo, 0x4e); + + __m128i f14_lo = + _mm_add_epi16(eight, _mm_sub_epi16(_mm_slli_epi16(qp6_lo, 3), qp6_lo)); + + f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp5_lo), + _mm_add_epi16(qp5_lo, qp4_lo)); + + f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp4_lo), + _mm_add_epi16(qp3_lo, qp2_lo)); + + f14_lo = _mm_add_epi16(_mm_add_epi16(f14_lo, qp1_lo), + _mm_add_epi16(qp0_lo, pq0_lo)); + + // p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + // q6 * 7 + q5 * 2 + q4 * 2 + q3 + q2 + q1 + q0 + p0 + *oqp5 = _mm_srli_epi16(f14_lo, 4); + + // p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + // q6 * 5 + q5 * 2 + q4 * 2 + q3 * 2 + q2 + q1 + q0 + p0 + p1 + f14_lo = FilterAdd2Sub2(f14_lo, qp3_lo, pq1_lo, qp6_lo, qp6_lo); + *oqp4 = _mm_srli_epi16(f14_lo, 4); + + // p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + // q6 * 4 + q5 + q4 * 2 + q3 * 2 + q2 * 2 + q1 + q0 + p0 + p1 + p2 + f14_lo = FilterAdd2Sub2(f14_lo, qp2_lo, pq2_lo, qp6_lo, qp5_lo); + *oqp3 = _mm_srli_epi16(f14_lo, 4); + + // p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3 + // q6 * 3 + q5 + q4 + q3 * 2 + q2 * 2 + q1 * 2 + q0 + p0 + p1 + p2 + p3 + f14_lo = FilterAdd2Sub2(f14_lo, qp1_lo, pq3_lo, qp6_lo, qp4_lo); + *oqp2 = _mm_srli_epi16(f14_lo, 4); + + // p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4 + // q6 * 2 + q5 + q4 + q3 + q2 * 2 + q1 * 2 + q0 * 2 + p0 + p1 + p2 + p3 + p4 + f14_lo = FilterAdd2Sub2(f14_lo, qp0_lo, pq4_lo, qp6_lo, qp3_lo); + *oqp1 = _mm_srli_epi16(f14_lo, 4); + + // p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5 + // q6 + q5 + q4 + q3 + q2 + q1 * 2 + q0 * 2 + p0 * 2 + p1 + p2 + p3 + p4 + p5 + f14_lo = FilterAdd2Sub2(f14_lo, pq0_lo, pq5_lo, qp6_lo, qp2_lo); + *oqp0 = _mm_srli_epi16(f14_lo, 4); +} + +template +void LoopFilterFuncs_SSE4_1::Horizontal14(void* dest, + ptrdiff_t stride8, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + auto* const dst = static_cast(dest); + const ptrdiff_t stride = stride8 / 2; + const __m128i v_flat_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0); + const __m128i v_outer_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0); + const __m128i v_inner_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0); + const __m128i v_hev_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0); + + const __m128i p3 = LoadLo8(dst - 4 * stride); + const __m128i p2 = LoadLo8(dst - 3 * stride); + const __m128i p1 = LoadLo8(dst - 2 * stride); + const __m128i p0 = LoadLo8(dst - 1 * stride); + const __m128i q0 = LoadLo8(dst + 0 * stride); + const __m128i q1 = LoadLo8(dst + 1 * stride); + const __m128i q2 = LoadLo8(dst + 2 * stride); + const __m128i q3 = LoadLo8(dst + 3 * stride); + const __m128i qp3 = _mm_unpacklo_epi64(p3, q3); + const __m128i qp2 = _mm_unpacklo_epi64(p2, q2); + const __m128i qp1 = _mm_unpacklo_epi64(p1, q1); + const __m128i qp0 = _mm_unpacklo_epi64(p0, q0); + + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = + NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh); + + __m128i oqp1; + __m128i oqp0; + + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth); + + const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh); + const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask); + const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo); + + if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) { + const __m128i p6 = LoadLo8(dst - 7 * stride); + const __m128i p5 = LoadLo8(dst - 6 * stride); + const __m128i p4 = LoadLo8(dst - 5 * stride); + const __m128i q4 = LoadLo8(dst + 4 * stride); + const __m128i q5 = LoadLo8(dst + 5 * stride); + const __m128i q6 = LoadLo8(dst + 6 * stride); + const __m128i qp6 = _mm_unpacklo_epi64(p6, q6); + const __m128i qp5 = _mm_unpacklo_epi64(p5, q5); + const __m128i qp4 = _mm_unpacklo_epi64(p4, q4); + + const __m128i v_isflatouter4_mask = + IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh); + const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask); + const __m128i v_flat4_mask = + _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo); + + __m128i oqp2_f8; + __m128i oqp1_f8; + __m128i oqp0_f8; + + Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8); + + oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask); + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask); + + if (_mm_test_all_zeros(v_flat4_mask, + _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) { + __m128i oqp5_f14; + __m128i oqp4_f14; + __m128i oqp3_f14; + __m128i oqp2_f14; + __m128i oqp1_f14; + __m128i oqp0_f14; + + Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14, + &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14); + + oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask); + oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask); + oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask); + oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask); + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask); + + StoreLo8(dst - 6 * stride, oqp5_f14); + StoreLo8(dst - 5 * stride, oqp4_f14); + StoreLo8(dst - 4 * stride, oqp3_f14); + + StoreHi8(dst + 3 * stride, oqp3_f14); + StoreHi8(dst + 4 * stride, oqp4_f14); + StoreHi8(dst + 5 * stride, oqp5_f14); + } + + StoreLo8(dst - 3 * stride, oqp2_f8); + StoreHi8(dst + 2 * stride, oqp2_f8); + } + + StoreLo8(dst - 2 * stride, oqp1); + StoreLo8(dst - 1 * stride, oqp0); + StoreHi8(dst + 0 * stride, oqp0); + StoreHi8(dst + 1 * stride, oqp1); +} + +inline void TransposeUpper4x8To8x4(const __m128i& x0, const __m128i& x1, + const __m128i& x2, const __m128i& x3, + const __m128i& x4, const __m128i& x5, + const __m128i& x6, const __m128i& x7, + __m128i* d0, __m128i* d1, __m128i* d2, + __m128i* d3) { + // input + // x0 00 01 02 03 xx xx xx xx + // x1 10 11 12 13 xx xx xx xx + // x2 20 21 22 23 xx xx xx xx + // x3 30 31 32 33 xx xx xx xx + // x4 40 41 42 43 xx xx xx xx + // x5 50 51 52 53 xx xx xx xx + // x6 60 61 62 63 xx xx xx xx + // x7 70 71 72 73 xx xx xx xx + // output + // d0 00 10 20 30 40 50 60 70 + // d1 01 11 21 31 41 51 61 71 + // d2 02 12 22 32 42 52 62 72 + // d3 03 13 23 33 43 53 63 73 + + // 00 10 01 11 02 12 03 13 + const __m128i w0 = _mm_unpackhi_epi16(x0, x1); + // 20 30 21 31 22 32 23 33 + const __m128i w1 = _mm_unpackhi_epi16(x2, x3); + // 40 50 41 51 42 52 43 53 + const __m128i w2 = _mm_unpackhi_epi16(x4, x5); + // 60 70 61 71 62 72 63 73 + const __m128i w3 = _mm_unpackhi_epi16(x6, x7); + + // 00 10 20 30 01 11 21 31 + const __m128i w4 = _mm_unpacklo_epi32(w0, w1); + // 40 50 60 70 41 51 61 71 + const __m128i w5 = _mm_unpacklo_epi32(w2, w3); + // 02 12 22 32 03 13 23 33 + const __m128i w6 = _mm_unpackhi_epi32(w0, w1); + // 42 52 62 72 43 53 63 73 + const __m128i w7 = _mm_unpackhi_epi32(w2, w3); + + // 00 10 20 30 40 50 60 70 + *d0 = _mm_unpacklo_epi64(w4, w5); + // 01 11 21 31 41 51 61 71 + *d1 = _mm_unpackhi_epi64(w4, w5); + // 02 12 22 32 42 52 62 72 + *d2 = _mm_unpacklo_epi64(w6, w7); + // 03 13 23 33 43 53 63 73 + *d3 = _mm_unpackhi_epi64(w6, w7); +} + +template +void LoopFilterFuncs_SSE4_1::Vertical14(void* dest, ptrdiff_t stride8, + int outer_thresh, + int inner_thresh, + int hev_thresh) { + auto* const dst = static_cast(dest); + const ptrdiff_t stride = stride8 / 2; + const __m128i v_flat_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(1 << kThreshShift), 0); + const __m128i v_outer_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(outer_thresh << kThreshShift), 0); + const __m128i v_inner_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(inner_thresh << kThreshShift), 0); + const __m128i v_hev_thresh = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(hev_thresh << kThreshShift), 0); + + // p7 p6 p5 p4 p3 p2 p1 p0 q0 q1 q2 q3 q4 q5 q6 q7 + // + // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f + // 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f + // 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f + // 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f + + __m128i x0 = LoadUnaligned16(dst - 8 + 0 * stride); + __m128i x1 = LoadUnaligned16(dst - 8 + 1 * stride); + __m128i x2 = LoadUnaligned16(dst - 8 + 2 * stride); + __m128i x3 = LoadUnaligned16(dst - 8 + 3 * stride); + + __m128i p7, p6, p5, p4, p3, p2, p1, p0; + __m128i q7, q6, q5, q4, q3, q2, q1, q0; + + Transpose8x4To4x8(x0, x1, x2, x3, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0); + + x0 = LoadUnaligned16(dst - 8 + 8 + 0 * stride); + x1 = LoadUnaligned16(dst - 8 + 8 + 1 * stride); + x2 = LoadUnaligned16(dst - 8 + 8 + 2 * stride); + x3 = LoadUnaligned16(dst - 8 + 8 + 3 * stride); + + Transpose8x4To4x8(x0, x1, x2, x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + + __m128i qp7 = _mm_unpacklo_epi64(p7, q7); + __m128i qp6 = _mm_unpacklo_epi64(p6, q6); + __m128i qp5 = _mm_unpacklo_epi64(p5, q5); + __m128i qp4 = _mm_unpacklo_epi64(p4, q4); + __m128i qp3 = _mm_unpacklo_epi64(p3, q3); + __m128i qp2 = _mm_unpacklo_epi64(p2, q2); + __m128i qp1 = _mm_unpacklo_epi64(p1, q1); + __m128i qp0 = _mm_unpacklo_epi64(p0, q0); + + const __m128i v_hev_mask = Hev(qp1, qp0, v_hev_thresh); + const __m128i v_needs_mask = + NeedsFilter8(qp3, qp2, qp1, qp0, v_outer_thresh, v_inner_thresh); + + __m128i oqp1; + __m128i oqp0; + + Filter4(qp1, qp0, &oqp1, &oqp0, v_needs_mask, v_hev_mask, bitdepth); + + const __m128i v_isflat4_mask = IsFlat4(qp3, qp2, qp1, qp0, v_flat_thresh); + const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask); + const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo); + + if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) { + const __m128i v_isflatouter4_mask = + IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh); + const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask); + const __m128i v_flat4_mask = + _mm_unpacklo_epi64(v_flat4_mask_lo, v_flat4_mask_lo); + + __m128i oqp2_f8; + __m128i oqp1_f8; + __m128i oqp0_f8; + + Filter8(qp3, qp2, qp1, qp0, &oqp2_f8, &oqp1_f8, &oqp0_f8); + + oqp2_f8 = _mm_blendv_epi8(qp2, oqp2_f8, v_mask); + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask); + + if (_mm_test_all_zeros(v_flat4_mask, + _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) { + __m128i oqp5_f14; + __m128i oqp4_f14; + __m128i oqp3_f14; + __m128i oqp2_f14; + __m128i oqp1_f14; + __m128i oqp0_f14; + + Filter14(qp6, qp5, qp4, qp3, qp2, qp1, qp0, &oqp5_f14, &oqp4_f14, + &oqp3_f14, &oqp2_f14, &oqp1_f14, &oqp0_f14); + + oqp5_f14 = _mm_blendv_epi8(qp5, oqp5_f14, v_flat4_mask); + oqp4_f14 = _mm_blendv_epi8(qp4, oqp4_f14, v_flat4_mask); + oqp3_f14 = _mm_blendv_epi8(qp3, oqp3_f14, v_flat4_mask); + oqp2_f8 = _mm_blendv_epi8(oqp2_f8, oqp2_f14, v_flat4_mask); + oqp1 = _mm_blendv_epi8(oqp1, oqp1_f14, v_flat4_mask); + oqp0 = _mm_blendv_epi8(oqp0, oqp0_f14, v_flat4_mask); + qp3 = oqp3_f14; + qp4 = oqp4_f14; + qp5 = oqp5_f14; + } + qp2 = oqp2_f8; + } + + TransposeLower4x8To8x4(qp7, qp6, qp5, qp4, qp3, qp2, oqp1, oqp0, &x0, &x1, + &x2, &x3); + + StoreUnaligned16(dst - 8 + 0 * stride, x0); + StoreUnaligned16(dst - 8 + 1 * stride, x1); + StoreUnaligned16(dst - 8 + 2 * stride, x2); + StoreUnaligned16(dst - 8 + 3 * stride, x3); + + TransposeUpper4x8To8x4(oqp0, oqp1, qp2, qp3, qp4, qp5, qp6, qp7, &x0, &x1, + &x2, &x3); + + StoreUnaligned16(dst - 8 + 8 + 0 * stride, x0); + StoreUnaligned16(dst - 8 + 8 + 1 * stride, x1); + StoreUnaligned16(dst - 8 + 8 + 2 * stride, x2); + StoreUnaligned16(dst - 8 + 8 + 3 * stride, x3); +} + +using Defs10bpp = LoopFilterFuncs_SSE4_1; + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + static_cast(dsp); +#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeHorizontal) + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] = + Defs10bpp::Horizontal4; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeHorizontal) + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] = + Defs10bpp::Horizontal6; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeHorizontal) + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] = + Defs10bpp::Horizontal8; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeHorizontal) + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] = + Defs10bpp::Horizontal14; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize4_LoopFilterTypeVertical) + dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = + Defs10bpp::Vertical4; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize6_LoopFilterTypeVertical) + dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = + Defs10bpp::Vertical6; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize8_LoopFilterTypeVertical) + dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = + Defs10bpp::Vertical8; +#endif +#if DSP_ENABLED_10BPP_SSE4_1(LoopFilterSize14_LoopFilterTypeVertical) + dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] = + Defs10bpp::Vertical14; +#endif +} +#endif +} // namespace +} // namespace high_bitdepth + +void LoopFilterInit_SSE4_1() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 +namespace libgav1 { +namespace dsp { + +void LoopFilterInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/loop_filter_sse4.h b/src/dsp/x86/loop_filter_sse4.h new file mode 100644 index 0000000..4795d8b --- /dev/null +++ b/src/dsp/x86/loop_filter_sse4.h @@ -0,0 +1,119 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::loop_filters, see the defines below for specifics. This +// function is not thread-safe. +void LoopFilterInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +// If sse4 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the sse4 implementation should be used. +#if LIBGAV1_TARGETING_SSE4_1 + +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal +#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal +#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal +#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal +#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical +#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical +#define LIBGAV1_Dsp8bpp_LoopFilterSize6_LoopFilterTypeVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical +#define LIBGAV1_Dsp8bpp_LoopFilterSize8_LoopFilterTypeVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical +#define LIBGAV1_Dsp8bpp_LoopFilterSize14_LoopFilterTypeVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal +#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal +#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal +#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal +#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeHorizontal \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical +#define LIBGAV1_Dsp10bpp_LoopFilterSize4_LoopFilterTypeVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical +#define LIBGAV1_Dsp10bpp_LoopFilterSize6_LoopFilterTypeVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical +#define LIBGAV1_Dsp10bpp_LoopFilterSize8_LoopFilterTypeVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical +#define LIBGAV1_Dsp10bpp_LoopFilterSize14_LoopFilterTypeVertical \ + LIBGAV1_CPU_SSE4_1 +#endif + +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_ diff --git a/src/dsp/x86/loop_restoration_10bit_avx2.cc b/src/dsp/x86/loop_restoration_10bit_avx2.cc new file mode 100644 index 0000000..702bdea --- /dev/null +++ b/src/dsp/x86/loop_restoration_10bit_avx2.cc @@ -0,0 +1,592 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/loop_restoration.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10 +#include + +#include +#include +#include +#include +#include + +#include "src/dsp/common.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_avx2.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace { + +inline void WienerHorizontalClip(const __m256i s[2], + int16_t* const wiener_buffer) { + constexpr int offset = + 1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1); + constexpr int limit = (offset << 2) - 1; + const __m256i offsets = _mm256_set1_epi16(-offset); + const __m256i limits = _mm256_set1_epi16(limit - offset); + const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsHorizontal - 1)); + const __m256i sum0 = _mm256_add_epi32(s[0], round); + const __m256i sum1 = _mm256_add_epi32(s[1], round); + const __m256i rounded_sum0 = + _mm256_srai_epi32(sum0, kInterRoundBitsHorizontal); + const __m256i rounded_sum1 = + _mm256_srai_epi32(sum1, kInterRoundBitsHorizontal); + const __m256i rounded_sum = _mm256_packs_epi32(rounded_sum0, rounded_sum1); + const __m256i d0 = _mm256_max_epi16(rounded_sum, offsets); + const __m256i d1 = _mm256_min_epi16(d0, limits); + StoreAligned32(wiener_buffer, d1); +} + +inline void WienerHorizontalTap7Kernel(const __m256i s[7], + const __m256i filter[2], + int16_t* const wiener_buffer) { + const __m256i s06 = _mm256_add_epi16(s[0], s[6]); + const __m256i s15 = _mm256_add_epi16(s[1], s[5]); + const __m256i s24 = _mm256_add_epi16(s[2], s[4]); + const __m256i ss0 = _mm256_unpacklo_epi16(s06, s15); + const __m256i ss1 = _mm256_unpackhi_epi16(s06, s15); + const __m256i ss2 = _mm256_unpacklo_epi16(s24, s[3]); + const __m256i ss3 = _mm256_unpackhi_epi16(s24, s[3]); + __m256i madds[4]; + madds[0] = _mm256_madd_epi16(ss0, filter[0]); + madds[1] = _mm256_madd_epi16(ss1, filter[0]); + madds[2] = _mm256_madd_epi16(ss2, filter[1]); + madds[3] = _mm256_madd_epi16(ss3, filter[1]); + madds[0] = _mm256_add_epi32(madds[0], madds[2]); + madds[1] = _mm256_add_epi32(madds[1], madds[3]); + WienerHorizontalClip(madds, wiener_buffer); +} + +inline void WienerHorizontalTap5Kernel(const __m256i s[5], const __m256i filter, + int16_t* const wiener_buffer) { + const __m256i s04 = _mm256_add_epi16(s[0], s[4]); + const __m256i s13 = _mm256_add_epi16(s[1], s[3]); + const __m256i s2d = _mm256_add_epi16(s[2], s[2]); + const __m256i s0m = _mm256_sub_epi16(s04, s2d); + const __m256i s1m = _mm256_sub_epi16(s13, s2d); + const __m256i ss0 = _mm256_unpacklo_epi16(s0m, s1m); + const __m256i ss1 = _mm256_unpackhi_epi16(s0m, s1m); + __m256i madds[2]; + madds[0] = _mm256_madd_epi16(ss0, filter); + madds[1] = _mm256_madd_epi16(ss1, filter); + const __m256i s2_lo = _mm256_unpacklo_epi16(s[2], _mm256_setzero_si256()); + const __m256i s2_hi = _mm256_unpackhi_epi16(s[2], _mm256_setzero_si256()); + const __m256i s2x128_lo = _mm256_slli_epi32(s2_lo, 7); + const __m256i s2x128_hi = _mm256_slli_epi32(s2_hi, 7); + madds[0] = _mm256_add_epi32(madds[0], s2x128_lo); + madds[1] = _mm256_add_epi32(madds[1], s2x128_hi); + WienerHorizontalClip(madds, wiener_buffer); +} + +inline void WienerHorizontalTap3Kernel(const __m256i s[3], const __m256i filter, + int16_t* const wiener_buffer) { + const __m256i s02 = _mm256_add_epi16(s[0], s[2]); + const __m256i ss0 = _mm256_unpacklo_epi16(s02, s[1]); + const __m256i ss1 = _mm256_unpackhi_epi16(s02, s[1]); + __m256i madds[2]; + madds[0] = _mm256_madd_epi16(ss0, filter); + madds[1] = _mm256_madd_epi16(ss1, filter); + WienerHorizontalClip(madds, wiener_buffer); +} + +inline void WienerHorizontalTap7(const uint16_t* src, + const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const __m256i* const coefficients, + int16_t** const wiener_buffer) { + __m256i filter[2]; + filter[0] = _mm256_shuffle_epi32(*coefficients, 0x0); + filter[1] = _mm256_shuffle_epi32(*coefficients, 0x55); + for (int y = height; y != 0; --y) { + ptrdiff_t x = 0; + do { + __m256i s[7]; + s[0] = LoadUnaligned32(src + x + 0); + s[1] = LoadUnaligned32(src + x + 1); + s[2] = LoadUnaligned32(src + x + 2); + s[3] = LoadUnaligned32(src + x + 3); + s[4] = LoadUnaligned32(src + x + 4); + s[5] = LoadUnaligned32(src + x + 5); + s[6] = LoadUnaligned32(src + x + 6); + WienerHorizontalTap7Kernel(s, filter, *wiener_buffer + x); + x += 16; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline void WienerHorizontalTap5(const uint16_t* src, + const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const __m256i* const coefficients, + int16_t** const wiener_buffer) { + const __m256i filter = + _mm256_shuffle_epi8(*coefficients, _mm256_set1_epi32(0x05040302)); + for (int y = height; y != 0; --y) { + ptrdiff_t x = 0; + do { + __m256i s[5]; + s[0] = LoadUnaligned32(src + x + 0); + s[1] = LoadUnaligned32(src + x + 1); + s[2] = LoadUnaligned32(src + x + 2); + s[3] = LoadUnaligned32(src + x + 3); + s[4] = LoadUnaligned32(src + x + 4); + WienerHorizontalTap5Kernel(s, filter, *wiener_buffer + x); + x += 16; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline void WienerHorizontalTap3(const uint16_t* src, + const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const __m256i* const coefficients, + int16_t** const wiener_buffer) { + const auto filter = _mm256_shuffle_epi32(*coefficients, 0x55); + for (int y = height; y != 0; --y) { + ptrdiff_t x = 0; + do { + __m256i s[3]; + s[0] = LoadUnaligned32(src + x + 0); + s[1] = LoadUnaligned32(src + x + 1); + s[2] = LoadUnaligned32(src + x + 2); + WienerHorizontalTap3Kernel(s, filter, *wiener_buffer + x); + x += 16; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline void WienerHorizontalTap1(const uint16_t* src, + const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + int16_t** const wiener_buffer) { + for (int y = height; y != 0; --y) { + ptrdiff_t x = 0; + do { + const __m256i s0 = LoadUnaligned32(src + x); + const __m256i d0 = _mm256_slli_epi16(s0, 4); + StoreAligned32(*wiener_buffer + x, d0); + x += 16; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline __m256i WienerVertical7(const __m256i a[4], const __m256i filter[4]) { + const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]); + const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]); + const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]); + const __m256i madd3 = _mm256_madd_epi16(a[3], filter[3]); + const __m256i madd01 = _mm256_add_epi32(madd0, madd1); + const __m256i madd23 = _mm256_add_epi32(madd2, madd3); + const __m256i sum = _mm256_add_epi32(madd01, madd23); + return _mm256_srai_epi32(sum, kInterRoundBitsVertical); +} + +inline __m256i WienerVertical5(const __m256i a[3], const __m256i filter[3]) { + const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]); + const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]); + const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]); + const __m256i madd01 = _mm256_add_epi32(madd0, madd1); + const __m256i sum = _mm256_add_epi32(madd01, madd2); + return _mm256_srai_epi32(sum, kInterRoundBitsVertical); +} + +inline __m256i WienerVertical3(const __m256i a[2], const __m256i filter[2]) { + const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]); + const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]); + const __m256i sum = _mm256_add_epi32(madd0, madd1); + return _mm256_srai_epi32(sum, kInterRoundBitsVertical); +} + +inline __m256i WienerVerticalClip(const __m256i s[2]) { + const __m256i d = _mm256_packus_epi32(s[0], s[1]); + return _mm256_min_epu16(d, _mm256_set1_epi16(1023)); +} + +inline __m256i WienerVerticalFilter7(const __m256i a[7], + const __m256i filter[2]) { + const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1)); + __m256i b[4], c[2]; + b[0] = _mm256_unpacklo_epi16(a[0], a[1]); + b[1] = _mm256_unpacklo_epi16(a[2], a[3]); + b[2] = _mm256_unpacklo_epi16(a[4], a[5]); + b[3] = _mm256_unpacklo_epi16(a[6], round); + c[0] = WienerVertical7(b, filter); + b[0] = _mm256_unpackhi_epi16(a[0], a[1]); + b[1] = _mm256_unpackhi_epi16(a[2], a[3]); + b[2] = _mm256_unpackhi_epi16(a[4], a[5]); + b[3] = _mm256_unpackhi_epi16(a[6], round); + c[1] = WienerVertical7(b, filter); + return WienerVerticalClip(c); +} + +inline __m256i WienerVerticalFilter5(const __m256i a[5], + const __m256i filter[3]) { + const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1)); + __m256i b[3], c[2]; + b[0] = _mm256_unpacklo_epi16(a[0], a[1]); + b[1] = _mm256_unpacklo_epi16(a[2], a[3]); + b[2] = _mm256_unpacklo_epi16(a[4], round); + c[0] = WienerVertical5(b, filter); + b[0] = _mm256_unpackhi_epi16(a[0], a[1]); + b[1] = _mm256_unpackhi_epi16(a[2], a[3]); + b[2] = _mm256_unpackhi_epi16(a[4], round); + c[1] = WienerVertical5(b, filter); + return WienerVerticalClip(c); +} + +inline __m256i WienerVerticalFilter3(const __m256i a[3], + const __m256i filter[2]) { + const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1)); + __m256i b[2], c[2]; + b[0] = _mm256_unpacklo_epi16(a[0], a[1]); + b[1] = _mm256_unpacklo_epi16(a[2], round); + c[0] = WienerVertical3(b, filter); + b[0] = _mm256_unpackhi_epi16(a[0], a[1]); + b[1] = _mm256_unpackhi_epi16(a[2], round); + c[1] = WienerVertical3(b, filter); + return WienerVerticalClip(c); +} + +inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m256i filter[2], __m256i a[7]) { + a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride); + a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride); + a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride); + a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride); + a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride); + a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride); + a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride); + return WienerVerticalFilter7(a, filter); +} + +inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m256i filter[3], __m256i a[5]) { + a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride); + a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride); + a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride); + a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride); + a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride); + return WienerVerticalFilter5(a, filter); +} + +inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m256i filter[2], __m256i a[3]) { + a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride); + a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride); + a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride); + return WienerVerticalFilter3(a, filter); +} + +inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m256i filter[2], __m256i d[2]) { + __m256i a[8]; + d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a); + a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride); + d[1] = WienerVerticalFilter7(a + 1, filter); +} + +inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m256i filter[3], __m256i d[2]) { + __m256i a[6]; + d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a); + a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride); + d[1] = WienerVerticalFilter5(a + 1, filter); +} + +inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m256i filter[2], __m256i d[2]) { + __m256i a[4]; + d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a); + a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride); + d[1] = WienerVerticalFilter3(a + 1, filter); +} + +inline void WienerVerticalTap7(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t coefficients[4], uint16_t* dst, + const ptrdiff_t dst_stride) { + const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients)); + __m256i filter[4]; + filter[0] = _mm256_shuffle_epi32(c, 0x0); + filter[1] = _mm256_shuffle_epi32(c, 0x55); + filter[2] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504)); + filter[3] = + _mm256_set1_epi32((1 << 16) | static_cast(coefficients[0])); + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + __m256i d[2]; + WienerVerticalTap7Kernel2(wiener_buffer + x, width, filter, d); + StoreUnaligned32(dst + x, d[0]); + StoreUnaligned32(dst + dst_stride + x, d[1]); + x += 16; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + __m256i a[7]; + const __m256i d = + WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a); + StoreUnaligned32(dst + x, d); + x += 16; + } while (x < width); + } +} + +inline void WienerVerticalTap5(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t coefficients[3], uint16_t* dst, + const ptrdiff_t dst_stride) { + const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients)); + __m256i filter[3]; + filter[0] = _mm256_shuffle_epi32(c, 0x0); + filter[1] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504)); + filter[2] = + _mm256_set1_epi32((1 << 16) | static_cast(coefficients[0])); + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + __m256i d[2]; + WienerVerticalTap5Kernel2(wiener_buffer + x, width, filter, d); + StoreUnaligned32(dst + x, d[0]); + StoreUnaligned32(dst + dst_stride + x, d[1]); + x += 16; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + __m256i a[5]; + const __m256i d = + WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a); + StoreUnaligned32(dst + x, d); + x += 16; + } while (x < width); + } +} + +inline void WienerVerticalTap3(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t coefficients[2], uint16_t* dst, + const ptrdiff_t dst_stride) { + __m256i filter[2]; + filter[0] = + _mm256_set1_epi32(*reinterpret_cast(coefficients)); + filter[1] = + _mm256_set1_epi32((1 << 16) | static_cast(coefficients[0])); + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + __m256i d[2][2]; + WienerVerticalTap3Kernel2(wiener_buffer + x, width, filter, d[0]); + StoreUnaligned32(dst + x, d[0][0]); + StoreUnaligned32(dst + dst_stride + x, d[0][1]); + x += 16; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + __m256i a[3]; + const __m256i d = + WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a); + StoreUnaligned32(dst + x, d); + x += 16; + } while (x < width); + } +} + +inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer, + uint16_t* const dst) { + const __m256i a = LoadAligned32(wiener_buffer); + const __m256i b = _mm256_add_epi16(a, _mm256_set1_epi16(8)); + const __m256i c = _mm256_srai_epi16(b, 4); + const __m256i d = _mm256_max_epi16(c, _mm256_setzero_si256()); + const __m256i e = _mm256_min_epi16(d, _mm256_set1_epi16(1023)); + StoreUnaligned32(dst, e); +} + +inline void WienerVerticalTap1(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + uint16_t* dst, const ptrdiff_t dst_stride) { + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + WienerVerticalTap1Kernel(wiener_buffer + x, dst + x); + WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x); + x += 16; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + WienerVerticalTap1Kernel(wiener_buffer + x, dst + x); + x += 16; + } while (x < width); + } +} + +void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info, + const void* const source, const void* const top_border, + const void* const bottom_border, const ptrdiff_t stride, + const int width, const int height, + RestorationBuffer* const restoration_buffer, + void* const dest) { + const int16_t* const number_leading_zero_coefficients = + restoration_info.wiener_info.number_leading_zero_coefficients; + const int number_rows_to_skip = std::max( + static_cast(number_leading_zero_coefficients[WienerInfo::kVertical]), + 1); + const ptrdiff_t wiener_stride = Align(width, 16); + int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer; + // The values are saturated to 13 bits before storing. + int16_t* wiener_buffer_horizontal = + wiener_buffer_vertical + number_rows_to_skip * wiener_stride; + + // horizontal filtering. + // Over-reads up to 15 - |kRestorationHorizontalBorder| values. + const int height_horizontal = + height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip; + const int height_extra = (height_horizontal - height) >> 1; + assert(height_extra <= 2); + const auto* const src = static_cast(source); + const auto* const top = static_cast(top_border); + const auto* const bottom = static_cast(bottom_border); + const __m128i c = + LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]); + const __m256i coefficients_horizontal = _mm256_broadcastq_epi64(c); + if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { + WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride, + wiener_stride, height_extra, &coefficients_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap7(src - 3, stride, wiener_stride, height, + &coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra, + &coefficients_horizontal, &wiener_buffer_horizontal); + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { + WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride, + wiener_stride, height_extra, &coefficients_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap5(src - 2, stride, wiener_stride, height, + &coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra, + &coefficients_horizontal, &wiener_buffer_horizontal); + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { + // The maximum over-reads happen here. + WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride, + wiener_stride, height_extra, &coefficients_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap3(src - 1, stride, wiener_stride, height, + &coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra, + &coefficients_horizontal, &wiener_buffer_horizontal); + } else { + assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); + WienerHorizontalTap1(top + (2 - height_extra) * stride, stride, + wiener_stride, height_extra, + &wiener_buffer_horizontal); + WienerHorizontalTap1(src, stride, wiener_stride, height, + &wiener_buffer_horizontal); + WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra, + &wiener_buffer_horizontal); + } + + // vertical filtering. + // Over-writes up to 15 values. + const int16_t* const filter_vertical = + restoration_info.wiener_info.filter[WienerInfo::kVertical]; + auto* dst = static_cast(dest); + if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) { + // Because the top row of |source| is a duplicate of the second row, and the + // bottom row of |source| is a duplicate of its above row, we can duplicate + // the top and bottom row of |wiener_buffer| accordingly. + memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride, + sizeof(*wiener_buffer_horizontal) * wiener_stride); + memcpy(restoration_buffer->wiener_buffer, + restoration_buffer->wiener_buffer + wiener_stride, + sizeof(*restoration_buffer->wiener_buffer) * wiener_stride); + WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height, + filter_vertical, dst, stride); + } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) { + WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride, + height, filter_vertical + 1, dst, stride); + } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) { + WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride, + wiener_stride, height, filter_vertical + 2, dst, stride); + } else { + assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3); + WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride, + wiener_stride, height, dst, stride); + } +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); +#if DSP_ENABLED_10BPP_AVX2(WienerFilter) + dsp->loop_restorations[0] = WienerFilter_AVX2; +#endif +} + +} // namespace + +void LoopRestorationInit10bpp_AVX2() { Init10bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10) +namespace libgav1 { +namespace dsp { + +void LoopRestorationInit10bpp_AVX2() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10 diff --git a/src/dsp/x86/loop_restoration_10bit_sse4.cc b/src/dsp/x86/loop_restoration_10bit_sse4.cc new file mode 100644 index 0000000..0598435 --- /dev/null +++ b/src/dsp/x86/loop_restoration_10bit_sse4.cc @@ -0,0 +1,551 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/loop_restoration.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10 +#include + +#include +#include +#include +#include +#include + +#include "src/dsp/common.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace { + +inline void WienerHorizontalClip(const __m128i s[2], + int16_t* const wiener_buffer) { + constexpr int offset = + 1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1); + constexpr int limit = (offset << 2) - 1; + const __m128i offsets = _mm_set1_epi16(-offset); + const __m128i limits = _mm_set1_epi16(limit - offset); + const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsHorizontal - 1)); + const __m128i sum0 = _mm_add_epi32(s[0], round); + const __m128i sum1 = _mm_add_epi32(s[1], round); + const __m128i rounded_sum0 = _mm_srai_epi32(sum0, kInterRoundBitsHorizontal); + const __m128i rounded_sum1 = _mm_srai_epi32(sum1, kInterRoundBitsHorizontal); + const __m128i rounded_sum = _mm_packs_epi32(rounded_sum0, rounded_sum1); + const __m128i d0 = _mm_max_epi16(rounded_sum, offsets); + const __m128i d1 = _mm_min_epi16(d0, limits); + StoreAligned16(wiener_buffer, d1); +} + +inline void WienerHorizontalTap7(const uint16_t* src, + const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const __m128i coefficients, + int16_t** const wiener_buffer) { + __m128i filter[2]; + filter[0] = _mm_shuffle_epi32(coefficients, 0x0); + filter[1] = _mm_shuffle_epi32(coefficients, 0x55); + for (int y = height; y != 0; --y) { + ptrdiff_t x = 0; + do { + __m128i s[7], madds[4]; + s[0] = LoadUnaligned16(src + x + 0); + s[1] = LoadUnaligned16(src + x + 1); + s[2] = LoadUnaligned16(src + x + 2); + s[3] = LoadUnaligned16(src + x + 3); + s[4] = LoadUnaligned16(src + x + 4); + s[5] = LoadUnaligned16(src + x + 5); + s[6] = LoadUnaligned16(src + x + 6); + const __m128i s06 = _mm_add_epi16(s[0], s[6]); + const __m128i s15 = _mm_add_epi16(s[1], s[5]); + const __m128i s24 = _mm_add_epi16(s[2], s[4]); + const __m128i ss0 = _mm_unpacklo_epi16(s06, s15); + const __m128i ss1 = _mm_unpackhi_epi16(s06, s15); + const __m128i ss2 = _mm_unpacklo_epi16(s24, s[3]); + const __m128i ss3 = _mm_unpackhi_epi16(s24, s[3]); + madds[0] = _mm_madd_epi16(ss0, filter[0]); + madds[1] = _mm_madd_epi16(ss1, filter[0]); + madds[2] = _mm_madd_epi16(ss2, filter[1]); + madds[3] = _mm_madd_epi16(ss3, filter[1]); + madds[0] = _mm_add_epi32(madds[0], madds[2]); + madds[1] = _mm_add_epi32(madds[1], madds[3]); + WienerHorizontalClip(madds, *wiener_buffer + x); + x += 8; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline void WienerHorizontalTap5(const uint16_t* src, + const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const __m128i coefficients, + int16_t** const wiener_buffer) { + const __m128i filter = + _mm_shuffle_epi8(coefficients, _mm_set1_epi32(0x05040302)); + for (int y = height; y != 0; --y) { + ptrdiff_t x = 0; + do { + __m128i s[5], madds[2]; + s[0] = LoadUnaligned16(src + x + 0); + s[1] = LoadUnaligned16(src + x + 1); + s[2] = LoadUnaligned16(src + x + 2); + s[3] = LoadUnaligned16(src + x + 3); + s[4] = LoadUnaligned16(src + x + 4); + const __m128i s04 = _mm_add_epi16(s[0], s[4]); + const __m128i s13 = _mm_add_epi16(s[1], s[3]); + const __m128i s2d = _mm_add_epi16(s[2], s[2]); + const __m128i s0m = _mm_sub_epi16(s04, s2d); + const __m128i s1m = _mm_sub_epi16(s13, s2d); + const __m128i ss0 = _mm_unpacklo_epi16(s0m, s1m); + const __m128i ss1 = _mm_unpackhi_epi16(s0m, s1m); + madds[0] = _mm_madd_epi16(ss0, filter); + madds[1] = _mm_madd_epi16(ss1, filter); + const __m128i s2_lo = _mm_unpacklo_epi16(s[2], _mm_setzero_si128()); + const __m128i s2_hi = _mm_unpackhi_epi16(s[2], _mm_setzero_si128()); + const __m128i s2x128_lo = _mm_slli_epi32(s2_lo, 7); + const __m128i s2x128_hi = _mm_slli_epi32(s2_hi, 7); + madds[0] = _mm_add_epi32(madds[0], s2x128_lo); + madds[1] = _mm_add_epi32(madds[1], s2x128_hi); + WienerHorizontalClip(madds, *wiener_buffer + x); + x += 8; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline void WienerHorizontalTap3(const uint16_t* src, + const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const __m128i coefficients, + int16_t** const wiener_buffer) { + const auto filter = _mm_shuffle_epi32(coefficients, 0x55); + for (int y = height; y != 0; --y) { + ptrdiff_t x = 0; + do { + __m128i s[3], madds[2]; + s[0] = LoadUnaligned16(src + x + 0); + s[1] = LoadUnaligned16(src + x + 1); + s[2] = LoadUnaligned16(src + x + 2); + const __m128i s02 = _mm_add_epi16(s[0], s[2]); + const __m128i ss0 = _mm_unpacklo_epi16(s02, s[1]); + const __m128i ss1 = _mm_unpackhi_epi16(s02, s[1]); + madds[0] = _mm_madd_epi16(ss0, filter); + madds[1] = _mm_madd_epi16(ss1, filter); + WienerHorizontalClip(madds, *wiener_buffer + x); + x += 8; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline void WienerHorizontalTap1(const uint16_t* src, + const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + int16_t** const wiener_buffer) { + for (int y = height; y != 0; --y) { + ptrdiff_t x = 0; + do { + const __m128i s = LoadUnaligned16(src + x); + const __m128i d = _mm_slli_epi16(s, 4); + StoreAligned16(*wiener_buffer + x, d); + x += 8; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline __m128i WienerVertical7(const __m128i a[4], const __m128i filter[4]) { + const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]); + const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]); + const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]); + const __m128i madd3 = _mm_madd_epi16(a[3], filter[3]); + const __m128i madd01 = _mm_add_epi32(madd0, madd1); + const __m128i madd23 = _mm_add_epi32(madd2, madd3); + const __m128i sum = _mm_add_epi32(madd01, madd23); + return _mm_srai_epi32(sum, kInterRoundBitsVertical); +} + +inline __m128i WienerVertical5(const __m128i a[3], const __m128i filter[3]) { + const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]); + const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]); + const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]); + const __m128i madd01 = _mm_add_epi32(madd0, madd1); + const __m128i sum = _mm_add_epi32(madd01, madd2); + return _mm_srai_epi32(sum, kInterRoundBitsVertical); +} + +inline __m128i WienerVertical3(const __m128i a[2], const __m128i filter[2]) { + const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]); + const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]); + const __m128i sum = _mm_add_epi32(madd0, madd1); + return _mm_srai_epi32(sum, kInterRoundBitsVertical); +} + +inline __m128i WienerVerticalClip(const __m128i s[2]) { + const __m128i d = _mm_packus_epi32(s[0], s[1]); + return _mm_min_epu16(d, _mm_set1_epi16(1023)); +} + +inline __m128i WienerVerticalFilter7(const __m128i a[7], + const __m128i filter[2]) { + const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1)); + __m128i b[4], c[2]; + b[0] = _mm_unpacklo_epi16(a[0], a[1]); + b[1] = _mm_unpacklo_epi16(a[2], a[3]); + b[2] = _mm_unpacklo_epi16(a[4], a[5]); + b[3] = _mm_unpacklo_epi16(a[6], round); + c[0] = WienerVertical7(b, filter); + b[0] = _mm_unpackhi_epi16(a[0], a[1]); + b[1] = _mm_unpackhi_epi16(a[2], a[3]); + b[2] = _mm_unpackhi_epi16(a[4], a[5]); + b[3] = _mm_unpackhi_epi16(a[6], round); + c[1] = WienerVertical7(b, filter); + return WienerVerticalClip(c); +} + +inline __m128i WienerVerticalFilter5(const __m128i a[5], + const __m128i filter[3]) { + const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1)); + __m128i b[3], c[2]; + b[0] = _mm_unpacklo_epi16(a[0], a[1]); + b[1] = _mm_unpacklo_epi16(a[2], a[3]); + b[2] = _mm_unpacklo_epi16(a[4], round); + c[0] = WienerVertical5(b, filter); + b[0] = _mm_unpackhi_epi16(a[0], a[1]); + b[1] = _mm_unpackhi_epi16(a[2], a[3]); + b[2] = _mm_unpackhi_epi16(a[4], round); + c[1] = WienerVertical5(b, filter); + return WienerVerticalClip(c); +} + +inline __m128i WienerVerticalFilter3(const __m128i a[3], + const __m128i filter[2]) { + const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1)); + __m128i b[2], c[2]; + b[0] = _mm_unpacklo_epi16(a[0], a[1]); + b[1] = _mm_unpacklo_epi16(a[2], round); + c[0] = WienerVertical3(b, filter); + b[0] = _mm_unpackhi_epi16(a[0], a[1]); + b[1] = _mm_unpackhi_epi16(a[2], round); + c[1] = WienerVertical3(b, filter); + return WienerVerticalClip(c); +} + +inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m128i filter[2], __m128i a[7]) { + a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride); + a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride); + a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride); + a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride); + a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride); + a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride); + a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride); + return WienerVerticalFilter7(a, filter); +} + +inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m128i filter[3], __m128i a[5]) { + a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride); + a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride); + a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride); + a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride); + a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride); + return WienerVerticalFilter5(a, filter); +} + +inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m128i filter[2], __m128i a[3]) { + a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride); + a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride); + a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride); + return WienerVerticalFilter3(a, filter); +} + +inline void WienerVerticalTap7(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t coefficients[4], uint16_t* dst, + const ptrdiff_t dst_stride) { + const __m128i c = LoadLo8(coefficients); + __m128i filter[4]; + filter[0] = _mm_shuffle_epi32(c, 0x0); + filter[1] = _mm_shuffle_epi32(c, 0x55); + filter[2] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504)); + filter[3] = + _mm_set1_epi32((1 << 16) | static_cast(coefficients[0])); + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + __m128i a[8], d[2]; + d[0] = WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a); + a[7] = LoadAligned16(wiener_buffer + x + 7 * width); + d[1] = WienerVerticalFilter7(a + 1, filter); + StoreAligned16(dst + x, d[0]); + StoreAligned16(dst + dst_stride + x, d[1]); + x += 8; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + __m128i a[7]; + const __m128i d = + WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a); + StoreAligned16(dst + x, d); + x += 8; + } while (x < width); + } +} + +inline void WienerVerticalTap5(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t coefficients[3], uint16_t* dst, + const ptrdiff_t dst_stride) { + const __m128i c = LoadLo8(coefficients); + __m128i filter[3]; + filter[0] = _mm_shuffle_epi32(c, 0x0); + filter[1] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504)); + filter[2] = + _mm_set1_epi32((1 << 16) | static_cast(coefficients[0])); + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + __m128i a[6], d[2]; + d[0] = WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a); + a[5] = LoadAligned16(wiener_buffer + x + 5 * width); + d[1] = WienerVerticalFilter5(a + 1, filter); + StoreAligned16(dst + x, d[0]); + StoreAligned16(dst + dst_stride + x, d[1]); + x += 8; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + __m128i a[5]; + const __m128i d = + WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a); + StoreAligned16(dst + x, d); + x += 8; + } while (x < width); + } +} + +inline void WienerVerticalTap3(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t coefficients[2], uint16_t* dst, + const ptrdiff_t dst_stride) { + __m128i filter[2]; + filter[0] = _mm_set1_epi32(*reinterpret_cast(coefficients)); + filter[1] = + _mm_set1_epi32((1 << 16) | static_cast(coefficients[0])); + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + __m128i a[4], d[2]; + d[0] = WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a); + a[3] = LoadAligned16(wiener_buffer + x + 3 * width); + d[1] = WienerVerticalFilter3(a + 1, filter); + StoreAligned16(dst + x, d[0]); + StoreAligned16(dst + dst_stride + x, d[1]); + x += 8; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + __m128i a[3]; + const __m128i d = + WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a); + StoreAligned16(dst + x, d); + x += 8; + } while (x < width); + } +} + +inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer, + uint16_t* const dst) { + const __m128i a = LoadAligned16(wiener_buffer); + const __m128i b = _mm_add_epi16(a, _mm_set1_epi16(8)); + const __m128i c = _mm_srai_epi16(b, 4); + const __m128i d = _mm_max_epi16(c, _mm_setzero_si128()); + const __m128i e = _mm_min_epi16(d, _mm_set1_epi16(1023)); + StoreAligned16(dst, e); +} + +inline void WienerVerticalTap1(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + uint16_t* dst, const ptrdiff_t dst_stride) { + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + WienerVerticalTap1Kernel(wiener_buffer + x, dst + x); + WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x); + x += 8; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + WienerVerticalTap1Kernel(wiener_buffer + x, dst + x); + x += 8; + } while (x < width); + } +} + +void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info, + const void* const source, const void* const top_border, + const void* const bottom_border, + const ptrdiff_t stride, const int width, + const int height, + RestorationBuffer* const restoration_buffer, + void* const dest) { + const int16_t* const number_leading_zero_coefficients = + restoration_info.wiener_info.number_leading_zero_coefficients; + const int number_rows_to_skip = std::max( + static_cast(number_leading_zero_coefficients[WienerInfo::kVertical]), + 1); + const ptrdiff_t wiener_stride = Align(width, 16); + int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer; + // The values are saturated to 13 bits before storing. + int16_t* wiener_buffer_horizontal = + wiener_buffer_vertical + number_rows_to_skip * wiener_stride; + + // horizontal filtering. + // Over-reads up to 15 - |kRestorationHorizontalBorder| values. + const int height_horizontal = + height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip; + const int height_extra = (height_horizontal - height) >> 1; + assert(height_extra <= 2); + const auto* const src = static_cast(source); + const auto* const top = static_cast(top_border); + const auto* const bottom = static_cast(bottom_border); + const __m128i coefficients_horizontal = + LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]); + if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { + WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride, + wiener_stride, height_extra, coefficients_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap7(src - 3, stride, wiener_stride, height, + coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra, + coefficients_horizontal, &wiener_buffer_horizontal); + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { + WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride, + wiener_stride, height_extra, coefficients_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap5(src - 2, stride, wiener_stride, height, + coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra, + coefficients_horizontal, &wiener_buffer_horizontal); + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { + // The maximum over-reads happen here. + WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride, + wiener_stride, height_extra, coefficients_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap3(src - 1, stride, wiener_stride, height, + coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra, + coefficients_horizontal, &wiener_buffer_horizontal); + } else { + assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); + WienerHorizontalTap1(top + (2 - height_extra) * stride, stride, + wiener_stride, height_extra, + &wiener_buffer_horizontal); + WienerHorizontalTap1(src, stride, wiener_stride, height, + &wiener_buffer_horizontal); + WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra, + &wiener_buffer_horizontal); + } + + // vertical filtering. + // Over-writes up to 15 values. + const int16_t* const filter_vertical = + restoration_info.wiener_info.filter[WienerInfo::kVertical]; + auto* dst = static_cast(dest); + if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) { + // Because the top row of |source| is a duplicate of the second row, and the + // bottom row of |source| is a duplicate of its above row, we can duplicate + // the top and bottom row of |wiener_buffer| accordingly. + memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride, + sizeof(*wiener_buffer_horizontal) * wiener_stride); + memcpy(restoration_buffer->wiener_buffer, + restoration_buffer->wiener_buffer + wiener_stride, + sizeof(*restoration_buffer->wiener_buffer) * wiener_stride); + WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height, + filter_vertical, dst, stride); + } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) { + WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride, + height, filter_vertical + 1, dst, stride); + } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) { + WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride, + wiener_stride, height, filter_vertical + 2, dst, stride); + } else { + assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3); + WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride, + wiener_stride, height, dst, stride); + } +} + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + static_cast(dsp); +#if DSP_ENABLED_10BPP_SSE4_1(WienerFilter) + dsp->loop_restorations[0] = WienerFilter_SSE4_1; +#else + static_cast(WienerFilter_SSE4_1); +#endif +} + +} // namespace + +void LoopRestorationInit10bpp_SSE4_1() { Init10bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10) +namespace libgav1 { +namespace dsp { + +void LoopRestorationInit10bpp_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10 diff --git a/src/dsp/x86/loop_restoration_avx2.cc b/src/dsp/x86/loop_restoration_avx2.cc new file mode 100644 index 0000000..7ae7c90 --- /dev/null +++ b/src/dsp/x86/loop_restoration_avx2.cc @@ -0,0 +1,2902 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/loop_restoration.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_AVX2 +#include + +#include +#include +#include +#include +#include + +#include "src/dsp/common.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_avx2.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +inline void WienerHorizontalClip(const __m256i s[2], const __m256i s_3x128, + int16_t* const wiener_buffer) { + constexpr int offset = + 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1); + constexpr int limit = + (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1; + const __m256i offsets = _mm256_set1_epi16(-offset); + const __m256i limits = _mm256_set1_epi16(limit - offset); + const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsHorizontal - 1)); + // The sum range here is [-128 * 255, 90 * 255]. + const __m256i madd = _mm256_add_epi16(s[0], s[1]); + const __m256i sum = _mm256_add_epi16(madd, round); + const __m256i rounded_sum0 = + _mm256_srai_epi16(sum, kInterRoundBitsHorizontal); + // Add back scaled down offset correction. + const __m256i rounded_sum1 = _mm256_add_epi16(rounded_sum0, s_3x128); + const __m256i d0 = _mm256_max_epi16(rounded_sum1, offsets); + const __m256i d1 = _mm256_min_epi16(d0, limits); + StoreAligned32(wiener_buffer, d1); +} + +// Using _mm256_alignr_epi8() is about 8% faster than loading all and unpacking, +// because the compiler generates redundant code when loading all and unpacking. +inline void WienerHorizontalTap7Kernel(const __m256i s[2], + const __m256i filter[4], + int16_t* const wiener_buffer) { + const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1); + const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5); + const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9); + const auto s67 = _mm256_alignr_epi8(s[1], s[0], 13); + __m256i madds[4]; + madds[0] = _mm256_maddubs_epi16(s01, filter[0]); + madds[1] = _mm256_maddubs_epi16(s23, filter[1]); + madds[2] = _mm256_maddubs_epi16(s45, filter[2]); + madds[3] = _mm256_maddubs_epi16(s67, filter[3]); + madds[0] = _mm256_add_epi16(madds[0], madds[2]); + madds[1] = _mm256_add_epi16(madds[1], madds[3]); + const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s23, 8), + 7 - kInterRoundBitsHorizontal); + WienerHorizontalClip(madds, s_3x128, wiener_buffer); +} + +inline void WienerHorizontalTap5Kernel(const __m256i s[2], + const __m256i filter[3], + int16_t* const wiener_buffer) { + const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1); + const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5); + const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9); + __m256i madds[3]; + madds[0] = _mm256_maddubs_epi16(s01, filter[0]); + madds[1] = _mm256_maddubs_epi16(s23, filter[1]); + madds[2] = _mm256_maddubs_epi16(s45, filter[2]); + madds[0] = _mm256_add_epi16(madds[0], madds[2]); + const __m256i s_3x128 = _mm256_srli_epi16(_mm256_slli_epi16(s23, 8), + kInterRoundBitsHorizontal + 1); + WienerHorizontalClip(madds, s_3x128, wiener_buffer); +} + +inline void WienerHorizontalTap3Kernel(const __m256i s[2], + const __m256i filter[2], + int16_t* const wiener_buffer) { + const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1); + const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5); + __m256i madds[2]; + madds[0] = _mm256_maddubs_epi16(s01, filter[0]); + madds[1] = _mm256_maddubs_epi16(s23, filter[1]); + const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s01, 8), + 7 - kInterRoundBitsHorizontal); + WienerHorizontalClip(madds, s_3x128, wiener_buffer); +} + +inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const __m256i coefficients, + int16_t** const wiener_buffer) { + __m256i filter[4]; + filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0100)); + filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302)); + filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0102)); + filter[3] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8000)); + for (int y = height; y != 0; --y) { + __m256i s = LoadUnaligned32(src); + __m256i ss[4]; + ss[0] = _mm256_unpacklo_epi8(s, s); + ptrdiff_t x = 0; + do { + ss[1] = _mm256_unpackhi_epi8(s, s); + s = LoadUnaligned32(src + x + 32); + ss[3] = _mm256_unpacklo_epi8(s, s); + ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21); + WienerHorizontalTap7Kernel(ss + 0, filter, *wiener_buffer + x + 0); + WienerHorizontalTap7Kernel(ss + 1, filter, *wiener_buffer + x + 16); + ss[0] = ss[3]; + x += 32; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const __m256i coefficients, + int16_t** const wiener_buffer) { + __m256i filter[3]; + filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0201)); + filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0203)); + filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8001)); + for (int y = height; y != 0; --y) { + __m256i s = LoadUnaligned32(src); + __m256i ss[4]; + ss[0] = _mm256_unpacklo_epi8(s, s); + ptrdiff_t x = 0; + do { + ss[1] = _mm256_unpackhi_epi8(s, s); + s = LoadUnaligned32(src + x + 32); + ss[3] = _mm256_unpacklo_epi8(s, s); + ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21); + WienerHorizontalTap5Kernel(ss + 0, filter, *wiener_buffer + x + 0); + WienerHorizontalTap5Kernel(ss + 1, filter, *wiener_buffer + x + 16); + ss[0] = ss[3]; + x += 32; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const __m256i coefficients, + int16_t** const wiener_buffer) { + __m256i filter[2]; + filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302)); + filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x8002)); + for (int y = height; y != 0; --y) { + __m256i s = LoadUnaligned32(src); + __m256i ss[4]; + ss[0] = _mm256_unpacklo_epi8(s, s); + ptrdiff_t x = 0; + do { + ss[1] = _mm256_unpackhi_epi8(s, s); + s = LoadUnaligned32(src + x + 32); + ss[3] = _mm256_unpacklo_epi8(s, s); + ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21); + WienerHorizontalTap3Kernel(ss + 0, filter, *wiener_buffer + x + 0); + WienerHorizontalTap3Kernel(ss + 1, filter, *wiener_buffer + x + 16); + ss[0] = ss[3]; + x += 32; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + int16_t** const wiener_buffer) { + for (int y = height; y != 0; --y) { + ptrdiff_t x = 0; + do { + const __m256i s = LoadUnaligned32(src + x); + const __m256i s0 = _mm256_unpacklo_epi8(s, _mm256_setzero_si256()); + const __m256i s1 = _mm256_unpackhi_epi8(s, _mm256_setzero_si256()); + __m256i d[2]; + d[0] = _mm256_slli_epi16(s0, 4); + d[1] = _mm256_slli_epi16(s1, 4); + StoreAligned64(*wiener_buffer + x, d); + x += 32; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline __m256i WienerVertical7(const __m256i a[2], const __m256i filter[2]) { + const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1)); + const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]); + const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]); + const __m256i sum0 = _mm256_add_epi32(round, madd0); + const __m256i sum1 = _mm256_add_epi32(sum0, madd1); + return _mm256_srai_epi32(sum1, kInterRoundBitsVertical); +} + +inline __m256i WienerVertical5(const __m256i a[2], const __m256i filter[2]) { + const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]); + const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]); + const __m256i sum = _mm256_add_epi32(madd0, madd1); + return _mm256_srai_epi32(sum, kInterRoundBitsVertical); +} + +inline __m256i WienerVertical3(const __m256i a, const __m256i filter) { + const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1)); + const __m256i madd = _mm256_madd_epi16(a, filter); + const __m256i sum = _mm256_add_epi32(round, madd); + return _mm256_srai_epi32(sum, kInterRoundBitsVertical); +} + +inline __m256i WienerVerticalFilter7(const __m256i a[7], + const __m256i filter[2]) { + __m256i b[2]; + const __m256i a06 = _mm256_add_epi16(a[0], a[6]); + const __m256i a15 = _mm256_add_epi16(a[1], a[5]); + const __m256i a24 = _mm256_add_epi16(a[2], a[4]); + b[0] = _mm256_unpacklo_epi16(a06, a15); + b[1] = _mm256_unpacklo_epi16(a24, a[3]); + const __m256i sum0 = WienerVertical7(b, filter); + b[0] = _mm256_unpackhi_epi16(a06, a15); + b[1] = _mm256_unpackhi_epi16(a24, a[3]); + const __m256i sum1 = WienerVertical7(b, filter); + return _mm256_packs_epi32(sum0, sum1); +} + +inline __m256i WienerVerticalFilter5(const __m256i a[5], + const __m256i filter[2]) { + const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1)); + __m256i b[2]; + const __m256i a04 = _mm256_add_epi16(a[0], a[4]); + const __m256i a13 = _mm256_add_epi16(a[1], a[3]); + b[0] = _mm256_unpacklo_epi16(a04, a13); + b[1] = _mm256_unpacklo_epi16(a[2], round); + const __m256i sum0 = WienerVertical5(b, filter); + b[0] = _mm256_unpackhi_epi16(a04, a13); + b[1] = _mm256_unpackhi_epi16(a[2], round); + const __m256i sum1 = WienerVertical5(b, filter); + return _mm256_packs_epi32(sum0, sum1); +} + +inline __m256i WienerVerticalFilter3(const __m256i a[3], const __m256i filter) { + __m256i b; + const __m256i a02 = _mm256_add_epi16(a[0], a[2]); + b = _mm256_unpacklo_epi16(a02, a[1]); + const __m256i sum0 = WienerVertical3(b, filter); + b = _mm256_unpackhi_epi16(a02, a[1]); + const __m256i sum1 = WienerVertical3(b, filter); + return _mm256_packs_epi32(sum0, sum1); +} + +inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m256i filter[2], __m256i a[7]) { + a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride); + a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride); + a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride); + a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride); + a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride); + a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride); + a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride); + return WienerVerticalFilter7(a, filter); +} + +inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m256i filter[2], __m256i a[5]) { + a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride); + a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride); + a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride); + a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride); + a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride); + return WienerVerticalFilter5(a, filter); +} + +inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m256i filter, __m256i a[3]) { + a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride); + a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride); + a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride); + return WienerVerticalFilter3(a, filter); +} + +inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m256i filter[2], __m256i d[2]) { + __m256i a[8]; + d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a); + a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride); + d[1] = WienerVerticalFilter7(a + 1, filter); +} + +inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m256i filter[2], __m256i d[2]) { + __m256i a[6]; + d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a); + a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride); + d[1] = WienerVerticalFilter5(a + 1, filter); +} + +inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m256i filter, __m256i d[2]) { + __m256i a[4]; + d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a); + a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride); + d[1] = WienerVerticalFilter3(a + 1, filter); +} + +inline void WienerVerticalTap7(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t coefficients[4], uint8_t* dst, + const ptrdiff_t dst_stride) { + const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients)); + __m256i filter[2]; + filter[0] = _mm256_shuffle_epi32(c, 0x0); + filter[1] = _mm256_shuffle_epi32(c, 0x55); + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + __m256i d[2][2]; + WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]); + WienerVerticalTap7Kernel2(wiener_buffer + x + 16, width, filter, d[1]); + StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0])); + StoreUnaligned32(dst + dst_stride + x, + _mm256_packus_epi16(d[0][1], d[1][1])); + x += 32; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + __m256i a[7]; + const __m256i d0 = + WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a); + const __m256i d1 = + WienerVerticalTap7Kernel(wiener_buffer + x + 16, width, filter, a); + StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1)); + x += 32; + } while (x < width); + } +} + +inline void WienerVerticalTap5(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t coefficients[3], uint8_t* dst, + const ptrdiff_t dst_stride) { + const __m256i c = _mm256_broadcastd_epi32(Load4(coefficients)); + __m256i filter[2]; + filter[0] = _mm256_shuffle_epi32(c, 0); + filter[1] = + _mm256_set1_epi32((1 << 16) | static_cast(coefficients[2])); + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + __m256i d[2][2]; + WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]); + WienerVerticalTap5Kernel2(wiener_buffer + x + 16, width, filter, d[1]); + StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0])); + StoreUnaligned32(dst + dst_stride + x, + _mm256_packus_epi16(d[0][1], d[1][1])); + x += 32; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + __m256i a[5]; + const __m256i d0 = + WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a); + const __m256i d1 = + WienerVerticalTap5Kernel(wiener_buffer + x + 16, width, filter, a); + StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1)); + x += 32; + } while (x < width); + } +} + +inline void WienerVerticalTap3(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t coefficients[2], uint8_t* dst, + const ptrdiff_t dst_stride) { + const __m256i filter = + _mm256_set1_epi32(*reinterpret_cast(coefficients)); + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + __m256i d[2][2]; + WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]); + WienerVerticalTap3Kernel2(wiener_buffer + x + 16, width, filter, d[1]); + StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0])); + StoreUnaligned32(dst + dst_stride + x, + _mm256_packus_epi16(d[0][1], d[1][1])); + x += 32; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + __m256i a[3]; + const __m256i d0 = + WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a); + const __m256i d1 = + WienerVerticalTap3Kernel(wiener_buffer + x + 16, width, filter, a); + StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1)); + x += 32; + } while (x < width); + } +} + +inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer, + uint8_t* const dst) { + const __m256i a0 = LoadAligned32(wiener_buffer + 0); + const __m256i a1 = LoadAligned32(wiener_buffer + 16); + const __m256i b0 = _mm256_add_epi16(a0, _mm256_set1_epi16(8)); + const __m256i b1 = _mm256_add_epi16(a1, _mm256_set1_epi16(8)); + const __m256i c0 = _mm256_srai_epi16(b0, 4); + const __m256i c1 = _mm256_srai_epi16(b1, 4); + const __m256i d = _mm256_packus_epi16(c0, c1); + StoreUnaligned32(dst, d); +} + +inline void WienerVerticalTap1(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + uint8_t* dst, const ptrdiff_t dst_stride) { + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + WienerVerticalTap1Kernel(wiener_buffer + x, dst + x); + WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x); + x += 32; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + WienerVerticalTap1Kernel(wiener_buffer + x, dst + x); + x += 32; + } while (x < width); + } +} + +void WienerFilter_AVX2(const RestorationUnitInfo& restoration_info, + const void* const source, const void* const top_border, + const void* const bottom_border, const ptrdiff_t stride, + const int width, const int height, + RestorationBuffer* const restoration_buffer, + void* const dest) { + const int16_t* const number_leading_zero_coefficients = + restoration_info.wiener_info.number_leading_zero_coefficients; + const int number_rows_to_skip = std::max( + static_cast(number_leading_zero_coefficients[WienerInfo::kVertical]), + 1); + const ptrdiff_t wiener_stride = Align(width, 32); + int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer; + // The values are saturated to 13 bits before storing. + int16_t* wiener_buffer_horizontal = + wiener_buffer_vertical + number_rows_to_skip * wiener_stride; + + // horizontal filtering. + // Over-reads up to 15 - |kRestorationHorizontalBorder| values. + const int height_horizontal = + height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip; + const int height_extra = (height_horizontal - height) >> 1; + assert(height_extra <= 2); + const auto* const src = static_cast(source); + const auto* const top = static_cast(top_border); + const auto* const bottom = static_cast(bottom_border); + const __m128i c = + LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]); + // In order to keep the horizontal pass intermediate values within 16 bits we + // offset |filter[3]| by 128. The 128 offset will be added back in the loop. + __m128i c_horizontal = + _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0)); + c_horizontal = _mm_packs_epi16(c_horizontal, c_horizontal); + const __m256i coefficients_horizontal = _mm256_broadcastd_epi32(c_horizontal); + if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { + WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride, + wiener_stride, height_extra, coefficients_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap7(src - 3, stride, wiener_stride, height, + coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra, + coefficients_horizontal, &wiener_buffer_horizontal); + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { + WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride, + wiener_stride, height_extra, coefficients_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap5(src - 2, stride, wiener_stride, height, + coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra, + coefficients_horizontal, &wiener_buffer_horizontal); + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { + // The maximum over-reads happen here. + WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride, + wiener_stride, height_extra, coefficients_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap3(src - 1, stride, wiener_stride, height, + coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra, + coefficients_horizontal, &wiener_buffer_horizontal); + } else { + assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); + WienerHorizontalTap1(top + (2 - height_extra) * stride, stride, + wiener_stride, height_extra, + &wiener_buffer_horizontal); + WienerHorizontalTap1(src, stride, wiener_stride, height, + &wiener_buffer_horizontal); + WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra, + &wiener_buffer_horizontal); + } + + // vertical filtering. + // Over-writes up to 15 values. + const int16_t* const filter_vertical = + restoration_info.wiener_info.filter[WienerInfo::kVertical]; + auto* dst = static_cast(dest); + if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) { + // Because the top row of |source| is a duplicate of the second row, and the + // bottom row of |source| is a duplicate of its above row, we can duplicate + // the top and bottom row of |wiener_buffer| accordingly. + memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride, + sizeof(*wiener_buffer_horizontal) * wiener_stride); + memcpy(restoration_buffer->wiener_buffer, + restoration_buffer->wiener_buffer + wiener_stride, + sizeof(*restoration_buffer->wiener_buffer) * wiener_stride); + WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height, + filter_vertical, dst, stride); + } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) { + WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride, + height, filter_vertical + 1, dst, stride); + } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) { + WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride, + wiener_stride, height, filter_vertical + 2, dst, stride); + } else { + assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3); + WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride, + wiener_stride, height, dst, stride); + } +} + +//------------------------------------------------------------------------------ +// SGR + +constexpr int kSumOffset = 24; + +// SIMD overreads the number of bytes in SIMD registers - (width % 16) - 2 * +// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of +// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2. +constexpr int kOverreadInBytesPass1_128 = 10; +constexpr int kOverreadInBytesPass2_128 = 12; +constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16; +constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16; + +inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x, + __m128i dst[2]) { + dst[0] = LoadAligned16(src[0] + x); + dst[1] = LoadAligned16(src[1] + x); +} + +inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x, + __m256i dst[2]) { + dst[0] = LoadAligned32(src[0] + x); + dst[1] = LoadAligned32(src[1] + x); +} + +inline void LoadAligned32x2U16Msan(const uint16_t* const src[2], + const ptrdiff_t x, const ptrdiff_t border, + __m256i dst[2]) { + dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border)); + dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border)); +} + +inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x, + __m128i dst[3]) { + dst[0] = LoadAligned16(src[0] + x); + dst[1] = LoadAligned16(src[1] + x); + dst[2] = LoadAligned16(src[2] + x); +} + +inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x, + __m256i dst[3]) { + dst[0] = LoadAligned32(src[0] + x); + dst[1] = LoadAligned32(src[1] + x); + dst[2] = LoadAligned32(src[2] + x); +} + +inline void LoadAligned32x3U16Msan(const uint16_t* const src[3], + const ptrdiff_t x, const ptrdiff_t border, + __m256i dst[3]) { + dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border)); + dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border)); + dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border)); +} + +inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) { + dst[0] = LoadAligned16(src + 0); + dst[1] = LoadAligned16(src + 4); +} + +inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x, + __m128i dst[2][2]) { + LoadAligned32U32(src[0] + x, dst[0]); + LoadAligned32U32(src[1] + x, dst[1]); +} + +inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x, + __m256i dst[2][2]) { + LoadAligned64(src[0] + x, dst[0]); + LoadAligned64(src[1] + x, dst[1]); +} + +inline void LoadAligned64x2U32Msan(const uint32_t* const src[2], + const ptrdiff_t x, const ptrdiff_t border, + __m256i dst[2][2]) { + LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]); + LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]); +} + +inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x, + __m128i dst[3][2]) { + LoadAligned32U32(src[0] + x, dst[0]); + LoadAligned32U32(src[1] + x, dst[1]); + LoadAligned32U32(src[2] + x, dst[2]); +} + +inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x, + __m256i dst[3][2]) { + LoadAligned64(src[0] + x, dst[0]); + LoadAligned64(src[1] + x, dst[1]); + LoadAligned64(src[2] + x, dst[2]); +} + +inline void LoadAligned64x3U32Msan(const uint32_t* const src[3], + const ptrdiff_t x, const ptrdiff_t border, + __m256i dst[3][2]) { + LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]); + LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]); + LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]); +} + +inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) { + StoreAligned16(dst + 0, src[0]); + StoreAligned16(dst + 4, src[1]); +} + +// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following +// functions. Some compilers may generate super inefficient code and the whole +// decoder could be 15% slower. + +inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128()); + return _mm_add_epi16(s0, s1); +} + +inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) { + const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256()); + const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256()); + return _mm256_add_epi16(s0, s1); +} + +inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) { + const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256()); + const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256()); + return _mm256_add_epi16(s0, s1); +} + +inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128()); + return _mm_add_epi32(s0, s1); +} + +inline __m256i VaddlLo16(const __m256i src0, const __m256i src1) { + const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256()); + const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256()); + return _mm256_add_epi32(s0, s1); +} + +inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128()); + return _mm_add_epi32(s0, s1); +} + +inline __m256i VaddlHi16(const __m256i src0, const __m256i src1) { + const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256()); + const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256()); + return _mm256_add_epi32(s0, s1); +} + +inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) { + const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128()); + return _mm_add_epi16(src0, s1); +} + +inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) { + const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256()); + return _mm256_add_epi16(src0, s1); +} + +inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) { + const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256()); + return _mm256_add_epi16(src0, s1); +} + +inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) { + const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128()); + return _mm_add_epi32(src0, s1); +} + +inline __m256i VaddwLo16(const __m256i src0, const __m256i src1) { + const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256()); + return _mm256_add_epi32(src0, s1); +} + +inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) { + const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128()); + return _mm_add_epi32(src0, s1); +} + +inline __m256i VaddwHi16(const __m256i src0, const __m256i src1) { + const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256()); + return _mm256_add_epi32(src0, s1); +} + +// Using VgetLane16() can save a sign extension instruction. +template +inline int VgetLane16(__m256i src) { + return _mm256_extract_epi16(src, n); +} + +template +inline int VgetLane8(__m256i src) { + return _mm256_extract_epi8(src, n); +} + +inline __m256i VmullNLo8(const __m256i src0, const int src1) { + const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256()); + return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1)); +} + +inline __m256i VmullNHi8(const __m256i src0, const int src1) { + const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256()); + return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1)); +} + +inline __m128i VmullLo16(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128()); + return _mm_madd_epi16(s0, s1); +} + +inline __m256i VmullLo16(const __m256i src0, const __m256i src1) { + const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256()); + const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256()); + return _mm256_madd_epi16(s0, s1); +} + +inline __m128i VmullHi16(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128()); + return _mm_madd_epi16(s0, s1); +} + +inline __m256i VmullHi16(const __m256i src0, const __m256i src1) { + const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256()); + const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256()); + return _mm256_madd_epi16(s0, s1); +} + +inline __m256i VrshrS32(const __m256i src0, const int src1) { + const __m256i sum = + _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1))); + return _mm256_srai_epi32(sum, src1); +} + +inline __m128i VrshrU32(const __m128i src0, const int src1) { + const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1))); + return _mm_srli_epi32(sum, src1); +} + +inline __m256i VrshrU32(const __m256i src0, const int src1) { + const __m256i sum = + _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1))); + return _mm256_srli_epi32(sum, src1); +} + +inline __m128i SquareLo8(const __m128i src) { + const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128()); + return _mm_mullo_epi16(s, s); +} + +inline __m256i SquareLo8(const __m256i src) { + const __m256i s = _mm256_unpacklo_epi8(src, _mm256_setzero_si256()); + return _mm256_mullo_epi16(s, s); +} + +inline __m128i SquareHi8(const __m128i src) { + const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128()); + return _mm_mullo_epi16(s, s); +} + +inline __m256i SquareHi8(const __m256i src) { + const __m256i s = _mm256_unpackhi_epi8(src, _mm256_setzero_si256()); + return _mm256_mullo_epi16(s, s); +} + +inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) { + dst[0] = src; + dst[1] = _mm_srli_si128(src, 1); + dst[2] = _mm_srli_si128(src, 2); +} + +inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) { + dst[0] = _mm256_alignr_epi8(src[1], src[0], 0); + dst[1] = _mm256_alignr_epi8(src[1], src[0], 1); + dst[2] = _mm256_alignr_epi8(src[1], src[0], 2); +} + +inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) { + dst[0] = src[0]; + dst[1] = _mm_alignr_epi8(src[1], src[0], 2); + dst[2] = _mm_alignr_epi8(src[1], src[0], 4); +} + +inline void Prepare3_16(const __m256i src[2], __m256i dst[3]) { + dst[0] = src[0]; + dst[1] = _mm256_alignr_epi8(src[1], src[0], 2); + dst[2] = _mm256_alignr_epi8(src[1], src[0], 4); +} + +inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) { + dst[0] = src; + dst[1] = _mm_srli_si128(src, 1); + dst[2] = _mm_srli_si128(src, 2); + dst[3] = _mm_srli_si128(src, 3); + dst[4] = _mm_srli_si128(src, 4); +} + +inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) { + Prepare3_16(src, dst); + dst[3] = _mm_alignr_epi8(src[1], src[0], 6); + dst[4] = _mm_alignr_epi8(src[1], src[0], 8); +} + +inline void Prepare5_16(const __m256i src[2], __m256i dst[5]) { + Prepare3_16(src, dst); + dst[3] = _mm256_alignr_epi8(src[1], src[0], 6); + dst[4] = _mm256_alignr_epi8(src[1], src[0], 8); +} + +inline __m128i Sum3_16(const __m128i src0, const __m128i src1, + const __m128i src2) { + const __m128i sum = _mm_add_epi16(src0, src1); + return _mm_add_epi16(sum, src2); +} + +inline __m256i Sum3_16(const __m256i src0, const __m256i src1, + const __m256i src2) { + const __m256i sum = _mm256_add_epi16(src0, src1); + return _mm256_add_epi16(sum, src2); +} + +inline __m128i Sum3_16(const __m128i src[3]) { + return Sum3_16(src[0], src[1], src[2]); +} + +inline __m256i Sum3_16(const __m256i src[3]) { + return Sum3_16(src[0], src[1], src[2]); +} + +inline __m128i Sum3_32(const __m128i src0, const __m128i src1, + const __m128i src2) { + const __m128i sum = _mm_add_epi32(src0, src1); + return _mm_add_epi32(sum, src2); +} + +inline __m256i Sum3_32(const __m256i src0, const __m256i src1, + const __m256i src2) { + const __m256i sum = _mm256_add_epi32(src0, src1); + return _mm256_add_epi32(sum, src2); +} + +inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) { + dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]); + dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]); +} + +inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) { + dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]); + dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]); +} + +inline __m128i Sum3WLo16(const __m128i src[3]) { + const __m128i sum = VaddlLo8(src[0], src[1]); + return VaddwLo8(sum, src[2]); +} + +inline __m256i Sum3WLo16(const __m256i src[3]) { + const __m256i sum = VaddlLo8(src[0], src[1]); + return VaddwLo8(sum, src[2]); +} + +inline __m256i Sum3WHi16(const __m256i src[3]) { + const __m256i sum = VaddlHi8(src[0], src[1]); + return VaddwHi8(sum, src[2]); +} + +inline __m128i Sum3WLo32(const __m128i src[3]) { + const __m128i sum = VaddlLo16(src[0], src[1]); + return VaddwLo16(sum, src[2]); +} + +inline __m256i Sum3WLo32(const __m256i src[3]) { + const __m256i sum = VaddlLo16(src[0], src[1]); + return VaddwLo16(sum, src[2]); +} + +inline __m128i Sum3WHi32(const __m128i src[3]) { + const __m128i sum = VaddlHi16(src[0], src[1]); + return VaddwHi16(sum, src[2]); +} + +inline __m256i Sum3WHi32(const __m256i src[3]) { + const __m256i sum = VaddlHi16(src[0], src[1]); + return VaddwHi16(sum, src[2]); +} + +inline __m128i Sum5_16(const __m128i src[5]) { + const __m128i sum01 = _mm_add_epi16(src[0], src[1]); + const __m128i sum23 = _mm_add_epi16(src[2], src[3]); + const __m128i sum = _mm_add_epi16(sum01, sum23); + return _mm_add_epi16(sum, src[4]); +} + +inline __m256i Sum5_16(const __m256i src[5]) { + const __m256i sum01 = _mm256_add_epi16(src[0], src[1]); + const __m256i sum23 = _mm256_add_epi16(src[2], src[3]); + const __m256i sum = _mm256_add_epi16(sum01, sum23); + return _mm256_add_epi16(sum, src[4]); +} + +inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1, + const __m128i* const src2, const __m128i* const src3, + const __m128i* const src4) { + const __m128i sum01 = _mm_add_epi32(*src0, *src1); + const __m128i sum23 = _mm_add_epi32(*src2, *src3); + const __m128i sum = _mm_add_epi32(sum01, sum23); + return _mm_add_epi32(sum, *src4); +} + +inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1, + const __m256i* const src2, const __m256i* const src3, + const __m256i* const src4) { + const __m256i sum01 = _mm256_add_epi32(*src0, *src1); + const __m256i sum23 = _mm256_add_epi32(*src2, *src3); + const __m256i sum = _mm256_add_epi32(sum01, sum23); + return _mm256_add_epi32(sum, *src4); +} + +inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) { + dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]); + dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]); +} + +inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) { + dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]); + dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]); +} + +inline __m128i Sum5WLo16(const __m128i src[5]) { + const __m128i sum01 = VaddlLo8(src[0], src[1]); + const __m128i sum23 = VaddlLo8(src[2], src[3]); + const __m128i sum = _mm_add_epi16(sum01, sum23); + return VaddwLo8(sum, src[4]); +} + +inline __m256i Sum5WLo16(const __m256i src[5]) { + const __m256i sum01 = VaddlLo8(src[0], src[1]); + const __m256i sum23 = VaddlLo8(src[2], src[3]); + const __m256i sum = _mm256_add_epi16(sum01, sum23); + return VaddwLo8(sum, src[4]); +} + +inline __m256i Sum5WHi16(const __m256i src[5]) { + const __m256i sum01 = VaddlHi8(src[0], src[1]); + const __m256i sum23 = VaddlHi8(src[2], src[3]); + const __m256i sum = _mm256_add_epi16(sum01, sum23); + return VaddwHi8(sum, src[4]); +} + +inline __m128i Sum3Horizontal(const __m128i src) { + __m128i s[3]; + Prepare3Lo8(src, s); + return Sum3WLo16(s); +} + +inline void Sum3Horizontal(const uint8_t* const src, + const ptrdiff_t over_read_in_bytes, __m256i dst[2]) { + __m256i s[3]; + s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0); + s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1); + s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2); + dst[0] = Sum3WLo16(s); + dst[1] = Sum3WHi16(s); +} + +inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) { + __m128i s[3]; + Prepare3_16(src, s); + dst[0] = Sum3WLo32(s); + dst[1] = Sum3WHi32(s); +} + +inline void Sum3WHorizontal(const __m256i src[2], __m256i dst[2]) { + __m256i s[3]; + Prepare3_16(src, s); + dst[0] = Sum3WLo32(s); + dst[1] = Sum3WHi32(s); +} + +inline __m128i Sum5Horizontal(const __m128i src) { + __m128i s[5]; + Prepare5Lo8(src, s); + return Sum5WLo16(s); +} + +inline void Sum5Horizontal(const uint8_t* const src, + const ptrdiff_t over_read_in_bytes, + __m256i* const dst0, __m256i* const dst1) { + __m256i s[5]; + s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0); + s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1); + s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2); + s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3); + s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4); + *dst0 = Sum5WLo16(s); + *dst1 = Sum5WHi16(s); +} + +inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) { + __m128i s[5]; + Prepare5_16(src, s); + const __m128i sum01_lo = VaddlLo16(s[0], s[1]); + const __m128i sum23_lo = VaddlLo16(s[2], s[3]); + const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo); + dst[0] = VaddwLo16(sum0123_lo, s[4]); + const __m128i sum01_hi = VaddlHi16(s[0], s[1]); + const __m128i sum23_hi = VaddlHi16(s[2], s[3]); + const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi); + dst[1] = VaddwHi16(sum0123_hi, s[4]); +} + +inline void Sum5WHorizontal(const __m256i src[2], __m256i dst[2]) { + __m256i s[5]; + Prepare5_16(src, s); + const __m256i sum01_lo = VaddlLo16(s[0], s[1]); + const __m256i sum23_lo = VaddlLo16(s[2], s[3]); + const __m256i sum0123_lo = _mm256_add_epi32(sum01_lo, sum23_lo); + dst[0] = VaddwLo16(sum0123_lo, s[4]); + const __m256i sum01_hi = VaddlHi16(s[0], s[1]); + const __m256i sum23_hi = VaddlHi16(s[2], s[3]); + const __m256i sum0123_hi = _mm256_add_epi32(sum01_hi, sum23_hi); + dst[1] = VaddwHi16(sum0123_hi, s[4]); +} + +void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3, + __m128i* const row_sq5) { + const __m128i sum04 = VaddlLo16(src[0], src[4]); + *row_sq3 = Sum3WLo32(src + 1); + *row_sq5 = _mm_add_epi32(sum04, *row_sq3); +} + +void SumHorizontalLo(const __m256i src[5], __m256i* const row_sq3, + __m256i* const row_sq5) { + const __m256i sum04 = VaddlLo16(src[0], src[4]); + *row_sq3 = Sum3WLo32(src + 1); + *row_sq5 = _mm256_add_epi32(sum04, *row_sq3); +} + +void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3, + __m128i* const row_sq5) { + const __m128i sum04 = VaddlHi16(src[0], src[4]); + *row_sq3 = Sum3WHi32(src + 1); + *row_sq5 = _mm_add_epi32(sum04, *row_sq3); +} + +void SumHorizontalHi(const __m256i src[5], __m256i* const row_sq3, + __m256i* const row_sq5) { + const __m256i sum04 = VaddlHi16(src[0], src[4]); + *row_sq3 = Sum3WHi32(src + 1); + *row_sq5 = _mm256_add_epi32(sum04, *row_sq3); +} + +void SumHorizontalLo(const __m128i src, __m128i* const row3, + __m128i* const row5) { + __m128i s[5]; + Prepare5Lo8(src, s); + const __m128i sum04 = VaddlLo8(s[0], s[4]); + *row3 = Sum3WLo16(s + 1); + *row5 = _mm_add_epi16(sum04, *row3); +} + +inline void SumHorizontal(const uint8_t* const src, + const ptrdiff_t over_read_in_bytes, + __m256i* const row3_0, __m256i* const row3_1, + __m256i* const row5_0, __m256i* const row5_1) { + __m256i s[5]; + s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0); + s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1); + s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2); + s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3); + s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4); + const __m256i sum04_lo = VaddlLo8(s[0], s[4]); + const __m256i sum04_hi = VaddlHi8(s[0], s[4]); + *row3_0 = Sum3WLo16(s + 1); + *row3_1 = Sum3WHi16(s + 1); + *row5_0 = _mm256_add_epi16(sum04_lo, *row3_0); + *row5_1 = _mm256_add_epi16(sum04_hi, *row3_1); +} + +inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0, + __m128i* const row_sq3_1, __m128i* const row_sq5_0, + __m128i* const row_sq5_1) { + __m128i s[5]; + Prepare5_16(src, s); + SumHorizontalLo(s, row_sq3_0, row_sq5_0); + SumHorizontalHi(s, row_sq3_1, row_sq5_1); +} + +inline void SumHorizontal(const __m256i src[2], __m256i* const row_sq3_0, + __m256i* const row_sq3_1, __m256i* const row_sq5_0, + __m256i* const row_sq5_1) { + __m256i s[5]; + Prepare5_16(src, s); + SumHorizontalLo(s, row_sq3_0, row_sq5_0); + SumHorizontalHi(s, row_sq3_1, row_sq5_1); +} + +inline __m256i Sum343Lo(const __m256i ma3[3]) { + const __m256i sum = Sum3WLo16(ma3); + const __m256i sum3 = Sum3_16(sum, sum, sum); + return VaddwLo8(sum3, ma3[1]); +} + +inline __m256i Sum343Hi(const __m256i ma3[3]) { + const __m256i sum = Sum3WHi16(ma3); + const __m256i sum3 = Sum3_16(sum, sum, sum); + return VaddwHi8(sum3, ma3[1]); +} + +inline __m256i Sum343WLo(const __m256i src[3]) { + const __m256i sum = Sum3WLo32(src); + const __m256i sum3 = Sum3_32(sum, sum, sum); + return VaddwLo16(sum3, src[1]); +} + +inline __m256i Sum343WHi(const __m256i src[3]) { + const __m256i sum = Sum3WHi32(src); + const __m256i sum3 = Sum3_32(sum, sum, sum); + return VaddwHi16(sum3, src[1]); +} + +inline void Sum343W(const __m256i src[2], __m256i dst[2]) { + __m256i s[3]; + Prepare3_16(src, s); + dst[0] = Sum343WLo(s); + dst[1] = Sum343WHi(s); +} + +inline __m256i Sum565Lo(const __m256i src[3]) { + const __m256i sum = Sum3WLo16(src); + const __m256i sum4 = _mm256_slli_epi16(sum, 2); + const __m256i sum5 = _mm256_add_epi16(sum4, sum); + return VaddwLo8(sum5, src[1]); +} + +inline __m256i Sum565Hi(const __m256i src[3]) { + const __m256i sum = Sum3WHi16(src); + const __m256i sum4 = _mm256_slli_epi16(sum, 2); + const __m256i sum5 = _mm256_add_epi16(sum4, sum); + return VaddwHi8(sum5, src[1]); +} + +inline __m256i Sum565WLo(const __m256i src[3]) { + const __m256i sum = Sum3WLo32(src); + const __m256i sum4 = _mm256_slli_epi32(sum, 2); + const __m256i sum5 = _mm256_add_epi32(sum4, sum); + return VaddwLo16(sum5, src[1]); +} + +inline __m256i Sum565WHi(const __m256i src[3]) { + const __m256i sum = Sum3WHi32(src); + const __m256i sum4 = _mm256_slli_epi32(sum, 2); + const __m256i sum5 = _mm256_add_epi32(sum4, sum); + return VaddwHi16(sum5, src[1]); +} + +inline void Sum565W(const __m256i src[2], __m256i dst[2]) { + __m256i s[3]; + Prepare3_16(src, s); + dst[0] = Sum565WLo(s); + dst[1] = Sum565WHi(s); +} + +inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const ptrdiff_t sum_stride, + const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5, + uint32_t* square_sum3, uint32_t* square_sum5) { + int y = 2; + do { + const __m128i s0 = + LoadUnaligned16Msan(src, kOverreadInBytesPass1_128 - width); + __m128i sq_128[2]; + __m256i sq[3]; + __m128i s3, s5, sq3[2], sq5[2]; + sq_128[0] = SquareLo8(s0); + sq_128[1] = SquareHi8(s0); + SumHorizontalLo(s0, &s3, &s5); + StoreAligned16(sum3, s3); + StoreAligned16(sum5, s5); + SumHorizontal(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]); + StoreAligned32U32(square_sum3, sq3); + StoreAligned32U32(square_sum5, sq5); + src += 8; + sum3 += 8; + sum5 += 8; + square_sum3 += 8; + square_sum5 += 8; + sq[0] = SetrM128i(sq_128[1], sq_128[1]); + ptrdiff_t x = sum_width; + do { + __m256i row3[2], row5[2], row_sq3[2], row_sq5[2]; + const __m256i s = LoadUnaligned32Msan( + src + 8, sum_width - x + 16 + kOverreadInBytesPass1_256 - width); + sq[1] = SquareLo8(s); + sq[2] = SquareHi8(s); + sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21); + SumHorizontal(src, sum_width - x + 8 + kOverreadInBytesPass1_256 - width, + &row3[0], &row3[1], &row5[0], &row5[1]); + StoreAligned64(sum3, row3); + StoreAligned64(sum5, row5); + SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]); + StoreAligned64(square_sum3 + 0, row_sq3); + StoreAligned64(square_sum5 + 0, row_sq5); + SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]); + StoreAligned64(square_sum3 + 16, row_sq3); + StoreAligned64(square_sum5 + 16, row_sq5); + sq[0] = sq[2]; + src += 32; + sum3 += 32; + sum5 += 32; + square_sum3 += 32; + square_sum5 += 32; + x -= 32; + } while (x != 0); + src += src_stride - sum_width - 8; + sum3 += sum_stride - sum_width - 8; + sum5 += sum_stride - sum_width - 8; + square_sum3 += sum_stride - sum_width - 8; + square_sum5 += sum_stride - sum_width - 8; + } while (--y != 0); +} + +template +inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const ptrdiff_t sum_stride, + const ptrdiff_t sum_width, uint16_t* sums, + uint32_t* square_sums) { + static_assert(size == 3 || size == 5, ""); + int kOverreadInBytes_128, kOverreadInBytes_256; + if (size == 3) { + kOverreadInBytes_128 = kOverreadInBytesPass2_128; + kOverreadInBytes_256 = kOverreadInBytesPass2_256; + } else { + kOverreadInBytes_128 = kOverreadInBytesPass1_128; + kOverreadInBytes_256 = kOverreadInBytesPass1_256; + } + int y = 2; + do { + const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytes_128 - width); + __m128i ss, sq_128[2], sqs[2]; + __m256i sq[3]; + sq_128[0] = SquareLo8(s); + sq_128[1] = SquareHi8(s); + if (size == 3) { + ss = Sum3Horizontal(s); + Sum3WHorizontal(sq_128, sqs); + } else { + ss = Sum5Horizontal(s); + Sum5WHorizontal(sq_128, sqs); + } + StoreAligned16(sums, ss); + StoreAligned32U32(square_sums, sqs); + src += 8; + sums += 8; + square_sums += 8; + sq[0] = SetrM128i(sq_128[1], sq_128[1]); + ptrdiff_t x = sum_width; + do { + __m256i row[2], row_sq[4]; + const __m256i s = LoadUnaligned32Msan( + src + 8, sum_width - x + 16 + kOverreadInBytes_256 - width); + sq[1] = SquareLo8(s); + sq[2] = SquareHi8(s); + sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21); + if (size == 3) { + Sum3Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width, + row); + Sum3WHorizontal(sq + 0, row_sq + 0); + Sum3WHorizontal(sq + 1, row_sq + 2); + } else { + Sum5Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width, + &row[0], &row[1]); + Sum5WHorizontal(sq + 0, row_sq + 0); + Sum5WHorizontal(sq + 1, row_sq + 2); + } + StoreAligned64(sums, row); + StoreAligned64(square_sums + 0, row_sq + 0); + StoreAligned64(square_sums + 16, row_sq + 2); + sq[0] = sq[2]; + src += 32; + sums += 32; + square_sums += 32; + x -= 32; + } while (x != 0); + src += src_stride - sum_width - 8; + sums += sum_stride - sum_width - 8; + square_sums += sum_stride - sum_width - 8; + } while (--y != 0); +} + +template +inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq, + const uint32_t scale) { + static_assert(n == 9 || n == 25, ""); + // a = |sum_sq| + // d = |sum| + // p = (a * n < d * d) ? 0 : a * n - d * d; + const __m128i dxd = _mm_madd_epi16(sum, sum); + // _mm_mullo_epi32() has high latency. Using shifts and additions instead. + // Some compilers could do this for us but we make this explicit. + // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n)); + __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3)); + if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4)); + const __m128i sub = _mm_sub_epi32(axn, dxd); + const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128()); + const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale)); + return VrshrU32(pxs, kSgrProjScaleBits); +} + +template +inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2], + const uint32_t scale) { + static_assert(n == 9 || n == 25, ""); + const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128()); + const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128()); + const __m128i z0 = CalculateMa(sum_lo, sum_sq[0], scale); + const __m128i z1 = CalculateMa(sum_hi, sum_sq[1], scale); + return _mm_packus_epi32(z0, z1); +} + +template +inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq, + const uint32_t scale) { + static_assert(n == 9 || n == 25, ""); + // a = |sum_sq| + // d = |sum| + // p = (a * n < d * d) ? 0 : a * n - d * d; + const __m256i dxd = _mm256_madd_epi16(sum, sum); + // _mm256_mullo_epi32() has high latency. Using shifts and additions instead. + // Some compilers could do this for us but we make this explicit. + // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n)); + __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3)); + if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4)); + const __m256i sub = _mm256_sub_epi32(axn, dxd); + const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256()); + const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale)); + return VrshrU32(pxs, kSgrProjScaleBits); +} + +template +inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2], + const uint32_t scale) { + static_assert(n == 9 || n == 25, ""); + const __m256i sum_lo = _mm256_unpacklo_epi16(sum, _mm256_setzero_si256()); + const __m256i sum_hi = _mm256_unpackhi_epi16(sum, _mm256_setzero_si256()); + const __m256i z0 = CalculateMa(sum_lo, sum_sq[0], scale); + const __m256i z1 = CalculateMa(sum_hi, sum_sq[1], scale); + return _mm256_packus_epi32(z0, z1); +} + +template +inline __m128i CalculateB(const __m128i sum, const __m128i ma) { + static_assert(n == 9 || n == 25, ""); + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; + const __m128i m0 = VmullLo16(ma, sum); + const __m128i m1 = VmullHi16(ma, sum); + const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n)); + const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n)); + const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits); + const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits); + return _mm_packus_epi32(b_lo, b_hi); +} + +template +inline __m256i CalculateB(const __m256i sum, const __m256i ma) { + static_assert(n == 9 || n == 25, ""); + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; + const __m256i m0 = VmullLo16(ma, sum); + const __m256i m1 = VmullHi16(ma, sum); + const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n)); + const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n)); + const __m256i b_lo = VrshrU32(m2, kSgrProjReciprocalBits); + const __m256i b_hi = VrshrU32(m3, kSgrProjReciprocalBits); + return _mm256_packus_epi32(b_lo, b_hi); +} + +inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2], + const uint32_t scale, __m128i* const sum, + __m128i* const index) { + __m128i sum_sq[2]; + *sum = Sum5_16(s5); + Sum5_32(sq5, sum_sq); + *index = CalculateMa<25>(*sum, sum_sq, scale); +} + +inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2], + const uint32_t scale, __m256i* const sum, + __m256i* const index) { + __m256i sum_sq[2]; + *sum = Sum5_16(s5); + Sum5_32(sq5, sum_sq); + *index = CalculateMa<25>(*sum, sum_sq, scale); +} + +inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2], + const uint32_t scale, __m128i* const sum, + __m128i* const index) { + __m128i sum_sq[2]; + *sum = Sum3_16(s3); + Sum3_32(sq3, sum_sq); + *index = CalculateMa<9>(*sum, sum_sq, scale); +} + +inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2], + const uint32_t scale, __m256i* const sum, + __m256i* const index) { + __m256i sum_sq[2]; + *sum = Sum3_16(s3); + Sum3_32(sq3, sum_sq); + *index = CalculateMa<9>(*sum, sum_sq, scale); +} + +template +inline void LookupIntermediate(const __m128i sum, const __m128i index, + __m128i* const ma, __m128i* const b) { + static_assert(n == 9 || n == 25, ""); + const __m128i idx = _mm_packus_epi16(index, index); + // Actually it's not stored and loaded. The compiler will use a 64-bit + // general-purpose register to process. Faster than using _mm_extract_epi8(). + uint8_t temp[8]; + StoreLo8(temp, idx); + *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7); + // b = ma * b * one_over_n + // |ma| = [0, 255] + // |sum| is a box sum with radius 1 or 2. + // For the first pass radius is 2. Maximum value is 5x5x255 = 6375. + // For the second pass radius is 1. Maximum value is 3x3x255 = 2295. + // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n + // When radius is 2 |n| is 25. |one_over_n| is 164. + // When radius is 1 |n| is 9. |one_over_n| is 455. + // |kSgrProjReciprocalBits| is 12. + // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). + // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). + const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128()); + *b = CalculateB(sum, maq); +} + +// Repeat the first 48 elements in kSgrMaLookup with a period of 16. +alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = { + 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, + 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, + 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, + 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, + 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, + 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5}; + +// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b +// to get value 0 as the shuffle result. The most significiant bit 1 comes +// either from the comparision instruction, or from the sign bit of the index. +inline __m256i ShuffleIndex(const __m256i table, const __m256i index) { + __m256i mask; + mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15)); + mask = _mm256_or_si256(mask, index); + return _mm256_shuffle_epi8(table, mask); +} + +inline __m256i AdjustValue(const __m256i value, const __m256i index, + const int threshold) { + const __m256i thresholds = _mm256_set1_epi8(threshold - 128); + const __m256i offset = _mm256_cmpgt_epi8(index, thresholds); + return _mm256_add_epi8(value, offset); +} + +template +inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2], + __m256i ma[3], __m256i b[2]) { + static_assert(n == 9 || n == 25, ""); + // Use table lookup to read elements which indices are less than 48. + const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32); + const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32); + const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32); + const __m256i indices = _mm256_packus_epi16(index[0], index[1]); + __m256i idx, mas; + // Clip idx to 127 to apply signed comparision instructions. + idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127)); + // All elements which indices are less than 48 are set to 0. + // Get shuffle results for indices in range [0, 15]. + mas = ShuffleIndex(c0, idx); + // Get shuffle results for indices in range [16, 31]. + // Subtract 16 to utilize the sign bit of the index. + idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16)); + const __m256i res1 = ShuffleIndex(c1, idx); + // Use OR instruction to combine shuffle results together. + mas = _mm256_or_si256(mas, res1); + // Get shuffle results for indices in range [32, 47]. + // Subtract 16 to utilize the sign bit of the index. + idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16)); + const __m256i res2 = ShuffleIndex(c2, idx); + mas = _mm256_or_si256(mas, res2); + + // For elements which indices are larger than 47, since they seldom change + // values with the increase of the index, we use comparison and arithmetic + // operations to calculate their values. + // Add -128 to apply signed comparision instructions. + idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128)); + // Elements which indices are larger than 47 (with value 0) are set to 5. + mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5)); + mas = AdjustValue(mas, idx, 55); // 55 is the last index which value is 5. + mas = AdjustValue(mas, idx, 72); // 72 is the last index which value is 4. + mas = AdjustValue(mas, idx, 101); // 101 is the last index which value is 3. + mas = AdjustValue(mas, idx, 169); // 169 is the last index which value is 2. + mas = AdjustValue(mas, idx, 254); // 254 is the last index which value is 1. + + ma[2] = _mm256_permute4x64_epi64(mas, 0x93); // 32-39 8-15 16-23 24-31 + ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc); // 0-7 8-15 16-23 24-31 + ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21); + + // b = ma * b * one_over_n + // |ma| = [0, 255] + // |sum| is a box sum with radius 1 or 2. + // For the first pass radius is 2. Maximum value is 5x5x255 = 6375. + // For the second pass radius is 1. Maximum value is 3x3x255 = 2295. + // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n + // When radius is 2 |n| is 25. |one_over_n| is 164. + // When radius is 1 |n| is 9. |one_over_n| is 455. + // |kSgrProjReciprocalBits| is 12. + // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). + // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). + const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256()); + const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256()); + b[0] = CalculateB(sum[0], maq0); + b[1] = CalculateB(sum[1], maq1); +} + +inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2], + const uint32_t scale, __m128i* const ma, + __m128i* const b) { + __m128i sum, index; + CalculateSumAndIndex5(s5, sq5, scale, &sum, &index); + LookupIntermediate<25>(sum, index, ma, b); +} + +inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2], + const uint32_t scale, __m128i* const ma, + __m128i* const b) { + __m128i sum, index; + CalculateSumAndIndex3(s3, sq3, scale, &sum, &index); + LookupIntermediate<9>(sum, index, ma, b); +} + +inline void Store343_444(const __m256i b3[2], const ptrdiff_t x, + __m256i sum_b343[2], __m256i sum_b444[2], + uint32_t* const b343, uint32_t* const b444) { + __m256i b[3], sum_b111[2]; + Prepare3_16(b3, b); + sum_b111[0] = Sum3WLo32(b); + sum_b111[1] = Sum3WHi32(b); + sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2); + sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2); + StoreAligned64(b444 + x, sum_b444); + sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]); + sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]); + sum_b343[0] = VaddwLo16(sum_b343[0], b[1]); + sum_b343[1] = VaddwHi16(sum_b343[1], b[1]); + StoreAligned64(b343 + x, sum_b343); +} + +inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2], + const ptrdiff_t x, __m256i* const sum_ma343, + __m256i* const sum_ma444, __m256i sum_b343[2], + __m256i sum_b444[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + const __m256i sum_ma111 = Sum3WLo16(ma3); + *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2); + StoreAligned32(ma444 + x, *sum_ma444); + const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111); + *sum_ma343 = VaddwLo8(sum333, ma3[1]); + StoreAligned32(ma343 + x, *sum_ma343); + Store343_444(b3, x, sum_b343, sum_b444, b343, b444); +} + +inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2], + const ptrdiff_t x, __m256i* const sum_ma343, + __m256i* const sum_ma444, __m256i sum_b343[2], + __m256i sum_b444[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + const __m256i sum_ma111 = Sum3WHi16(ma3); + *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2); + StoreAligned32(ma444 + x, *sum_ma444); + const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111); + *sum_ma343 = VaddwHi8(sum333, ma3[1]); + StoreAligned32(ma343 + x, *sum_ma343); + Store343_444(b3, x, sum_b343, sum_b444, b343, b444); +} + +inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2], + const ptrdiff_t x, __m256i* const sum_ma343, + __m256i sum_b343[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m256i sum_ma444, sum_b444[2]; + Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343, + ma444, b343, b444); +} + +inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2], + const ptrdiff_t x, __m256i* const sum_ma343, + __m256i sum_b343[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m256i sum_ma444, sum_b444[2]; + Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343, + ma444, b343, b444); +} + +inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2], + const ptrdiff_t x, uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m256i sum_ma343, sum_b343[2]; + Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444); +} + +inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2], + const ptrdiff_t x, uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m256i sum_ma343, sum_b343[2]; + Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo( + const __m128i s[2][3], const uint32_t scale, uint16_t* const sum5[5], + uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i* const ma, + __m128i* const b) { + __m128i s5[2][5], sq5[5][2]; + sq[0][1] = SquareHi8(s[0][0]); + sq[1][1] = SquareHi8(s[1][0]); + s5[0][3] = Sum5Horizontal(s[0][0]); + StoreAligned16(sum5[3], s5[0][3]); + s5[0][4] = Sum5Horizontal(s[1][0]); + StoreAligned16(sum5[4], s5[0][4]); + Sum5WHorizontal(sq[0], sq5[3]); + StoreAligned32U32(square_sum5[3], sq5[3]); + Sum5WHorizontal(sq[1], sq5[4]); + StoreAligned32U32(square_sum5[4], sq5[4]); + LoadAligned16x3U16(sum5, 0, s5[0]); + LoadAligned32x3U32(square_sum5, 0, sq5); + CalculateIntermediate5(s5[0], sq5, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5( + const uint8_t* const src0, const uint8_t* const src1, + const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width, + const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5], + uint32_t* const square_sum5[5], __m256i sq[2][3], __m256i ma[3], + __m256i b[3]) { + const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8); + const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8); + __m256i s5[2][5], sq5[5][2], sum[2], index[2]; + sq[0][1] = SquareLo8(s0); + sq[0][2] = SquareHi8(s0); + sq[1][1] = SquareLo8(s1); + sq[1][2] = SquareHi8(s1); + sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21); + sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21); + Sum5Horizontal(src0, over_read_in_bytes, &s5[0][3], &s5[1][3]); + Sum5Horizontal(src1, over_read_in_bytes, &s5[0][4], &s5[1][4]); + StoreAligned32(sum5[3] + x + 0, s5[0][3]); + StoreAligned32(sum5[3] + x + 16, s5[1][3]); + StoreAligned32(sum5[4] + x + 0, s5[0][4]); + StoreAligned32(sum5[4] + x + 16, s5[1][4]); + Sum5WHorizontal(sq[0], sq5[3]); + StoreAligned64(square_sum5[3] + x, sq5[3]); + Sum5WHorizontal(sq[1], sq5[4]); + StoreAligned64(square_sum5[4] + x, sq5[4]); + LoadAligned32x3U16(sum5, x, s5[0]); + LoadAligned64x3U32(square_sum5, x, sq5); + CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]); + + Sum5WHorizontal(sq[0] + 1, sq5[3]); + StoreAligned64(square_sum5[3] + x + 16, sq5[3]); + Sum5WHorizontal(sq[1] + 1, sq5[4]); + StoreAligned64(square_sum5[4] + x + 16, sq5[4]); + LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]); + LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5); + CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]); + CalculateIntermediate<25>(sum, index, ma, b + 1); + b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo( + const __m128i s, const uint32_t scale, const uint16_t* const sum5[5], + const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma, + __m128i* const b) { + __m128i s5[5], sq5[5][2]; + sq[1] = SquareHi8(s); + s5[3] = s5[4] = Sum5Horizontal(s); + Sum5WHorizontal(sq, sq5[3]); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + LoadAligned16x3U16(sum5, 0, s5); + LoadAligned32x3U32(square_sum5, 0, sq5); + CalculateIntermediate5(s5, sq5, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow( + const uint8_t* const src, const ptrdiff_t over_read_in_bytes, + const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale, + const uint16_t* const sum5[5], const uint32_t* const square_sum5[5], + __m256i sq[3], __m256i ma[3], __m256i b[3]) { + const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8); + __m256i s5[2][5], sq5[5][2], sum[2], index[2]; + sq[1] = SquareLo8(s); + sq[2] = SquareHi8(s); + sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21); + Sum5Horizontal(src, over_read_in_bytes, &s5[0][3], &s5[1][3]); + s5[0][4] = s5[0][3]; + s5[1][4] = s5[1][3]; + Sum5WHorizontal(sq, sq5[3]); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + LoadAligned32x3U16(sum5, x, s5[0]); + LoadAligned64x3U32(square_sum5, x, sq5); + CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]); + + Sum5WHorizontal(sq + 1, sq5[3]); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]); + LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5); + CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]); + CalculateIntermediate<25>(sum, index, ma, b + 1); + b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo( + const __m128i s, const uint32_t scale, uint16_t* const sum3[3], + uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma, + __m128i* const b) { + __m128i s3[3], sq3[3][2]; + sq[1] = SquareHi8(s); + s3[2] = Sum3Horizontal(s); + StoreAligned16(sum3[2], s3[2]); + Sum3WHorizontal(sq, sq3[2]); + StoreAligned32U32(square_sum3[2], sq3[2]); + LoadAligned16x2U16(sum3, 0, s3); + LoadAligned32x2U32(square_sum3, 0, sq3); + CalculateIntermediate3(s3, sq3, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3( + const uint8_t* const src, const ptrdiff_t over_read_in_bytes, + const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale, + uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[3], + __m256i ma[3], __m256i b[3]) { + const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8); + __m256i s3[4], sq3[3][2], sum[2], index[2]; + sq[1] = SquareLo8(s); + sq[2] = SquareHi8(s); + sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21); + Sum3Horizontal(src, over_read_in_bytes, s3 + 2); + StoreAligned64(sum3[2] + x, s3 + 2); + Sum3WHorizontal(sq + 0, sq3[2]); + StoreAligned64(square_sum3[2] + x, sq3[2]); + LoadAligned32x2U16(sum3, x, s3); + LoadAligned64x2U32(square_sum3, x, sq3); + CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]); + + Sum3WHorizontal(sq + 1, sq3[2]); + StoreAligned64(square_sum3[2] + x + 16, sq3[2]); + LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1); + LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3); + CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]); + CalculateIntermediate<9>(sum, index, ma, b + 1); + b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo( + const __m128i s[2], const uint16_t scales[2], uint16_t* const sum3[4], + uint16_t* const sum5[5], uint32_t* const square_sum3[4], + uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i ma3[2], + __m128i b3[2], __m128i* const ma5, __m128i* const b5) { + __m128i s3[4], s5[5], sq3[4][2], sq5[5][2]; + sq[0][1] = SquareHi8(s[0]); + sq[1][1] = SquareHi8(s[1]); + SumHorizontalLo(s[0], &s3[2], &s5[3]); + SumHorizontalLo(s[1], &s3[3], &s5[4]); + StoreAligned16(sum3[2], s3[2]); + StoreAligned16(sum3[3], s3[3]); + StoreAligned16(sum5[3], s5[3]); + StoreAligned16(sum5[4], s5[4]); + SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + StoreAligned32U32(square_sum3[2], sq3[2]); + StoreAligned32U32(square_sum5[3], sq5[3]); + SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]); + StoreAligned32U32(square_sum3[3], sq3[3]); + StoreAligned32U32(square_sum5[4], sq5[4]); + LoadAligned16x2U16(sum3, 0, s3); + LoadAligned32x2U32(square_sum3, 0, sq3); + LoadAligned16x3U16(sum5, 0, s5); + LoadAligned32x3U32(square_sum5, 0, sq5); + // Note: in the SSE4_1 version, CalculateIntermediate() is called + // to replace the slow LookupIntermediate() when calculating 16 intermediate + // data points. However, the AVX2 compiler generates even slower code. So we + // keep using CalculateIntermediate3(). + CalculateIntermediate3(s3 + 0, sq3 + 0, scales[1], &ma3[0], &b3[0]); + CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], &ma3[1], &b3[1]); + CalculateIntermediate5(s5, sq5, scales[0], ma5, b5); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess( + const uint8_t* const src0, const uint8_t* const src1, + const ptrdiff_t over_read_in_bytes, const ptrdiff_t x, + const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, __m256i sq[2][3], __m256i ma3[2][3], + __m256i b3[2][5], __m256i ma5[3], __m256i b5[5]) { + const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8); + const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8); + __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sq3t[4][2], sq5t[5][2], + sum_3[2][2], index_3[2][2], sum_5[2], index_5[2]; + sq[0][1] = SquareLo8(s0); + sq[0][2] = SquareHi8(s0); + sq[1][1] = SquareLo8(s1); + sq[1][2] = SquareHi8(s1); + sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21); + sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21); + SumHorizontal(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3], + &s5[1][3]); + SumHorizontal(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4], + &s5[1][4]); + StoreAligned32(sum3[2] + x + 0, s3[0][2]); + StoreAligned32(sum3[2] + x + 16, s3[1][2]); + StoreAligned32(sum3[3] + x + 0, s3[0][3]); + StoreAligned32(sum3[3] + x + 16, s3[1][3]); + StoreAligned32(sum5[3] + x + 0, s5[0][3]); + StoreAligned32(sum5[3] + x + 16, s5[1][3]); + StoreAligned32(sum5[4] + x + 0, s5[0][4]); + StoreAligned32(sum5[4] + x + 16, s5[1][4]); + SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]); + StoreAligned64(square_sum3[2] + x, sq3[2]); + StoreAligned64(square_sum5[3] + x, sq5[3]); + StoreAligned64(square_sum3[3] + x, sq3[3]); + StoreAligned64(square_sum5[4] + x, sq5[4]); + LoadAligned32x2U16(sum3, x, s3[0]); + LoadAligned64x2U32(square_sum3, x, sq3); + CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]); + CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0], + &index_3[1][0]); + LoadAligned32x3U16(sum5, x, s5[0]); + LoadAligned64x3U32(square_sum5, x, sq5); + CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]); + + SumHorizontal(sq[0] + 1, &sq3t[2][0], &sq3t[2][1], &sq5t[3][0], &sq5t[3][1]); + SumHorizontal(sq[1] + 1, &sq3t[3][0], &sq3t[3][1], &sq5t[4][0], &sq5t[4][1]); + StoreAligned64(square_sum3[2] + x + 16, sq3t[2]); + StoreAligned64(square_sum5[3] + x + 16, sq5t[3]); + StoreAligned64(square_sum3[3] + x + 16, sq3t[3]); + StoreAligned64(square_sum5[4] + x + 16, sq5t[4]); + LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]); + LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3t); + CalculateSumAndIndex3(s3[1], sq3t, scales[1], &sum_3[0][1], &index_3[0][1]); + CalculateSumAndIndex3(s3[1] + 1, sq3t + 1, scales[1], &sum_3[1][1], + &index_3[1][1]); + CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], b3[0] + 1); + CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], b3[1] + 1); + LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]); + LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5t); + CalculateSumAndIndex5(s5[1], sq5t, scales[0], &sum_5[1], &index_5[1]); + CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1); + b3[0][0] = _mm256_permute2x128_si256(b3[0][0], b3[0][2], 0x21); + b3[1][0] = _mm256_permute2x128_si256(b3[1][0], b3[1][2], 0x21); + b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo( + const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4], + const uint16_t* const sum5[5], const uint32_t* const square_sum3[4], + const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3, + __m128i* const ma5, __m128i* const b3, __m128i* const b5) { + __m128i s3[3], s5[5], sq3[3][2], sq5[5][2]; + sq[1] = SquareHi8(s); + SumHorizontalLo(s, &s3[2], &s5[3]); + SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + LoadAligned16x3U16(sum5, 0, s5); + s5[4] = s5[3]; + LoadAligned32x3U32(square_sum5, 0, sq5); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + CalculateIntermediate5(s5, sq5, scales[0], ma5, b5); + LoadAligned16x2U16(sum3, 0, s3); + LoadAligned32x2U32(square_sum3, 0, sq3); + CalculateIntermediate3(s3, sq3, scales[1], ma3, b3); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow( + const uint8_t* const src, const ptrdiff_t over_read_in_bytes, + const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2], + const uint16_t* const sum3[4], const uint16_t* const sum5[5], + const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5], + __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5], + __m256i b5[5]) { + const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8); + __m256i s3[2][3], s5[2][5], sq3[4][2], sq3t[4][2], sq5[5][2], sq5t[5][2], + sum_3[2], index_3[2], sum_5[2], index_5[2]; + sq[1] = SquareLo8(s0); + sq[2] = SquareHi8(s0); + sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21); + SumHorizontal(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3], + &s5[1][3]); + SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + LoadAligned32x2U16(sum3, x, s3[0]); + LoadAligned64x2U32(square_sum3, x, sq3); + CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]); + LoadAligned32x3U16(sum5, x, s5[0]); + s5[0][4] = s5[0][3]; + LoadAligned64x3U32(square_sum5, x, sq5); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]); + + SumHorizontal(sq + 1, &sq3t[2][0], &sq3t[2][1], &sq5t[3][0], &sq5t[3][1]); + LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]); + LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3t); + CalculateSumAndIndex3(s3[1], sq3t, scales[1], &sum_3[1], &index_3[1]); + CalculateIntermediate<9>(sum_3, index_3, ma3, b3 + 1); + LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]); + s5[1][4] = s5[1][3]; + LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5t); + sq5t[4][0] = sq5t[3][0]; + sq5t[4][1] = sq5t[3][1]; + CalculateSumAndIndex5(s5[1], sq5t, scales[0], &sum_5[1], &index_5[1]); + CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1); + b3[0] = _mm256_permute2x128_si256(b3[0], b3[2], 0x21); + b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21); +} + +inline void BoxSumFilterPreProcess5(const uint8_t* const src0, + const uint8_t* const src1, const int width, + const uint32_t scale, + uint16_t* const sum5[5], + uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, uint16_t* ma565, + uint32_t* b565) { + __m128i ma0, b0, s[2][3], sq_128[2][2]; + __m256i mas[3], sq[2][3], bs[3]; + s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width); + s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width); + sq_128[0][0] = SquareLo8(s[0][0]); + sq_128[1][0] = SquareLo8(s[1][0]); + BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0); + sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]); + sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]); + mas[0] = SetrM128i(ma0, ma0); + bs[0] = SetrM128i(b0, b0); + + int x = 0; + do { + __m256i ma5[3], ma[2], b[4]; + BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8, + x + 8 + kOverreadInBytesPass1_256 - width, sum_width, + x + 8, scale, sum5, square_sum5, sq, mas, bs); + Prepare3_8(mas, ma5); + ma[0] = Sum565Lo(ma5); + ma[1] = Sum565Hi(ma5); + StoreAligned64(ma565, ma); + Sum565W(bs + 0, b + 0); + Sum565W(bs + 1, b + 2); + StoreAligned64(b565, b + 0); + StoreAligned64(b565 + 16, b + 2); + sq[0][0] = sq[0][2]; + sq[1][0] = sq[1][2]; + mas[0] = mas[2]; + bs[0] = bs[2]; + ma565 += 32; + b565 += 32; + x += 32; + } while (x < width); +} + +template +LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3( + const uint8_t* const src, const int width, const uint32_t scale, + uint16_t* const sum3[3], uint32_t* const square_sum3[3], + const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343, + uint32_t* b444) { + __m128i ma0, sq_128[2], b0; + __m256i mas[3], sq[3], bs[3]; + const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width); + sq_128[0] = SquareLo8(s); + BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, &b0); + sq[0] = SetrM128i(sq_128[0], sq_128[1]); + mas[0] = SetrM128i(ma0, ma0); + bs[0] = SetrM128i(b0, b0); + + int x = 0; + do { + __m256i ma3[3]; + BoxFilterPreProcess3(src + x + 8, x + 8 + kOverreadInBytesPass2_256 - width, + x + 8, sum_width, scale, sum3, square_sum3, sq, mas, + bs); + Prepare3_8(mas, ma3); + if (calculate444) { // NOLINT(readability-simplify-boolean-expr) + Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444); + Store343_444Hi(ma3, bs + 1, 16, ma343, ma444, b343, b444); + ma444 += 32; + b444 += 32; + } else { + __m256i ma[2], b[4]; + ma[0] = Sum343Lo(ma3); + ma[1] = Sum343Hi(ma3); + StoreAligned64(ma343, ma); + Sum343W(bs + 0, b + 0); + Sum343W(bs + 1, b + 2); + StoreAligned64(b343 + 0, b + 0); + StoreAligned64(b343 + 16, b + 2); + } + sq[0] = sq[2]; + mas[0] = mas[2]; + bs[0] = bs[2]; + ma343 += 32; + b343 += 32; + x += 32; + } while (x < width); +} + +inline void BoxSumFilterPreProcess( + const uint8_t* const src0, const uint8_t* const src1, const int width, + const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, uint16_t* const ma343[4], + uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4], + uint32_t* const b444[2], uint32_t* b565) { + __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0; + __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5]; + s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width); + s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width); + sq_128[0][0] = SquareLo8(s[0]); + sq_128[1][0] = SquareLo8(s[1]); + BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128, + ma3_128, b3_128, &ma5_0, &b5_0); + sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]); + sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]); + ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]); + ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]); + ma5[0] = SetrM128i(ma5_0, ma5_0); + b3[0][0] = SetrM128i(b3_128[0], b3_128[0]); + b3[1][0] = SetrM128i(b3_128[1], b3_128[1]); + b5[0] = SetrM128i(b5_0, b5_0); + + int x = 0; + do { + __m256i ma[2], b[4], ma3x[3], ma5x[3]; + BoxFilterPreProcess(src0 + x + 8, src1 + x + 8, + x + 8 + kOverreadInBytesPass1_256 - width, x + 8, + scales, sum3, sum5, square_sum3, square_sum5, sum_width, + sq, ma3, b3, ma5, b5); + Prepare3_8(ma3[0], ma3x); + ma[0] = Sum343Lo(ma3x); + ma[1] = Sum343Hi(ma3x); + StoreAligned64(ma343[0] + x, ma); + Sum343W(b3[0], b); + StoreAligned64(b343[0] + x, b); + Sum565W(b5, b); + StoreAligned64(b565, b); + Prepare3_8(ma3[1], ma3x); + Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]); + Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444[0], b343[1], + b444[0]); + Prepare3_8(ma5, ma5x); + ma[0] = Sum565Lo(ma5x); + ma[1] = Sum565Hi(ma5x); + StoreAligned64(ma565, ma); + Sum343W(b3[0] + 1, b); + StoreAligned64(b343[0] + x + 16, b); + Sum565W(b5 + 1, b); + StoreAligned64(b565 + 16, b); + sq[0][0] = sq[0][2]; + sq[1][0] = sq[1][2]; + ma3[0][0] = ma3[0][2]; + ma3[1][0] = ma3[1][2]; + ma5[0] = ma5[2]; + b3[0][0] = b3[0][2]; + b3[1][0] = b3[1][2]; + b5[0] = b5[2]; + ma565 += 32; + b565 += 32; + x += 32; + } while (x < width); +} + +template +inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) { + // ma: 255 * 32 = 8160 (13 bits) + // b: 65088 * 32 = 2082816 (21 bits) + // v: b - ma * 255 (22 bits) + const __m256i v = _mm256_sub_epi32(b, ma_x_src); + // kSgrProjSgrBits = 8 + // kSgrProjRestoreBits = 4 + // shift = 4 or 5 + // v >> 8 or 9 (13 bits) + return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits); +} + +template +inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma, + const __m256i b[2]) { + const __m256i ma_x_src_lo = VmullLo16(ma, src); + const __m256i ma_x_src_hi = VmullHi16(ma, src); + const __m256i dst_lo = FilterOutput(ma_x_src_lo, b[0]); + const __m256i dst_hi = FilterOutput(ma_x_src_hi, b[1]); + return _mm256_packs_epi32(dst_lo, dst_hi); // 13 bits +} + +inline __m256i CalculateFilteredOutputPass1(const __m256i src, __m256i ma[2], + __m256i b[2][2]) { + const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]); + __m256i b_sum[2]; + b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]); + b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]); + return CalculateFilteredOutput<5>(src, ma_sum, b_sum); +} + +inline __m256i CalculateFilteredOutputPass2(const __m256i src, __m256i ma[3], + __m256i b[3][2]) { + const __m256i ma_sum = Sum3_16(ma); + __m256i b_sum[2]; + Sum3_32(b, b_sum); + return CalculateFilteredOutput<5>(src, ma_sum, b_sum); +} + +inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) { + const __m256i v_lo = + VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits); + const __m256i v_hi = + VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits); + const __m256i vv = _mm256_packs_epi32(v_lo, v_hi); + return _mm256_add_epi16(src, vv); +} + +inline __m256i SelfGuidedDoubleMultiplier(const __m256i src, + const __m256i filter[2], const int w0, + const int w2) { + __m256i v[2]; + const __m256i w0_w2 = + _mm256_set1_epi32((w2 << 16) | static_cast(w0)); + const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]); + const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]); + v[0] = _mm256_madd_epi16(w0_w2, f_lo); + v[1] = _mm256_madd_epi16(w0_w2, f_hi); + return SelfGuidedFinal(src, v); +} + +inline __m256i SelfGuidedSingleMultiplier(const __m256i src, + const __m256i filter, const int w0) { + // weight: -96 to 96 (Sgrproj_Xqd_Min/Max) + __m256i v[2]; + v[0] = VmullNLo8(filter, w0); + v[1] = VmullNHi8(filter, w0); + return SelfGuidedFinal(src, v); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPass1( + const uint8_t* const src, const uint8_t* const src0, + const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5], + uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width, + const uint32_t scale, const int16_t w0, uint16_t* const ma565[2], + uint32_t* const b565[2], uint8_t* const dst) { + __m128i ma0, b0, s[2][3], sq_128[2][2]; + __m256i mas[3], sq[2][3], bs[3]; + s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width); + s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width); + sq_128[0][0] = SquareLo8(s[0][0]); + sq_128[1][0] = SquareLo8(s[1][0]); + BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0); + sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]); + sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]); + mas[0] = SetrM128i(ma0, ma0); + bs[0] = SetrM128i(b0, b0); + + int x = 0; + do { + __m256i ma[3], ma3[3], b[2][2][2]; + BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8, + x + 8 + kOverreadInBytesPass1_256 - width, sum_width, + x + 8, scale, sum5, square_sum5, sq, mas, bs); + Prepare3_8(mas, ma3); + ma[1] = Sum565Lo(ma3); + ma[2] = Sum565Hi(ma3); + StoreAligned64(ma565[1] + x, ma + 1); + Sum565W(bs + 0, b[0][1]); + Sum565W(bs + 1, b[1][1]); + StoreAligned64(b565[1] + x + 0, b[0][1]); + StoreAligned64(b565[1] + x + 16, b[1][1]); + const __m256i sr0 = LoadUnaligned32(src + x); + const __m256i sr1 = LoadUnaligned32(src + stride + x); + const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256()); + const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256()); + ma[0] = LoadAligned32(ma565[0] + x); + LoadAligned64(b565[0] + x, b[0][0]); + const __m256i p00 = CalculateFilteredOutputPass1(sr0_lo, ma, b[0]); + const __m256i p01 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[0][1]); + const __m256i d00 = SelfGuidedSingleMultiplier(sr0_lo, p00, w0); + const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p01, w0); + const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256()); + const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256()); + ma[1] = LoadAligned32(ma565[0] + x + 16); + LoadAligned64(b565[0] + x + 16, b[1][0]); + const __m256i p10 = CalculateFilteredOutputPass1(sr0_hi, ma + 1, b[1]); + const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[2], b[1][1]); + const __m256i d01 = SelfGuidedSingleMultiplier(sr0_hi, p10, w0); + const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0); + StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01)); + StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11)); + sq[0][0] = sq[0][2]; + sq[1][0] = sq[1][2]; + mas[0] = mas[2]; + bs[0] = bs[2]; + x += 32; + } while (x < width); +} + +inline void BoxFilterPass1LastRow( + const uint8_t* const src, const uint8_t* const src0, const int width, + const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0, + uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565, + uint32_t* b565, uint8_t* const dst) { + const __m128i s0 = + LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width); + __m128i ma0, b0, sq_128[2]; + __m256i mas[3], sq[3], bs[3]; + sq_128[0] = SquareLo8(s0); + BoxFilterPreProcess5LastRowLo(s0, scale, sum5, square_sum5, sq_128, &ma0, + &b0); + sq[0] = SetrM128i(sq_128[0], sq_128[1]); + mas[0] = SetrM128i(ma0, ma0); + bs[0] = SetrM128i(b0, b0); + + int x = 0; + do { + __m256i ma[3], ma5[3], b[2][2]; + BoxFilterPreProcess5LastRow( + src0 + x + 8, x + 8 + kOverreadInBytesPass1_256 - width, sum_width, + x + 8, scale, sum5, square_sum5, sq, mas, bs); + Prepare3_8(mas, ma5); + ma[1] = Sum565Lo(ma5); + ma[2] = Sum565Hi(ma5); + Sum565W(bs + 0, b[1]); + const __m256i sr = LoadUnaligned32(src + x); + const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256()); + const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256()); + ma[0] = LoadAligned32(ma565); + LoadAligned64(b565 + 0, b[0]); + const __m256i p0 = CalculateFilteredOutputPass1(sr_lo, ma, b); + ma[1] = LoadAligned32(ma565 + 16); + LoadAligned64(b565 + 16, b[0]); + Sum565W(bs + 1, b[1]); + const __m256i p1 = CalculateFilteredOutputPass1(sr_hi, ma + 1, b); + const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0); + const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0); + StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1)); + sq[0] = sq[2]; + mas[0] = mas[2]; + bs[0] = bs[2]; + ma565 += 32; + b565 += 32; + x += 32; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPass2( + const uint8_t* const src, const uint8_t* const src0, const int width, + const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0, + uint16_t* const sum3[3], uint32_t* const square_sum3[3], + uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3], + uint32_t* const b444[2], uint8_t* const dst) { + const __m128i s0 = + LoadUnaligned16Msan(src0, kOverreadInBytesPass2_128 - width); + __m128i ma0, b0, sq_128[2]; + __m256i mas[3], sq[3], bs[3]; + sq_128[0] = SquareLo8(s0); + BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, &b0); + sq[0] = SetrM128i(sq_128[0], sq_128[1]); + mas[0] = SetrM128i(ma0, ma0); + bs[0] = SetrM128i(b0, b0); + + int x = 0; + do { + __m256i ma[4], b[4][2], ma3[3]; + BoxFilterPreProcess3(src0 + x + 8, + x + 8 + kOverreadInBytesPass2_256 - width, x + 8, + sum_width, scale, sum3, square_sum3, sq, mas, bs); + Prepare3_8(mas, ma3); + Store343_444Lo(ma3, bs + 0, x + 0, &ma[2], b[2], ma343[2], ma444[1], + b343[2], b444[1]); + Store343_444Hi(ma3, bs + 1, x + 16, &ma[3], b[3], ma343[2], ma444[1], + b343[2], b444[1]); + const __m256i sr = LoadUnaligned32(src + x); + const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256()); + const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256()); + ma[0] = LoadAligned32(ma343[0] + x); + ma[1] = LoadAligned32(ma444[0] + x); + LoadAligned64(b343[0] + x, b[0]); + LoadAligned64(b444[0] + x, b[1]); + const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b); + ma[1] = LoadAligned32(ma343[0] + x + 16); + ma[2] = LoadAligned32(ma444[0] + x + 16); + LoadAligned64(b343[0] + x + 16, b[1]); + LoadAligned64(b444[0] + x + 16, b[2]); + const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1); + const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0); + const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0); + StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1)); + sq[0] = sq[2]; + mas[0] = mas[2]; + bs[0] = bs[2]; + x += 32; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilter( + const uint8_t* const src, const uint8_t* const src0, + const uint8_t* const src1, const ptrdiff_t stride, const int width, + const uint16_t scales[2], const int16_t w0, const int16_t w2, + uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, uint16_t* const ma343[4], + uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4], + uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) { + __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0; + __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5]; + s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width); + s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width); + sq_128[0][0] = SquareLo8(s[0]); + sq_128[1][0] = SquareLo8(s[1]); + BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128, + ma3_128, b3_128, &ma5_0, &b5_0); + sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]); + sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]); + ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]); + ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]); + ma5[0] = SetrM128i(ma5_0, ma5_0); + b3[0][0] = SetrM128i(b3_128[0], b3_128[0]); + b3[1][0] = SetrM128i(b3_128[1], b3_128[1]); + b5[0] = SetrM128i(b5_0, b5_0); + + int x = 0; + do { + __m256i ma[3][3], mat[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3]; + BoxFilterPreProcess(src0 + x + 8, src1 + x + 8, + x + 8 + kOverreadInBytesPass1_256 - width, x + 8, + scales, sum3, sum5, square_sum3, square_sum5, sum_width, + sq, ma3, b3, ma5, b5); + Prepare3_8(ma3[0], ma3x[0]); + Prepare3_8(ma3[1], ma3x[1]); + Prepare3_8(ma5, ma5x); + Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1], + ma343[2], ma444[1], b343[2], b444[1]); + Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2], + b343[3], b444[2]); + ma[0][1] = Sum565Lo(ma5x); + ma[0][2] = Sum565Hi(ma5x); + mat[0][1] = ma[0][2]; + StoreAligned64(ma565[1] + x, ma[0] + 1); + Sum565W(b5, b[0][1]); + StoreAligned64(b565[1] + x, b[0][1]); + const __m256i sr0 = LoadUnaligned32(src + x); + const __m256i sr1 = LoadUnaligned32(src + stride + x); + const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256()); + const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256()); + ma[0][0] = LoadAligned32(ma565[0] + x); + LoadAligned64(b565[0] + x, b[0][0]); + p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]); + p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]); + ma[1][0] = LoadAligned32(ma343[0] + x); + ma[1][1] = LoadAligned32(ma444[0] + x); + LoadAligned64(b343[0] + x, b[1][0]); + LoadAligned64(b444[0] + x, b[1][1]); + p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]); + const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2); + ma[2][0] = LoadAligned32(ma343[1] + x); + LoadAligned64(b343[1] + x, b[2][0]); + p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]); + const __m256i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2); + + Sum565W(b5 + 1, b[0][1]); + StoreAligned64(b565[1] + x + 16, b[0][1]); + Store343_444Hi(ma3x[0], b3[0] + 1, x + 16, &mat[1][2], &mat[2][1], b[1][2], + b[2][1], ma343[2], ma444[1], b343[2], b444[1]); + Store343_444Hi(ma3x[1], b3[1] + 1, x + 16, &mat[2][2], b[2][2], ma343[3], + ma444[2], b343[3], b444[2]); + const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256()); + const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256()); + mat[0][0] = LoadAligned32(ma565[0] + x + 16); + LoadAligned64(b565[0] + x + 16, b[0][0]); + p[0][0] = CalculateFilteredOutputPass1(sr0_hi, mat[0], b[0]); + p[1][0] = CalculateFilteredOutput<4>(sr1_hi, mat[0][1], b[0][1]); + mat[1][0] = LoadAligned32(ma343[0] + x + 16); + mat[1][1] = LoadAligned32(ma444[0] + x + 16); + LoadAligned64(b343[0] + x + 16, b[1][0]); + LoadAligned64(b444[0] + x + 16, b[1][1]); + p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], b[1]); + const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2); + mat[2][0] = LoadAligned32(ma343[1] + x + 16); + LoadAligned64(b343[1] + x + 16, b[2][0]); + p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], b[2]); + const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2); + StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01)); + StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11)); + sq[0][0] = sq[0][2]; + sq[1][0] = sq[1][2]; + ma3[0][0] = ma3[0][2]; + ma3[1][0] = ma3[1][2]; + ma5[0] = ma5[2]; + b3[0][0] = b3[0][2]; + b3[1][0] = b3[1][2]; + b5[0] = b5[2]; + x += 32; + } while (x < width); +} + +inline void BoxFilterLastRow( + const uint8_t* const src, const uint8_t* const src0, const int width, + const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0, + const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + uint16_t* const ma343[4], uint16_t* const ma444[3], + uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3], + uint32_t* const b565[2], uint8_t* const dst) { + const __m128i s0 = + LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width); + __m128i ma3_0, ma5_0, b3_0, b5_0, sq_128[2]; + __m256i ma3[3], ma5[3], sq[3], b3[3], b5[3]; + sq_128[0] = SquareLo8(s0); + BoxFilterPreProcessLastRowLo(s0, scales, sum3, sum5, square_sum3, square_sum5, + sq_128, &ma3_0, &ma5_0, &b3_0, &b5_0); + sq[0] = SetrM128i(sq_128[0], sq_128[1]); + ma3[0] = SetrM128i(ma3_0, ma3_0); + ma5[0] = SetrM128i(ma5_0, ma5_0); + b3[0] = SetrM128i(b3_0, b3_0); + b5[0] = SetrM128i(b5_0, b5_0); + + int x = 0; + do { + __m256i ma[3], mat[3], b[3][2], p[2], ma3x[3], ma5x[3]; + BoxFilterPreProcessLastRow(src0 + x + 8, + x + 8 + kOverreadInBytesPass1_256 - width, + sum_width, x + 8, scales, sum3, sum5, + square_sum3, square_sum5, sq, ma3, ma5, b3, b5); + Prepare3_8(ma3, ma3x); + Prepare3_8(ma5, ma5x); + ma[1] = Sum565Lo(ma5x); + Sum565W(b5, b[1]); + ma[2] = Sum343Lo(ma3x); + Sum343W(b3, b[2]); + const __m256i sr = LoadUnaligned32(src + x); + const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256()); + ma[0] = LoadAligned32(ma565[0] + x); + LoadAligned64(b565[0] + x, b[0]); + p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b); + ma[0] = LoadAligned32(ma343[0] + x); + ma[1] = LoadAligned32(ma444[0] + x); + LoadAligned64(b343[0] + x, b[0]); + LoadAligned64(b444[0] + x, b[1]); + p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b); + const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2); + + mat[1] = Sum565Hi(ma5x); + Sum565W(b5 + 1, b[1]); + mat[2] = Sum343Hi(ma3x); + Sum343W(b3 + 1, b[2]); + const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256()); + mat[0] = LoadAligned32(ma565[0] + x + 16); + LoadAligned64(b565[0] + x + 16, b[0]); + p[0] = CalculateFilteredOutputPass1(sr_hi, mat, b); + mat[0] = LoadAligned32(ma343[0] + x + 16); + mat[1] = LoadAligned32(ma444[0] + x + 16); + LoadAligned64(b343[0] + x + 16, b[0]); + LoadAligned64(b444[0] + x + 16, b[1]); + p[1] = CalculateFilteredOutputPass2(sr_hi, mat, b); + const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2); + StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1)); + sq[0] = sq[2]; + ma3[0] = ma3[2]; + ma5[0] = ma5[2]; + b3[0] = b3[2]; + b5[0] = b5[2]; + x += 32; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( + const RestorationUnitInfo& restoration_info, const uint8_t* src, + const uint8_t* const top_border, const uint8_t* bottom_border, + const ptrdiff_t stride, const int width, const int height, + SgrBuffer* const sgr_buffer, uint8_t* dst) { + const auto temp_stride = Align(width, 32); + const auto sum_width = temp_stride + 8; + const auto sum_stride = temp_stride + 32; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12. + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1; + uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2]; + uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2]; + sum3[0] = sgr_buffer->sum3 + kSumOffset; + square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset; + ma343[0] = sgr_buffer->ma343; + b343[0] = sgr_buffer->b343; + for (int i = 1; i <= 3; ++i) { + sum3[i] = sum3[i - 1] + sum_stride; + square_sum3[i] = square_sum3[i - 1] + sum_stride; + ma343[i] = ma343[i - 1] + temp_stride; + b343[i] = b343[i - 1] + temp_stride; + } + sum5[0] = sgr_buffer->sum5 + kSumOffset; + square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset; + for (int i = 1; i <= 4; ++i) { + sum5[i] = sum5[i - 1] + sum_stride; + square_sum5[i] = square_sum5[i - 1] + sum_stride; + } + ma444[0] = sgr_buffer->ma444; + b444[0] = sgr_buffer->b444; + for (int i = 1; i <= 2; ++i) { + ma444[i] = ma444[i - 1] + temp_stride; + b444[i] = b444[i - 1] + temp_stride; + } + ma565[0] = sgr_buffer->ma565; + ma565[1] = ma565[0] + temp_stride; + b565[0] = sgr_buffer->b565; + b565[1] = b565[0] + temp_stride; + assert(scales[0] != 0); + assert(scales[1] != 0); + BoxSum(top_border, stride, width, sum_stride, temp_stride, sum3[0], sum5[1], + square_sum3[0], square_sum5[1]); + sum5[0] = sum5[1]; + square_sum5[0] = square_sum5[1]; + const uint8_t* const s = (height > 1) ? src + stride : bottom_border; + BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3, + square_sum5, sum_width, ma343, ma444, ma565[0], b343, + b444, b565[0]); + sum5[0] = sgr_buffer->sum5 + kSumOffset; + square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset; + + for (int y = (height >> 1) - 1; y > 0; --y) { + Circulate4PointersBy2(sum3); + Circulate4PointersBy2(square_sum3); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width, + scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width, + ma343, ma444, ma565, b343, b444, b565, dst); + src += 2 * stride; + dst += 2 * stride; + Circulate4PointersBy2(ma343); + Circulate4PointersBy2(b343); + std::swap(ma444[0], ma444[2]); + std::swap(b444[0], b444[2]); + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + + Circulate4PointersBy2(sum3); + Circulate4PointersBy2(square_sum3); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + if ((height & 1) == 0 || height > 1) { + const uint8_t* sr[2]; + if ((height & 1) == 0) { + sr[0] = bottom_border; + sr[1] = bottom_border + stride; + } else { + sr[0] = src + 2 * stride; + sr[1] = bottom_border; + } + BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5, + square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343, + b444, b565, dst); + } + if ((height & 1) != 0) { + if (height > 1) { + src += 2 * stride; + dst += 2 * stride; + Circulate4PointersBy2(sum3); + Circulate4PointersBy2(square_sum3); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + Circulate4PointersBy2(ma343); + Circulate4PointersBy2(b343); + std::swap(ma444[0], ma444[2]); + std::swap(b444[0], b444[2]); + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales, + w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444, + ma565, b343, b444, b565, dst); + } +} + +inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, + const uint8_t* src, + const uint8_t* const top_border, + const uint8_t* bottom_border, + const ptrdiff_t stride, const int width, + const int height, SgrBuffer* const sgr_buffer, + uint8_t* dst) { + const auto temp_stride = Align(width, 32); + const auto sum_width = temp_stride + 8; + const auto sum_stride = temp_stride + 32; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12. + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + uint16_t *sum5[5], *ma565[2]; + uint32_t *square_sum5[5], *b565[2]; + sum5[0] = sgr_buffer->sum5 + kSumOffset; + square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset; + for (int i = 1; i <= 4; ++i) { + sum5[i] = sum5[i - 1] + sum_stride; + square_sum5[i] = square_sum5[i - 1] + sum_stride; + } + ma565[0] = sgr_buffer->ma565; + ma565[1] = ma565[0] + temp_stride; + b565[0] = sgr_buffer->b565; + b565[1] = b565[0] + temp_stride; + assert(scale != 0); + BoxSum<5>(top_border, stride, width, sum_stride, temp_stride, sum5[1], + square_sum5[1]); + sum5[0] = sum5[1]; + square_sum5[0] = square_sum5[1]; + const uint8_t* const s = (height > 1) ? src + stride : bottom_border; + BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width, + ma565[0], b565[0]); + sum5[0] = sgr_buffer->sum5 + kSumOffset; + square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset; + + for (int y = (height >> 1) - 1; y > 0; --y) { + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5, + square_sum5, width, sum_width, scale, w0, ma565, b565, dst); + src += 2 * stride; + dst += 2 * stride; + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + if ((height & 1) == 0 || height > 1) { + const uint8_t* sr[2]; + if ((height & 1) == 0) { + sr[0] = bottom_border; + sr[1] = bottom_border + stride; + } else { + sr[0] = src + 2 * stride; + sr[1] = bottom_border; + } + BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width, + sum_width, scale, w0, ma565, b565, dst); + } + if ((height & 1) != 0) { + src += 3; + if (height > 1) { + src += 2 * stride; + dst += 2 * stride; + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + } + BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale, + w0, sum5, square_sum5, ma565[0], b565[0], dst); + } +} + +inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, + const uint8_t* src, + const uint8_t* const top_border, + const uint8_t* bottom_border, + const ptrdiff_t stride, const int width, + const int height, SgrBuffer* const sgr_buffer, + uint8_t* dst) { + assert(restoration_info.sgr_proj_info.multiplier[0] == 0); + const auto temp_stride = Align(width, 32); + const auto sum_width = temp_stride + 8; + const auto sum_stride = temp_stride + 32; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12. + uint16_t *sum3[3], *ma343[3], *ma444[2]; + uint32_t *square_sum3[3], *b343[3], *b444[2]; + sum3[0] = sgr_buffer->sum3 + kSumOffset; + square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset; + ma343[0] = sgr_buffer->ma343; + b343[0] = sgr_buffer->b343; + for (int i = 1; i <= 2; ++i) { + sum3[i] = sum3[i - 1] + sum_stride; + square_sum3[i] = square_sum3[i - 1] + sum_stride; + ma343[i] = ma343[i - 1] + temp_stride; + b343[i] = b343[i - 1] + temp_stride; + } + ma444[0] = sgr_buffer->ma444; + ma444[1] = ma444[0] + temp_stride; + b444[0] = sgr_buffer->b444; + b444[1] = b444[0] + temp_stride; + assert(scale != 0); + BoxSum<3>(top_border, stride, width, sum_stride, temp_stride, sum3[0], + square_sum3[0]); + BoxSumFilterPreProcess3(src, width, scale, sum3, square_sum3, + sum_width, ma343[0], nullptr, b343[0], + nullptr); + Circulate3PointersBy1(sum3); + Circulate3PointersBy1(square_sum3); + const uint8_t* s; + if (height > 1) { + s = src + stride; + } else { + s = bottom_border; + bottom_border += stride; + } + BoxSumFilterPreProcess3(s, width, scale, sum3, square_sum3, sum_width, + ma343[1], ma444[0], b343[1], b444[0]); + + for (int y = height - 2; y > 0; --y) { + Circulate3PointersBy1(sum3); + Circulate3PointersBy1(square_sum3); + BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3, + square_sum3, ma343, ma444, b343, b444, dst); + src += stride; + dst += stride; + Circulate3PointersBy1(ma343); + Circulate3PointersBy1(b343); + std::swap(ma444[0], ma444[1]); + std::swap(b444[0], b444[1]); + } + + int y = std::min(height, 2); + src += 2; + do { + Circulate3PointersBy1(sum3); + Circulate3PointersBy1(square_sum3); + BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3, + square_sum3, ma343, ma444, b343, b444, dst); + src += stride; + dst += stride; + bottom_border += stride; + Circulate3PointersBy1(ma343); + Circulate3PointersBy1(b343); + std::swap(ma444[0], ma444[1]); + std::swap(b444[0], b444[1]); + } while (--y != 0); +} + +// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in +// the end of each row. It is safe to overwrite the output as it will not be +// part of the visible frame. +void SelfGuidedFilter_AVX2( + const RestorationUnitInfo& restoration_info, const void* const source, + const void* const top_border, const void* const bottom_border, + const ptrdiff_t stride, const int width, const int height, + RestorationBuffer* const restoration_buffer, void* const dest) { + const int index = restoration_info.sgr_proj_info.index; + const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 + const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 + const auto* const src = static_cast(source); + const auto* top = static_cast(top_border); + const auto* bottom = static_cast(bottom_border); + auto* const dst = static_cast(dest); + SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer; + if (radius_pass_1 == 0) { + // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the + // following assertion. + assert(radius_pass_0 != 0); + BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3, + stride, width, height, sgr_buffer, dst); + } else if (radius_pass_0 == 0) { + BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2, + stride, width, height, sgr_buffer, dst); + } else { + BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride, + width, height, sgr_buffer, dst); + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); +#if DSP_ENABLED_8BPP_AVX2(WienerFilter) + dsp->loop_restorations[0] = WienerFilter_AVX2; +#endif +#if DSP_ENABLED_8BPP_AVX2(SelfGuidedFilter) + dsp->loop_restorations[1] = SelfGuidedFilter_AVX2; +#endif +} + +} // namespace +} // namespace low_bitdepth + +void LoopRestorationInit_AVX2() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_AVX2 +namespace libgav1 { +namespace dsp { + +void LoopRestorationInit_AVX2() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_AVX2 diff --git a/src/dsp/x86/loop_restoration_avx2.h b/src/dsp/x86/loop_restoration_avx2.h new file mode 100644 index 0000000..d80227c --- /dev/null +++ b/src/dsp/x86/loop_restoration_avx2.h @@ -0,0 +1,52 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_ +#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::loop_restorations, see the defines below for specifics. +// These functions are not thread-safe. +void LoopRestorationInit_AVX2(); +void LoopRestorationInit10bpp_AVX2(); + +} // namespace dsp +} // namespace libgav1 + +// If avx2 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the avx2 implementation should be used. +#if LIBGAV1_TARGETING_AVX2 + +#ifndef LIBGAV1_Dsp8bpp_WienerFilter +#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_AVX2 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WienerFilter +#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_AVX2 +#endif + +#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter +#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2 +#endif + +#endif // LIBGAV1_TARGETING_AVX2 + +#endif // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_ diff --git a/src/dsp/x86/loop_restoration_sse4.cc b/src/dsp/x86/loop_restoration_sse4.cc new file mode 100644 index 0000000..24f5ad2 --- /dev/null +++ b/src/dsp/x86/loop_restoration_sse4.cc @@ -0,0 +1,2549 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/loop_restoration.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 +#include + +#include +#include +#include +#include +#include + +#include "src/dsp/common.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +inline void WienerHorizontalClip(const __m128i s[2], const __m128i s_3x128, + int16_t* const wiener_buffer) { + constexpr int offset = + 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1); + constexpr int limit = + (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1; + const __m128i offsets = _mm_set1_epi16(-offset); + const __m128i limits = _mm_set1_epi16(limit - offset); + // The sum range here is [-128 * 255 + 4, 90 * 255 + 4]. + const __m128i sum = _mm_add_epi16(s[0], s[1]); + const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal); + // Add back scaled down offset correction. + const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128); + const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets); + const __m128i d1 = _mm_min_epi16(d0, limits); + StoreAligned16(wiener_buffer, d1); +} + +inline void WienerHorizontalTap7Kernel(const __m128i s[4], + const __m128i filter[4], + int16_t* const wiener_buffer) { + __m128i madds[4]; + madds[0] = _mm_maddubs_epi16(s[0], filter[0]); + madds[1] = _mm_maddubs_epi16(s[1], filter[1]); + madds[2] = _mm_maddubs_epi16(s[2], filter[2]); + madds[3] = _mm_maddubs_epi16(s[3], filter[3]); + madds[0] = _mm_add_epi16(madds[0], madds[2]); + madds[1] = _mm_add_epi16(madds[1], madds[3]); + const __m128i s_3x128 = + _mm_slli_epi16(_mm_srli_epi16(s[1], 8), 7 - kInterRoundBitsHorizontal); + WienerHorizontalClip(madds, s_3x128, wiener_buffer); +} + +inline void WienerHorizontalTap5Kernel(const __m128i s[5], + const __m128i filter[3], + int16_t* const wiener_buffer) { + __m128i madds[3]; + madds[0] = _mm_maddubs_epi16(s[0], filter[0]); + madds[1] = _mm_maddubs_epi16(s[1], filter[1]); + madds[2] = _mm_maddubs_epi16(s[2], filter[2]); + madds[0] = _mm_add_epi16(madds[0], madds[2]); + const __m128i s_3x128 = + _mm_srli_epi16(_mm_slli_epi16(s[1], 8), kInterRoundBitsHorizontal + 1); + WienerHorizontalClip(madds, s_3x128, wiener_buffer); +} + +inline void WienerHorizontalTap3Kernel(const __m128i s[2], + const __m128i filter[2], + int16_t* const wiener_buffer) { + __m128i madds[2]; + madds[0] = _mm_maddubs_epi16(s[0], filter[0]); + madds[1] = _mm_maddubs_epi16(s[1], filter[1]); + const __m128i s_3x128 = + _mm_slli_epi16(_mm_srli_epi16(s[0], 8), 7 - kInterRoundBitsHorizontal); + WienerHorizontalClip(madds, s_3x128, wiener_buffer); +} + +// loading all and unpacking is about 7% faster than using _mm_alignr_epi8(). +inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const int coefficient0, + const __m128i coefficients, + int16_t** const wiener_buffer) { + const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1)); + __m128i filter[4]; + filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0200)); + filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604)); + filter[2] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0204)); + filter[3] = _mm_set1_epi16((1 << 8) | static_cast(coefficient0)); + for (int y = height; y != 0; --y) { + ptrdiff_t x = 0; + do { + __m128i s[7], ss[4]; + s[0] = LoadUnaligned16(src + x + 0); + s[1] = LoadUnaligned16(src + x + 1); + s[2] = LoadUnaligned16(src + x + 2); + s[3] = LoadUnaligned16(src + x + 3); + s[4] = LoadUnaligned16(src + x + 4); + s[5] = LoadUnaligned16(src + x + 5); + s[6] = LoadUnaligned16(src + x + 6); + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + ss[3] = _mm_unpacklo_epi8(s[6], round); + WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 0); + ss[0] = _mm_unpackhi_epi8(s[0], s[1]); + ss[1] = _mm_unpackhi_epi8(s[2], s[3]); + ss[2] = _mm_unpackhi_epi8(s[4], s[5]); + ss[3] = _mm_unpackhi_epi8(s[6], round); + WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 8); + x += 16; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const int coefficient1, + const __m128i coefficients, + int16_t** const wiener_buffer) { + const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1)); + __m128i filter[3]; + filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0402)); + filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0406)); + filter[2] = _mm_set1_epi16((1 << 8) | static_cast(coefficient1)); + for (int y = height; y != 0; --y) { + ptrdiff_t x = 0; + do { + __m128i s[5], ss[3]; + s[0] = LoadUnaligned16(src + x + 0); + s[1] = LoadUnaligned16(src + x + 1); + s[2] = LoadUnaligned16(src + x + 2); + s[3] = LoadUnaligned16(src + x + 3); + s[4] = LoadUnaligned16(src + x + 4); + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + ss[2] = _mm_unpacklo_epi8(s[4], round); + WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 0); + ss[0] = _mm_unpackhi_epi8(s[0], s[1]); + ss[1] = _mm_unpackhi_epi8(s[2], s[3]); + ss[2] = _mm_unpackhi_epi8(s[4], round); + WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 8); + x += 16; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + const int coefficient2, + const __m128i coefficients, + int16_t** const wiener_buffer) { + const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1)); + __m128i filter[2]; + filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604)); + filter[1] = _mm_set1_epi16((1 << 8) | static_cast(coefficient2)); + for (int y = height; y != 0; --y) { + ptrdiff_t x = 0; + do { + __m128i s[3], ss[2]; + s[0] = LoadUnaligned16(src + x + 0); + s[1] = LoadUnaligned16(src + x + 1); + s[2] = LoadUnaligned16(src + x + 2); + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + ss[1] = _mm_unpacklo_epi8(s[2], round); + WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 0); + ss[0] = _mm_unpackhi_epi8(s[0], s[1]); + ss[1] = _mm_unpackhi_epi8(s[2], round); + WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 8); + x += 16; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const int height, + int16_t** const wiener_buffer) { + for (int y = height; y != 0; --y) { + ptrdiff_t x = 0; + do { + const __m128i s = LoadUnaligned16(src + x); + const __m128i s0 = _mm_unpacklo_epi8(s, _mm_setzero_si128()); + const __m128i s1 = _mm_unpackhi_epi8(s, _mm_setzero_si128()); + const __m128i d0 = _mm_slli_epi16(s0, 4); + const __m128i d1 = _mm_slli_epi16(s1, 4); + StoreAligned16(*wiener_buffer + x + 0, d0); + StoreAligned16(*wiener_buffer + x + 8, d1); + x += 16; + } while (x < width); + src += src_stride; + *wiener_buffer += width; + } +} + +inline __m128i WienerVertical7(const __m128i a[2], const __m128i filter[2]) { + const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1)); + const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]); + const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]); + const __m128i sum0 = _mm_add_epi32(round, madd0); + const __m128i sum1 = _mm_add_epi32(sum0, madd1); + return _mm_srai_epi32(sum1, kInterRoundBitsVertical); +} + +inline __m128i WienerVertical5(const __m128i a[2], const __m128i filter[2]) { + const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]); + const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]); + const __m128i sum = _mm_add_epi32(madd0, madd1); + return _mm_srai_epi32(sum, kInterRoundBitsVertical); +} + +inline __m128i WienerVertical3(const __m128i a, const __m128i filter) { + const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1)); + const __m128i madd = _mm_madd_epi16(a, filter); + const __m128i sum = _mm_add_epi32(round, madd); + return _mm_srai_epi32(sum, kInterRoundBitsVertical); +} + +inline __m128i WienerVerticalFilter7(const __m128i a[7], + const __m128i filter[2]) { + __m128i b[2]; + const __m128i a06 = _mm_add_epi16(a[0], a[6]); + const __m128i a15 = _mm_add_epi16(a[1], a[5]); + const __m128i a24 = _mm_add_epi16(a[2], a[4]); + b[0] = _mm_unpacklo_epi16(a06, a15); + b[1] = _mm_unpacklo_epi16(a24, a[3]); + const __m128i sum0 = WienerVertical7(b, filter); + b[0] = _mm_unpackhi_epi16(a06, a15); + b[1] = _mm_unpackhi_epi16(a24, a[3]); + const __m128i sum1 = WienerVertical7(b, filter); + return _mm_packs_epi32(sum0, sum1); +} + +inline __m128i WienerVerticalFilter5(const __m128i a[5], + const __m128i filter[2]) { + const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1)); + __m128i b[2]; + const __m128i a04 = _mm_add_epi16(a[0], a[4]); + const __m128i a13 = _mm_add_epi16(a[1], a[3]); + b[0] = _mm_unpacklo_epi16(a04, a13); + b[1] = _mm_unpacklo_epi16(a[2], round); + const __m128i sum0 = WienerVertical5(b, filter); + b[0] = _mm_unpackhi_epi16(a04, a13); + b[1] = _mm_unpackhi_epi16(a[2], round); + const __m128i sum1 = WienerVertical5(b, filter); + return _mm_packs_epi32(sum0, sum1); +} + +inline __m128i WienerVerticalFilter3(const __m128i a[3], const __m128i filter) { + __m128i b; + const __m128i a02 = _mm_add_epi16(a[0], a[2]); + b = _mm_unpacklo_epi16(a02, a[1]); + const __m128i sum0 = WienerVertical3(b, filter); + b = _mm_unpackhi_epi16(a02, a[1]); + const __m128i sum1 = WienerVertical3(b, filter); + return _mm_packs_epi32(sum0, sum1); +} + +inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m128i filter[2], __m128i a[7]) { + a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride); + a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride); + a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride); + a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride); + a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride); + a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride); + a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride); + return WienerVerticalFilter7(a, filter); +} + +inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m128i filter[2], __m128i a[5]) { + a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride); + a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride); + a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride); + a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride); + a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride); + return WienerVerticalFilter5(a, filter); +} + +inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m128i filter, __m128i a[3]) { + a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride); + a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride); + a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride); + return WienerVerticalFilter3(a, filter); +} + +inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m128i filter[2], __m128i d[2]) { + __m128i a[8]; + d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a); + a[7] = LoadAligned16(wiener_buffer + 7 * wiener_stride); + d[1] = WienerVerticalFilter7(a + 1, filter); +} + +inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m128i filter[2], __m128i d[2]) { + __m128i a[6]; + d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a); + a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride); + d[1] = WienerVerticalFilter5(a + 1, filter); +} + +inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer, + const ptrdiff_t wiener_stride, + const __m128i filter, __m128i d[2]) { + __m128i a[4]; + d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a); + a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride); + d[1] = WienerVerticalFilter3(a + 1, filter); +} + +inline void WienerVerticalTap7(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t coefficients[4], uint8_t* dst, + const ptrdiff_t dst_stride) { + const __m128i c = LoadLo8(coefficients); + __m128i filter[2]; + filter[0] = _mm_shuffle_epi32(c, 0x0); + filter[1] = _mm_shuffle_epi32(c, 0x55); + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + __m128i d[2][2]; + WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]); + WienerVerticalTap7Kernel2(wiener_buffer + x + 8, width, filter, d[1]); + StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0])); + StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1])); + x += 16; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + __m128i a[7]; + const __m128i d0 = + WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a); + const __m128i d1 = + WienerVerticalTap7Kernel(wiener_buffer + x + 8, width, filter, a); + StoreAligned16(dst + x, _mm_packus_epi16(d0, d1)); + x += 16; + } while (x < width); + } +} + +inline void WienerVerticalTap5(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t coefficients[3], uint8_t* dst, + const ptrdiff_t dst_stride) { + const __m128i c = Load4(coefficients); + __m128i filter[2]; + filter[0] = _mm_shuffle_epi32(c, 0); + filter[1] = + _mm_set1_epi32((1 << 16) | static_cast(coefficients[2])); + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + __m128i d[2][2]; + WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]); + WienerVerticalTap5Kernel2(wiener_buffer + x + 8, width, filter, d[1]); + StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0])); + StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1])); + x += 16; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + __m128i a[5]; + const __m128i d0 = + WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a); + const __m128i d1 = + WienerVerticalTap5Kernel(wiener_buffer + x + 8, width, filter, a); + StoreAligned16(dst + x, _mm_packus_epi16(d0, d1)); + x += 16; + } while (x < width); + } +} + +inline void WienerVerticalTap3(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + const int16_t coefficients[2], uint8_t* dst, + const ptrdiff_t dst_stride) { + const __m128i filter = + _mm_set1_epi32(*reinterpret_cast(coefficients)); + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + __m128i d[2][2]; + WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]); + WienerVerticalTap3Kernel2(wiener_buffer + x + 8, width, filter, d[1]); + StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0])); + StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1])); + x += 16; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + __m128i a[3]; + const __m128i d0 = + WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a); + const __m128i d1 = + WienerVerticalTap3Kernel(wiener_buffer + x + 8, width, filter, a); + StoreAligned16(dst + x, _mm_packus_epi16(d0, d1)); + x += 16; + } while (x < width); + } +} + +inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer, + uint8_t* const dst) { + const __m128i a0 = LoadAligned16(wiener_buffer + 0); + const __m128i a1 = LoadAligned16(wiener_buffer + 8); + const __m128i b0 = _mm_add_epi16(a0, _mm_set1_epi16(8)); + const __m128i b1 = _mm_add_epi16(a1, _mm_set1_epi16(8)); + const __m128i c0 = _mm_srai_epi16(b0, 4); + const __m128i c1 = _mm_srai_epi16(b1, 4); + const __m128i d = _mm_packus_epi16(c0, c1); + StoreAligned16(dst, d); +} + +inline void WienerVerticalTap1(const int16_t* wiener_buffer, + const ptrdiff_t width, const int height, + uint8_t* dst, const ptrdiff_t dst_stride) { + for (int y = height >> 1; y > 0; --y) { + ptrdiff_t x = 0; + do { + WienerVerticalTap1Kernel(wiener_buffer + x, dst + x); + WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x); + x += 16; + } while (x < width); + dst += 2 * dst_stride; + wiener_buffer += 2 * width; + } + + if ((height & 1) != 0) { + ptrdiff_t x = 0; + do { + WienerVerticalTap1Kernel(wiener_buffer + x, dst + x); + x += 16; + } while (x < width); + } +} + +void WienerFilter_SSE4_1(const RestorationUnitInfo& restoration_info, + const void* const source, const void* const top_border, + const void* const bottom_border, + const ptrdiff_t stride, const int width, + const int height, + RestorationBuffer* const restoration_buffer, + void* const dest) { + const int16_t* const number_leading_zero_coefficients = + restoration_info.wiener_info.number_leading_zero_coefficients; + const int number_rows_to_skip = std::max( + static_cast(number_leading_zero_coefficients[WienerInfo::kVertical]), + 1); + const ptrdiff_t wiener_stride = Align(width, 16); + int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer; + // The values are saturated to 13 bits before storing. + int16_t* wiener_buffer_horizontal = + wiener_buffer_vertical + number_rows_to_skip * wiener_stride; + + // horizontal filtering. + // Over-reads up to 15 - |kRestorationHorizontalBorder| values. + const int height_horizontal = + height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip; + const int height_extra = (height_horizontal - height) >> 1; + assert(height_extra <= 2); + const auto* const src = static_cast(source); + const auto* const top = static_cast(top_border); + const auto* const bottom = static_cast(bottom_border); + const int16_t* const filter_horizontal = + restoration_info.wiener_info.filter[WienerInfo::kHorizontal]; + const __m128i c = LoadLo8(filter_horizontal); + // In order to keep the horizontal pass intermediate values within 16 bits we + // offset |filter[3]| by 128. The 128 offset will be added back in the loop. + const __m128i coefficients_horizontal = + _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0)); + if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) { + WienerHorizontalTap7(top + (2 - height_extra) * stride - 3, stride, + wiener_stride, height_extra, filter_horizontal[0], + coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap7(src - 3, stride, wiener_stride, height, + filter_horizontal[0], coefficients_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap7(bottom - 3, stride, wiener_stride, height_extra, + filter_horizontal[0], coefficients_horizontal, + &wiener_buffer_horizontal); + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) { + WienerHorizontalTap5(top + (2 - height_extra) * stride - 2, stride, + wiener_stride, height_extra, filter_horizontal[1], + coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap5(src - 2, stride, wiener_stride, height, + filter_horizontal[1], coefficients_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap5(bottom - 2, stride, wiener_stride, height_extra, + filter_horizontal[1], coefficients_horizontal, + &wiener_buffer_horizontal); + } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) { + // The maximum over-reads happen here. + WienerHorizontalTap3(top + (2 - height_extra) * stride - 1, stride, + wiener_stride, height_extra, filter_horizontal[2], + coefficients_horizontal, &wiener_buffer_horizontal); + WienerHorizontalTap3(src - 1, stride, wiener_stride, height, + filter_horizontal[2], coefficients_horizontal, + &wiener_buffer_horizontal); + WienerHorizontalTap3(bottom - 1, stride, wiener_stride, height_extra, + filter_horizontal[2], coefficients_horizontal, + &wiener_buffer_horizontal); + } else { + assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3); + WienerHorizontalTap1(top + (2 - height_extra) * stride, stride, + wiener_stride, height_extra, + &wiener_buffer_horizontal); + WienerHorizontalTap1(src, stride, wiener_stride, height, + &wiener_buffer_horizontal); + WienerHorizontalTap1(bottom, stride, wiener_stride, height_extra, + &wiener_buffer_horizontal); + } + + // vertical filtering. + // Over-writes up to 15 values. + const int16_t* const filter_vertical = + restoration_info.wiener_info.filter[WienerInfo::kVertical]; + auto* dst = static_cast(dest); + if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) { + // Because the top row of |source| is a duplicate of the second row, and the + // bottom row of |source| is a duplicate of its above row, we can duplicate + // the top and bottom row of |wiener_buffer| accordingly. + memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride, + sizeof(*wiener_buffer_horizontal) * wiener_stride); + memcpy(restoration_buffer->wiener_buffer, + restoration_buffer->wiener_buffer + wiener_stride, + sizeof(*restoration_buffer->wiener_buffer) * wiener_stride); + WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height, + filter_vertical, dst, stride); + } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) { + WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride, + height, filter_vertical + 1, dst, stride); + } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) { + WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride, + wiener_stride, height, filter_vertical + 2, dst, stride); + } else { + assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3); + WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride, + wiener_stride, height, dst, stride); + } +} + +//------------------------------------------------------------------------------ +// SGR + +// SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for +// Pass 1 and 2 for Pass 2. +constexpr int kOverreadInBytesPass1 = 10; +constexpr int kOverreadInBytesPass2 = 12; + +inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x, + __m128i dst[2]) { + dst[0] = LoadAligned16(src[0] + x); + dst[1] = LoadAligned16(src[1] + x); +} + +inline void LoadAligned16x2U16Msan(const uint16_t* const src[2], + const ptrdiff_t x, const ptrdiff_t border, + __m128i dst[2]) { + dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border)); + dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border)); +} + +inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x, + __m128i dst[3]) { + dst[0] = LoadAligned16(src[0] + x); + dst[1] = LoadAligned16(src[1] + x); + dst[2] = LoadAligned16(src[2] + x); +} + +inline void LoadAligned16x3U16Msan(const uint16_t* const src[3], + const ptrdiff_t x, const ptrdiff_t border, + __m128i dst[3]) { + dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border)); + dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border)); + dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border)); +} + +inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) { + dst[0] = LoadAligned16(src + 0); + dst[1] = LoadAligned16(src + 4); +} + +inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x, + const ptrdiff_t border, __m128i dst[2]) { + dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border)); + dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border)); +} + +inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x, + __m128i dst[2][2]) { + LoadAligned32U32(src[0] + x, dst[0]); + LoadAligned32U32(src[1] + x, dst[1]); +} + +inline void LoadAligned32x2U32Msan(const uint32_t* const src[2], + const ptrdiff_t x, const ptrdiff_t border, + __m128i dst[2][2]) { + LoadAligned32U32Msan(src[0], x, border, dst[0]); + LoadAligned32U32Msan(src[1], x, border, dst[1]); +} + +inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x, + __m128i dst[3][2]) { + LoadAligned32U32(src[0] + x, dst[0]); + LoadAligned32U32(src[1] + x, dst[1]); + LoadAligned32U32(src[2] + x, dst[2]); +} + +inline void LoadAligned32x3U32Msan(const uint32_t* const src[3], + const ptrdiff_t x, const ptrdiff_t border, + __m128i dst[3][2]) { + LoadAligned32U32Msan(src[0], x, border, dst[0]); + LoadAligned32U32Msan(src[1], x, border, dst[1]); + LoadAligned32U32Msan(src[2], x, border, dst[2]); +} + +inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) { + StoreAligned16(dst + 0, src[0]); + StoreAligned16(dst + 8, src[1]); +} + +inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) { + StoreAligned16(dst + 0, src[0]); + StoreAligned16(dst + 4, src[1]); +} + +inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) { + StoreAligned32U32(dst + 0, src + 0); + StoreAligned32U32(dst + 8, src + 2); +} + +// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following +// functions. Some compilers may generate super inefficient code and the whole +// decoder could be 15% slower. + +inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128()); + return _mm_add_epi16(s0, s1); +} + +inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128()); + return _mm_add_epi16(s0, s1); +} + +inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128()); + return _mm_add_epi32(s0, s1); +} + +inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128()); + return _mm_add_epi32(s0, s1); +} + +inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) { + const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128()); + return _mm_add_epi16(src0, s1); +} + +inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) { + const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128()); + return _mm_add_epi16(src0, s1); +} + +inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) { + const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128()); + return _mm_add_epi32(src0, s1); +} + +inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) { + const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128()); + return _mm_add_epi32(src0, s1); +} + +inline __m128i VmullNLo8(const __m128i src0, const int src1) { + const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128()); + return _mm_madd_epi16(s0, _mm_set1_epi32(src1)); +} + +inline __m128i VmullNHi8(const __m128i src0, const int src1) { + const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128()); + return _mm_madd_epi16(s0, _mm_set1_epi32(src1)); +} + +inline __m128i VmullLo16(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128()); + return _mm_madd_epi16(s0, s1); +} + +inline __m128i VmullHi16(const __m128i src0, const __m128i src1) { + const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128()); + const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128()); + return _mm_madd_epi16(s0, s1); +} + +inline __m128i VrshrS32(const __m128i src0, const int src1) { + const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1))); + return _mm_srai_epi32(sum, src1); +} + +inline __m128i VrshrU32(const __m128i src0, const int src1) { + const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1))); + return _mm_srli_epi32(sum, src1); +} + +inline __m128i SquareLo8(const __m128i src) { + const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128()); + return _mm_mullo_epi16(s, s); +} + +inline __m128i SquareHi8(const __m128i src) { + const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128()); + return _mm_mullo_epi16(s, s); +} + +inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) { + dst[0] = src; + dst[1] = _mm_srli_si128(src, 1); + dst[2] = _mm_srli_si128(src, 2); +} + +template +inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) { + dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0); + dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1); + dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2); +} + +inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) { + dst[0] = src[0]; + dst[1] = _mm_alignr_epi8(src[1], src[0], 2); + dst[2] = _mm_alignr_epi8(src[1], src[0], 4); +} + +inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) { + dst[0] = src; + dst[1] = _mm_srli_si128(src, 1); + dst[2] = _mm_srli_si128(src, 2); + dst[3] = _mm_srli_si128(src, 3); + dst[4] = _mm_srli_si128(src, 4); +} + +template +inline void Prepare5_8(const __m128i src[2], __m128i dst[5]) { + dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0); + dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1); + dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2); + dst[3] = _mm_alignr_epi8(src[1], src[0], offset + 3); + dst[4] = _mm_alignr_epi8(src[1], src[0], offset + 4); +} + +inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) { + Prepare3_16(src, dst); + dst[3] = _mm_alignr_epi8(src[1], src[0], 6); + dst[4] = _mm_alignr_epi8(src[1], src[0], 8); +} + +inline __m128i Sum3_16(const __m128i src0, const __m128i src1, + const __m128i src2) { + const __m128i sum = _mm_add_epi16(src0, src1); + return _mm_add_epi16(sum, src2); +} + +inline __m128i Sum3_16(const __m128i src[3]) { + return Sum3_16(src[0], src[1], src[2]); +} + +inline __m128i Sum3_32(const __m128i src0, const __m128i src1, + const __m128i src2) { + const __m128i sum = _mm_add_epi32(src0, src1); + return _mm_add_epi32(sum, src2); +} + +inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) { + dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]); + dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]); +} + +inline __m128i Sum3WLo16(const __m128i src[3]) { + const __m128i sum = VaddlLo8(src[0], src[1]); + return VaddwLo8(sum, src[2]); +} + +inline __m128i Sum3WHi16(const __m128i src[3]) { + const __m128i sum = VaddlHi8(src[0], src[1]); + return VaddwHi8(sum, src[2]); +} + +inline __m128i Sum3WLo32(const __m128i src[3]) { + const __m128i sum = VaddlLo16(src[0], src[1]); + return VaddwLo16(sum, src[2]); +} + +inline __m128i Sum3WHi32(const __m128i src[3]) { + const __m128i sum = VaddlHi16(src[0], src[1]); + return VaddwHi16(sum, src[2]); +} + +inline __m128i Sum5_16(const __m128i src[5]) { + const __m128i sum01 = _mm_add_epi16(src[0], src[1]); + const __m128i sum23 = _mm_add_epi16(src[2], src[3]); + const __m128i sum = _mm_add_epi16(sum01, sum23); + return _mm_add_epi16(sum, src[4]); +} + +inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1, + const __m128i* const src2, const __m128i* const src3, + const __m128i* const src4) { + const __m128i sum01 = _mm_add_epi32(*src0, *src1); + const __m128i sum23 = _mm_add_epi32(*src2, *src3); + const __m128i sum = _mm_add_epi32(sum01, sum23); + return _mm_add_epi32(sum, *src4); +} + +inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) { + dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]); + dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]); +} + +inline __m128i Sum5WLo16(const __m128i src[5]) { + const __m128i sum01 = VaddlLo8(src[0], src[1]); + const __m128i sum23 = VaddlLo8(src[2], src[3]); + const __m128i sum = _mm_add_epi16(sum01, sum23); + return VaddwLo8(sum, src[4]); +} + +inline __m128i Sum5WHi16(const __m128i src[5]) { + const __m128i sum01 = VaddlHi8(src[0], src[1]); + const __m128i sum23 = VaddlHi8(src[2], src[3]); + const __m128i sum = _mm_add_epi16(sum01, sum23); + return VaddwHi8(sum, src[4]); +} + +inline __m128i Sum3Horizontal(const __m128i src) { + __m128i s[3]; + Prepare3Lo8(src, s); + return Sum3WLo16(s); +} + +template +inline void Sum3Horizontal(const __m128i src[2], __m128i dst[2]) { + __m128i s[3]; + Prepare3_8(src, s); + dst[0] = Sum3WLo16(s); + dst[1] = Sum3WHi16(s); +} + +inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) { + __m128i s[3]; + Prepare3_16(src, s); + dst[0] = Sum3WLo32(s); + dst[1] = Sum3WHi32(s); +} + +inline __m128i Sum5Horizontal(const __m128i src) { + __m128i s[5]; + Prepare5Lo8(src, s); + return Sum5WLo16(s); +} + +template +inline void Sum5Horizontal(const __m128i src[2], __m128i* const dst0, + __m128i* const dst1) { + __m128i s[5]; + Prepare5_8(src, s); + *dst0 = Sum5WLo16(s); + *dst1 = Sum5WHi16(s); +} + +inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) { + __m128i s[5]; + Prepare5_16(src, s); + const __m128i sum01_lo = VaddlLo16(s[0], s[1]); + const __m128i sum23_lo = VaddlLo16(s[2], s[3]); + const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo); + dst[0] = VaddwLo16(sum0123_lo, s[4]); + const __m128i sum01_hi = VaddlHi16(s[0], s[1]); + const __m128i sum23_hi = VaddlHi16(s[2], s[3]); + const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi); + dst[1] = VaddwHi16(sum0123_hi, s[4]); +} + +void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3, + __m128i* const row_sq5) { + const __m128i sum04 = VaddlLo16(src[0], src[4]); + *row_sq3 = Sum3WLo32(src + 1); + *row_sq5 = _mm_add_epi32(sum04, *row_sq3); +} + +void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3, + __m128i* const row_sq5) { + const __m128i sum04 = VaddlHi16(src[0], src[4]); + *row_sq3 = Sum3WHi32(src + 1); + *row_sq5 = _mm_add_epi32(sum04, *row_sq3); +} + +void SumHorizontalLo(const __m128i src, __m128i* const row3, + __m128i* const row5) { + __m128i s[5]; + Prepare5Lo8(src, s); + const __m128i sum04 = VaddlLo8(s[0], s[4]); + *row3 = Sum3WLo16(s + 1); + *row5 = _mm_add_epi16(sum04, *row3); +} + +template +void SumHorizontal(const __m128i src[2], __m128i* const row3_0, + __m128i* const row3_1, __m128i* const row5_0, + __m128i* const row5_1) { + __m128i s[5]; + Prepare5_8(src, s); + const __m128i sum04_lo = VaddlLo8(s[0], s[4]); + const __m128i sum04_hi = VaddlHi8(s[0], s[4]); + *row3_0 = Sum3WLo16(s + 1); + *row3_1 = Sum3WHi16(s + 1); + *row5_0 = _mm_add_epi16(sum04_lo, *row3_0); + *row5_1 = _mm_add_epi16(sum04_hi, *row3_1); +} + +inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0, + __m128i* const row_sq3_1, __m128i* const row_sq5_0, + __m128i* const row_sq5_1) { + __m128i s[5]; + Prepare5_16(src, s); + SumHorizontalLo(s, row_sq3_0, row_sq5_0); + SumHorizontalHi(s, row_sq3_1, row_sq5_1); +} + +inline __m128i Sum343Lo(const __m128i ma3[3]) { + const __m128i sum = Sum3WLo16(ma3); + const __m128i sum3 = Sum3_16(sum, sum, sum); + return VaddwLo8(sum3, ma3[1]); +} + +inline __m128i Sum343Hi(const __m128i ma3[3]) { + const __m128i sum = Sum3WHi16(ma3); + const __m128i sum3 = Sum3_16(sum, sum, sum); + return VaddwHi8(sum3, ma3[1]); +} + +inline __m128i Sum343WLo(const __m128i src[3]) { + const __m128i sum = Sum3WLo32(src); + const __m128i sum3 = Sum3_32(sum, sum, sum); + return VaddwLo16(sum3, src[1]); +} + +inline __m128i Sum343WHi(const __m128i src[3]) { + const __m128i sum = Sum3WHi32(src); + const __m128i sum3 = Sum3_32(sum, sum, sum); + return VaddwHi16(sum3, src[1]); +} + +inline void Sum343W(const __m128i src[2], __m128i dst[2]) { + __m128i s[3]; + Prepare3_16(src, s); + dst[0] = Sum343WLo(s); + dst[1] = Sum343WHi(s); +} + +inline __m128i Sum565Lo(const __m128i src[3]) { + const __m128i sum = Sum3WLo16(src); + const __m128i sum4 = _mm_slli_epi16(sum, 2); + const __m128i sum5 = _mm_add_epi16(sum4, sum); + return VaddwLo8(sum5, src[1]); +} + +inline __m128i Sum565Hi(const __m128i src[3]) { + const __m128i sum = Sum3WHi16(src); + const __m128i sum4 = _mm_slli_epi16(sum, 2); + const __m128i sum5 = _mm_add_epi16(sum4, sum); + return VaddwHi8(sum5, src[1]); +} + +inline __m128i Sum565WLo(const __m128i src[3]) { + const __m128i sum = Sum3WLo32(src); + const __m128i sum4 = _mm_slli_epi32(sum, 2); + const __m128i sum5 = _mm_add_epi32(sum4, sum); + return VaddwLo16(sum5, src[1]); +} + +inline __m128i Sum565WHi(const __m128i src[3]) { + const __m128i sum = Sum3WHi32(src); + const __m128i sum4 = _mm_slli_epi32(sum, 2); + const __m128i sum5 = _mm_add_epi32(sum4, sum); + return VaddwHi16(sum5, src[1]); +} + +inline void Sum565W(const __m128i src[2], __m128i dst[2]) { + __m128i s[3]; + Prepare3_16(src, s); + dst[0] = Sum565WLo(s); + dst[1] = Sum565WHi(s); +} + +inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const ptrdiff_t sum_stride, + const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5, + uint32_t* square_sum3, uint32_t* square_sum5) { + int y = 2; + do { + __m128i s[2], sq[3]; + s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass1 - width); + sq[0] = SquareLo8(s[0]); + ptrdiff_t x = sum_width; + do { + __m128i row3[2], row5[2], row_sq3[2], row_sq5[2]; + x -= 16; + src += 16; + s[1] = LoadUnaligned16Msan(src, + sum_width - x + kOverreadInBytesPass1 - width); + sq[1] = SquareHi8(s[0]); + sq[2] = SquareLo8(s[1]); + SumHorizontal<0>(s, &row3[0], &row3[1], &row5[0], &row5[1]); + StoreAligned32U16(sum3, row3); + StoreAligned32U16(sum5, row5); + SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]); + StoreAligned32U32(square_sum3 + 0, row_sq3); + StoreAligned32U32(square_sum5 + 0, row_sq5); + SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]); + StoreAligned32U32(square_sum3 + 8, row_sq3); + StoreAligned32U32(square_sum5 + 8, row_sq5); + s[0] = s[1]; + sq[0] = sq[2]; + sum3 += 16; + sum5 += 16; + square_sum3 += 16; + square_sum5 += 16; + } while (x != 0); + src += src_stride - sum_width; + sum3 += sum_stride - sum_width; + sum5 += sum_stride - sum_width; + square_sum3 += sum_stride - sum_width; + square_sum5 += sum_stride - sum_width; + } while (--y != 0); +} + +template +inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride, + const ptrdiff_t width, const ptrdiff_t sum_stride, + const ptrdiff_t sum_width, uint16_t* sums, + uint32_t* square_sums) { + static_assert(size == 3 || size == 5, ""); + constexpr int kOverreadInBytes = + (size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2; + int y = 2; + do { + __m128i s[2], sq[3]; + s[0] = LoadUnaligned16Msan(src, kOverreadInBytes - width); + sq[0] = SquareLo8(s[0]); + ptrdiff_t x = sum_width; + do { + __m128i row[2], row_sq[4]; + x -= 16; + src += 16; + s[1] = LoadUnaligned16Msan(src, sum_width - x + kOverreadInBytes - width); + sq[1] = SquareHi8(s[0]); + sq[2] = SquareLo8(s[1]); + if (size == 3) { + Sum3Horizontal<0>(s, row); + Sum3WHorizontal(sq + 0, row_sq + 0); + Sum3WHorizontal(sq + 1, row_sq + 2); + } else { + Sum5Horizontal<0>(s, &row[0], &row[1]); + Sum5WHorizontal(sq + 0, row_sq + 0); + Sum5WHorizontal(sq + 1, row_sq + 2); + } + StoreAligned32U16(sums, row); + StoreAligned64U32(square_sums, row_sq); + s[0] = s[1]; + sq[0] = sq[2]; + sums += 16; + square_sums += 16; + } while (x != 0); + src += src_stride - sum_width; + sums += sum_stride - sum_width; + square_sums += sum_stride - sum_width; + } while (--y != 0); +} + +template +inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq, + const uint32_t scale) { + static_assert(n == 9 || n == 25, ""); + // a = |sum_sq| + // d = |sum| + // p = (a * n < d * d) ? 0 : a * n - d * d; + const __m128i dxd = _mm_madd_epi16(sum, sum); + // _mm_mullo_epi32() has high latency. Using shifts and additions instead. + // Some compilers could do this for us but we make this explicit. + // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n)); + __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3)); + if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4)); + const __m128i sub = _mm_sub_epi32(axn, dxd); + const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128()); + const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale)); + return VrshrU32(pxs, kSgrProjScaleBits); +} + +template +inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2], + const uint32_t scale) { + static_assert(n == 9 || n == 25, ""); + const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128()); + const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128()); + const __m128i z0 = CalculateMa(sum_lo, sum_sq[0], scale); + const __m128i z1 = CalculateMa(sum_hi, sum_sq[1], scale); + return _mm_packus_epi32(z0, z1); +} + +template +inline __m128i CalculateB(const __m128i sum, const __m128i ma) { + static_assert(n == 9 || n == 25, ""); + constexpr uint32_t one_over_n = + ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n; + const __m128i m0 = VmullLo16(ma, sum); + const __m128i m1 = VmullHi16(ma, sum); + const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n)); + const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n)); + const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits); + const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits); + return _mm_packus_epi32(b_lo, b_hi); +} + +inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2], + const uint32_t scale, __m128i* const sum, + __m128i* const index) { + __m128i sum_sq[2]; + *sum = Sum5_16(s5); + Sum5_32(sq5, sum_sq); + *index = CalculateMa<25>(*sum, sum_sq, scale); +} + +inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2], + const uint32_t scale, __m128i* const sum, + __m128i* const index) { + __m128i sum_sq[2]; + *sum = Sum3_16(s3); + Sum3_32(sq3, sum_sq); + *index = CalculateMa<9>(*sum, sum_sq, scale); +} + +template +inline void LookupIntermediate(const __m128i sum, const __m128i index, + __m128i* const ma, __m128i* const b) { + static_assert(n == 9 || n == 25, ""); + static_assert(offset == 0 || offset == 8, ""); + const __m128i idx = _mm_packus_epi16(index, index); + // Actually it's not stored and loaded. The compiler will use a 64-bit + // general-purpose register to process. Faster than using _mm_extract_epi8(). + uint8_t temp[8]; + StoreLo8(temp, idx); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6); + *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7); + // b = ma * b * one_over_n + // |ma| = [0, 255] + // |sum| is a box sum with radius 1 or 2. + // For the first pass radius is 2. Maximum value is 5x5x255 = 6375. + // For the second pass radius is 1. Maximum value is 3x3x255 = 2295. + // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n + // When radius is 2 |n| is 25. |one_over_n| is 164. + // When radius is 1 |n| is 9. |one_over_n| is 455. + // |kSgrProjReciprocalBits| is 12. + // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). + // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). + __m128i maq; + if (offset == 0) { + maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128()); + } else { + maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128()); + } + *b = CalculateB(sum, maq); +} + +// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b +// to get value 0 as the shuffle result. The most significiant bit 1 comes +// either from the comparision instruction, or from the sign bit of the index. +inline __m128i ShuffleIndex(const __m128i table, const __m128i index) { + __m128i mask; + mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15)); + mask = _mm_or_si128(mask, index); + return _mm_shuffle_epi8(table, mask); +} + +inline __m128i AdjustValue(const __m128i value, const __m128i index, + const int threshold) { + const __m128i thresholds = _mm_set1_epi8(threshold - 128); + const __m128i offset = _mm_cmpgt_epi8(index, thresholds); + return _mm_add_epi8(value, offset); +} + +inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2], + __m128i* const ma, __m128i* const b0, + __m128i* const b1) { + // Use table lookup to read elements which indices are less than 48. + const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16); + const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16); + const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16); + const __m128i indices = _mm_packus_epi16(index[0], index[1]); + __m128i idx; + // Clip idx to 127 to apply signed comparision instructions. + idx = _mm_min_epu8(indices, _mm_set1_epi8(127)); + // All elements which indices are less than 48 are set to 0. + // Get shuffle results for indices in range [0, 15]. + *ma = ShuffleIndex(c0, idx); + // Get shuffle results for indices in range [16, 31]. + // Subtract 16 to utilize the sign bit of the index. + idx = _mm_sub_epi8(idx, _mm_set1_epi8(16)); + const __m128i res1 = ShuffleIndex(c1, idx); + // Use OR instruction to combine shuffle results together. + *ma = _mm_or_si128(*ma, res1); + // Get shuffle results for indices in range [32, 47]. + // Subtract 16 to utilize the sign bit of the index. + idx = _mm_sub_epi8(idx, _mm_set1_epi8(16)); + const __m128i res2 = ShuffleIndex(c2, idx); + *ma = _mm_or_si128(*ma, res2); + + // For elements which indices are larger than 47, since they seldom change + // values with the increase of the index, we use comparison and arithmetic + // operations to calculate their values. + // Add -128 to apply signed comparision instructions. + idx = _mm_add_epi8(indices, _mm_set1_epi8(-128)); + // Elements which indices are larger than 47 (with value 0) are set to 5. + *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5)); + *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5. + *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4. + *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3. + *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2. + *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1. + + // b = ma * b * one_over_n + // |ma| = [0, 255] + // |sum| is a box sum with radius 1 or 2. + // For the first pass radius is 2. Maximum value is 5x5x255 = 6375. + // For the second pass radius is 1. Maximum value is 3x3x255 = 2295. + // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n + // When radius is 2 |n| is 25. |one_over_n| is 164. + // When radius is 1 |n| is 9. |one_over_n| is 455. + // |kSgrProjReciprocalBits| is 12. + // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits). + // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits). + const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128()); + *b0 = CalculateB<9>(sum[0], maq0); + const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128()); + *b1 = CalculateB<9>(sum[1], maq1); +} + +inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2], + __m128i ma[2], __m128i b[2]) { + __m128i mas; + CalculateIntermediate(sum, index, &mas, &b[0], &b[1]); + ma[0] = _mm_unpacklo_epi64(ma[0], mas); + ma[1] = _mm_srli_si128(mas, 8); +} + +// Note: It has been tried to call CalculateIntermediate() to replace the slow +// LookupIntermediate() when calculating 16 intermediate data points. However, +// the compiler generates even slower code. +template +inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2], + const uint32_t scale, __m128i* const ma, + __m128i* const b) { + static_assert(offset == 0 || offset == 8, ""); + __m128i sum, index; + CalculateSumAndIndex5(s5, sq5, scale, &sum, &index); + LookupIntermediate<25, offset>(sum, index, ma, b); +} + +inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2], + const uint32_t scale, __m128i* const ma, + __m128i* const b) { + __m128i sum, index; + CalculateSumAndIndex3(s3, sq3, scale, &sum, &index); + LookupIntermediate<9, 0>(sum, index, ma, b); +} + +inline void Store343_444(const __m128i b3[2], const ptrdiff_t x, + __m128i sum_b343[2], __m128i sum_b444[2], + uint32_t* const b343, uint32_t* const b444) { + __m128i b[3], sum_b111[2]; + Prepare3_16(b3, b); + sum_b111[0] = Sum3WLo32(b); + sum_b111[1] = Sum3WHi32(b); + sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2); + sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2); + StoreAligned32U32(b444 + x, sum_b444); + sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]); + sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]); + sum_b343[0] = VaddwLo16(sum_b343[0], b[1]); + sum_b343[1] = VaddwHi16(sum_b343[1], b[1]); + StoreAligned32U32(b343 + x, sum_b343); +} + +inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2], + const ptrdiff_t x, __m128i* const sum_ma343, + __m128i* const sum_ma444, __m128i sum_b343[2], + __m128i sum_b444[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + const __m128i sum_ma111 = Sum3WLo16(ma3); + *sum_ma444 = _mm_slli_epi16(sum_ma111, 2); + StoreAligned16(ma444 + x, *sum_ma444); + const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111); + *sum_ma343 = VaddwLo8(sum333, ma3[1]); + StoreAligned16(ma343 + x, *sum_ma343); + Store343_444(b3, x, sum_b343, sum_b444, b343, b444); +} + +inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2], + const ptrdiff_t x, __m128i* const sum_ma343, + __m128i* const sum_ma444, __m128i sum_b343[2], + __m128i sum_b444[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + const __m128i sum_ma111 = Sum3WHi16(ma3); + *sum_ma444 = _mm_slli_epi16(sum_ma111, 2); + StoreAligned16(ma444 + x, *sum_ma444); + const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111); + *sum_ma343 = VaddwHi8(sum333, ma3[1]); + StoreAligned16(ma343 + x, *sum_ma343); + Store343_444(b3, x, sum_b343, sum_b444, b343, b444); +} + +inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2], + const ptrdiff_t x, __m128i* const sum_ma343, + __m128i sum_b343[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m128i sum_ma444, sum_b444[2]; + Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343, + ma444, b343, b444); +} + +inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2], + const ptrdiff_t x, __m128i* const sum_ma343, + __m128i sum_b343[2], uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m128i sum_ma444, sum_b444[2]; + Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343, + ma444, b343, b444); +} + +inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2], + const ptrdiff_t x, uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m128i sum_ma343, sum_b343[2]; + Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444); +} + +inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2], + const ptrdiff_t x, uint16_t* const ma343, + uint16_t* const ma444, uint32_t* const b343, + uint32_t* const b444) { + __m128i sum_ma343, sum_b343[2]; + Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo( + const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5], + uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma, + __m128i* const b) { + __m128i s5[2][5], sq5[5][2]; + sq[0][1] = SquareHi8(s[0][0]); + sq[1][1] = SquareHi8(s[1][0]); + s5[0][3] = Sum5Horizontal(s[0][0]); + StoreAligned16(sum5[3], s5[0][3]); + s5[0][4] = Sum5Horizontal(s[1][0]); + StoreAligned16(sum5[4], s5[0][4]); + Sum5WHorizontal(sq[0], sq5[3]); + StoreAligned32U32(square_sum5[3], sq5[3]); + Sum5WHorizontal(sq[1], sq5[4]); + StoreAligned32U32(square_sum5[4], sq5[4]); + LoadAligned16x3U16(sum5, 0, s5[0]); + LoadAligned32x3U32(square_sum5, 0, sq5); + CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5( + const __m128i s[2][2], const ptrdiff_t sum_width, const ptrdiff_t x, + const uint32_t scale, uint16_t* const sum5[5], + uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma[2], + __m128i b[3]) { + __m128i s5[2][5], sq5[5][2]; + sq[0][2] = SquareLo8(s[0][1]); + sq[1][2] = SquareLo8(s[1][1]); + Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]); + StoreAligned16(sum5[3] + x + 0, s5[0][3]); + StoreAligned16(sum5[3] + x + 8, s5[1][3]); + Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]); + StoreAligned16(sum5[4] + x + 0, s5[0][4]); + StoreAligned16(sum5[4] + x + 8, s5[1][4]); + Sum5WHorizontal(sq[0] + 1, sq5[3]); + StoreAligned32U32(square_sum5[3] + x, sq5[3]); + Sum5WHorizontal(sq[1] + 1, sq5[4]); + StoreAligned32U32(square_sum5[4] + x, sq5[4]); + LoadAligned16x3U16(sum5, x, s5[0]); + LoadAligned32x3U32(square_sum5, x, sq5); + CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]); + + sq[0][3] = SquareHi8(s[0][1]); + sq[1][3] = SquareHi8(s[1][1]); + Sum5WHorizontal(sq[0] + 2, sq5[3]); + StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]); + Sum5WHorizontal(sq[1] + 2, sq5[4]); + StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]); + LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]); + LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5); + CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo( + const __m128i s, const uint32_t scale, const uint16_t* const sum5[5], + const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma, + __m128i* const b) { + __m128i s5[5], sq5[5][2]; + sq[1] = SquareHi8(s); + s5[3] = s5[4] = Sum5Horizontal(s); + Sum5WHorizontal(sq, sq5[3]); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + LoadAligned16x3U16(sum5, 0, s5); + LoadAligned32x3U32(square_sum5, 0, sq5); + CalculateIntermediate5<0>(s5, sq5, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow( + const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x, + const uint32_t scale, const uint16_t* const sum5[5], + const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma[2], + __m128i b[3]) { + __m128i s5[2][5], sq5[5][2]; + sq[2] = SquareLo8(s[1]); + Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]); + s5[0][4] = s5[0][3]; + s5[1][4] = s5[1][3]; + Sum5WHorizontal(sq + 1, sq5[3]); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + LoadAligned16x3U16(sum5, x, s5[0]); + LoadAligned32x3U32(square_sum5, x, sq5); + CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]); + + sq[3] = SquareHi8(s[1]); + Sum5WHorizontal(sq + 2, sq5[3]); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]); + LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5); + CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo( + const __m128i s, const uint32_t scale, uint16_t* const sum3[3], + uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma, + __m128i* const b) { + __m128i s3[3], sq3[3][2]; + sq[1] = SquareHi8(s); + s3[2] = Sum3Horizontal(s); + StoreAligned16(sum3[2], s3[2]); + Sum3WHorizontal(sq, sq3[2]); + StoreAligned32U32(square_sum3[2], sq3[2]); + LoadAligned16x2U16(sum3, 0, s3); + LoadAligned32x2U32(square_sum3, 0, sq3); + CalculateIntermediate3(s3, sq3, scale, ma, b); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3( + const __m128i s[2], const ptrdiff_t x, const ptrdiff_t sum_width, + const uint32_t scale, uint16_t* const sum3[3], + uint32_t* const square_sum3[3], __m128i sq[4], __m128i ma[2], + __m128i b[3]) { + __m128i s3[4], sq3[3][2], sum[2], index[2]; + sq[2] = SquareLo8(s[1]); + Sum3Horizontal<8>(s, s3 + 2); + StoreAligned32U16(sum3[2] + x, s3 + 2); + Sum3WHorizontal(sq + 1, sq3[2]); + StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]); + LoadAligned16x2U16(sum3, x, s3); + LoadAligned32x2U32(square_sum3, x, sq3); + CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]); + + sq[3] = SquareHi8(s[1]); + Sum3WHorizontal(sq + 2, sq3[2]); + StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]); + LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1); + LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3); + CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]); + CalculateIntermediate(sum, index, ma, b + 1); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo( + const __m128i s[2][2], const uint16_t scales[2], uint16_t* const sum3[4], + uint16_t* const sum5[5], uint32_t* const square_sum3[4], + uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma3[2][2], + __m128i b3[2][3], __m128i* const ma5, __m128i* const b5) { + __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2]; + sq[0][1] = SquareHi8(s[0][0]); + sq[1][1] = SquareHi8(s[1][0]); + SumHorizontalLo(s[0][0], &s3[2], &s5[3]); + SumHorizontalLo(s[1][0], &s3[3], &s5[4]); + StoreAligned16(sum3[2], s3[2]); + StoreAligned16(sum3[3], s3[3]); + StoreAligned16(sum5[3], s5[3]); + StoreAligned16(sum5[4], s5[4]); + SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + StoreAligned32U32(square_sum3[2], sq3[2]); + StoreAligned32U32(square_sum5[3], sq5[3]); + SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]); + StoreAligned32U32(square_sum3[3], sq3[3]); + StoreAligned32U32(square_sum5[4], sq5[4]); + LoadAligned16x2U16(sum3, 0, s3); + LoadAligned32x2U32(square_sum3, 0, sq3); + LoadAligned16x3U16(sum5, 0, s5); + LoadAligned32x3U32(square_sum5, 0, sq5); + CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]); + CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]); + CalculateIntermediate(sum, index, &ma3[0][0], &b3[0][0], &b3[1][0]); + ma3[1][0] = _mm_srli_si128(ma3[0][0], 8); + CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess( + const __m128i s[2][2], const ptrdiff_t x, const uint16_t scales[2], + uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, __m128i sq[2][4], __m128i ma3[2][2], + __m128i b3[2][3], __m128i ma5[2], __m128i b5[3]) { + __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2]; + SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]); + StoreAligned16(sum3[2] + x + 0, s3[0][2]); + StoreAligned16(sum3[2] + x + 8, s3[1][2]); + StoreAligned16(sum5[3] + x + 0, s5[0][3]); + StoreAligned16(sum5[3] + x + 8, s5[1][3]); + SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]); + StoreAligned16(sum3[3] + x + 0, s3[0][3]); + StoreAligned16(sum3[3] + x + 8, s3[1][3]); + StoreAligned16(sum5[4] + x + 0, s5[0][4]); + StoreAligned16(sum5[4] + x + 8, s5[1][4]); + sq[0][2] = SquareLo8(s[0][1]); + sq[1][2] = SquareLo8(s[1][1]); + SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + StoreAligned32U32(square_sum3[2] + x, sq3[2]); + StoreAligned32U32(square_sum5[3] + x, sq5[3]); + SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]); + StoreAligned32U32(square_sum3[3] + x, sq3[3]); + StoreAligned32U32(square_sum5[4] + x, sq5[4]); + LoadAligned16x2U16(sum3, x, s3[0]); + LoadAligned32x2U32(square_sum3, x, sq3); + CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]); + CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0], + &index[1][0]); + LoadAligned16x3U16(sum5, x, s5[0]); + LoadAligned32x3U32(square_sum5, x, sq5); + CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[1]); + + sq[0][3] = SquareHi8(s[0][1]); + sq[1][3] = SquareHi8(s[1][1]); + SumHorizontal(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]); + StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]); + SumHorizontal(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]); + StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]); + StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]); + LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]); + LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3); + CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]); + CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1], + &index[1][1]); + CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 1); + CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 1); + LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]); + LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5); + CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[2]); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo( + const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4], + const uint16_t* const sum5[5], const uint32_t* const square_sum3[4], + const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3, + __m128i* const ma5, __m128i* const b3, __m128i* const b5) { + __m128i s3[3], s5[5], sq3[3][2], sq5[5][2]; + sq[1] = SquareHi8(s); + SumHorizontalLo(s, &s3[2], &s5[3]); + SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + LoadAligned16x3U16(sum5, 0, s5); + s5[4] = s5[3]; + LoadAligned32x3U32(square_sum5, 0, sq5); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5); + LoadAligned16x2U16(sum3, 0, s3); + LoadAligned32x2U32(square_sum3, 0, sq3); + CalculateIntermediate3(s3, sq3, scales[1], ma3, b3); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow( + const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x, + const uint16_t scales[2], const uint16_t* const sum3[4], + const uint16_t* const sum5[5], const uint32_t* const square_sum3[4], + const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma3[2], + __m128i ma5[2], __m128i b3[3], __m128i b5[3]) { + __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2]; + sq[2] = SquareLo8(s[1]); + SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]); + SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + LoadAligned16x3U16(sum5, x, s5[0]); + s5[0][4] = s5[0][3]; + LoadAligned32x3U32(square_sum5, x, sq5); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 1); + LoadAligned16x2U16(sum3, x, s3[0]); + LoadAligned32x2U32(square_sum3, x, sq3); + CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]); + + sq[3] = SquareHi8(s[1]); + SumHorizontal(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]); + LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]); + s5[1][4] = s5[1][3]; + LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5); + sq5[4][0] = sq5[3][0]; + sq5[4][1] = sq5[3][1]; + CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 2); + LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]); + LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3); + CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]); + CalculateIntermediate(sum, index, ma3, b3 + 1); +} + +inline void BoxSumFilterPreProcess5(const uint8_t* const src0, + const uint8_t* const src1, const int width, + const uint32_t scale, + uint16_t* const sum5[5], + uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, uint16_t* ma565, + uint32_t* b565) { + __m128i s[2][2], mas[2], sq[2][4], bs[3]; + s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width); + s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width); + sq[0][0] = SquareLo8(s[0][0]); + sq[1][0] = SquareLo8(s[1][0]); + BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]); + + int x = 0; + do { + __m128i ma5[3], ma[2], b[4]; + s[0][1] = LoadUnaligned16Msan(src0 + x + 16, + x + 16 + kOverreadInBytesPass1 - width); + s[1][1] = LoadUnaligned16Msan(src1 + x + 16, + x + 16 + kOverreadInBytesPass1 - width); + BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas, + bs); + Prepare3_8<0>(mas, ma5); + ma[0] = Sum565Lo(ma5); + ma[1] = Sum565Hi(ma5); + StoreAligned32U16(ma565, ma); + Sum565W(bs + 0, b + 0); + Sum565W(bs + 1, b + 2); + StoreAligned64U32(b565, b); + s[0][0] = s[0][1]; + s[1][0] = s[1][1]; + sq[0][1] = sq[0][3]; + sq[1][1] = sq[1][3]; + mas[0] = mas[1]; + bs[0] = bs[2]; + ma565 += 16; + b565 += 16; + x += 16; + } while (x < width); +} + +template +LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3( + const uint8_t* const src, const int width, const uint32_t scale, + uint16_t* const sum3[3], uint32_t* const square_sum3[3], + const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343, + uint32_t* b444) { + __m128i s[2], mas[2], sq[4], bs[3]; + s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass2 - width); + sq[0] = SquareLo8(s[0]); + BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]); + + int x = 0; + do { + s[1] = LoadUnaligned16Msan(src + x + 16, + x + 16 + kOverreadInBytesPass2 - width); + BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas, + bs); + __m128i ma3[3]; + Prepare3_8<0>(mas, ma3); + if (calculate444) { // NOLINT(readability-simplify-boolean-expr) + Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444); + Store343_444Hi(ma3, bs + 1, 8, ma343, ma444, b343, b444); + ma444 += 16; + b444 += 16; + } else { + __m128i ma[2], b[4]; + ma[0] = Sum343Lo(ma3); + ma[1] = Sum343Hi(ma3); + StoreAligned32U16(ma343, ma); + Sum343W(bs + 0, b + 0); + Sum343W(bs + 1, b + 2); + StoreAligned64U32(b343, b); + } + s[0] = s[1]; + sq[1] = sq[3]; + mas[0] = mas[1]; + bs[0] = bs[2]; + ma343 += 16; + b343 += 16; + x += 16; + } while (x < width); +} + +inline void BoxSumFilterPreProcess( + const uint8_t* const src0, const uint8_t* const src1, const int width, + const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, uint16_t* const ma343[4], + uint16_t* const ma444[2], uint16_t* ma565, uint32_t* const b343[4], + uint32_t* const b444[2], uint32_t* b565) { + __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3]; + s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width); + s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width); + sq[0][0] = SquareLo8(s[0][0]); + sq[1][0] = SquareLo8(s[1][0]); + BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq, + ma3, b3, &ma5[0], &b5[0]); + + int x = 0; + do { + __m128i ma[2], b[4], ma3x[3], ma5x[3]; + s[0][1] = LoadUnaligned16Msan(src0 + x + 16, + x + 16 + kOverreadInBytesPass1 - width); + s[1][1] = LoadUnaligned16Msan(src1 + x + 16, + x + 16 + kOverreadInBytesPass1 - width); + BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5, + sum_width, sq, ma3, b3, ma5, b5); + + Prepare3_8<0>(ma3[0], ma3x); + ma[0] = Sum343Lo(ma3x); + ma[1] = Sum343Hi(ma3x); + StoreAligned32U16(ma343[0] + x, ma); + Sum343W(b3[0] + 0, b + 0); + Sum343W(b3[0] + 1, b + 2); + StoreAligned64U32(b343[0] + x, b); + Sum565W(b5 + 0, b + 0); + Sum565W(b5 + 1, b + 2); + StoreAligned64U32(b565, b); + Prepare3_8<0>(ma3[1], ma3x); + Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444[0], b343[1], b444[0]); + Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444[0], b343[1], + b444[0]); + Prepare3_8<0>(ma5, ma5x); + ma[0] = Sum565Lo(ma5x); + ma[1] = Sum565Hi(ma5x); + StoreAligned32U16(ma565, ma); + s[0][0] = s[0][1]; + s[1][0] = s[1][1]; + sq[0][1] = sq[0][3]; + sq[1][1] = sq[1][3]; + ma3[0][0] = ma3[0][1]; + ma3[1][0] = ma3[1][1]; + ma5[0] = ma5[1]; + b3[0][0] = b3[0][2]; + b3[1][0] = b3[1][2]; + b5[0] = b5[2]; + ma565 += 16; + b565 += 16; + x += 16; + } while (x < width); +} + +template +inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) { + // ma: 255 * 32 = 8160 (13 bits) + // b: 65088 * 32 = 2082816 (21 bits) + // v: b - ma * 255 (22 bits) + const __m128i v = _mm_sub_epi32(b, ma_x_src); + // kSgrProjSgrBits = 8 + // kSgrProjRestoreBits = 4 + // shift = 4 or 5 + // v >> 8 or 9 (13 bits) + return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits); +} + +template +inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma, + const __m128i b[2]) { + const __m128i ma_x_src_lo = VmullLo16(ma, src); + const __m128i ma_x_src_hi = VmullHi16(ma, src); + const __m128i dst_lo = FilterOutput(ma_x_src_lo, b[0]); + const __m128i dst_hi = FilterOutput(ma_x_src_hi, b[1]); + return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits +} + +inline __m128i CalculateFilteredOutputPass1(const __m128i src, __m128i ma[2], + __m128i b[2][2]) { + const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]); + __m128i b_sum[2]; + b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]); + b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]); + return CalculateFilteredOutput<5>(src, ma_sum, b_sum); +} + +inline __m128i CalculateFilteredOutputPass2(const __m128i src, __m128i ma[3], + __m128i b[3][2]) { + const __m128i ma_sum = Sum3_16(ma); + __m128i b_sum[2]; + Sum3_32(b, b_sum); + return CalculateFilteredOutput<5>(src, ma_sum, b_sum); +} + +inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) { + const __m128i v_lo = + VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits); + const __m128i v_hi = + VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits); + const __m128i vv = _mm_packs_epi32(v_lo, v_hi); + return _mm_add_epi16(src, vv); +} + +inline __m128i SelfGuidedDoubleMultiplier(const __m128i src, + const __m128i filter[2], const int w0, + const int w2) { + __m128i v[2]; + const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast(w0)); + const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]); + const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]); + v[0] = _mm_madd_epi16(w0_w2, f_lo); + v[1] = _mm_madd_epi16(w0_w2, f_hi); + return SelfGuidedFinal(src, v); +} + +inline __m128i SelfGuidedSingleMultiplier(const __m128i src, + const __m128i filter, const int w0) { + // weight: -96 to 96 (Sgrproj_Xqd_Min/Max) + __m128i v[2]; + v[0] = VmullNLo8(filter, w0); + v[1] = VmullNHi8(filter, w0); + return SelfGuidedFinal(src, v); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPass1( + const uint8_t* const src, const uint8_t* const src0, + const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5], + uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width, + const uint32_t scale, const int16_t w0, uint16_t* const ma565[2], + uint32_t* const b565[2], uint8_t* const dst) { + __m128i s[2][2], mas[2], sq[2][4], bs[3]; + s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width); + s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width); + sq[0][0] = SquareLo8(s[0][0]); + sq[1][0] = SquareLo8(s[1][0]); + BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]); + + int x = 0; + do { + __m128i ma[2], ma3[3], b[2][2], sr[2], p[2]; + s[0][1] = LoadUnaligned16Msan(src0 + x + 16, + x + 16 + kOverreadInBytesPass1 - width); + s[1][1] = LoadUnaligned16Msan(src1 + x + 16, + x + 16 + kOverreadInBytesPass1 - width); + BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas, + bs); + Prepare3_8<0>(mas, ma3); + ma[1] = Sum565Lo(ma3); + StoreAligned16(ma565[1] + x, ma[1]); + Sum565W(bs, b[1]); + StoreAligned32U32(b565[1] + x, b[1]); + sr[0] = LoadAligned16(src + x); + sr[1] = LoadAligned16(src + stride + x); + const __m128i sr0_lo = _mm_unpacklo_epi8(sr[0], _mm_setzero_si128()); + const __m128i sr1_lo = _mm_unpacklo_epi8(sr[1], _mm_setzero_si128()); + ma[0] = LoadAligned16(ma565[0] + x); + LoadAligned32U32(b565[0] + x, b[0]); + p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b); + p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]); + const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0); + const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0); + + ma[1] = Sum565Hi(ma3); + StoreAligned16(ma565[1] + x + 8, ma[1]); + Sum565W(bs + 1, b[1]); + StoreAligned32U32(b565[1] + x + 8, b[1]); + const __m128i sr0_hi = _mm_unpackhi_epi8(sr[0], _mm_setzero_si128()); + const __m128i sr1_hi = _mm_unpackhi_epi8(sr[1], _mm_setzero_si128()); + ma[0] = LoadAligned16(ma565[0] + x + 8); + LoadAligned32U32(b565[0] + x + 8, b[0]); + p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b); + p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]); + const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0); + StoreAligned16(dst + x, _mm_packus_epi16(d00, d01)); + const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0); + StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11)); + s[0][0] = s[0][1]; + s[1][0] = s[1][1]; + sq[0][1] = sq[0][3]; + sq[1][1] = sq[1][3]; + mas[0] = mas[1]; + bs[0] = bs[2]; + x += 16; + } while (x < width); +} + +inline void BoxFilterPass1LastRow( + const uint8_t* const src, const uint8_t* const src0, const int width, + const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0, + uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565, + uint32_t* b565, uint8_t* const dst) { + __m128i s[2], mas[2], sq[4], bs[3]; + s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width); + sq[0] = SquareLo8(s[0]); + BoxFilterPreProcess5LastRowLo(s[0], scale, sum5, square_sum5, sq, &mas[0], + &bs[0]); + + int x = 0; + do { + __m128i ma[2], ma5[3], b[2][2]; + s[1] = LoadUnaligned16Msan(src0 + x + 16, + x + 16 + kOverreadInBytesPass1 - width); + BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5, + sq, mas, bs); + Prepare3_8<0>(mas, ma5); + ma[1] = Sum565Lo(ma5); + Sum565W(bs, b[1]); + ma[0] = LoadAligned16(ma565); + LoadAligned32U32(b565, b[0]); + const __m128i sr = LoadAligned16(src + x); + const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128()); + __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b); + const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0); + + ma[1] = Sum565Hi(ma5); + Sum565W(bs + 1, b[1]); + ma[0] = LoadAligned16(ma565 + 8); + LoadAligned32U32(b565 + 8, b[0]); + const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128()); + p = CalculateFilteredOutputPass1(sr_hi, ma, b); + const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0); + StoreAligned16(dst + x, _mm_packus_epi16(d0, d1)); + s[0] = s[1]; + sq[1] = sq[3]; + mas[0] = mas[1]; + bs[0] = bs[2]; + ma565 += 16; + b565 += 16; + x += 16; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterPass2( + const uint8_t* const src, const uint8_t* const src0, const int width, + const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0, + uint16_t* const sum3[3], uint32_t* const square_sum3[3], + uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3], + uint32_t* const b444[2], uint8_t* const dst) { + __m128i s[2], mas[2], sq[4], bs[3]; + s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass2 - width); + sq[0] = SquareLo8(s[0]); + BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]); + + int x = 0; + do { + s[1] = LoadUnaligned16Msan(src0 + x + 16, + x + 16 + kOverreadInBytesPass2 - width); + BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas, + bs); + __m128i ma[3], b[3][2], ma3[3]; + Prepare3_8<0>(mas, ma3); + Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2], + b444[1]); + const __m128i sr = LoadAligned16(src + x); + const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128()); + ma[0] = LoadAligned16(ma343[0] + x); + ma[1] = LoadAligned16(ma444[0] + x); + LoadAligned32U32(b343[0] + x, b[0]); + LoadAligned32U32(b444[0] + x, b[1]); + const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b); + + Store343_444Hi(ma3, bs + 1, x + 8, &ma[2], b[2], ma343[2], ma444[1], + b343[2], b444[1]); + const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128()); + ma[0] = LoadAligned16(ma343[0] + x + 8); + ma[1] = LoadAligned16(ma444[0] + x + 8); + LoadAligned32U32(b343[0] + x + 8, b[0]); + LoadAligned32U32(b444[0] + x + 8, b[1]); + const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b); + const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0); + const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0); + StoreAligned16(dst + x, _mm_packus_epi16(d0, d1)); + s[0] = s[1]; + sq[1] = sq[3]; + mas[0] = mas[1]; + bs[0] = bs[2]; + x += 16; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilter( + const uint8_t* const src, const uint8_t* const src0, + const uint8_t* const src1, const ptrdiff_t stride, const int width, + const uint16_t scales[2], const int16_t w0, const int16_t w2, + uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + const ptrdiff_t sum_width, uint16_t* const ma343[4], + uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4], + uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) { + __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3]; + s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width); + s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width); + sq[0][0] = SquareLo8(s[0][0]); + sq[1][0] = SquareLo8(s[1][0]); + BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq, + ma3, b3, &ma5[0], &b5[0]); + + int x = 0; + do { + __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3]; + s[0][1] = LoadUnaligned16Msan(src0 + x + 16, + x + 16 + kOverreadInBytesPass1 - width); + s[1][1] = LoadUnaligned16Msan(src1 + x + 16, + x + 16 + kOverreadInBytesPass1 - width); + BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5, + sum_width, sq, ma3, b3, ma5, b5); + Prepare3_8<0>(ma3[0], ma3x[0]); + Prepare3_8<0>(ma3[1], ma3x[1]); + Prepare3_8<0>(ma5, ma5x); + Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1], + ma343[2], ma444[1], b343[2], b444[1]); + Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2], + b343[3], b444[2]); + ma[0][1] = Sum565Lo(ma5x); + StoreAligned16(ma565[1] + x, ma[0][1]); + Sum565W(b5, b[0][1]); + StoreAligned32U32(b565[1] + x, b[0][1]); + const __m128i sr0 = LoadAligned16(src + x); + const __m128i sr1 = LoadAligned16(src + stride + x); + const __m128i sr0_lo = _mm_unpacklo_epi8(sr0, _mm_setzero_si128()); + const __m128i sr1_lo = _mm_unpacklo_epi8(sr1, _mm_setzero_si128()); + ma[0][0] = LoadAligned16(ma565[0] + x); + LoadAligned32U32(b565[0] + x, b[0][0]); + p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]); + p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]); + ma[1][0] = LoadAligned16(ma343[0] + x); + ma[1][1] = LoadAligned16(ma444[0] + x); + LoadAligned32U32(b343[0] + x, b[1][0]); + LoadAligned32U32(b444[0] + x, b[1][1]); + p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]); + const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2); + ma[2][0] = LoadAligned16(ma343[1] + x); + LoadAligned32U32(b343[1] + x, b[2][0]); + p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]); + const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2); + + Store343_444Hi(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], b[1][2], + b[2][1], ma343[2], ma444[1], b343[2], b444[1]); + Store343_444Hi(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], b[2][2], ma343[3], + ma444[2], b343[3], b444[2]); + ma[0][1] = Sum565Hi(ma5x); + StoreAligned16(ma565[1] + x + 8, ma[0][1]); + Sum565W(b5 + 1, b[0][1]); + StoreAligned32U32(b565[1] + x + 8, b[0][1]); + const __m128i sr0_hi = _mm_unpackhi_epi8(sr0, _mm_setzero_si128()); + const __m128i sr1_hi = _mm_unpackhi_epi8(sr1, _mm_setzero_si128()); + ma[0][0] = LoadAligned16(ma565[0] + x + 8); + LoadAligned32U32(b565[0] + x + 8, b[0][0]); + p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]); + p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]); + ma[1][0] = LoadAligned16(ma343[0] + x + 8); + ma[1][1] = LoadAligned16(ma444[0] + x + 8); + LoadAligned32U32(b343[0] + x + 8, b[1][0]); + LoadAligned32U32(b444[0] + x + 8, b[1][1]); + p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]); + const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2); + StoreAligned16(dst + x, _mm_packus_epi16(d00, d01)); + ma[2][0] = LoadAligned16(ma343[1] + x + 8); + LoadAligned32U32(b343[1] + x + 8, b[2][0]); + p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]); + const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2); + StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11)); + s[0][0] = s[0][1]; + s[1][0] = s[1][1]; + sq[0][1] = sq[0][3]; + sq[1][1] = sq[1][3]; + ma3[0][0] = ma3[0][1]; + ma3[1][0] = ma3[1][1]; + ma5[0] = ma5[1]; + b3[0][0] = b3[0][2]; + b3[1][0] = b3[1][2]; + b5[0] = b5[2]; + x += 16; + } while (x < width); +} + +inline void BoxFilterLastRow( + const uint8_t* const src, const uint8_t* const src0, const int width, + const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0, + const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5], + uint32_t* const square_sum3[4], uint32_t* const square_sum5[5], + uint16_t* const ma343[4], uint16_t* const ma444[3], + uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3], + uint32_t* const b565[2], uint8_t* const dst) { + __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2]; + s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width); + sq[0] = SquareLo8(s[0]); + BoxFilterPreProcessLastRowLo(s[0], scales, sum3, sum5, square_sum3, + square_sum5, sq, &ma3[0], &ma5[0], &b3[0], + &b5[0]); + + int x = 0; + do { + __m128i ma3x[3], ma5x[3], p[2]; + s[1] = LoadUnaligned16Msan(src0 + x + 16, + x + 16 + kOverreadInBytesPass1 - width); + BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5, + square_sum3, square_sum5, sq, ma3, ma5, b3, b5); + Prepare3_8<0>(ma3, ma3x); + Prepare3_8<0>(ma5, ma5x); + ma[1] = Sum565Lo(ma5x); + Sum565W(b5, b[1]); + ma[2] = Sum343Lo(ma3x); + Sum343W(b3, b[2]); + const __m128i sr = LoadAligned16(src + x); + const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128()); + ma[0] = LoadAligned16(ma565[0] + x); + LoadAligned32U32(b565[0] + x, b[0]); + p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b); + ma[0] = LoadAligned16(ma343[0] + x); + ma[1] = LoadAligned16(ma444[0] + x); + LoadAligned32U32(b343[0] + x, b[0]); + LoadAligned32U32(b444[0] + x, b[1]); + p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b); + const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2); + + ma[1] = Sum565Hi(ma5x); + Sum565W(b5 + 1, b[1]); + ma[2] = Sum343Hi(ma3x); + Sum343W(b3 + 1, b[2]); + const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128()); + ma[0] = LoadAligned16(ma565[0] + x + 8); + LoadAligned32U32(b565[0] + x + 8, b[0]); + p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b); + ma[0] = LoadAligned16(ma343[0] + x + 8); + ma[1] = LoadAligned16(ma444[0] + x + 8); + LoadAligned32U32(b343[0] + x + 8, b[0]); + LoadAligned32U32(b444[0] + x + 8, b[1]); + p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b); + const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2); + StoreAligned16(dst + x, _mm_packus_epi16(d0, d1)); + s[0] = s[1]; + sq[1] = sq[3]; + ma3[0] = ma3[1]; + ma5[0] = ma5[1]; + b3[0] = b3[2]; + b5[0] = b5[2]; + x += 16; + } while (x < width); +} + +LIBGAV1_ALWAYS_INLINE void BoxFilterProcess( + const RestorationUnitInfo& restoration_info, const uint8_t* src, + const uint8_t* const top_border, const uint8_t* bottom_border, + const ptrdiff_t stride, const int width, const int height, + SgrBuffer* const sgr_buffer, uint8_t* dst) { + const auto temp_stride = Align(width, 16); + const auto sum_width = Align(width + 8, 16); + const auto sum_stride = temp_stride + 16; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12. + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1; + uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2]; + uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2]; + sum3[0] = sgr_buffer->sum3; + square_sum3[0] = sgr_buffer->square_sum3; + ma343[0] = sgr_buffer->ma343; + b343[0] = sgr_buffer->b343; + for (int i = 1; i <= 3; ++i) { + sum3[i] = sum3[i - 1] + sum_stride; + square_sum3[i] = square_sum3[i - 1] + sum_stride; + ma343[i] = ma343[i - 1] + temp_stride; + b343[i] = b343[i - 1] + temp_stride; + } + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + for (int i = 1; i <= 4; ++i) { + sum5[i] = sum5[i - 1] + sum_stride; + square_sum5[i] = square_sum5[i - 1] + sum_stride; + } + ma444[0] = sgr_buffer->ma444; + b444[0] = sgr_buffer->b444; + for (int i = 1; i <= 2; ++i) { + ma444[i] = ma444[i - 1] + temp_stride; + b444[i] = b444[i - 1] + temp_stride; + } + ma565[0] = sgr_buffer->ma565; + ma565[1] = ma565[0] + temp_stride; + b565[0] = sgr_buffer->b565; + b565[1] = b565[0] + temp_stride; + assert(scales[0] != 0); + assert(scales[1] != 0); + BoxSum(top_border, stride, width, sum_stride, sum_width, sum3[0], sum5[1], + square_sum3[0], square_sum5[1]); + sum5[0] = sum5[1]; + square_sum5[0] = square_sum5[1]; + const uint8_t* const s = (height > 1) ? src + stride : bottom_border; + BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3, + square_sum5, sum_width, ma343, ma444, ma565[0], b343, + b444, b565[0]); + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + + for (int y = (height >> 1) - 1; y > 0; --y) { + Circulate4PointersBy2(sum3); + Circulate4PointersBy2(square_sum3); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width, + scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width, + ma343, ma444, ma565, b343, b444, b565, dst); + src += 2 * stride; + dst += 2 * stride; + Circulate4PointersBy2(ma343); + Circulate4PointersBy2(b343); + std::swap(ma444[0], ma444[2]); + std::swap(b444[0], b444[2]); + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + + Circulate4PointersBy2(sum3); + Circulate4PointersBy2(square_sum3); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + if ((height & 1) == 0 || height > 1) { + const uint8_t* sr[2]; + if ((height & 1) == 0) { + sr[0] = bottom_border; + sr[1] = bottom_border + stride; + } else { + sr[0] = src + 2 * stride; + sr[1] = bottom_border; + } + BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5, + square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343, + b444, b565, dst); + } + if ((height & 1) != 0) { + if (height > 1) { + src += 2 * stride; + dst += 2 * stride; + Circulate4PointersBy2(sum3); + Circulate4PointersBy2(square_sum3); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + Circulate4PointersBy2(ma343); + Circulate4PointersBy2(b343); + std::swap(ma444[0], ma444[2]); + std::swap(b444[0], b444[2]); + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + BoxFilterLastRow(src + 3, bottom_border + stride, width, sum_width, scales, + w0, w2, sum3, sum5, square_sum3, square_sum5, ma343, ma444, + ma565, b343, b444, b565, dst); + } +} + +inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info, + const uint8_t* src, + const uint8_t* const top_border, + const uint8_t* bottom_border, + const ptrdiff_t stride, const int width, + const int height, SgrBuffer* const sgr_buffer, + uint8_t* dst) { + const auto temp_stride = Align(width, 16); + const auto sum_width = Align(width + 8, 16); + const auto sum_stride = temp_stride + 16; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12. + const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0]; + uint16_t *sum5[5], *ma565[2]; + uint32_t *square_sum5[5], *b565[2]; + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + for (int i = 1; i <= 4; ++i) { + sum5[i] = sum5[i - 1] + sum_stride; + square_sum5[i] = square_sum5[i - 1] + sum_stride; + } + ma565[0] = sgr_buffer->ma565; + ma565[1] = ma565[0] + temp_stride; + b565[0] = sgr_buffer->b565; + b565[1] = b565[0] + temp_stride; + assert(scale != 0); + BoxSum<5>(top_border, stride, width, sum_stride, sum_width, sum5[1], + square_sum5[1]); + sum5[0] = sum5[1]; + square_sum5[0] = square_sum5[1]; + const uint8_t* const s = (height > 1) ? src + stride : bottom_border; + BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width, + ma565[0], b565[0]); + sum5[0] = sgr_buffer->sum5; + square_sum5[0] = sgr_buffer->square_sum5; + + for (int y = (height >> 1) - 1; y > 0; --y) { + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5, + square_sum5, width, sum_width, scale, w0, ma565, b565, dst); + src += 2 * stride; + dst += 2 * stride; + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + } + + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + if ((height & 1) == 0 || height > 1) { + const uint8_t* sr[2]; + if ((height & 1) == 0) { + sr[0] = bottom_border; + sr[1] = bottom_border + stride; + } else { + sr[0] = src + 2 * stride; + sr[1] = bottom_border; + } + BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width, + sum_width, scale, w0, ma565, b565, dst); + } + if ((height & 1) != 0) { + src += 3; + if (height > 1) { + src += 2 * stride; + dst += 2 * stride; + std::swap(ma565[0], ma565[1]); + std::swap(b565[0], b565[1]); + Circulate5PointersBy2(sum5); + Circulate5PointersBy2(square_sum5); + } + BoxFilterPass1LastRow(src, bottom_border + stride, width, sum_width, scale, + w0, sum5, square_sum5, ma565[0], b565[0], dst); + } +} + +inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info, + const uint8_t* src, + const uint8_t* const top_border, + const uint8_t* bottom_border, + const ptrdiff_t stride, const int width, + const int height, SgrBuffer* const sgr_buffer, + uint8_t* dst) { + assert(restoration_info.sgr_proj_info.multiplier[0] == 0); + const auto temp_stride = Align(width, 16); + const auto sum_width = Align(width + 8, 16); + const auto sum_stride = temp_stride + 16; + const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1]; + const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1; + const int sgr_proj_index = restoration_info.sgr_proj_info.index; + const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12. + uint16_t *sum3[3], *ma343[3], *ma444[2]; + uint32_t *square_sum3[3], *b343[3], *b444[2]; + sum3[0] = sgr_buffer->sum3; + square_sum3[0] = sgr_buffer->square_sum3; + ma343[0] = sgr_buffer->ma343; + b343[0] = sgr_buffer->b343; + for (int i = 1; i <= 2; ++i) { + sum3[i] = sum3[i - 1] + sum_stride; + square_sum3[i] = square_sum3[i - 1] + sum_stride; + ma343[i] = ma343[i - 1] + temp_stride; + b343[i] = b343[i - 1] + temp_stride; + } + ma444[0] = sgr_buffer->ma444; + ma444[1] = ma444[0] + temp_stride; + b444[0] = sgr_buffer->b444; + b444[1] = b444[0] + temp_stride; + assert(scale != 0); + BoxSum<3>(top_border, stride, width, sum_stride, sum_width, sum3[0], + square_sum3[0]); + BoxSumFilterPreProcess3(src, width, scale, sum3, square_sum3, + sum_width, ma343[0], nullptr, b343[0], + nullptr); + Circulate3PointersBy1(sum3); + Circulate3PointersBy1(square_sum3); + const uint8_t* s; + if (height > 1) { + s = src + stride; + } else { + s = bottom_border; + bottom_border += stride; + } + BoxSumFilterPreProcess3(s, width, scale, sum3, square_sum3, sum_width, + ma343[1], ma444[0], b343[1], b444[0]); + + for (int y = height - 2; y > 0; --y) { + Circulate3PointersBy1(sum3); + Circulate3PointersBy1(square_sum3); + BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3, + square_sum3, ma343, ma444, b343, b444, dst); + src += stride; + dst += stride; + Circulate3PointersBy1(ma343); + Circulate3PointersBy1(b343); + std::swap(ma444[0], ma444[1]); + std::swap(b444[0], b444[1]); + } + + int y = std::min(height, 2); + src += 2; + do { + Circulate3PointersBy1(sum3); + Circulate3PointersBy1(square_sum3); + BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3, + square_sum3, ma343, ma444, b343, b444, dst); + src += stride; + dst += stride; + bottom_border += stride; + Circulate3PointersBy1(ma343); + Circulate3PointersBy1(b343); + std::swap(ma444[0], ma444[1]); + std::swap(b444[0], b444[1]); + } while (--y != 0); +} + +// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in +// the end of each row. It is safe to overwrite the output as it will not be +// part of the visible frame. +void SelfGuidedFilter_SSE4_1( + const RestorationUnitInfo& restoration_info, const void* const source, + const void* const top_border, const void* const bottom_border, + const ptrdiff_t stride, const int width, const int height, + RestorationBuffer* const restoration_buffer, void* const dest) { + const int index = restoration_info.sgr_proj_info.index; + const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0 + const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0 + const auto* const src = static_cast(source); + const auto* top = static_cast(top_border); + const auto* bottom = static_cast(bottom_border); + auto* const dst = static_cast(dest); + SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer; + if (radius_pass_1 == 0) { + // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the + // following assertion. + assert(radius_pass_0 != 0); + BoxFilterProcessPass1(restoration_info, src - 3, top - 3, bottom - 3, + stride, width, height, sgr_buffer, dst); + } else if (radius_pass_0 == 0) { + BoxFilterProcessPass2(restoration_info, src - 2, top - 2, bottom - 2, + stride, width, height, sgr_buffer, dst); + } else { + BoxFilterProcess(restoration_info, src - 3, top - 3, bottom - 3, stride, + width, height, sgr_buffer, dst); + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + static_cast(dsp); +#if DSP_ENABLED_8BPP_SSE4_1(WienerFilter) + dsp->loop_restorations[0] = WienerFilter_SSE4_1; +#else + static_cast(WienerFilter_SSE4_1); +#endif +#if DSP_ENABLED_8BPP_SSE4_1(SelfGuidedFilter) + dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1; +#else + static_cast(SelfGuidedFilter_SSE4_1); +#endif +} + +} // namespace +} // namespace low_bitdepth + +void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 +namespace libgav1 { +namespace dsp { + +void LoopRestorationInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/loop_restoration_sse4.h b/src/dsp/x86/loop_restoration_sse4.h new file mode 100644 index 0000000..65b2b11 --- /dev/null +++ b/src/dsp/x86/loop_restoration_sse4.h @@ -0,0 +1,52 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::loop_restorations, see the defines below for specifics. +// These functions are not thread-safe. +void LoopRestorationInit_SSE4_1(); +void LoopRestorationInit10bpp_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +// If sse4 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the sse4 implementation should be used. +#if LIBGAV1_TARGETING_SSE4_1 + +#ifndef LIBGAV1_Dsp8bpp_WienerFilter +#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter +#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp10bpp_WienerFilter +#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_SSE4_1 +#endif + +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_ diff --git a/src/dsp/x86/mask_blend_sse4.cc b/src/dsp/x86/mask_blend_sse4.cc new file mode 100644 index 0000000..d8036be --- /dev/null +++ b/src/dsp/x86/mask_blend_sse4.cc @@ -0,0 +1,447 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/mask_blend.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +// Width can only be 4 when it is subsampled from a block of width 8, hence +// subsampling_x is always 1 when this function is called. +template +inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) { + if (subsampling_x == 1) { + const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask)); + const __m128i mask_val_1 = + _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y))); + __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); + if (subsampling_y == 1) { + const __m128i next_mask_val_0 = + _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride)); + const __m128i next_mask_val_1 = + _mm_cvtepu8_epi16(LoadLo8(mask + mask_stride * 3)); + subsampled_mask = _mm_add_epi16( + subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1)); + } + return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y); + } + const __m128i mask_val_0 = Load4(mask); + const __m128i mask_val_1 = Load4(mask + mask_stride); + return _mm_cvtepu8_epi16( + _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4))); +} + +// This function returns a 16-bit packed mask to fit in _mm_madd_epi16. +// 16-bit is also the lowest packing for hadd, but without subsampling there is +// an unfortunate conversion required. +template +inline __m128i GetMask8(const uint8_t* mask, ptrdiff_t stride) { + if (subsampling_x == 1) { + const __m128i row_vals = LoadUnaligned16(mask); + + const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals); + const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8)); + __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); + + if (subsampling_y == 1) { + const __m128i next_row_vals = LoadUnaligned16(mask + stride); + const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals); + const __m128i next_mask_val_1 = + _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8)); + subsampled_mask = _mm_add_epi16( + subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1)); + } + return RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y); + } + assert(subsampling_y == 0 && subsampling_x == 0); + const __m128i mask_val = LoadLo8(mask); + return _mm_cvtepu8_epi16(mask_val); +} + +// This version returns 8-bit packed values to fit in _mm_maddubs_epi16 because, +// when is_inter_intra is true, the prediction values are brought to 8-bit +// packing as well. +template +inline __m128i GetInterIntraMask8(const uint8_t* mask, ptrdiff_t stride) { + if (subsampling_x == 1) { + const __m128i row_vals = LoadUnaligned16(mask); + + const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals); + const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8)); + __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1); + + if (subsampling_y == 1) { + const __m128i next_row_vals = LoadUnaligned16(mask + stride); + const __m128i next_mask_val_0 = _mm_cvtepu8_epi16(next_row_vals); + const __m128i next_mask_val_1 = + _mm_cvtepu8_epi16(_mm_srli_si128(next_row_vals, 8)); + subsampled_mask = _mm_add_epi16( + subsampled_mask, _mm_hadd_epi16(next_mask_val_0, next_mask_val_1)); + } + const __m128i ret = + RightShiftWithRounding_U16(subsampled_mask, 1 + subsampling_y); + return _mm_packus_epi16(ret, ret); + } + assert(subsampling_y == 0 && subsampling_x == 0); + // Unfortunately there is no shift operation for 8-bit packing, or else we + // could return everything with 8-bit packing. + const __m128i mask_val = LoadLo8(mask); + return mask_val; +} + +inline void WriteMaskBlendLine4x2(const int16_t* const pred_0, + const int16_t* const pred_1, + const __m128i pred_mask_0, + const __m128i pred_mask_1, uint8_t* dst, + const ptrdiff_t dst_stride) { + const __m128i pred_val_0 = LoadAligned16(pred_0); + const __m128i pred_val_1 = LoadAligned16(pred_1); + const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1); + const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1); + const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1); + const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1); + + // int res = (mask_value * prediction_0[x] + + // (64 - mask_value) * prediction_1[x]) >> 6; + const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo); + const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi); + const __m128i compound_pred = _mm_packus_epi32( + _mm_srli_epi32(compound_pred_lo, 6), _mm_srli_epi32(compound_pred_hi, 6)); + + // dst[x] = static_cast( + // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0, + // (1 << kBitdepth8) - 1)); + const __m128i result = RightShiftWithRounding_S16(compound_pred, 4); + const __m128i res = _mm_packus_epi16(result, result); + Store4(dst, res); + Store4(dst + dst_stride, _mm_srli_si128(res, 4)); +} + +template +inline void MaskBlending4x4_SSE4(const int16_t* pred_0, const int16_t* pred_1, + const uint8_t* mask, + const ptrdiff_t mask_stride, uint8_t* dst, + const ptrdiff_t dst_stride) { + const __m128i mask_inverter = _mm_set1_epi16(64); + __m128i pred_mask_0 = + GetMask4x2(mask, mask_stride); + __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, + dst_stride); + pred_0 += 4 << 1; + pred_1 += 4 << 1; + mask += mask_stride << (1 + subsampling_y); + dst += dst_stride << 1; + + pred_mask_0 = GetMask4x2(mask, mask_stride); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, + dst_stride); +} + +template +inline void MaskBlending4xH_SSE4(const int16_t* pred_0, const int16_t* pred_1, + const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, const int height, + uint8_t* dst, const ptrdiff_t dst_stride) { + const uint8_t* mask = mask_ptr; + if (height == 4) { + MaskBlending4x4_SSE4( + pred_0, pred_1, mask, mask_stride, dst, dst_stride); + return; + } + const __m128i mask_inverter = _mm_set1_epi16(64); + int y = 0; + do { + __m128i pred_mask_0 = + GetMask4x2(mask, mask_stride); + __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + + WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, + dst_stride); + pred_0 += 4 << 1; + pred_1 += 4 << 1; + mask += mask_stride << (1 + subsampling_y); + dst += dst_stride << 1; + + pred_mask_0 = GetMask4x2(mask, mask_stride); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, + dst_stride); + pred_0 += 4 << 1; + pred_1 += 4 << 1; + mask += mask_stride << (1 + subsampling_y); + dst += dst_stride << 1; + + pred_mask_0 = GetMask4x2(mask, mask_stride); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, + dst_stride); + pred_0 += 4 << 1; + pred_1 += 4 << 1; + mask += mask_stride << (1 + subsampling_y); + dst += dst_stride << 1; + + pred_mask_0 = GetMask4x2(mask, mask_stride); + pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst, + dst_stride); + pred_0 += 4 << 1; + pred_1 += 4 << 1; + mask += mask_stride << (1 + subsampling_y); + dst += dst_stride << 1; + y += 8; + } while (y < height); +} + +template +inline void MaskBlend_SSE4(const void* prediction_0, const void* prediction_1, + const ptrdiff_t /*prediction_stride_1*/, + const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, const int width, + const int height, void* dest, + const ptrdiff_t dst_stride) { + auto* dst = static_cast(dest); + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + const ptrdiff_t pred_stride_0 = width; + const ptrdiff_t pred_stride_1 = width; + if (width == 4) { + MaskBlending4xH_SSE4( + pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride); + return; + } + const uint8_t* mask = mask_ptr; + const __m128i mask_inverter = _mm_set1_epi16(64); + int y = 0; + do { + int x = 0; + do { + const __m128i pred_mask_0 = GetMask8( + mask + (x << subsampling_x), mask_stride); + // 64 - mask + const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0); + const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1); + const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1); + + const __m128i pred_val_0 = LoadAligned16(pred_0 + x); + const __m128i pred_val_1 = LoadAligned16(pred_1 + x); + const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1); + const __m128i pred_hi = _mm_unpackhi_epi16(pred_val_0, pred_val_1); + // int res = (mask_value * prediction_0[x] + + // (64 - mask_value) * prediction_1[x]) >> 6; + const __m128i compound_pred_lo = _mm_madd_epi16(pred_lo, mask_lo); + const __m128i compound_pred_hi = _mm_madd_epi16(pred_hi, mask_hi); + + const __m128i res = _mm_packus_epi32(_mm_srli_epi32(compound_pred_lo, 6), + _mm_srli_epi32(compound_pred_hi, 6)); + // dst[x] = static_cast( + // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0, + // (1 << kBitdepth8) - 1)); + const __m128i result = RightShiftWithRounding_S16(res, 4); + StoreLo8(dst + x, _mm_packus_epi16(result, result)); + + x += 8; + } while (x < width); + dst += dst_stride; + pred_0 += pred_stride_0; + pred_1 += pred_stride_1; + mask += mask_stride << subsampling_y; + } while (++y < height); +} + +inline void InterIntraWriteMaskBlendLine8bpp4x2(const uint8_t* const pred_0, + uint8_t* const pred_1, + const ptrdiff_t pred_stride_1, + const __m128i pred_mask_0, + const __m128i pred_mask_1) { + const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1); + + const __m128i pred_val_0 = LoadLo8(pred_0); + // TODO(b/150326556): One load. + __m128i pred_val_1 = Load4(pred_1); + pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4), + pred_val_1); + const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1); + // int res = (mask_value * prediction_1[x] + + // (64 - mask_value) * prediction_0[x]) >> 6; + const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask); + const __m128i result = RightShiftWithRounding_U16(compound_pred, 6); + const __m128i res = _mm_packus_epi16(result, result); + + Store4(pred_1, res); + Store4(pred_1 + pred_stride_1, _mm_srli_si128(res, 4)); +} + +template +inline void InterIntraMaskBlending8bpp4x4_SSE4(const uint8_t* pred_0, + uint8_t* pred_1, + const ptrdiff_t pred_stride_1, + const uint8_t* mask, + const ptrdiff_t mask_stride) { + const __m128i mask_inverter = _mm_set1_epi8(64); + const __m128i pred_mask_u16_first = + GetMask4x2(mask, mask_stride); + mask += mask_stride << (1 + subsampling_y); + const __m128i pred_mask_u16_second = + GetMask4x2(mask, mask_stride); + mask += mask_stride << (1 + subsampling_y); + __m128i pred_mask_1 = + _mm_packus_epi16(pred_mask_u16_first, pred_mask_u16_second); + __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1); + InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1); + pred_0 += 4 << 1; + pred_1 += pred_stride_1 << 1; + + pred_mask_1 = _mm_srli_si128(pred_mask_1, 8); + pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1); + InterIntraWriteMaskBlendLine8bpp4x2(pred_0, pred_1, pred_stride_1, + pred_mask_0, pred_mask_1); +} + +template +inline void InterIntraMaskBlending8bpp4xH_SSE4(const uint8_t* pred_0, + uint8_t* pred_1, + const ptrdiff_t pred_stride_1, + const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, + const int height) { + const uint8_t* mask = mask_ptr; + if (height == 4) { + InterIntraMaskBlending8bpp4x4_SSE4( + pred_0, pred_1, pred_stride_1, mask, mask_stride); + return; + } + int y = 0; + do { + InterIntraMaskBlending8bpp4x4_SSE4( + pred_0, pred_1, pred_stride_1, mask, mask_stride); + pred_0 += 4 << 2; + pred_1 += pred_stride_1 << 2; + mask += mask_stride << (2 + subsampling_y); + + InterIntraMaskBlending8bpp4x4_SSE4( + pred_0, pred_1, pred_stride_1, mask, mask_stride); + pred_0 += 4 << 2; + pred_1 += pred_stride_1 << 2; + mask += mask_stride << (2 + subsampling_y); + y += 8; + } while (y < height); +} + +template +void InterIntraMaskBlend8bpp_SSE4(const uint8_t* prediction_0, + uint8_t* prediction_1, + const ptrdiff_t prediction_stride_1, + const uint8_t* const mask_ptr, + const ptrdiff_t mask_stride, const int width, + const int height) { + if (width == 4) { + InterIntraMaskBlending8bpp4xH_SSE4( + prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride, + height); + return; + } + const uint8_t* mask = mask_ptr; + const __m128i mask_inverter = _mm_set1_epi8(64); + int y = 0; + do { + int x = 0; + do { + const __m128i pred_mask_1 = + GetInterIntraMask8( + mask + (x << subsampling_x), mask_stride); + // 64 - mask + const __m128i pred_mask_0 = _mm_sub_epi8(mask_inverter, pred_mask_1); + const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1); + + const __m128i pred_val_0 = LoadLo8(prediction_0 + x); + const __m128i pred_val_1 = LoadLo8(prediction_1 + x); + const __m128i pred = _mm_unpacklo_epi8(pred_val_0, pred_val_1); + // int res = (mask_value * prediction_1[x] + + // (64 - mask_value) * prediction_0[x]) >> 6; + const __m128i compound_pred = _mm_maddubs_epi16(pred, pred_mask); + const __m128i result = RightShiftWithRounding_U16(compound_pred, 6); + const __m128i res = _mm_packus_epi16(result, result); + + StoreLo8(prediction_1 + x, res); + + x += 8; + } while (x < width); + prediction_0 += width; + prediction_1 += prediction_stride_1; + mask += mask_stride << subsampling_y; + } while (++y < height); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); +#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend444) + dsp->mask_blend[0][0] = MaskBlend_SSE4<0, 0>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend422) + dsp->mask_blend[1][0] = MaskBlend_SSE4<1, 0>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(MaskBlend420) + dsp->mask_blend[2][0] = MaskBlend_SSE4<1, 1>; +#endif + // The is_inter_intra index of mask_blend[][] is replaced by + // inter_intra_mask_blend_8bpp[] in 8-bit. +#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp444) + dsp->inter_intra_mask_blend_8bpp[0] = InterIntraMaskBlend8bpp_SSE4<0, 0>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp422) + dsp->inter_intra_mask_blend_8bpp[1] = InterIntraMaskBlend8bpp_SSE4<1, 0>; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(InterIntraMaskBlend8bpp420) + dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_SSE4<1, 1>; +#endif +} + +} // namespace +} // namespace low_bitdepth + +void MaskBlendInit_SSE4_1() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 + +namespace libgav1 { +namespace dsp { + +void MaskBlendInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/mask_blend_sse4.h b/src/dsp/x86/mask_blend_sse4.h new file mode 100644 index 0000000..52b0b5c --- /dev/null +++ b/src/dsp/x86/mask_blend_sse4.h @@ -0,0 +1,60 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::mask_blend. This function is not thread-safe. +void MaskBlendInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_TARGETING_SSE4_1 + +#ifndef LIBGAV1_Dsp8bpp_MaskBlend444 +#define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_MaskBlend422 +#define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_MaskBlend420 +#define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 +#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 +#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 +#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_SSE4_1 +#endif + +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_ diff --git a/src/dsp/x86/motion_field_projection_sse4.cc b/src/dsp/x86/motion_field_projection_sse4.cc new file mode 100644 index 0000000..c506941 --- /dev/null +++ b/src/dsp/x86/motion_field_projection_sse4.cc @@ -0,0 +1,397 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/motion_field_projection.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { +namespace dsp { +namespace { + +inline __m128i LoadDivision(const __m128i division_table, + const __m128i reference_offset) { + const __m128i kOne = _mm_set1_epi16(0x0100); + const __m128i t = _mm_add_epi8(reference_offset, reference_offset); + const __m128i tt = _mm_unpacklo_epi8(t, t); + const __m128i idx = _mm_add_epi8(tt, kOne); + return _mm_shuffle_epi8(division_table, idx); +} + +inline __m128i MvProjection(const __m128i mv, const __m128i denominator, + const int numerator) { + const __m128i m0 = _mm_madd_epi16(mv, denominator); + const __m128i m = _mm_mullo_epi32(m0, _mm_set1_epi32(numerator)); + // Add the sign (0 or -1) to round towards zero. + const __m128i sign = _mm_srai_epi32(m, 31); + const __m128i add_sign = _mm_add_epi32(m, sign); + const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13)); + return _mm_srai_epi32(sum, 14); +} + +inline __m128i MvProjectionClip(const __m128i mv, const __m128i denominator, + const int numerator) { + const __m128i mv0 = _mm_unpacklo_epi16(mv, _mm_setzero_si128()); + const __m128i mv1 = _mm_unpackhi_epi16(mv, _mm_setzero_si128()); + const __m128i denorm0 = _mm_unpacklo_epi16(denominator, _mm_setzero_si128()); + const __m128i denorm1 = _mm_unpackhi_epi16(denominator, _mm_setzero_si128()); + const __m128i s0 = MvProjection(mv0, denorm0, numerator); + const __m128i s1 = MvProjection(mv1, denorm1, numerator); + const __m128i projection = _mm_packs_epi32(s0, s1); + const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp); + const __m128i projection_mv_clamp_negative = + _mm_set1_epi16(-kProjectionMvClamp); + const __m128i clamp = _mm_min_epi16(projection, projection_mv_clamp); + return _mm_max_epi16(clamp, projection_mv_clamp_negative); +} + +inline __m128i Project_SSE4_1(const __m128i delta, const __m128i dst_sign) { + // Add 63 to negative delta so that it shifts towards zero. + const __m128i delta_sign = _mm_srai_epi16(delta, 15); + const __m128i delta_sign_63 = _mm_srli_epi16(delta_sign, 10); + const __m128i delta_adjust = _mm_add_epi16(delta, delta_sign_63); + const __m128i offset0 = _mm_srai_epi16(delta_adjust, 6); + const __m128i offset1 = _mm_xor_si128(offset0, dst_sign); + return _mm_sub_epi16(offset1, dst_sign); +} + +inline void GetPosition( + const __m128i division_table, const MotionVector* const mv, + const int numerator, const int x8_start, const int x8_end, const int x8, + const __m128i& r_offsets, const __m128i& source_reference_type8, + const __m128i& skip_r, const __m128i& y8_floor8, const __m128i& y8_ceiling8, + const __m128i& d_sign, const int delta, __m128i* const r, + __m128i* const position_xy, int64_t* const skip_64, __m128i mvs[2]) { + const auto* const mv_int = reinterpret_cast(mv + x8); + *r = _mm_shuffle_epi8(r_offsets, source_reference_type8); + const __m128i denorm = LoadDivision(division_table, source_reference_type8); + __m128i projection_mv[2]; + mvs[0] = LoadUnaligned16(mv_int + 0); + mvs[1] = LoadUnaligned16(mv_int + 4); + // Deinterlace x and y components + const __m128i kShuffle = + _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + const __m128i mv0 = _mm_shuffle_epi8(mvs[0], kShuffle); + const __m128i mv1 = _mm_shuffle_epi8(mvs[1], kShuffle); + const __m128i mv_y = _mm_unpacklo_epi64(mv0, mv1); + const __m128i mv_x = _mm_unpackhi_epi64(mv0, mv1); + // numerator could be 0. + projection_mv[0] = MvProjectionClip(mv_y, denorm, numerator); + projection_mv[1] = MvProjectionClip(mv_x, denorm, numerator); + // Do not update the motion vector if the block position is not valid or + // if position_x8 is outside the current range of x8_start and x8_end. + // Note that position_y8 will always be within the range of y8_start and + // y8_end. + // After subtracting the base, valid projections are within 8-bit. + const __m128i position_y = Project_SSE4_1(projection_mv[0], d_sign); + const __m128i position_x = Project_SSE4_1(projection_mv[1], d_sign); + const __m128i positions = _mm_packs_epi16(position_x, position_y); + const __m128i k01234567 = + _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0); + *position_xy = _mm_add_epi8(positions, k01234567); + const int x8_floor = std::max( + x8_start - x8, delta - kProjectionMvMaxHorizontalOffset); // [-8, 8] + const int x8_ceiling = + std::min(x8_end - x8, delta + 8 + kProjectionMvMaxHorizontalOffset) - + 1; // [-1, 15] + const __m128i x8_floor8 = _mm_set1_epi8(x8_floor); + const __m128i x8_ceiling8 = _mm_set1_epi8(x8_ceiling); + const __m128i floor_xy = _mm_unpacklo_epi64(x8_floor8, y8_floor8); + const __m128i ceiling_xy = _mm_unpacklo_epi64(x8_ceiling8, y8_ceiling8); + const __m128i underflow = _mm_cmplt_epi8(*position_xy, floor_xy); + const __m128i overflow = _mm_cmpgt_epi8(*position_xy, ceiling_xy); + const __m128i out = _mm_or_si128(underflow, overflow); + const __m128i skip_low = _mm_or_si128(skip_r, out); + const __m128i skip = _mm_or_si128(skip_low, _mm_srli_si128(out, 8)); + StoreLo8(skip_64, skip); +} + +template +inline void Store(const __m128i position, const __m128i reference_offset, + const __m128i mv, int8_t* dst_reference_offset, + MotionVector* dst_mv) { + const ptrdiff_t offset = + static_cast(_mm_extract_epi16(position, idx)); + if ((idx & 3) == 0) { + dst_mv[offset].mv32 = _mm_cvtsi128_si32(mv); + } else { + dst_mv[offset].mv32 = _mm_extract_epi32(mv, idx & 3); + } + dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx); +} + +template +inline void CheckStore(const int8_t* skips, const __m128i position, + const __m128i reference_offset, const __m128i mv, + int8_t* dst_reference_offset, MotionVector* dst_mv) { + if (skips[idx] == 0) { + Store(position, reference_offset, mv, dst_reference_offset, dst_mv); + } +} + +// 7.9.2. +void MotionFieldProjectionKernel_SSE4_1( + const ReferenceInfo& reference_info, + const int reference_to_current_with_sign, const int dst_sign, + const int y8_start, const int y8_end, const int x8_start, const int x8_end, + TemporalMotionField* const motion_field) { + const ptrdiff_t stride = motion_field->mv.columns(); + // The column range has to be offset by kProjectionMvMaxHorizontalOffset since + // coordinates in that range could end up being position_x8 because of + // projection. + const int adjusted_x8_start = + std::max(x8_start - kProjectionMvMaxHorizontalOffset, 0); + const int adjusted_x8_end = std::min( + x8_end + kProjectionMvMaxHorizontalOffset, static_cast(stride)); + const int adjusted_x8_end8 = adjusted_x8_end & ~7; + const int leftover = adjusted_x8_end - adjusted_x8_end8; + const int8_t* const reference_offsets = + reference_info.relative_distance_to.data(); + const bool* const skip_references = reference_info.skip_references.data(); + const int16_t* const projection_divisions = + reference_info.projection_divisions.data(); + const ReferenceFrameType* source_reference_types = + &reference_info.motion_field_reference_frame[y8_start][0]; + const MotionVector* mv = &reference_info.motion_field_mv[y8_start][0]; + int8_t* dst_reference_offset = motion_field->reference_offset[y8_start]; + MotionVector* dst_mv = motion_field->mv[y8_start]; + const __m128i d_sign = _mm_set1_epi16(dst_sign); + + static_assert(sizeof(int8_t) == sizeof(bool), ""); + static_assert(sizeof(int8_t) == sizeof(ReferenceFrameType), ""); + static_assert(sizeof(int32_t) == sizeof(MotionVector), ""); + assert(dst_sign == 0 || dst_sign == -1); + assert(stride == motion_field->reference_offset.columns()); + assert((y8_start & 7) == 0); + assert((adjusted_x8_start & 7) == 0); + // The final position calculation is represented with int16_t. Valid + // position_y8 from its base is at most 7. After considering the horizontal + // offset which is at most |stride - 1|, we have the following assertion, + // which means this optimization works for frame width up to 32K (each + // position is a 8x8 block). + assert(8 * stride <= 32768); + const __m128i skip_reference = LoadLo8(skip_references); + const __m128i r_offsets = LoadLo8(reference_offsets); + const __m128i division_table = LoadUnaligned16(projection_divisions); + + int y8 = y8_start; + do { + const int y8_floor = (y8 & ~7) - y8; // [-7, 0] + const int y8_ceiling = std::min(y8_end - y8, y8_floor + 8) - 1; // [0, 7] + const __m128i y8_floor8 = _mm_set1_epi8(y8_floor); + const __m128i y8_ceiling8 = _mm_set1_epi8(y8_ceiling); + int x8; + + for (x8 = adjusted_x8_start; x8 < adjusted_x8_end8; x8 += 8) { + const __m128i source_reference_type8 = + LoadLo8(source_reference_types + x8); + const __m128i skip_r = + _mm_shuffle_epi8(skip_reference, source_reference_type8); + int64_t early_skip; + StoreLo8(&early_skip, skip_r); + // Early termination #1 if all are skips. Chance is typically ~30-40%. + if (early_skip == -1) continue; + int64_t skip_64; + __m128i r, position_xy, mvs[2]; + GetPosition(division_table, mv, reference_to_current_with_sign, x8_start, + x8_end, x8, r_offsets, source_reference_type8, skip_r, + y8_floor8, y8_ceiling8, d_sign, 0, &r, &position_xy, &skip_64, + mvs); + // Early termination #2 if all are skips. + // Chance is typically ~15-25% after Early termination #1. + if (skip_64 == -1) continue; + const __m128i p_y = _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8)); + const __m128i p_x = _mm_cvtepi8_epi16(position_xy); + const __m128i p_y_offset = _mm_mullo_epi16(p_y, _mm_set1_epi16(stride)); + const __m128i pos = _mm_add_epi16(p_y_offset, p_x); + const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8)); + if (skip_64 == 0) { + // Store all. Chance is typically ~70-85% after Early termination #2. + Store<0>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv); + } else { + // Check and store each. + // Chance is typically ~15-30% after Early termination #2. + // The compiler is smart enough to not create the local buffer skips[]. + int8_t skips[8]; + memcpy(skips, &skip_64, sizeof(skips)); + CheckStore<0>(skips, position, r, mvs[0], dst_reference_offset, dst_mv); + CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, dst_mv); + CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, dst_mv); + CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, dst_mv); + CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, dst_mv); + CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, dst_mv); + CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, dst_mv); + CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, dst_mv); + } + } + + // The following leftover processing cannot be moved out of the do...while + // loop. Doing so may change the result storing orders of the same position. + if (leftover > 0) { + // Use SIMD only when leftover is at least 4, and there are at least 8 + // elements in a row. + if (leftover >= 4 && adjusted_x8_start < adjusted_x8_end8) { + // Process the last 8 elements to avoid loading invalid memory. Some + // elements may have been processed in the above loop, which is OK. + const int delta = 8 - leftover; + x8 = adjusted_x8_end - 8; + const __m128i source_reference_type8 = + LoadLo8(source_reference_types + x8); + const __m128i skip_r = + _mm_shuffle_epi8(skip_reference, source_reference_type8); + int64_t early_skip; + StoreLo8(&early_skip, skip_r); + // Early termination #1 if all are skips. + if (early_skip != -1) { + int64_t skip_64; + __m128i r, position_xy, mvs[2]; + GetPosition(division_table, mv, reference_to_current_with_sign, + x8_start, x8_end, x8, r_offsets, source_reference_type8, + skip_r, y8_floor8, y8_ceiling8, d_sign, delta, &r, + &position_xy, &skip_64, mvs); + // Early termination #2 if all are skips. + if (skip_64 != -1) { + const __m128i p_y = + _mm_cvtepi8_epi16(_mm_srli_si128(position_xy, 8)); + const __m128i p_x = _mm_cvtepi8_epi16(position_xy); + const __m128i p_y_offset = + _mm_mullo_epi16(p_y, _mm_set1_epi16(stride)); + const __m128i pos = _mm_add_epi16(p_y_offset, p_x); + const __m128i position = _mm_add_epi16(pos, _mm_set1_epi16(x8)); + // Store up to 7 elements since leftover is at most 7. + if (skip_64 == 0) { + // Store all. + Store<1>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<2>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<3>(position, r, mvs[0], dst_reference_offset, dst_mv); + Store<4>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<5>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<6>(position, r, mvs[1], dst_reference_offset, dst_mv); + Store<7>(position, r, mvs[1], dst_reference_offset, dst_mv); + } else { + // Check and store each. + // The compiler is smart enough to not create the local buffer + // skips[]. + int8_t skips[8]; + memcpy(skips, &skip_64, sizeof(skips)); + CheckStore<1>(skips, position, r, mvs[0], dst_reference_offset, + dst_mv); + CheckStore<2>(skips, position, r, mvs[0], dst_reference_offset, + dst_mv); + CheckStore<3>(skips, position, r, mvs[0], dst_reference_offset, + dst_mv); + CheckStore<4>(skips, position, r, mvs[1], dst_reference_offset, + dst_mv); + CheckStore<5>(skips, position, r, mvs[1], dst_reference_offset, + dst_mv); + CheckStore<6>(skips, position, r, mvs[1], dst_reference_offset, + dst_mv); + CheckStore<7>(skips, position, r, mvs[1], dst_reference_offset, + dst_mv); + } + } + } + } else { + for (; x8 < adjusted_x8_end; ++x8) { + const int source_reference_type = source_reference_types[x8]; + if (skip_references[source_reference_type]) continue; + MotionVector projection_mv; + // reference_to_current_with_sign could be 0. + GetMvProjection(mv[x8], reference_to_current_with_sign, + projection_divisions[source_reference_type], + &projection_mv); + // Do not update the motion vector if the block position is not valid + // or if position_x8 is outside the current range of x8_start and + // x8_end. Note that position_y8 will always be within the range of + // y8_start and y8_end. + const int position_y8 = Project(0, projection_mv.mv[0], dst_sign); + if (position_y8 < y8_floor || position_y8 > y8_ceiling) continue; + const int x8_base = x8 & ~7; + const int x8_floor = + std::max(x8_start, x8_base - kProjectionMvMaxHorizontalOffset); + const int x8_ceiling = + std::min(x8_end, x8_base + 8 + kProjectionMvMaxHorizontalOffset); + const int position_x8 = Project(x8, projection_mv.mv[1], dst_sign); + if (position_x8 < x8_floor || position_x8 >= x8_ceiling) continue; + dst_mv[position_y8 * stride + position_x8] = mv[x8]; + dst_reference_offset[position_y8 * stride + position_x8] = + reference_offsets[source_reference_type]; + } + } + } + + source_reference_types += stride; + mv += stride; + dst_reference_offset += stride; + dst_mv += stride; + } while (++y8 < y8_end); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1; +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->motion_field_projection_kernel = MotionFieldProjectionKernel_SSE4_1; +} +#endif + +} // namespace + +void MotionFieldProjectionInit_SSE4_1() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 +namespace libgav1 { +namespace dsp { + +void MotionFieldProjectionInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/motion_field_projection_sse4.h b/src/dsp/x86/motion_field_projection_sse4.h new file mode 100644 index 0000000..c05422c --- /dev/null +++ b/src/dsp/x86/motion_field_projection_sse4.h @@ -0,0 +1,41 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::motion_field_projection_kernel. This function is not +// thread-safe. +void MotionFieldProjectionInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_TARGETING_SSE4_1 + +#ifndef LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel +#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_SSE4_1 +#endif + +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_ diff --git a/src/dsp/x86/motion_vector_search_sse4.cc b/src/dsp/x86/motion_vector_search_sse4.cc new file mode 100644 index 0000000..e9cdd4c --- /dev/null +++ b/src/dsp/x86/motion_vector_search_sse4.cc @@ -0,0 +1,262 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/motion_vector_search.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { +namespace dsp { +namespace { + +constexpr int kProjectionMvDivisionLookup_32bit[kMaxFrameDistance + 1] = { + 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638, + 1489, 1365, 1260, 1170, 1092, 1024, 963, 910, 862, 819, 780, + 744, 712, 682, 655, 630, 606, 585, 564, 546, 528}; + +inline __m128i MvProjection(const __m128i mv, const __m128i denominator, + const __m128i numerator) { + const __m128i m0 = _mm_madd_epi16(mv, denominator); + const __m128i m = _mm_mullo_epi32(m0, numerator); + // Add the sign (0 or -1) to round towards zero. + const __m128i sign = _mm_srai_epi32(m, 31); + const __m128i add_sign = _mm_add_epi32(m, sign); + const __m128i sum = _mm_add_epi32(add_sign, _mm_set1_epi32(1 << 13)); + return _mm_srai_epi32(sum, 14); +} + +inline __m128i MvProjectionClip(const __m128i mvs[2], + const __m128i denominators[2], + const __m128i numerator) { + const __m128i s0 = MvProjection(mvs[0], denominators[0], numerator); + const __m128i s1 = MvProjection(mvs[1], denominators[1], numerator); + const __m128i mv = _mm_packs_epi32(s0, s1); + const __m128i projection_mv_clamp = _mm_set1_epi16(kProjectionMvClamp); + const __m128i projection_mv_clamp_negative = + _mm_set1_epi16(-kProjectionMvClamp); + const __m128i clamp = _mm_min_epi16(mv, projection_mv_clamp); + return _mm_max_epi16(clamp, projection_mv_clamp_negative); +} + +inline __m128i MvProjectionCompoundClip( + const MotionVector* const temporal_mvs, + const int8_t temporal_reference_offsets[2], + const int reference_offsets[2]) { + const auto* const tmvs = reinterpret_cast(temporal_mvs); + const __m128i temporal_mv = LoadLo8(tmvs); + const __m128i temporal_mv_0 = _mm_cvtepu16_epi32(temporal_mv); + __m128i mvs[2], denominators[2]; + mvs[0] = _mm_unpacklo_epi64(temporal_mv_0, temporal_mv_0); + mvs[1] = _mm_unpackhi_epi64(temporal_mv_0, temporal_mv_0); + denominators[0] = _mm_set1_epi32( + kProjectionMvDivisionLookup[temporal_reference_offsets[0]]); + denominators[1] = _mm_set1_epi32( + kProjectionMvDivisionLookup[temporal_reference_offsets[1]]); + const __m128i offsets = LoadLo8(reference_offsets); + const __m128i numerator = _mm_unpacklo_epi32(offsets, offsets); + return MvProjectionClip(mvs, denominators, numerator); +} + +inline __m128i MvProjectionSingleClip( + const MotionVector* const temporal_mvs, + const int8_t* const temporal_reference_offsets, + const int reference_offset) { + const auto* const tmvs = reinterpret_cast(temporal_mvs); + const __m128i temporal_mv = LoadAligned16(tmvs); + __m128i lookup = _mm_cvtsi32_si128( + kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[0]]); + lookup = _mm_insert_epi32( + lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[1]], + 1); + lookup = _mm_insert_epi32( + lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[2]], + 2); + lookup = _mm_insert_epi32( + lookup, kProjectionMvDivisionLookup_32bit[temporal_reference_offsets[3]], + 3); + __m128i mvs[2], denominators[2]; + mvs[0] = _mm_unpacklo_epi16(temporal_mv, _mm_setzero_si128()); + mvs[1] = _mm_unpackhi_epi16(temporal_mv, _mm_setzero_si128()); + denominators[0] = _mm_unpacklo_epi32(lookup, lookup); + denominators[1] = _mm_unpackhi_epi32(lookup, lookup); + const __m128i numerator = _mm_set1_epi32(reference_offset); + return MvProjectionClip(mvs, denominators, numerator); +} + +inline void LowPrecision(const __m128i mv, void* const candidate_mvs) { + const __m128i kRoundDownMask = _mm_set1_epi16(~1); + const __m128i sign = _mm_srai_epi16(mv, 15); + const __m128i sub_sign = _mm_sub_epi16(mv, sign); + const __m128i d = _mm_and_si128(sub_sign, kRoundDownMask); + StoreAligned16(candidate_mvs, d); +} + +inline void ForceInteger(const __m128i mv, void* const candidate_mvs) { + const __m128i kRoundDownMask = _mm_set1_epi16(~7); + const __m128i sign = _mm_srai_epi16(mv, 15); + const __m128i mv1 = _mm_add_epi16(mv, _mm_set1_epi16(3)); + const __m128i mv2 = _mm_sub_epi16(mv1, sign); + const __m128i mv3 = _mm_and_si128(mv2, kRoundDownMask); + StoreAligned16(candidate_mvs, mv3); +} + +void MvProjectionCompoundLowPrecision_SSE4_1( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* candidate_mvs) { + // |reference_offsets| non-zero check usually equals true and is ignored. + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + // One more element could be calculated. + int i = 0; + do { + const __m128i mv = MvProjectionCompoundClip( + temporal_mvs + i, temporal_reference_offsets + i, offsets); + LowPrecision(mv, candidate_mvs + i); + i += 2; + } while (i < count); +} + +void MvProjectionCompoundForceInteger_SSE4_1( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* candidate_mvs) { + // |reference_offsets| non-zero check usually equals true and is ignored. + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + // One more element could be calculated. + int i = 0; + do { + const __m128i mv = MvProjectionCompoundClip( + temporal_mvs + i, temporal_reference_offsets + i, offsets); + ForceInteger(mv, candidate_mvs + i); + i += 2; + } while (i < count); +} + +void MvProjectionCompoundHighPrecision_SSE4_1( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offsets[2], const int count, + CompoundMotionVector* candidate_mvs) { + // |reference_offsets| non-zero check usually equals true and is ignored. + // To facilitate the compilers, make a local copy of |reference_offsets|. + const int offsets[2] = {reference_offsets[0], reference_offsets[1]}; + // One more element could be calculated. + int i = 0; + do { + const __m128i mv = MvProjectionCompoundClip( + temporal_mvs + i, temporal_reference_offsets + i, offsets); + StoreAligned16(candidate_mvs + i, mv); + i += 2; + } while (i < count); +} + +void MvProjectionSingleLowPrecision_SSE4_1( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offset, const int count, MotionVector* candidate_mvs) { + // Up to three more elements could be calculated. + int i = 0; + do { + const __m128i mv = MvProjectionSingleClip( + temporal_mvs + i, temporal_reference_offsets + i, reference_offset); + LowPrecision(mv, candidate_mvs + i); + i += 4; + } while (i < count); +} + +void MvProjectionSingleForceInteger_SSE4_1( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offset, const int count, MotionVector* candidate_mvs) { + // Up to three more elements could be calculated. + int i = 0; + do { + const __m128i mv = MvProjectionSingleClip( + temporal_mvs + i, temporal_reference_offsets + i, reference_offset); + ForceInteger(mv, candidate_mvs + i); + i += 4; + } while (i < count); +} + +void MvProjectionSingleHighPrecision_SSE4_1( + const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, + const int reference_offset, const int count, MotionVector* candidate_mvs) { + // Up to three more elements could be calculated. + int i = 0; + do { + const __m128i mv = MvProjectionSingleClip( + temporal_mvs + i, temporal_reference_offsets + i, reference_offset); + StoreAligned16(candidate_mvs + i, mv); + i += 4; + } while (i < count); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1; + dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1; + dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1; + dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1; + dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1; + dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1; +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->mv_projection_compound[0] = MvProjectionCompoundLowPrecision_SSE4_1; + dsp->mv_projection_compound[1] = MvProjectionCompoundForceInteger_SSE4_1; + dsp->mv_projection_compound[2] = MvProjectionCompoundHighPrecision_SSE4_1; + dsp->mv_projection_single[0] = MvProjectionSingleLowPrecision_SSE4_1; + dsp->mv_projection_single[1] = MvProjectionSingleForceInteger_SSE4_1; + dsp->mv_projection_single[2] = MvProjectionSingleHighPrecision_SSE4_1; +} +#endif + +} // namespace + +void MotionVectorSearchInit_SSE4_1() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 +namespace libgav1 { +namespace dsp { + +void MotionVectorSearchInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/motion_vector_search_sse4.h b/src/dsp/x86/motion_vector_search_sse4.h new file mode 100644 index 0000000..d65b392 --- /dev/null +++ b/src/dsp/x86/motion_vector_search_sse4.h @@ -0,0 +1,41 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::mv_projection_compound and Dsp::mv_projection_single. This +// function is not thread-safe. +void MotionVectorSearchInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_TARGETING_SSE4_1 + +#ifndef LIBGAV1_Dsp8bpp_MotionVectorSearch +#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_SSE4_1 +#endif + +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_ diff --git a/src/dsp/x86/obmc_sse4.cc b/src/dsp/x86/obmc_sse4.cc new file mode 100644 index 0000000..3a1d1fd --- /dev/null +++ b/src/dsp/x86/obmc_sse4.cc @@ -0,0 +1,329 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/obmc.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace { + +#include "src/dsp/obmc.inc" + +inline void OverlapBlendFromLeft2xH_SSE4_1( + uint8_t* const prediction, const ptrdiff_t prediction_stride, + const int height, const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); + const __m128i mask_val = _mm_shufflelo_epi16(Load4(kObmcMask), 0); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); + int y = height; + do { + const __m128i pred_val = Load2x2(pred, pred + prediction_stride); + const __m128i obmc_pred_val = + Load2x2(obmc_pred, obmc_pred + obmc_prediction_stride); + + const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + const __m128i packed_result = _mm_packus_epi16(result, result); + Store2(pred, packed_result); + pred += prediction_stride; + const int16_t second_row_result = _mm_extract_epi16(packed_result, 1); + memcpy(pred, &second_row_result, sizeof(second_row_result)); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride << 1; + y -= 2; + } while (y != 0); +} + +inline void OverlapBlendFromLeft4xH_SSE4_1( + uint8_t* const prediction, const ptrdiff_t prediction_stride, + const int height, const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040); + const __m128i mask_val = Load4(kObmcMask + 2); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + // Duplicate first half of vector. + const __m128i masks = + _mm_shuffle_epi32(_mm_unpacklo_epi8(mask_val, obmc_mask_val), 0x44); + int y = height; + do { + const __m128i pred_val0 = Load4(pred); + const __m128i obmc_pred_val0 = Load4(obmc_pred); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + + // Place the second row of each source in the second four bytes. + const __m128i pred_val = + _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12); + const __m128i obmc_pred_val = _mm_alignr_epi8( + Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12); + const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + const __m128i packed_result = _mm_packus_epi16(result, result); + Store4(pred - prediction_stride, packed_result); + const int second_row_result = _mm_extract_epi32(packed_result, 1); + memcpy(pred, &second_row_result, sizeof(second_row_result)); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + y -= 2; + } while (y != 0); +} + +inline void OverlapBlendFromLeft8xH_SSE4_1( + uint8_t* const prediction, const ptrdiff_t prediction_stride, + const int height, const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + const __m128i mask_inverter = _mm_set1_epi8(64); + const __m128i mask_val = LoadLo8(kObmcMask + 6); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); + int y = height; + do { + const __m128i pred_val = LoadLo8(pred); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); + const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + + StoreLo8(pred, _mm_packus_epi16(result, result)); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + } while (--y != 0); +} + +void OverlapBlendFromLeft_SSE4_1(void* const prediction, + const ptrdiff_t prediction_stride, + const int width, const int height, + const void* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + auto* pred = static_cast(prediction); + const auto* obmc_pred = static_cast(obmc_prediction); + + if (width == 2) { + OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + if (width == 4) { + OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + if (width == 8) { + OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + const __m128i mask_inverter = _mm_set1_epi8(64); + const uint8_t* mask = kObmcMask + width - 2; + int x = 0; + do { + pred = static_cast(prediction) + x; + obmc_pred = static_cast(obmc_prediction) + x; + const __m128i mask_val = LoadUnaligned16(mask + x); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks_lo = _mm_unpacklo_epi8(mask_val, obmc_mask_val); + const __m128i masks_hi = _mm_unpackhi_epi8(mask_val, obmc_mask_val); + + int y = 0; + do { + const __m128i pred_val = LoadUnaligned16(pred); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred); + const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result_lo = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks_lo), 6); + const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val); + const __m128i result_hi = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks_hi), 6); + StoreUnaligned16(pred, _mm_packus_epi16(result_lo, result_hi)); + + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + } while (++y < height); + x += 16; + } while (x < width); +} + +inline void OverlapBlendFromTop4xH_SSE4_1( + uint8_t* const prediction, const ptrdiff_t prediction_stride, + const int height, const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + const __m128i mask_inverter = _mm_set1_epi16(64); + const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0); + const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1); + + const uint8_t* mask = kObmcMask + height - 2; + const int compute_height = height - (height >> 2); + int y = 0; + do { + // First mask in the first half, second mask in the second half. + const __m128i mask_val = _mm_shuffle_epi8( + _mm_cvtsi32_si128(*reinterpret_cast(mask + y)), + mask_shuffler); + const __m128i masks = + _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter)); + const __m128i pred_val0 = Load4(pred); + + const __m128i obmc_pred_val0 = Load4(obmc_pred); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + const __m128i pred_val = + _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12); + const __m128i obmc_pred_val = _mm_alignr_epi8( + Load4(obmc_pred), _mm_slli_si128(obmc_pred_val0, 12), 12); + const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val); + const __m128i result = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + + const __m128i packed_result = _mm_packus_epi16(result, result); + Store4(pred - prediction_stride, packed_result); + Store4(pred, _mm_srli_si128(packed_result, 4)); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + y += 2; + } while (y < compute_height); +} + +inline void OverlapBlendFromTop8xH_SSE4_1( + uint8_t* const prediction, const ptrdiff_t prediction_stride, + const int height, const uint8_t* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + uint8_t* pred = prediction; + const uint8_t* obmc_pred = obmc_prediction; + const uint8_t* mask = kObmcMask + height - 2; + const __m128i mask_inverter = _mm_set1_epi8(64); + const int compute_height = height - (height >> 2); + int y = compute_height; + do { + const __m128i mask_val = _mm_set1_epi8(mask[compute_height - y]); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); + const __m128i pred_val = LoadLo8(pred); + const __m128i obmc_pred_val = LoadLo8(obmc_pred); + const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6); + + StoreLo8(pred, _mm_packus_epi16(result, result)); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + } while (--y != 0); +} + +void OverlapBlendFromTop_SSE4_1(void* const prediction, + const ptrdiff_t prediction_stride, + const int width, const int height, + const void* const obmc_prediction, + const ptrdiff_t obmc_prediction_stride) { + auto* pred = static_cast(prediction); + const auto* obmc_pred = static_cast(obmc_prediction); + + if (width <= 4) { + OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + if (width == 8) { + OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred, + obmc_prediction_stride); + return; + } + + // Stop when mask value becomes 64. + const int compute_height = height - (height >> 2); + const __m128i mask_inverter = _mm_set1_epi8(64); + int y = 0; + const uint8_t* mask = kObmcMask + height - 2; + do { + const __m128i mask_val = _mm_set1_epi8(mask[y]); + // 64 - mask + const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val); + const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val); + int x = 0; + do { + const __m128i pred_val = LoadUnaligned16(pred + x); + const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x); + const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val); + const __m128i result_lo = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6); + const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val); + const __m128i result_hi = + RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6); + StoreUnaligned16(pred + x, _mm_packus_epi16(result_lo, result_hi)); + x += 16; + } while (x < width); + pred += prediction_stride; + obmc_pred += obmc_prediction_stride; + } while (++y < compute_height); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); +#if DSP_ENABLED_8BPP_SSE4_1(ObmcVertical) + dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_SSE4_1; +#endif +#if DSP_ENABLED_8BPP_SSE4_1(ObmcHorizontal) + dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_SSE4_1; +#endif +} + +} // namespace + +void ObmcInit_SSE4_1() { Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 + +namespace libgav1 { +namespace dsp { + +void ObmcInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/obmc_sse4.h b/src/dsp/x86/obmc_sse4.h new file mode 100644 index 0000000..bd8b416 --- /dev/null +++ b/src/dsp/x86/obmc_sse4.h @@ -0,0 +1,43 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::obmc_blend[]. This function is not thread-safe. +void ObmcInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +// If sse4 is enabled and the baseline isn't set due to a higher level of +// optimization being enabled, signal the sse4 implementation should be used. +#if LIBGAV1_TARGETING_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_ObmcVertical +#define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_SSE4_1 +#endif +#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal +#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1 +#endif +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_ diff --git a/src/dsp/x86/super_res_sse4.cc b/src/dsp/x86/super_res_sse4.cc new file mode 100644 index 0000000..b2bdfd2 --- /dev/null +++ b/src/dsp/x86/super_res_sse4.cc @@ -0,0 +1,166 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/super_res.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/dsp/x86/transpose_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +// Upscale_Filter as defined in AV1 Section 7.16 +// Negative to make them fit in 8-bit. +alignas(16) const int8_t + kNegativeUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = { + {0, 0, 0, -128, 0, 0, 0, 0}, {0, 0, 1, -128, -2, 1, 0, 0}, + {0, -1, 3, -127, -4, 2, -1, 0}, {0, -1, 4, -127, -6, 3, -1, 0}, + {0, -2, 6, -126, -8, 3, -1, 0}, {0, -2, 7, -125, -11, 4, -1, 0}, + {1, -2, 8, -125, -13, 5, -2, 0}, {1, -3, 9, -124, -15, 6, -2, 0}, + {1, -3, 10, -123, -18, 6, -2, 1}, {1, -3, 11, -122, -20, 7, -3, 1}, + {1, -4, 12, -121, -22, 8, -3, 1}, {1, -4, 13, -120, -25, 9, -3, 1}, + {1, -4, 14, -118, -28, 9, -3, 1}, {1, -4, 15, -117, -30, 10, -4, 1}, + {1, -5, 16, -116, -32, 11, -4, 1}, {1, -5, 16, -114, -35, 12, -4, 1}, + {1, -5, 17, -112, -38, 12, -4, 1}, {1, -5, 18, -111, -40, 13, -5, 1}, + {1, -5, 18, -109, -43, 14, -5, 1}, {1, -6, 19, -107, -45, 14, -5, 1}, + {1, -6, 19, -105, -48, 15, -5, 1}, {1, -6, 19, -103, -51, 16, -5, 1}, + {1, -6, 20, -101, -53, 16, -6, 1}, {1, -6, 20, -99, -56, 17, -6, 1}, + {1, -6, 20, -97, -58, 17, -6, 1}, {1, -6, 20, -95, -61, 18, -6, 1}, + {2, -7, 20, -93, -64, 18, -6, 2}, {2, -7, 20, -91, -66, 19, -6, 1}, + {2, -7, 20, -88, -69, 19, -6, 1}, {2, -7, 20, -86, -71, 19, -6, 1}, + {2, -7, 20, -84, -74, 20, -7, 2}, {2, -7, 20, -81, -76, 20, -7, 1}, + {2, -7, 20, -79, -79, 20, -7, 2}, {1, -7, 20, -76, -81, 20, -7, 2}, + {2, -7, 20, -74, -84, 20, -7, 2}, {1, -6, 19, -71, -86, 20, -7, 2}, + {1, -6, 19, -69, -88, 20, -7, 2}, {1, -6, 19, -66, -91, 20, -7, 2}, + {2, -6, 18, -64, -93, 20, -7, 2}, {1, -6, 18, -61, -95, 20, -6, 1}, + {1, -6, 17, -58, -97, 20, -6, 1}, {1, -6, 17, -56, -99, 20, -6, 1}, + {1, -6, 16, -53, -101, 20, -6, 1}, {1, -5, 16, -51, -103, 19, -6, 1}, + {1, -5, 15, -48, -105, 19, -6, 1}, {1, -5, 14, -45, -107, 19, -6, 1}, + {1, -5, 14, -43, -109, 18, -5, 1}, {1, -5, 13, -40, -111, 18, -5, 1}, + {1, -4, 12, -38, -112, 17, -5, 1}, {1, -4, 12, -35, -114, 16, -5, 1}, + {1, -4, 11, -32, -116, 16, -5, 1}, {1, -4, 10, -30, -117, 15, -4, 1}, + {1, -3, 9, -28, -118, 14, -4, 1}, {1, -3, 9, -25, -120, 13, -4, 1}, + {1, -3, 8, -22, -121, 12, -4, 1}, {1, -3, 7, -20, -122, 11, -3, 1}, + {1, -2, 6, -18, -123, 10, -3, 1}, {0, -2, 6, -15, -124, 9, -3, 1}, + {0, -2, 5, -13, -125, 8, -2, 1}, {0, -1, 4, -11, -125, 7, -2, 0}, + {0, -1, 3, -8, -126, 6, -2, 0}, {0, -1, 3, -6, -127, 4, -1, 0}, + {0, -1, 2, -4, -127, 3, -1, 0}, {0, 0, 1, -2, -128, 1, 0, 0}, +}; + +void SuperResCoefficients_SSE4_1(const int upscaled_width, + const int initial_subpixel_x, const int step, + void* const coefficients) { + auto* dst = static_cast(coefficients); + int subpixel_x = initial_subpixel_x; + int x = RightShiftWithCeiling(upscaled_width, 4); + do { + for (int i = 0; i < 8; ++i, dst += 16) { + int remainder = subpixel_x & kSuperResScaleMask; + __m128i filter = + LoadLo8(kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]); + subpixel_x += step; + remainder = subpixel_x & kSuperResScaleMask; + filter = LoadHi8(filter, + kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]); + subpixel_x += step; + StoreAligned16(dst, filter); + } + } while (--x != 0); +} + +void SuperRes_SSE4_1(const void* const coefficients, void* const source, + const ptrdiff_t stride, const int height, + const int downscaled_width, const int upscaled_width, + const int initial_subpixel_x, const int step, + void* const dest) { + auto* src = static_cast(source) - DivideBy2(kSuperResFilterTaps); + auto* dst = static_cast(dest); + int y = height; + do { + const auto* filter = static_cast(coefficients); + uint8_t* dst_ptr = dst; + ExtendLine(src + DivideBy2(kSuperResFilterTaps), downscaled_width, + kSuperResHorizontalBorder, kSuperResHorizontalBorder); + int subpixel_x = initial_subpixel_x; + // The below code calculates up to 15 extra upscaled + // pixels which will over-read up to 15 downscaled pixels in the end of each + // row. kSuperResHorizontalBorder accounts for this. + int x = RightShiftWithCeiling(upscaled_width, 4); + do { + __m128i weighted_src[8]; + for (int i = 0; i < 8; ++i, filter += 16) { + __m128i s = LoadLo8(&src[subpixel_x >> kSuperResScaleBits]); + subpixel_x += step; + s = LoadHi8(s, &src[subpixel_x >> kSuperResScaleBits]); + subpixel_x += step; + const __m128i f = LoadAligned16(filter); + weighted_src[i] = _mm_maddubs_epi16(s, f); + } + + __m128i a[4]; + a[0] = _mm_hadd_epi16(weighted_src[0], weighted_src[1]); + a[1] = _mm_hadd_epi16(weighted_src[2], weighted_src[3]); + a[2] = _mm_hadd_epi16(weighted_src[4], weighted_src[5]); + a[3] = _mm_hadd_epi16(weighted_src[6], weighted_src[7]); + Transpose2x16_U16(a, a); + a[0] = _mm_adds_epi16(a[0], a[1]); + a[1] = _mm_adds_epi16(a[2], a[3]); + const __m128i rounding = _mm_set1_epi16(1 << (kFilterBits - 1)); + a[0] = _mm_subs_epi16(rounding, a[0]); + a[1] = _mm_subs_epi16(rounding, a[1]); + a[0] = _mm_srai_epi16(a[0], kFilterBits); + a[1] = _mm_srai_epi16(a[1], kFilterBits); + StoreAligned16(dst_ptr, _mm_packus_epi16(a[0], a[1])); + dst_ptr += 16; + } while (--x != 0); + src += stride; + dst += stride; + } while (--y != 0); +} + +void Init8bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + dsp->super_res_coefficients = SuperResCoefficients_SSE4_1; + dsp->super_res = SuperRes_SSE4_1; +} + +} // namespace +} // namespace low_bitdepth + +void SuperResInit_SSE4_1() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 + +namespace libgav1 { +namespace dsp { + +void SuperResInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/super_res_sse4.h b/src/dsp/x86/super_res_sse4.h new file mode 100644 index 0000000..aef5147 --- /dev/null +++ b/src/dsp/x86/super_res_sse4.h @@ -0,0 +1,38 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::super_res_row. This function is not thread-safe. +void SuperResInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_TARGETING_SSE4_1 +#ifndef LIBGAV1_Dsp8bpp_SuperRes +#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_SSE4_1 +#endif +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_ diff --git a/src/dsp/x86/transpose_sse4.h b/src/dsp/x86/transpose_sse4.h new file mode 100644 index 0000000..208b301 --- /dev/null +++ b/src/dsp/x86/transpose_sse4.h @@ -0,0 +1,307 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_ + +#include "src/utils/compiler_attributes.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 +#include + +namespace libgav1 { +namespace dsp { + +LIBGAV1_ALWAYS_INLINE void Transpose2x16_U16(const __m128i* const in, + __m128i* const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 10 11 20 21 30 31 + // in[0]: 40 41 50 51 60 61 70 71 + // in[0]: 80 81 90 91 a0 a1 b0 b1 + // in[0]: c0 c1 d0 d1 e0 e1 f0 f1 + // to: + // a0: 00 40 01 41 10 50 11 51 + // a1: 20 60 21 61 30 70 31 71 + // a2: 80 c0 81 c1 90 d0 91 d1 + // a3: a0 e0 a1 e1 b0 f0 b1 f1 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a2 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a3 = _mm_unpackhi_epi16(in[2], in[3]); + // b0: 00 20 40 60 01 21 41 61 + // b1: 10 30 50 70 11 31 51 71 + // b2: 80 a0 c0 e0 81 a1 c1 e1 + // b3: 90 b0 d0 f0 91 b1 d1 f1 + const __m128i b0 = _mm_unpacklo_epi16(a0, a1); + const __m128i b1 = _mm_unpackhi_epi16(a0, a1); + const __m128i b2 = _mm_unpacklo_epi16(a2, a3); + const __m128i b3 = _mm_unpackhi_epi16(a2, a3); + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 80 90 a0 b0 c0 d0 e0 f0 + // out[3]: 81 91 a1 b1 c1 d1 e1 f1 + out[0] = _mm_unpacklo_epi16(b0, b1); + out[1] = _mm_unpackhi_epi16(b0, b1); + out[2] = _mm_unpacklo_epi16(b2, b3); + out[3] = _mm_unpackhi_epi16(b2, b3); +} + +LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + return _mm_unpacklo_epi16(a0, a1); +} + +LIBGAV1_ALWAYS_INLINE void Transpose8x8To4x16_U8(const __m128i* const in, + __m128i* out) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); + + // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi16(a0, a1); + const __m128i b1 = _mm_unpacklo_epi16(a2, a3); + const __m128i b2 = _mm_unpackhi_epi16(a0, a1); + const __m128i b3 = _mm_unpackhi_epi16(a2, a3); + + // out[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // out[1]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // out[2]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // out[3]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi32(b0, b1); + out[1] = _mm_unpackhi_epi32(b0, b1); + out[2] = _mm_unpacklo_epi32(b2, b3); + out[3] = _mm_unpackhi_epi32(b2, b3); +} + +LIBGAV1_ALWAYS_INLINE void Transpose4x4_U16(const __m128i* in, __m128i* out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // to: + // ba: 00 10 01 11 02 12 03 13 + // dc: 20 30 21 31 22 32 23 33 + const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]); + // Unpack 32 bit elements resulting in: + // dcba_lo: 00 10 20 30 01 11 21 31 + // dcba_hi: 02 12 22 32 03 13 23 33 + const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc); + const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc); + // Assign or shift right by 8 bytes resulting in: + // out[0]: 00 10 20 30 01 11 21 31 + // out[1]: 01 11 21 31 XX XX XX XX + // out[2]: 02 12 22 32 03 13 23 33 + // out[3]: 03 13 23 33 XX XX XX XX + out[0] = dcba_lo; + out[1] = _mm_srli_si128(dcba_lo, 8); + out[2] = dcba_hi; + out[3] = _mm_srli_si128(dcba_hi, 8); +} + +LIBGAV1_ALWAYS_INLINE void Transpose4x8To8x4_U16(const __m128i* in, + __m128i* out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // in[4]: 40 41 42 43 XX XX XX XX + // in[5]: 50 51 52 53 XX XX XX XX + // in[6]: 60 61 62 63 XX XX XX XX + // in[7]: 70 71 72 73 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 02 12 22 32 03 13 23 33 + // b3: 42 52 62 72 43 53 63 73 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpackhi_epi32(a0, a1); + const __m128i b3 = _mm_unpackhi_epi32(a2, a3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b2, b3); + out[3] = _mm_unpackhi_epi64(b2, b3); +} + +LIBGAV1_ALWAYS_INLINE void Transpose8x4To4x8_U16(const __m128i* in, + __m128i* out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b2: 04 14 24 34 05 15 25 35 + // b4: 02 12 22 32 03 13 23 33 + // b6: 06 16 26 36 07 17 27 37 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 XX XX XX XX + // out[1]: 01 11 21 31 XX XX XX XX + // out[2]: 02 12 22 32 XX XX XX XX + // out[3]: 03 13 23 33 XX XX XX XX + // out[4]: 04 14 24 34 XX XX XX XX + // out[5]: 05 15 25 35 XX XX XX XX + // out[6]: 06 16 26 36 XX XX XX XX + // out[7]: 07 17 27 37 XX XX XX XX + const __m128i zeros = _mm_setzero_si128(); + out[0] = _mm_unpacklo_epi64(b0, zeros); + out[1] = _mm_unpackhi_epi64(b0, zeros); + out[2] = _mm_unpacklo_epi64(b4, zeros); + out[3] = _mm_unpackhi_epi64(b4, zeros); + out[4] = _mm_unpacklo_epi64(b2, zeros); + out[5] = _mm_unpackhi_epi64(b2, zeros); + out[6] = _mm_unpacklo_epi64(b6, zeros); + out[7] = _mm_unpackhi_epi64(b6, zeros); +} + +LIBGAV1_ALWAYS_INLINE void Transpose8x8_U16(const __m128i* const in, + __m128i* const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + // a6: 44 54 45 55 46 56 47 57 + // a7: 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 04 14 24 34 05 15 25 35 + // b3: 44 54 64 74 45 55 65 75 + // b4: 02 12 22 32 03 13 23 33 + // b5: 42 52 62 72 43 53 63 73 + // b6: 06 16 26 36 07 17 27 37 + // b7: 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b3 = _mm_unpacklo_epi32(a6, a7); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b5 = _mm_unpackhi_epi32(a2, a3); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + const __m128i b7 = _mm_unpackhi_epi32(a6, a7); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b4, b5); + out[3] = _mm_unpackhi_epi64(b4, b5); + out[4] = _mm_unpacklo_epi64(b2, b3); + out[5] = _mm_unpackhi_epi64(b2, b3); + out[6] = _mm_unpacklo_epi64(b6, b7); + out[7] = _mm_unpackhi_epi64(b6, b7); +} + +} // namespace dsp +} // namespace libgav1 + +#endif // LIBGAV1_TARGETING_SSE4_1 +#endif // LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_ diff --git a/src/dsp/x86/warp_sse4.cc b/src/dsp/x86/warp_sse4.cc new file mode 100644 index 0000000..43279ab --- /dev/null +++ b/src/dsp/x86/warp_sse4.cc @@ -0,0 +1,525 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/warp.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include +#include +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/dsp/x86/transpose_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +// Number of extra bits of precision in warped filtering. +constexpr int kWarpedDiffPrecisionBits = 10; + +// This assumes the two filters contain filter[x] and filter[x+2]. +inline __m128i AccumulateFilter(const __m128i sum, const __m128i filter_0, + const __m128i filter_1, + const __m128i& src_window) { + const __m128i filter_taps = _mm_unpacklo_epi8(filter_0, filter_1); + const __m128i src = + _mm_unpacklo_epi8(src_window, _mm_srli_si128(src_window, 2)); + return _mm_add_epi16(sum, _mm_maddubs_epi16(src, filter_taps)); +} + +constexpr int kFirstPassOffset = 1 << 14; +constexpr int kOffsetRemoval = + (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128; + +// Applies the horizontal filter to one source row and stores the result in +// |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8 +// |intermediate_result| two-dimensional array. +inline void HorizontalFilter(const int sx4, const int16_t alpha, + const __m128i src_row, + int16_t intermediate_result_row[8]) { + int sx = sx4 - MultiplyBy4(alpha); + __m128i filter[8]; + for (__m128i& f : filter) { + const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + f = LoadLo8(kWarpedFilters8[offset]); + sx += alpha; + } + Transpose8x8To4x16_U8(filter, filter); + // |filter| now contains two filters per register. + // Staggered combinations allow us to take advantage of _mm_maddubs_epi16 + // without overflowing the sign bit. The sign bit is hit only where two taps + // paired in a single madd add up to more than 128. This is only possible with + // two adjacent "inner" taps. Therefore, pairing odd with odd and even with + // even guarantees safety. |sum| is given a negative offset to allow for large + // intermediate values. + // k = 0, 2. + __m128i src_row_window = src_row; + __m128i sum = _mm_set1_epi16(-kFirstPassOffset); + sum = AccumulateFilter(sum, filter[0], filter[1], src_row_window); + + // k = 1, 3. + src_row_window = _mm_srli_si128(src_row_window, 1); + sum = AccumulateFilter(sum, _mm_srli_si128(filter[0], 8), + _mm_srli_si128(filter[1], 8), src_row_window); + // k = 4, 6. + src_row_window = _mm_srli_si128(src_row_window, 3); + sum = AccumulateFilter(sum, filter[2], filter[3], src_row_window); + + // k = 5, 7. + src_row_window = _mm_srli_si128(src_row_window, 1); + sum = AccumulateFilter(sum, _mm_srli_si128(filter[2], 8), + _mm_srli_si128(filter[3], 8), src_row_window); + + sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal); + StoreUnaligned16(intermediate_result_row, sum); +} + +template +inline void WriteVerticalFilter(const __m128i filter[8], + const int16_t intermediate_result[15][8], int y, + void* dst_row) { + constexpr int kRoundBitsVertical = + is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical; + __m128i sum_low = _mm_set1_epi32(kOffsetRemoval); + __m128i sum_high = sum_low; + for (int k = 0; k < 8; k += 2) { + const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]); + const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]); + const __m128i intermediate_0 = LoadUnaligned16(intermediate_result[y + k]); + const __m128i intermediate_1 = + LoadUnaligned16(intermediate_result[y + k + 1]); + const __m128i intermediate_low = + _mm_unpacklo_epi16(intermediate_0, intermediate_1); + const __m128i intermediate_high = + _mm_unpackhi_epi16(intermediate_0, intermediate_1); + + const __m128i product_low = _mm_madd_epi16(filters_low, intermediate_low); + const __m128i product_high = + _mm_madd_epi16(filters_high, intermediate_high); + sum_low = _mm_add_epi32(sum_low, product_low); + sum_high = _mm_add_epi32(sum_high, product_high); + } + sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical); + sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical); + if (is_compound) { + const __m128i sum = _mm_packs_epi32(sum_low, sum_high); + StoreUnaligned16(static_cast(dst_row), sum); + } else { + const __m128i sum = _mm_packus_epi32(sum_low, sum_high); + StoreLo8(static_cast(dst_row), _mm_packus_epi16(sum, sum)); + } +} + +template +inline void WriteVerticalFilter(const __m128i filter[8], + const int16_t* intermediate_result_column, + void* dst_row) { + constexpr int kRoundBitsVertical = + is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical; + __m128i sum_low = _mm_setzero_si128(); + __m128i sum_high = _mm_setzero_si128(); + for (int k = 0; k < 8; k += 2) { + const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]); + const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]); + // Equivalent to unpacking two vectors made by duplicating int16_t values. + const __m128i intermediate = + _mm_set1_epi32((intermediate_result_column[k + 1] << 16) | + intermediate_result_column[k]); + const __m128i product_low = _mm_madd_epi16(filters_low, intermediate); + const __m128i product_high = _mm_madd_epi16(filters_high, intermediate); + sum_low = _mm_add_epi32(sum_low, product_low); + sum_high = _mm_add_epi32(sum_high, product_high); + } + sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical); + sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical); + if (is_compound) { + const __m128i sum = _mm_packs_epi32(sum_low, sum_high); + StoreUnaligned16(static_cast(dst_row), sum); + } else { + const __m128i sum = _mm_packus_epi32(sum_low, sum_high); + StoreLo8(static_cast(dst_row), _mm_packus_epi16(sum, sum)); + } +} + +template +inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma, + int delta, DestType* dest_row, + ptrdiff_t dest_stride) { + int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + for (int y = 0; y < 8; ++y) { + int sy = sy4 - MultiplyBy4(gamma); + __m128i filter[8]; + for (__m128i& f : filter) { + const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + f = LoadUnaligned16(kWarpedFilters[offset]); + sy += gamma; + } + Transpose8x8_U16(filter, filter); + WriteVerticalFilter(filter, source, y, dest_row); + dest_row += dest_stride; + sy4 += delta; + } +} + +template +inline void VerticalFilter(const int16_t* source_cols, int y4, int gamma, + int delta, DestType* dest_row, + ptrdiff_t dest_stride) { + int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + for (int y = 0; y < 8; ++y) { + int sy = sy4 - MultiplyBy4(gamma); + __m128i filter[8]; + for (__m128i& f : filter) { + const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + f = LoadUnaligned16(kWarpedFilters[offset]); + sy += gamma; + } + Transpose8x8_U16(filter, filter); + WriteVerticalFilter(filter, &source_cols[y], dest_row); + dest_row += dest_stride; + sy4 += delta; + } +} + +template +inline void WarpRegion1(const uint8_t* src, ptrdiff_t source_stride, + int source_width, int source_height, int ix4, int iy4, + DestType* dst_row, ptrdiff_t dest_stride) { + // Region 1 + // Points to the left or right border of the first row of |src|. + const uint8_t* first_row_border = + (ix4 + 7 <= 0) ? src : src + source_width - 1; + // In general, for y in [-7, 8), the row number iy4 + y is clipped: + // const int row = Clip3(iy4 + y, 0, source_height - 1); + // In two special cases, iy4 + y is clipped to either 0 or + // source_height - 1 for all y. In the rest of the cases, iy4 + y is + // bounded and we can avoid clipping iy4 + y by relying on a reference + // frame's boundary extension on the top and bottom. + // Region 1. + // Every sample used to calculate the prediction block has the same + // value. So the whole prediction block has the same value. + const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const uint8_t row_border_pixel = first_row_border[row * source_stride]; + + if (is_compound) { + const __m128i sum = + _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical - + kInterRoundBitsCompoundVertical)); + StoreUnaligned16(dst_row, sum); + } else { + memset(dst_row, row_border_pixel, 8); + } + const DestType* const first_dst_row = dst_row; + dst_row += dest_stride; + for (int y = 1; y < 8; ++y) { + memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row)); + dst_row += dest_stride; + } +} + +template +inline void WarpRegion2(const uint8_t* src, ptrdiff_t source_stride, + int source_width, int y4, int ix4, int iy4, int gamma, + int delta, int16_t intermediate_result_column[15], + DestType* dst_row, ptrdiff_t dest_stride) { + // Region 2. + // Points to the left or right border of the first row of |src|. + const uint8_t* first_row_border = + (ix4 + 7 <= 0) ? src : src + source_width - 1; + // In general, for y in [-7, 8), the row number iy4 + y is clipped: + // const int row = Clip3(iy4 + y, 0, source_height - 1); + // In two special cases, iy4 + y is clipped to either 0 or + // source_height - 1 for all y. In the rest of the cases, iy4 + y is + // bounded and we can avoid clipping iy4 + y by relying on a reference + // frame's boundary extension on the top and bottom. + + // Region 2. + // Horizontal filter. + // The input values in this region are generated by extending the border + // which makes them identical in the horizontal direction. This + // computation could be inlined in the vertical pass but most + // implementations will need a transpose of some sort. + // It is not necessary to use the offset values here because the + // horizontal pass is a simple shift and the vertical pass will always + // require using 32 bits. + for (int y = -7; y < 8; ++y) { + // We may over-read up to 13 pixels above the top source row, or up + // to 13 pixels below the bottom source row. This is proved in + // warp.cc. + const int row = iy4 + y; + int sum = first_row_border[row * source_stride]; + sum <<= (kFilterBits - kInterRoundBitsHorizontal); + intermediate_result_column[y + 7] = sum; + } + // Region 2 vertical filter. + VerticalFilter(intermediate_result_column, y4, gamma, + delta, dst_row, dest_stride); +} + +template +inline void WarpRegion3(const uint8_t* src, ptrdiff_t source_stride, + int source_height, int alpha, int beta, int x4, int ix4, + int iy4, int16_t intermediate_result[15][8]) { + // Region 3 + // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. + + // In general, for y in [-7, 8), the row number iy4 + y is clipped: + // const int row = Clip3(iy4 + y, 0, source_height - 1); + // In two special cases, iy4 + y is clipped to either 0 or + // source_height - 1 for all y. In the rest of the cases, iy4 + y is + // bounded and we can avoid clipping iy4 + y by relying on a reference + // frame's boundary extension on the top and bottom. + // Horizontal filter. + const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const uint8_t* const src_row = src + row * source_stride; + // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also + // read but is ignored. + // + // NOTE: This may read up to 13 bytes before src_row[0] or up to 14 + // bytes after src_row[source_width - 1]. We assume the source frame + // has left and right borders of at least 13 bytes that extend the + // frame boundary pixels. We also assume there is at least one extra + // padding byte after the right border of the last source row. + const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]); + int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + for (int y = -7; y < 8; ++y) { + HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]); + sx4 += beta; + } +} + +template +inline void WarpRegion4(const uint8_t* src, ptrdiff_t source_stride, int alpha, + int beta, int x4, int ix4, int iy4, + int16_t intermediate_result[15][8]) { + // Region 4. + // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. + + // In general, for y in [-7, 8), the row number iy4 + y is clipped: + // const int row = Clip3(iy4 + y, 0, source_height - 1); + // In two special cases, iy4 + y is clipped to either 0 or + // source_height - 1 for all y. In the rest of the cases, iy4 + y is + // bounded and we can avoid clipping iy4 + y by relying on a reference + // frame's boundary extension on the top and bottom. + // Horizontal filter. + int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + for (int y = -7; y < 8; ++y) { + // We may over-read up to 13 pixels above the top source row, or up + // to 13 pixels below the bottom source row. This is proved in + // warp.cc. + const int row = iy4 + y; + const uint8_t* const src_row = src + row * source_stride; + // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also + // read but is ignored. + // + // NOTE: This may read up to 13 bytes before src_row[0] or up to 14 + // bytes after src_row[source_width - 1]. We assume the source frame + // has left and right borders of at least 13 bytes that extend the + // frame boundary pixels. We also assume there is at least one extra + // padding byte after the right border of the last source row. + const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]); + // Convert src_row_v to int8 (subtract 128). + HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]); + sx4 += beta; + } +} + +template +inline void HandleWarpBlock(const uint8_t* src, ptrdiff_t source_stride, + int source_width, int source_height, + const int* warp_params, int subsampling_x, + int subsampling_y, int src_x, int src_y, + int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta, DestType* dst_row, + ptrdiff_t dest_stride) { + union { + // Intermediate_result is the output of the horizontal filtering and + // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 - + // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t + // type so that we can start with a negative offset and restore it on the + // final filter sum. + int16_t intermediate_result[15][8]; // 15 rows, 8 columns. + // In the simple special cases where the samples in each row are all the + // same, store one sample per row in a column vector. + int16_t intermediate_result_column[15]; + }; + + const int dst_x = + src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; + const int dst_y = + src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; + const int x4 = dst_x >> subsampling_x; + const int y4 = dst_y >> subsampling_y; + const int ix4 = x4 >> kWarpedModelPrecisionBits; + const int iy4 = y4 >> kWarpedModelPrecisionBits; + // A prediction block may fall outside the frame's boundaries. If a + // prediction block is calculated using only samples outside the frame's + // boundary, the filtering can be simplified. We can divide the plane + // into several regions and handle them differently. + // + // | | + // 1 | 3 | 1 + // | | + // -------+-----------+------- + // |***********| + // 2 |*****4*****| 2 + // |***********| + // -------+-----------+------- + // | | + // 1 | 3 | 1 + // | | + // + // At the center, region 4 represents the frame and is the general case. + // + // In regions 1 and 2, the prediction block is outside the frame's + // boundary horizontally. Therefore the horizontal filtering can be + // simplified. Furthermore, in the region 1 (at the four corners), the + // prediction is outside the frame's boundary both horizontally and + // vertically, so we get a constant prediction block. + // + // In region 3, the prediction block is outside the frame's boundary + // vertically. Unfortunately because we apply the horizontal filters + // first, by the time we apply the vertical filters, they no longer see + // simple inputs. So the only simplification is that all the rows are + // the same, but we still need to apply all the horizontal and vertical + // filters. + + // Check for two simple special cases, where the horizontal filter can + // be significantly simplified. + // + // In general, for each row, the horizontal filter is calculated as + // follows: + // for (int x = -4; x < 4; ++x) { + // const int offset = ...; + // int sum = first_pass_offset; + // for (int k = 0; k < 8; ++k) { + // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1); + // sum += kWarpedFilters[offset][k] * src_row[column]; + // } + // ... + // } + // The column index before clipping, ix4 + x + k - 3, varies in the range + // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1 + // or ix4 + 7 <= 0, then all the column indexes are clipped to the same + // border index (source_width - 1 or 0, respectively). Then for each x, + // the inner for loop of the horizontal filter is reduced to multiplying + // the border pixel by the sum of the filter coefficients. + if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { + if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) { + // Outside the frame in both directions. One repeated value. + WarpRegion1(src, source_stride, source_width, + source_height, ix4, iy4, dst_row, + dest_stride); + return; + } + // Outside the frame horizontally. Rows repeated. + WarpRegion2( + src, source_stride, source_width, y4, ix4, iy4, gamma, delta, + intermediate_result_column, dst_row, dest_stride); + return; + } + + if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) { + // Outside the frame vertically. + WarpRegion3(src, source_stride, source_height, alpha, + beta, x4, ix4, iy4, intermediate_result); + } else { + // Inside the frame. + WarpRegion4(src, source_stride, alpha, beta, x4, ix4, + iy4, intermediate_result); + } + // Region 3 and 4 vertical filter. + VerticalFilter(intermediate_result, y4, gamma, delta, + dst_row, dest_stride); +} + +template +void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width, + int source_height, const int* warp_params, int subsampling_x, + int subsampling_y, int block_start_x, int block_start_y, + int block_width, int block_height, int16_t alpha, int16_t beta, + int16_t gamma, int16_t delta, void* dest, + ptrdiff_t dest_stride) { + const auto* const src = static_cast(source); + using DestType = + typename std::conditional::type; + auto* dst = static_cast(dest); + + // Warp process applies for each 8x8 block. + assert(block_width >= 8); + assert(block_height >= 8); + const int block_end_x = block_start_x + block_width; + const int block_end_y = block_start_y + block_height; + + const int start_x = block_start_x; + const int start_y = block_start_y; + int src_x = (start_x + 4) << subsampling_x; + int src_y = (start_y + 4) << subsampling_y; + const int end_x = (block_end_x + 4) << subsampling_x; + const int end_y = (block_end_y + 4) << subsampling_y; + do { + DestType* dst_row = dst; + src_x = (start_x + 4) << subsampling_x; + do { + HandleWarpBlock( + src, source_stride, source_width, source_height, warp_params, + subsampling_x, subsampling_y, src_x, src_y, alpha, beta, gamma, delta, + dst_row, dest_stride); + src_x += (8 << subsampling_x); + dst_row += 8; + } while (src_x < end_x); + dst += 8 * dest_stride; + src_y += (8 << subsampling_y); + } while (src_y < end_y); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->warp = Warp_SSE4_1; + dsp->warp_compound = Warp_SSE4_1; +} + +} // namespace +} // namespace low_bitdepth + +void WarpInit_SSE4_1() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 +#else // !LIBGAV1_TARGETING_SSE4_1 + +namespace libgav1 { +namespace dsp { + +void WarpInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/warp_sse4.h b/src/dsp/x86/warp_sse4.h new file mode 100644 index 0000000..a2dc5ca --- /dev/null +++ b/src/dsp/x86/warp_sse4.h @@ -0,0 +1,44 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_ +#define LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::warp. This function is not thread-safe. +void WarpInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_TARGETING_SSE4_1 + +#ifndef LIBGAV1_Dsp8bpp_Warp +#define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WarpCompound +#define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_SSE4_1 +#endif + +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_ diff --git a/src/dsp/x86/weight_mask_sse4.cc b/src/dsp/x86/weight_mask_sse4.cc new file mode 100644 index 0000000..dfd5662 --- /dev/null +++ b/src/dsp/x86/weight_mask_sse4.cc @@ -0,0 +1,464 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/x86/weight_mask_sse4.h" + +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 + +#include + +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +constexpr int kRoundingBits8bpp = 4; + +template +inline void WeightMask8_SSE4(const int16_t* prediction_0, + const int16_t* prediction_1, uint8_t* mask) { + const __m128i pred_0 = LoadAligned16(prediction_0); + const __m128i pred_1 = LoadAligned16(prediction_1); + const __m128i difference = RightShiftWithRounding_U16( + _mm_abs_epi16(_mm_sub_epi16(pred_0, pred_1)), kRoundingBits8bpp); + const __m128i scaled_difference = _mm_srli_epi16(difference, 4); + const __m128i difference_offset = _mm_set1_epi8(38); + const __m128i adjusted_difference = + _mm_adds_epu8(_mm_packus_epi16(scaled_difference, scaled_difference), + difference_offset); + const __m128i mask_ceiling = _mm_set1_epi8(64); + const __m128i mask_value = _mm_min_epi8(adjusted_difference, mask_ceiling); + if (mask_is_inverse) { + const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value); + StoreLo8(mask, inverted_mask_value); + } else { + StoreLo8(mask, mask_value); + } +} + +#define WEIGHT8_WITHOUT_STRIDE \ + WeightMask8_SSE4(pred_0, pred_1, mask) + +#define WEIGHT8_AND_STRIDE \ + WEIGHT8_WITHOUT_STRIDE; \ + pred_0 += 8; \ + pred_1 += 8; \ + mask += mask_stride + +template +void WeightMask8x8_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y = 0; + do { + WEIGHT8_AND_STRIDE; + } while (++y < 7); + WEIGHT8_WITHOUT_STRIDE; +} + +template +void WeightMask8x16_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT8_AND_STRIDE; + WEIGHT8_AND_STRIDE; + WEIGHT8_AND_STRIDE; + } while (++y3 < 5); + WEIGHT8_WITHOUT_STRIDE; +} + +template +void WeightMask8x32_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y5 = 0; + do { + WEIGHT8_AND_STRIDE; + WEIGHT8_AND_STRIDE; + WEIGHT8_AND_STRIDE; + WEIGHT8_AND_STRIDE; + WEIGHT8_AND_STRIDE; + } while (++y5 < 6); + WEIGHT8_AND_STRIDE; + WEIGHT8_WITHOUT_STRIDE; +} + +#define WEIGHT16_WITHOUT_STRIDE \ + WeightMask8_SSE4(pred_0, pred_1, mask); \ + WeightMask8_SSE4(pred_0 + 8, pred_1 + 8, mask + 8) + +#define WEIGHT16_AND_STRIDE \ + WEIGHT16_WITHOUT_STRIDE; \ + pred_0 += 16; \ + pred_1 += 16; \ + mask += mask_stride + +template +void WeightMask16x8_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y = 0; + do { + WEIGHT16_AND_STRIDE; + } while (++y < 7); + WEIGHT16_WITHOUT_STRIDE; +} + +template +void WeightMask16x16_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + } while (++y3 < 5); + WEIGHT16_WITHOUT_STRIDE; +} + +template +void WeightMask16x32_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y5 = 0; + do { + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + } while (++y5 < 6); + WEIGHT16_AND_STRIDE; + WEIGHT16_WITHOUT_STRIDE; +} + +template +void WeightMask16x64_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + WEIGHT16_AND_STRIDE; + } while (++y3 < 21); + WEIGHT16_WITHOUT_STRIDE; +} + +#define WEIGHT32_WITHOUT_STRIDE \ + WeightMask8_SSE4(pred_0, pred_1, mask); \ + WeightMask8_SSE4(pred_0 + 8, pred_1 + 8, mask + 8); \ + WeightMask8_SSE4(pred_0 + 16, pred_1 + 16, mask + 16); \ + WeightMask8_SSE4(pred_0 + 24, pred_1 + 24, mask + 24) + +#define WEIGHT32_AND_STRIDE \ + WEIGHT32_WITHOUT_STRIDE; \ + pred_0 += 32; \ + pred_1 += 32; \ + mask += mask_stride + +template +void WeightMask32x8_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_WITHOUT_STRIDE; +} + +template +void WeightMask32x16_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + } while (++y3 < 5); + WEIGHT32_WITHOUT_STRIDE; +} + +template +void WeightMask32x32_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y5 = 0; + do { + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + } while (++y5 < 6); + WEIGHT32_AND_STRIDE; + WEIGHT32_WITHOUT_STRIDE; +} + +template +void WeightMask32x64_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + WEIGHT32_AND_STRIDE; + } while (++y3 < 21); + WEIGHT32_WITHOUT_STRIDE; +} + +#define WEIGHT64_WITHOUT_STRIDE \ + WeightMask8_SSE4(pred_0, pred_1, mask); \ + WeightMask8_SSE4(pred_0 + 8, pred_1 + 8, mask + 8); \ + WeightMask8_SSE4(pred_0 + 16, pred_1 + 16, mask + 16); \ + WeightMask8_SSE4(pred_0 + 24, pred_1 + 24, mask + 24); \ + WeightMask8_SSE4(pred_0 + 32, pred_1 + 32, mask + 32); \ + WeightMask8_SSE4(pred_0 + 40, pred_1 + 40, mask + 40); \ + WeightMask8_SSE4(pred_0 + 48, pred_1 + 48, mask + 48); \ + WeightMask8_SSE4(pred_0 + 56, pred_1 + 56, mask + 56) + +#define WEIGHT64_AND_STRIDE \ + WEIGHT64_WITHOUT_STRIDE; \ + pred_0 += 64; \ + pred_1 += 64; \ + mask += mask_stride + +template +void WeightMask64x16_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + } while (++y3 < 5); + WEIGHT64_WITHOUT_STRIDE; +} + +template +void WeightMask64x32_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y5 = 0; + do { + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + } while (++y5 < 6); + WEIGHT64_AND_STRIDE; + WEIGHT64_WITHOUT_STRIDE; +} + +template +void WeightMask64x64_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + } while (++y3 < 21); + WEIGHT64_WITHOUT_STRIDE; +} + +template +void WeightMask64x128_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + do { + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + WEIGHT64_AND_STRIDE; + } while (++y3 < 42); + WEIGHT64_AND_STRIDE; + WEIGHT64_WITHOUT_STRIDE; +} + +template +void WeightMask128x64_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + const ptrdiff_t adjusted_mask_stride = mask_stride - 64; + do { + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + } while (++y3 < 21); + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; +} + +template +void WeightMask128x128_SSE4(const void* prediction_0, const void* prediction_1, + uint8_t* mask, ptrdiff_t mask_stride) { + const auto* pred_0 = static_cast(prediction_0); + const auto* pred_1 = static_cast(prediction_1); + int y3 = 0; + const ptrdiff_t adjusted_mask_stride = mask_stride - 64; + do { + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + } while (++y3 < 42); + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += adjusted_mask_stride; + + WEIGHT64_WITHOUT_STRIDE; + pred_0 += 64; + pred_1 += 64; + mask += 64; + WEIGHT64_WITHOUT_STRIDE; +} + +#define INIT_WEIGHT_MASK_8BPP(width, height, w_index, h_index) \ + dsp->weight_mask[w_index][h_index][0] = \ + WeightMask##width##x##height##_SSE4<0>; \ + dsp->weight_mask[w_index][h_index][1] = WeightMask##width##x##height##_SSE4<1> +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + INIT_WEIGHT_MASK_8BPP(8, 8, 0, 0); + INIT_WEIGHT_MASK_8BPP(8, 16, 0, 1); + INIT_WEIGHT_MASK_8BPP(8, 32, 0, 2); + INIT_WEIGHT_MASK_8BPP(16, 8, 1, 0); + INIT_WEIGHT_MASK_8BPP(16, 16, 1, 1); + INIT_WEIGHT_MASK_8BPP(16, 32, 1, 2); + INIT_WEIGHT_MASK_8BPP(16, 64, 1, 3); + INIT_WEIGHT_MASK_8BPP(32, 8, 2, 0); + INIT_WEIGHT_MASK_8BPP(32, 16, 2, 1); + INIT_WEIGHT_MASK_8BPP(32, 32, 2, 2); + INIT_WEIGHT_MASK_8BPP(32, 64, 2, 3); + INIT_WEIGHT_MASK_8BPP(64, 16, 3, 1); + INIT_WEIGHT_MASK_8BPP(64, 32, 3, 2); + INIT_WEIGHT_MASK_8BPP(64, 64, 3, 3); + INIT_WEIGHT_MASK_8BPP(64, 128, 3, 4); + INIT_WEIGHT_MASK_8BPP(128, 64, 4, 3); + INIT_WEIGHT_MASK_8BPP(128, 128, 4, 4); +} + +} // namespace +} // namespace low_bitdepth + +void WeightMaskInit_SSE4_1() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_SSE4_1 + +namespace libgav1 { +namespace dsp { + +void WeightMaskInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 diff --git a/src/dsp/x86/weight_mask_sse4.h b/src/dsp/x86/weight_mask_sse4.h new file mode 100644 index 0000000..07636b7 --- /dev/null +++ b/src/dsp/x86/weight_mask_sse4.h @@ -0,0 +1,104 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_ +#define LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_ + +#include "src/dsp/dsp.h" +#include "src/utils/cpu.h" + +namespace libgav1 { +namespace dsp { + +// Initializes Dsp::weight_mask. This function is not thread-safe. +void WeightMaskInit_SSE4_1(); + +} // namespace dsp +} // namespace libgav1 + +#if LIBGAV1_TARGETING_SSE4_1 + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8 +#define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16 +#define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32 +#define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8 +#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16 +#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32 +#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64 +#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8 +#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16 +#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32 +#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64 +#define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16 +#define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32 +#define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64 +#define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128 +#define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64 +#define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1 +#endif + +#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128 +#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1 +#endif + +#endif // LIBGAV1_TARGETING_SSE4_1 + +#endif // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_ diff --git a/src/film_grain.cc b/src/film_grain.cc new file mode 100644 index 0000000..dac37b5 --- /dev/null +++ b/src/film_grain.cc @@ -0,0 +1,817 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/film_grain.h" + +#include +#include +#include +#include +#include +#include + +#include "src/dsp/common.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/array_2d.h" +#include "src/utils/blocking_counter.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" +#include "src/utils/logging.h" +#include "src/utils/threadpool.h" + +namespace libgav1 { + +namespace { + +// The kGaussianSequence array contains random samples from a Gaussian +// distribution with zero mean and standard deviation of about 512 clipped to +// the range of [-2048, 2047] (representable by a signed integer using 12 bits +// of precision) and rounded to the nearest multiple of 4. +// +// Note: It is important that every element in the kGaussianSequence array be +// less than 2040, so that RightShiftWithRounding(kGaussianSequence[i], 4) is +// less than 128 for bitdepth=8 (GrainType=int8_t). +constexpr int16_t kGaussianSequence[/*2048*/] = { + 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820, + 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800, + 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588, + -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368, + 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4, + 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396, + 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740, + 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292, + 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532, + 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704, + 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96, + -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244, + 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136, + 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676, + -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400, + -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844, + -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96, + -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356, + 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280, + 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808, + 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228, + -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136, + -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264, + -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388, + 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500, + 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384, + 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220, + -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148, + 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572, + -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516, + 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916, + -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492, + 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560, + -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108, + -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516, + -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88, + -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196, + -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864, + 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920, + 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564, + -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876, + -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244, + 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184, + 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364, + -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72, + 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24, + 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4, + -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120, + 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108, + -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296, + 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336, + -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164, + -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264, + 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536, + -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296, + -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696, + 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204, + 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212, + -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40, + 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384, + 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8, + 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704, + -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348, + -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592, + -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420, + 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220, + -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208, + -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544, + -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288, + -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240, + -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132, + 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16, + -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044, + -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732, + 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460, + -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52, + -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104, + -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460, + 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716, + -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960, + 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476, + 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692, + 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352, + -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144, + -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44, + 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356, + 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452, + -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552, + -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264, + -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448, + -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588, + 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464, + 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216, + 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132, + 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412, + 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48, + 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196, + 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48, + -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292, + 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32, + -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012, + -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120, + -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56, + 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416, + -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404, + -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92, + 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904, + 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728, + 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584, + 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48, + 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180, + 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528, + 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364, + -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260, + -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324, + -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64, + 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120, + -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168, + -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888, + 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588, + -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484, + 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580, + 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392, + 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80, + -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688, + 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4, + -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300, + 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444, + 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192, + 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160, + 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188, + -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404, + -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400, + 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92, + -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824, + 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620, + 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720, + 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620, + -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508, + -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736, + 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836, + 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180, + 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140, + -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32, + -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916, + 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368, + -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380, + -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572, + -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864, + 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908, + -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84, + 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396, + -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360, + 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928, + -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288, + 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196, + 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504, + 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272, + 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344, + -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208, + -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156, + -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240, + -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432, + 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244, + 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584, + 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24, + 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300, + -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416, + 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380, + -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384, + 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88, + 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876, + -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320, + -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88, + -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196, + -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120, + 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664, + -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0, + -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264, + -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288, + -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56, + 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148, + 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156, + -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144, + -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148, + 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944, + 428, -484}; +static_assert(sizeof(kGaussianSequence) / sizeof(kGaussianSequence[0]) == 2048, + ""); + +// The number of rows in a contiguous group computed by a single worker thread +// before checking for the next available group. +constexpr int kFrameChunkHeight = 8; + +// |width| and |height| refer to the plane, not the frame, meaning any +// subsampling should be applied by the caller. +template +inline void CopyImagePlane(const uint8_t* source_plane, ptrdiff_t source_stride, + int width, int height, uint8_t* dest_plane, + ptrdiff_t dest_stride) { + // If it's the same buffer there's nothing to do. + if (source_plane == dest_plane) return; + + int y = 0; + do { + memcpy(dest_plane, source_plane, width * sizeof(Pixel)); + source_plane += source_stride; + dest_plane += dest_stride; + } while (++y < height); +} + +} // namespace + +template +FilmGrain::FilmGrain(const FilmGrainParams& params, + bool is_monochrome, + bool color_matrix_is_identity, int subsampling_x, + int subsampling_y, int width, int height, + ThreadPool* thread_pool) + : params_(params), + is_monochrome_(is_monochrome), + color_matrix_is_identity_(color_matrix_is_identity), + subsampling_x_(subsampling_x), + subsampling_y_(subsampling_y), + width_(width), + height_(height), + template_uv_width_((subsampling_x != 0) ? kMinChromaWidth + : kMaxChromaWidth), + template_uv_height_((subsampling_y != 0) ? kMinChromaHeight + : kMaxChromaHeight), + thread_pool_(thread_pool) {} + +template +bool FilmGrain::Init() { + // Section 7.18.3.3. Generate grain process. + const dsp::Dsp& dsp = *dsp::GetDspTable(bitdepth); + // If params_.num_y_points is 0, luma_grain_ will never be read, so we don't + // need to generate it. + const bool use_luma = params_.num_y_points > 0; + if (use_luma) { + GenerateLumaGrain(params_, luma_grain_); + // If params_.auto_regression_coeff_lag is 0, the filter is the identity + // filter and therefore can be skipped. + if (params_.auto_regression_coeff_lag > 0) { + dsp.film_grain + .luma_auto_regression[params_.auto_regression_coeff_lag - 1]( + params_, luma_grain_); + } + } else { + // Have AddressSanitizer warn if luma_grain_ is used. + ASAN_POISON_MEMORY_REGION(luma_grain_, sizeof(luma_grain_)); + } + if (!is_monochrome_) { + GenerateChromaGrains(params_, template_uv_width_, template_uv_height_, + u_grain_, v_grain_); + if (params_.auto_regression_coeff_lag > 0 || use_luma) { + dsp.film_grain.chroma_auto_regression[static_cast( + use_luma)][params_.auto_regression_coeff_lag]( + params_, luma_grain_, subsampling_x_, subsampling_y_, u_grain_, + v_grain_); + } + } + + // Section 7.18.3.4. Scaling lookup initialization process. + + // Initialize scaling_lut_y_. If params_.num_y_points > 0, scaling_lut_y_ + // is used for the Y plane. If params_.chroma_scaling_from_luma is true, + // scaling_lut_u_ and scaling_lut_v_ are the same as scaling_lut_y_ and are + // set up as aliases. So we need to initialize scaling_lut_y_ under these + // two conditions. + // + // Note: Although it does not seem to make sense, there are test vectors + // with chroma_scaling_from_luma=true and params_.num_y_points=0. + if (use_luma || params_.chroma_scaling_from_luma) { + dsp.film_grain.initialize_scaling_lut( + params_.num_y_points, params_.point_y_value, params_.point_y_scaling, + scaling_lut_y_); + } else { + ASAN_POISON_MEMORY_REGION(scaling_lut_y_, sizeof(scaling_lut_y_)); + } + if (!is_monochrome_) { + if (params_.chroma_scaling_from_luma) { + scaling_lut_u_ = scaling_lut_y_; + scaling_lut_v_ = scaling_lut_y_; + } else if (params_.num_u_points > 0 || params_.num_v_points > 0) { + const size_t buffer_size = + (kScalingLookupTableSize + kScalingLookupTablePadding) * + (static_cast(params_.num_u_points > 0) + + static_cast(params_.num_v_points > 0)); + scaling_lut_chroma_buffer_.reset(new (std::nothrow) uint8_t[buffer_size]); + if (scaling_lut_chroma_buffer_ == nullptr) return false; + + uint8_t* buffer = scaling_lut_chroma_buffer_.get(); + if (params_.num_u_points > 0) { + scaling_lut_u_ = buffer; + dsp.film_grain.initialize_scaling_lut( + params_.num_u_points, params_.point_u_value, + params_.point_u_scaling, scaling_lut_u_); + buffer += kScalingLookupTableSize + kScalingLookupTablePadding; + } + if (params_.num_v_points > 0) { + scaling_lut_v_ = buffer; + dsp.film_grain.initialize_scaling_lut( + params_.num_v_points, params_.point_v_value, + params_.point_v_scaling, scaling_lut_v_); + } + } + } + return true; +} + +template +void FilmGrain::GenerateLumaGrain(const FilmGrainParams& params, + GrainType* luma_grain) { + // If params.num_y_points is equal to 0, Section 7.18.3.3 specifies we set + // the luma_grain array to all zeros. But the Note at the end of Section + // 7.18.3.3 says luma_grain "will never be read in this case". So we don't + // call GenerateLumaGrain if params.num_y_points is equal to 0. + assert(params.num_y_points > 0); + const int shift = 12 - bitdepth + params.grain_scale_shift; + uint16_t seed = params.grain_seed; + GrainType* luma_grain_row = luma_grain; + for (int y = 0; y < kLumaHeight; ++y) { + for (int x = 0; x < kLumaWidth; ++x) { + luma_grain_row[x] = RightShiftWithRounding( + kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift); + } + luma_grain_row += kLumaWidth; + } +} + +template +void FilmGrain::GenerateChromaGrains(const FilmGrainParams& params, + int chroma_width, + int chroma_height, + GrainType* u_grain, + GrainType* v_grain) { + const int shift = 12 - bitdepth + params.grain_scale_shift; + if (params.num_u_points == 0 && !params.chroma_scaling_from_luma) { + memset(u_grain, 0, chroma_height * chroma_width * sizeof(*u_grain)); + } else { + uint16_t seed = params.grain_seed ^ 0xb524; + GrainType* u_grain_row = u_grain; + assert(chroma_width > 0); + assert(chroma_height > 0); + int y = 0; + do { + int x = 0; + do { + u_grain_row[x] = RightShiftWithRounding( + kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift); + } while (++x < chroma_width); + + u_grain_row += chroma_width; + } while (++y < chroma_height); + } + if (params.num_v_points == 0 && !params.chroma_scaling_from_luma) { + memset(v_grain, 0, chroma_height * chroma_width * sizeof(*v_grain)); + } else { + GrainType* v_grain_row = v_grain; + uint16_t seed = params.grain_seed ^ 0x49d8; + int y = 0; + do { + int x = 0; + do { + v_grain_row[x] = RightShiftWithRounding( + kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift); + } while (++x < chroma_width); + + v_grain_row += chroma_width; + } while (++y < chroma_height); + } +} + +template +bool FilmGrain::AllocateNoiseStripes() { + const int half_height = DivideBy2(height_ + 1); + assert(half_height > 0); + // ceil(half_height / 16.0) + const int max_luma_num = DivideBy16(half_height + 15); + constexpr int kNoiseStripeHeight = 34; + size_t noise_buffer_size = kNoiseStripePadding; + if (params_.num_y_points > 0) { + noise_buffer_size += max_luma_num * kNoiseStripeHeight * width_; + } + if (!is_monochrome_) { + noise_buffer_size += 2 * max_luma_num * + (kNoiseStripeHeight >> subsampling_y_) * + SubsampledValue(width_, subsampling_x_); + } + noise_buffer_.reset(new (std::nothrow) GrainType[noise_buffer_size]); + if (noise_buffer_ == nullptr) return false; + GrainType* noise_buffer = noise_buffer_.get(); + if (params_.num_y_points > 0) { + noise_stripes_[kPlaneY].Reset(max_luma_num, kNoiseStripeHeight * width_, + noise_buffer); + noise_buffer += max_luma_num * kNoiseStripeHeight * width_; + } + if (!is_monochrome_) { + noise_stripes_[kPlaneU].Reset(max_luma_num, + (kNoiseStripeHeight >> subsampling_y_) * + SubsampledValue(width_, subsampling_x_), + noise_buffer); + noise_buffer += max_luma_num * (kNoiseStripeHeight >> subsampling_y_) * + SubsampledValue(width_, subsampling_x_); + noise_stripes_[kPlaneV].Reset(max_luma_num, + (kNoiseStripeHeight >> subsampling_y_) * + SubsampledValue(width_, subsampling_x_), + noise_buffer); + } + return true; +} + +template +bool FilmGrain::AllocateNoiseImage() { + if (params_.num_y_points > 0 && + !noise_image_[kPlaneY].Reset(height_, width_ + kNoiseImagePadding, + /*zero_initialize=*/false)) { + return false; + } + if (!is_monochrome_) { + if (!noise_image_[kPlaneU].Reset( + (height_ + subsampling_y_) >> subsampling_y_, + ((width_ + subsampling_x_) >> subsampling_x_) + kNoiseImagePadding, + /*zero_initialize=*/false)) { + return false; + } + if (!noise_image_[kPlaneV].Reset( + (height_ + subsampling_y_) >> subsampling_y_, + ((width_ + subsampling_x_) >> subsampling_x_) + kNoiseImagePadding, + /*zero_initialize=*/false)) { + return false; + } + } + return true; +} + +// Uses |overlap_flag| to skip rows that are covered by the overlap computation. +template +void FilmGrain::ConstructNoiseImage( + const Array2DView* noise_stripes, int width, int height, + int subsampling_x, int subsampling_y, int stripe_start_offset, + Array2D* noise_image) { + const int plane_width = (width + subsampling_x) >> subsampling_x; + const int plane_height = (height + subsampling_y) >> subsampling_y; + const int stripe_height = 32 >> subsampling_y; + const int stripe_mask = stripe_height - 1; + int y = 0; + // |luma_num| = y >> (5 - |subsampling_y|). Hence |luma_num| == 0 for all y up + // to either 16 or 32. + const GrainType* first_noise_stripe = (*noise_stripes)[0]; + do { + memcpy((*noise_image)[y], first_noise_stripe + y * plane_width, + plane_width * sizeof(first_noise_stripe[0])); + } while (++y < std::min(stripe_height, plane_height)); + // End special iterations for luma_num == 0. + + int luma_num = 1; + for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) { + const GrainType* noise_stripe = (*noise_stripes)[luma_num]; + int i = stripe_start_offset; + do { + memcpy((*noise_image)[y + i], noise_stripe + i * plane_width, + plane_width * sizeof(noise_stripe[0])); + } while (++i < stripe_height); + } + + // If there is a partial stripe, copy any rows beyond the overlap rows. + const int remaining_height = plane_height - y; + if (remaining_height > stripe_start_offset) { + assert(luma_num < noise_stripes->rows()); + const GrainType* noise_stripe = (*noise_stripes)[luma_num]; + int i = stripe_start_offset; + do { + memcpy((*noise_image)[y + i], noise_stripe + i * plane_width, + plane_width * sizeof(noise_stripe[0])); + } while (++i < remaining_height); + } +} + +template +void FilmGrain::BlendNoiseChromaWorker( + const dsp::Dsp& dsp, const Plane* planes, int num_planes, + std::atomic* job_counter, int min_value, int max_chroma, + const uint8_t* source_plane_y, ptrdiff_t source_stride_y, + const uint8_t* source_plane_u, const uint8_t* source_plane_v, + ptrdiff_t source_stride_uv, uint8_t* dest_plane_u, uint8_t* dest_plane_v, + ptrdiff_t dest_stride_uv) { + assert(num_planes > 0); + const int full_jobs_per_plane = height_ / kFrameChunkHeight; + const int remainder_job_height = height_ & (kFrameChunkHeight - 1); + const int total_full_jobs = full_jobs_per_plane * num_planes; + // If the frame height is not a multiple of kFrameChunkHeight, one job with + // a smaller number of rows is necessary at the end of each plane. + const int total_jobs = + total_full_jobs + ((remainder_job_height == 0) ? 0 : num_planes); + int job_index; + // Each job corresponds to a slice of kFrameChunkHeight rows in the luma + // plane. dsp->blend_noise_chroma handles subsampling. + // This loop body handles a slice of one plane or the other, depending on + // which are active. That way, threads working on consecutive jobs will keep + // the same region of luma source in working memory. + while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) < + total_jobs) { + const Plane plane = planes[job_index % num_planes]; + const int slice_index = job_index / num_planes; + const int start_height = slice_index * kFrameChunkHeight; + const int job_height = std::min(height_ - start_height, kFrameChunkHeight); + + const auto* source_cursor_y = reinterpret_cast( + source_plane_y + start_height * source_stride_y); + const uint8_t* scaling_lut_uv; + const uint8_t* source_plane_uv; + uint8_t* dest_plane_uv; + + if (plane == kPlaneU) { + scaling_lut_uv = scaling_lut_u_; + source_plane_uv = source_plane_u; + dest_plane_uv = dest_plane_u; + } else { + assert(plane == kPlaneV); + scaling_lut_uv = scaling_lut_v_; + source_plane_uv = source_plane_v; + dest_plane_uv = dest_plane_v; + } + const auto* source_cursor_uv = reinterpret_cast( + source_plane_uv + (start_height >> subsampling_y_) * source_stride_uv); + auto* dest_cursor_uv = reinterpret_cast( + dest_plane_uv + (start_height >> subsampling_y_) * dest_stride_uv); + dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma]( + plane, params_, noise_image_, min_value, max_chroma, width_, job_height, + start_height, subsampling_x_, subsampling_y_, scaling_lut_uv, + source_cursor_y, source_stride_y, source_cursor_uv, source_stride_uv, + dest_cursor_uv, dest_stride_uv); + } +} + +template +void FilmGrain::BlendNoiseLumaWorker( + const dsp::Dsp& dsp, std::atomic* job_counter, int min_value, + int max_luma, const uint8_t* source_plane_y, ptrdiff_t source_stride_y, + uint8_t* dest_plane_y, ptrdiff_t dest_stride_y) { + const int total_full_jobs = height_ / kFrameChunkHeight; + const int remainder_job_height = height_ & (kFrameChunkHeight - 1); + const int total_jobs = + total_full_jobs + static_cast(remainder_job_height > 0); + int job_index; + // Each job is some number of rows in a plane. + while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) < + total_jobs) { + const int start_height = job_index * kFrameChunkHeight; + const int job_height = std::min(height_ - start_height, kFrameChunkHeight); + + const auto* source_cursor_y = reinterpret_cast( + source_plane_y + start_height * source_stride_y); + auto* dest_cursor_y = + reinterpret_cast(dest_plane_y + start_height * dest_stride_y); + dsp.film_grain.blend_noise_luma( + noise_image_, min_value, max_luma, params_.chroma_scaling, width_, + job_height, start_height, scaling_lut_y_, source_cursor_y, + source_stride_y, dest_cursor_y, dest_stride_y); + } +} + +template +bool FilmGrain::AddNoise( + const uint8_t* source_plane_y, ptrdiff_t source_stride_y, + const uint8_t* source_plane_u, const uint8_t* source_plane_v, + ptrdiff_t source_stride_uv, uint8_t* dest_plane_y, ptrdiff_t dest_stride_y, + uint8_t* dest_plane_u, uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv) { + if (!Init()) { + LIBGAV1_DLOG(ERROR, "Init() failed."); + return false; + } + if (!AllocateNoiseStripes()) { + LIBGAV1_DLOG(ERROR, "AllocateNoiseStripes() failed."); + return false; + } + + const dsp::Dsp& dsp = *dsp::GetDspTable(bitdepth); + const bool use_luma = params_.num_y_points > 0; + + // Construct noise stripes. + if (use_luma) { + // The luma plane is never subsampled. + dsp.film_grain + .construct_noise_stripes[static_cast(params_.overlap_flag)]( + luma_grain_, params_.grain_seed, width_, height_, + /*subsampling_x=*/0, /*subsampling_y=*/0, &noise_stripes_[kPlaneY]); + } + if (!is_monochrome_) { + dsp.film_grain + .construct_noise_stripes[static_cast(params_.overlap_flag)]( + u_grain_, params_.grain_seed, width_, height_, subsampling_x_, + subsampling_y_, &noise_stripes_[kPlaneU]); + dsp.film_grain + .construct_noise_stripes[static_cast(params_.overlap_flag)]( + v_grain_, params_.grain_seed, width_, height_, subsampling_x_, + subsampling_y_, &noise_stripes_[kPlaneV]); + } + + if (!AllocateNoiseImage()) { + LIBGAV1_DLOG(ERROR, "AllocateNoiseImage() failed."); + return false; + } + + // Construct noise image. + if (use_luma) { + ConstructNoiseImage( + &noise_stripes_[kPlaneY], width_, height_, /*subsampling_x=*/0, + /*subsampling_y=*/0, static_cast(params_.overlap_flag) << 1, + &noise_image_[kPlaneY]); + if (params_.overlap_flag) { + dsp.film_grain.construct_noise_image_overlap( + &noise_stripes_[kPlaneY], width_, height_, /*subsampling_x=*/0, + /*subsampling_y=*/0, &noise_image_[kPlaneY]); + } + } + if (!is_monochrome_) { + ConstructNoiseImage(&noise_stripes_[kPlaneU], width_, height_, + subsampling_x_, subsampling_y_, + static_cast(params_.overlap_flag) + << (1 - subsampling_y_), + &noise_image_[kPlaneU]); + ConstructNoiseImage(&noise_stripes_[kPlaneV], width_, height_, + subsampling_x_, subsampling_y_, + static_cast(params_.overlap_flag) + << (1 - subsampling_y_), + &noise_image_[kPlaneV]); + if (params_.overlap_flag) { + dsp.film_grain.construct_noise_image_overlap( + &noise_stripes_[kPlaneU], width_, height_, subsampling_x_, + subsampling_y_, &noise_image_[kPlaneU]); + dsp.film_grain.construct_noise_image_overlap( + &noise_stripes_[kPlaneV], width_, height_, subsampling_x_, + subsampling_y_, &noise_image_[kPlaneV]); + } + } + + // Blend noise image. + int min_value; + int max_luma; + int max_chroma; + if (params_.clip_to_restricted_range) { + min_value = 16 << (bitdepth - 8); + max_luma = 235 << (bitdepth - 8); + if (color_matrix_is_identity_) { + max_chroma = max_luma; + } else { + max_chroma = 240 << (bitdepth - 8); + } + } else { + min_value = 0; + max_luma = (256 << (bitdepth - 8)) - 1; + max_chroma = max_luma; + } + + // Handle all chroma planes first because luma source may be altered in place. + if (!is_monochrome_) { + // This is done in a strange way but Vector can't be passed by copy to the + // lambda capture that spawns the thread. + Plane planes_to_blend[2]; + int num_planes = 0; + if (params_.chroma_scaling_from_luma) { + // Both noise planes are computed from the luma scaling lookup table. + planes_to_blend[num_planes++] = kPlaneU; + planes_to_blend[num_planes++] = kPlaneV; + } else { + const int height_uv = SubsampledValue(height_, subsampling_y_); + const int width_uv = SubsampledValue(width_, subsampling_x_); + + // Noise is applied according to a lookup table defined by pieceiwse + // linear "points." If the lookup table is empty, that corresponds to + // outputting zero noise. + if (params_.num_u_points == 0) { + CopyImagePlane(source_plane_u, source_stride_uv, width_uv, + height_uv, dest_plane_u, dest_stride_uv); + } else { + planes_to_blend[num_planes++] = kPlaneU; + } + if (params_.num_v_points == 0) { + CopyImagePlane(source_plane_v, source_stride_uv, width_uv, + height_uv, dest_plane_v, dest_stride_uv); + } else { + planes_to_blend[num_planes++] = kPlaneV; + } + } + if (thread_pool_ != nullptr && num_planes > 0) { + const int num_workers = thread_pool_->num_threads(); + BlockingCounter pending_workers(num_workers); + std::atomic job_counter(0); + for (int i = 0; i < num_workers; ++i) { + thread_pool_->Schedule([this, dsp, &pending_workers, &planes_to_blend, + num_planes, &job_counter, min_value, max_chroma, + source_plane_y, source_stride_y, source_plane_u, + source_plane_v, source_stride_uv, dest_plane_u, + dest_plane_v, dest_stride_uv]() { + BlendNoiseChromaWorker(dsp, planes_to_blend, num_planes, &job_counter, + min_value, max_chroma, source_plane_y, + source_stride_y, source_plane_u, + source_plane_v, source_stride_uv, dest_plane_u, + dest_plane_v, dest_stride_uv); + pending_workers.Decrement(); + }); + } + BlendNoiseChromaWorker( + dsp, planes_to_blend, num_planes, &job_counter, min_value, max_chroma, + source_plane_y, source_stride_y, source_plane_u, source_plane_v, + source_stride_uv, dest_plane_u, dest_plane_v, dest_stride_uv); + + pending_workers.Wait(); + } else { + // Single threaded. + if (params_.num_u_points > 0 || params_.chroma_scaling_from_luma) { + dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma]( + kPlaneU, params_, noise_image_, min_value, max_chroma, width_, + height_, /*start_height=*/0, subsampling_x_, subsampling_y_, + scaling_lut_u_, source_plane_y, source_stride_y, source_plane_u, + source_stride_uv, dest_plane_u, dest_stride_uv); + } + if (params_.num_v_points > 0 || params_.chroma_scaling_from_luma) { + dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma]( + kPlaneV, params_, noise_image_, min_value, max_chroma, width_, + height_, /*start_height=*/0, subsampling_x_, subsampling_y_, + scaling_lut_v_, source_plane_y, source_stride_y, source_plane_v, + source_stride_uv, dest_plane_v, dest_stride_uv); + } + } + } + if (use_luma) { + if (thread_pool_ != nullptr) { + const int num_workers = thread_pool_->num_threads(); + BlockingCounter pending_workers(num_workers); + std::atomic job_counter(0); + for (int i = 0; i < num_workers; ++i) { + thread_pool_->Schedule( + [this, dsp, &pending_workers, &job_counter, min_value, max_luma, + source_plane_y, source_stride_y, dest_plane_y, dest_stride_y]() { + BlendNoiseLumaWorker(dsp, &job_counter, min_value, max_luma, + source_plane_y, source_stride_y, + dest_plane_y, dest_stride_y); + pending_workers.Decrement(); + }); + } + + BlendNoiseLumaWorker(dsp, &job_counter, min_value, max_luma, + source_plane_y, source_stride_y, dest_plane_y, + dest_stride_y); + pending_workers.Wait(); + } else { + dsp.film_grain.blend_noise_luma( + noise_image_, min_value, max_luma, params_.chroma_scaling, width_, + height_, /*start_height=*/0, scaling_lut_y_, source_plane_y, + source_stride_y, dest_plane_y, dest_stride_y); + } + } else { + CopyImagePlane(source_plane_y, source_stride_y, width_, height_, + dest_plane_y, dest_stride_y); + } + + return true; +} + +// Explicit instantiations. +template class FilmGrain<8>; +#if LIBGAV1_MAX_BITDEPTH >= 10 +template class FilmGrain<10>; +#endif + +} // namespace libgav1 diff --git a/src/film_grain.h b/src/film_grain.h new file mode 100644 index 0000000..b588f6d --- /dev/null +++ b/src/film_grain.h @@ -0,0 +1,193 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_FILM_GRAIN_H_ +#define LIBGAV1_SRC_FILM_GRAIN_H_ + +#include +#include +#include +#include +#include + +#include "src/dsp/common.h" +#include "src/dsp/dsp.h" +#include "src/dsp/film_grain_common.h" +#include "src/utils/array_2d.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" +#include "src/utils/threadpool.h" +#include "src/utils/types.h" +#include "src/utils/vector.h" + +namespace libgav1 { + +// Film grain synthesis function signature. Section 7.18.3. +// This function generates film grain noise and blends the noise with the +// decoded frame. +// |source_plane_y|, |source_plane_u|, and |source_plane_v| are the plane +// buffers of the decoded frame. They are blended with the film grain noise and +// written to |dest_plane_y|, |dest_plane_u|, and |dest_plane_v| as final +// output for display. |source_plane_p| and |dest_plane_p| (where p is y, u, or +// v) may point to the same buffer, in which case the film grain noise is added +// in place. +// |film_grain_params| are parameters read from frame header. +// |is_monochrome| is true indicates only Y plane needs to be processed. +// |color_matrix_is_identity| is true if the matrix_coefficients field in the +// sequence header's color config is is MC_IDENTITY. +// |width| is the upscaled width of the frame. +// |height| is the frame height. +// |subsampling_x| and |subsampling_y| are subsamplings for UV planes, not used +// if |is_monochrome| is true. +// Returns true on success, or false on failure (e.g., out of memory). +using FilmGrainSynthesisFunc = bool (*)( + const void* source_plane_y, ptrdiff_t source_stride_y, + const void* source_plane_u, ptrdiff_t source_stride_u, + const void* source_plane_v, ptrdiff_t source_stride_v, + const FilmGrainParams& film_grain_params, bool is_monochrome, + bool color_matrix_is_identity, int width, int height, int subsampling_x, + int subsampling_y, void* dest_plane_y, ptrdiff_t dest_stride_y, + void* dest_plane_u, ptrdiff_t dest_stride_u, void* dest_plane_v, + ptrdiff_t dest_stride_v); + +// Section 7.18.3.5. Add noise synthesis process. +template +class FilmGrain { + public: + using GrainType = + typename std::conditional::type; + + FilmGrain(const FilmGrainParams& params, bool is_monochrome, + bool color_matrix_is_identity, int subsampling_x, int subsampling_y, + int width, int height, ThreadPool* thread_pool); + + // Note: These static methods are declared public so that the unit tests can + // call them. + + static void GenerateLumaGrain(const FilmGrainParams& params, + GrainType* luma_grain); + + // Generates white noise arrays u_grain and v_grain chroma_width samples wide + // and chroma_height samples high. + static void GenerateChromaGrains(const FilmGrainParams& params, + int chroma_width, int chroma_height, + GrainType* u_grain, GrainType* v_grain); + + // Copies rows from |noise_stripes| to |noise_image|, skipping rows that are + // subject to overlap. + static void ConstructNoiseImage(const Array2DView* noise_stripes, + int width, int height, int subsampling_x, + int subsampling_y, int stripe_start_offset, + Array2D* noise_image); + + // Combines the film grain with the image data. + bool AddNoise(const uint8_t* source_plane_y, ptrdiff_t source_stride_y, + const uint8_t* source_plane_u, const uint8_t* source_plane_v, + ptrdiff_t source_stride_uv, uint8_t* dest_plane_y, + ptrdiff_t dest_stride_y, uint8_t* dest_plane_u, + uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv); + + private: + using Pixel = + typename std::conditional::type; + + bool Init(); + + // Allocates noise_stripes_. + bool AllocateNoiseStripes(); + + bool AllocateNoiseImage(); + + void BlendNoiseChromaWorker(const dsp::Dsp& dsp, const Plane* planes, + int num_planes, std::atomic* job_counter, + int min_value, int max_chroma, + const uint8_t* source_plane_y, + ptrdiff_t source_stride_y, + const uint8_t* source_plane_u, + const uint8_t* source_plane_v, + ptrdiff_t source_stride_uv, uint8_t* dest_plane_u, + uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv); + + void BlendNoiseLumaWorker(const dsp::Dsp& dsp, std::atomic* job_counter, + int min_value, int max_luma, + const uint8_t* source_plane_y, + ptrdiff_t source_stride_y, uint8_t* dest_plane_y, + ptrdiff_t dest_stride_y); + + const FilmGrainParams& params_; + const bool is_monochrome_; + const bool color_matrix_is_identity_; + const int subsampling_x_; + const int subsampling_y_; + // Frame width and height. + const int width_; + const int height_; + // Section 7.18.3.3, Dimensions of the noise templates for chroma, which are + // known as CbGrain and CrGrain. + // These templates are used to construct the noise image for each plane by + // copying 32x32 blocks with pseudorandom offsets, into "noise stripes." + // The noise template known as LumaGrain array is an 82x73 block. + // The height and width of the templates for chroma become 44 and 38 under + // subsampling, respectively. + // For more details see: + // A. Norkin and N. Birkbeck, "Film Grain Synthesis for AV1 Video Codec," 2018 + // Data Compression Conference, Snowbird, UT, 2018, pp. 3-12. + const int template_uv_width_; + const int template_uv_height_; + // LumaGrain. The luma_grain array contains white noise generated for luma. + // The array size is fixed but subject to further optimization for SIMD. + GrainType luma_grain_[kLumaHeight * kLumaWidth]; + // CbGrain and CrGrain. The maximum size of the u_grain and v_grain arrays is + // kMaxChromaHeight * kMaxChromaWidth. The actual size is + // template_uv_height_ * template_uv_width_. + GrainType u_grain_[kMaxChromaHeight * kMaxChromaWidth]; + GrainType v_grain_[kMaxChromaHeight * kMaxChromaWidth]; + // Scaling lookup tables. + uint8_t scaling_lut_y_[kScalingLookupTableSize + kScalingLookupTablePadding]; + uint8_t* scaling_lut_u_ = nullptr; + uint8_t* scaling_lut_v_ = nullptr; + // If allocated, this buffer is 256 * 2 bytes long and scaling_lut_u_ and + // scaling_lut_v_ point into this buffer. Otherwise, scaling_lut_u_ and + // scaling_lut_v_ point to scaling_lut_y_. + std::unique_ptr scaling_lut_chroma_buffer_; + + // A two-dimensional array of noise data for each plane. Generated for each 32 + // luma sample high stripe of the image. The first dimension is called + // luma_num. The second dimension is the size of one noise stripe. + // + // Each row of the Array2DView noise_stripes_[plane] is a conceptually + // two-dimensional array of |GrainType|s. The two-dimensional array of + // |GrainType|s is flattened into a one-dimensional buffer in this + // implementation. + // + // noise_stripes_[kPlaneY][luma_num] is an array that has 34 rows and + // |width_| columns and contains noise for the luma component. + // + // noise_stripes_[kPlaneU][luma_num] or noise_stripes_[kPlaneV][luma_num] + // is an array that has (34 >> subsampling_y_) rows and + // SubsampledValue(width_, subsampling_x_) columns and contains noise for the + // chroma components. + Array2DView noise_stripes_[kMaxPlanes]; + // Owns the memory that the elements of noise_stripes_ point to. + std::unique_ptr noise_buffer_; + + Array2D noise_image_[kMaxPlanes]; + ThreadPool* const thread_pool_; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_FILM_GRAIN_H_ diff --git a/src/frame_buffer.cc b/src/frame_buffer.cc new file mode 100644 index 0000000..50c7756 --- /dev/null +++ b/src/frame_buffer.cc @@ -0,0 +1,151 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/gav1/frame_buffer.h" + +#include + +#include "src/frame_buffer_utils.h" +#include "src/utils/common.h" + +extern "C" { + +Libgav1StatusCode Libgav1ComputeFrameBufferInfo( + int bitdepth, Libgav1ImageFormat image_format, int width, int height, + int left_border, int right_border, int top_border, int bottom_border, + int stride_alignment, Libgav1FrameBufferInfo* info) { + switch (bitdepth) { + case 8: +#if LIBGAV1_MAX_BITDEPTH >= 10 + case 10: +#endif +#if LIBGAV1_MAX_BITDEPTH == 12 + case 12: +#endif + break; + default: + return kLibgav1StatusInvalidArgument; + } + switch (image_format) { + case kLibgav1ImageFormatYuv420: + case kLibgav1ImageFormatYuv422: + case kLibgav1ImageFormatYuv444: + case kLibgav1ImageFormatMonochrome400: + break; + default: + return kLibgav1StatusInvalidArgument; + } + // All int arguments must be nonnegative. Borders must be a multiple of 2. + // |stride_alignment| must be a power of 2. + if ((width | height | left_border | right_border | top_border | + bottom_border | stride_alignment) < 0 || + ((left_border | right_border | top_border | bottom_border) & 1) != 0 || + (stride_alignment & (stride_alignment - 1)) != 0 || info == nullptr) { + return kLibgav1StatusInvalidArgument; + } + + bool is_monochrome; + int8_t subsampling_x; + int8_t subsampling_y; + libgav1::DecomposeImageFormat(image_format, &is_monochrome, &subsampling_x, + &subsampling_y); + + // Calculate y_stride (in bytes). It is padded to a multiple of + // |stride_alignment| bytes. + int y_stride = width + left_border + right_border; +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth > 8) y_stride *= sizeof(uint16_t); +#endif + y_stride = libgav1::Align(y_stride, stride_alignment); + // Size of the Y buffer in bytes. + const uint64_t y_buffer_size = + (height + top_border + bottom_border) * static_cast(y_stride) + + (stride_alignment - 1); + + const int uv_width = + is_monochrome ? 0 : libgav1::SubsampledValue(width, subsampling_x); + const int uv_height = + is_monochrome ? 0 : libgav1::SubsampledValue(height, subsampling_y); + const int uv_left_border = is_monochrome ? 0 : left_border >> subsampling_x; + const int uv_right_border = is_monochrome ? 0 : right_border >> subsampling_x; + const int uv_top_border = is_monochrome ? 0 : top_border >> subsampling_y; + const int uv_bottom_border = + is_monochrome ? 0 : bottom_border >> subsampling_y; + + // Calculate uv_stride (in bytes). It is padded to a multiple of + // |stride_alignment| bytes. + int uv_stride = uv_width + uv_left_border + uv_right_border; +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth > 8) uv_stride *= sizeof(uint16_t); +#endif + uv_stride = libgav1::Align(uv_stride, stride_alignment); + // Size of the U or V buffer in bytes. + const uint64_t uv_buffer_size = + is_monochrome ? 0 + : (uv_height + uv_top_border + uv_bottom_border) * + static_cast(uv_stride) + + (stride_alignment - 1); + + // Check if it is safe to cast y_buffer_size and uv_buffer_size to size_t. + if (y_buffer_size > SIZE_MAX || uv_buffer_size > SIZE_MAX) { + return kLibgav1StatusInvalidArgument; + } + + int left_border_bytes = left_border; + int uv_left_border_bytes = uv_left_border; +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth > 8) { + left_border_bytes *= sizeof(uint16_t); + uv_left_border_bytes *= sizeof(uint16_t); + } +#endif + + info->y_stride = y_stride; + info->uv_stride = uv_stride; + info->y_buffer_size = static_cast(y_buffer_size); + info->uv_buffer_size = static_cast(uv_buffer_size); + info->y_plane_offset = top_border * y_stride + left_border_bytes; + info->uv_plane_offset = uv_top_border * uv_stride + uv_left_border_bytes; + info->stride_alignment = stride_alignment; + return kLibgav1StatusOk; +} + +Libgav1StatusCode Libgav1SetFrameBuffer(const Libgav1FrameBufferInfo* info, + uint8_t* y_buffer, uint8_t* u_buffer, + uint8_t* v_buffer, + void* buffer_private_data, + Libgav1FrameBuffer* frame_buffer) { + if (info == nullptr || + (info->uv_buffer_size == 0 && + (u_buffer != nullptr || v_buffer != nullptr)) || + frame_buffer == nullptr) { + return kLibgav1StatusInvalidArgument; + } + if (y_buffer == nullptr || (info->uv_buffer_size != 0 && + (u_buffer == nullptr || v_buffer == nullptr))) { + return kLibgav1StatusOutOfMemory; + } + frame_buffer->plane[0] = libgav1::AlignAddr(y_buffer + info->y_plane_offset, + info->stride_alignment); + frame_buffer->plane[1] = libgav1::AlignAddr(u_buffer + info->uv_plane_offset, + info->stride_alignment); + frame_buffer->plane[2] = libgav1::AlignAddr(v_buffer + info->uv_plane_offset, + info->stride_alignment); + frame_buffer->stride[0] = info->y_stride; + frame_buffer->stride[1] = frame_buffer->stride[2] = info->uv_stride; + frame_buffer->private_data = buffer_private_data; + return kLibgav1StatusOk; +} + +} // extern "C" diff --git a/src/frame_buffer_utils.h b/src/frame_buffer_utils.h new file mode 100644 index 0000000..d41437e --- /dev/null +++ b/src/frame_buffer_utils.h @@ -0,0 +1,78 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_ +#define LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_ + +#include +#include + +#include "src/gav1/decoder_buffer.h" + +namespace libgav1 { + +// The following table is from Section 6.4.2 of the spec. +// +// subsampling_x subsampling_y mono_chrome Description +// ----------------------------------------------------------- +// 0 0 0 YUV 4:4:4 +// 1 0 0 YUV 4:2:2 +// 1 1 0 YUV 4:2:0 +// 1 1 1 Monochrome 4:0:0 + +inline Libgav1ImageFormat ComposeImageFormat(bool is_monochrome, + int8_t subsampling_x, + int8_t subsampling_y) { + Libgav1ImageFormat image_format; + if (subsampling_x == 0) { + assert(subsampling_y == 0 && !is_monochrome); + image_format = kLibgav1ImageFormatYuv444; + } else if (subsampling_y == 0) { + assert(!is_monochrome); + image_format = kLibgav1ImageFormatYuv422; + } else if (!is_monochrome) { + image_format = kLibgav1ImageFormatYuv420; + } else { + image_format = kLibgav1ImageFormatMonochrome400; + } + return image_format; +} + +inline void DecomposeImageFormat(Libgav1ImageFormat image_format, + bool* is_monochrome, int8_t* subsampling_x, + int8_t* subsampling_y) { + *is_monochrome = false; + *subsampling_x = 1; + *subsampling_y = 1; + switch (image_format) { + case kLibgav1ImageFormatYuv420: + break; + case kLibgav1ImageFormatYuv422: + *subsampling_y = 0; + break; + case kLibgav1ImageFormatYuv444: + *subsampling_x = *subsampling_y = 0; + break; + default: + assert(image_format == kLibgav1ImageFormatMonochrome400); + *is_monochrome = true; + break; + } +} + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_FRAME_BUFFER_UTILS_H_ diff --git a/src/frame_scratch_buffer.h b/src/frame_scratch_buffer.h new file mode 100644 index 0000000..90c3bb8 --- /dev/null +++ b/src/frame_scratch_buffer.h @@ -0,0 +1,113 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_ +#define LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_ + +#include // NOLINT (unapproved c++11 header) +#include +#include +#include // NOLINT (unapproved c++11 header) + +#include "src/loop_restoration_info.h" +#include "src/residual_buffer_pool.h" +#include "src/symbol_decoder_context.h" +#include "src/threading_strategy.h" +#include "src/tile_scratch_buffer.h" +#include "src/utils/array_2d.h" +#include "src/utils/block_parameters_holder.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" +#include "src/utils/dynamic_buffer.h" +#include "src/utils/memory.h" +#include "src/utils/stack.h" +#include "src/utils/types.h" +#include "src/yuv_buffer.h" + +namespace libgav1 { + +// Buffer used to store the unfiltered pixels that are necessary for decoding +// the next superblock row (for the intra prediction process). +using IntraPredictionBuffer = + std::array, kMaxPlanes>; + +// Buffer to facilitate decoding a frame. This struct is used only within +// DecoderImpl::DecodeTiles(). +struct FrameScratchBuffer { + LoopRestorationInfo loop_restoration_info; + Array2D cdef_index; + Array2D inter_transform_sizes; + BlockParametersHolder block_parameters_holder; + TemporalMotionField motion_field; + SymbolDecoderContext symbol_decoder_context; + std::unique_ptr residual_buffer_pool; + // Buffer used to store the cdef borders. This buffer will store 4 rows for + // every 64x64 block (4 rows for every 32x32 for chroma with subsampling). The + // indices of the rows that are stored are specified in |kCdefBorderRows|. + YuvBuffer cdef_border; + AlignedDynamicBuffer superres_coefficients[kNumPlaneTypes]; + // Buffer used to temporarily store the input row for applying SuperRes. + YuvBuffer superres_line_buffer; + // Buffer used to store the loop restoration borders. This buffer will store 4 + // rows for every 64x64 block (4 rows for every 32x32 for chroma with + // subsampling). The indices of the rows that are stored are specified in + // |kLoopRestorationBorderRows|. + YuvBuffer loop_restoration_border; + // The size of this dynamic buffer is |tile_rows|. + DynamicBuffer intra_prediction_buffers; + TileScratchBufferPool tile_scratch_buffer_pool; + ThreadingStrategy threading_strategy; + std::mutex superblock_row_mutex; + // The size of this buffer is the number of superblock rows. + // |superblock_row_progress[i]| is incremented whenever a tile finishes + // decoding superblock row at index i. If the count reaches tile_columns, then + // |superblock_row_progress_condvar[i]| is notified. + DynamicBuffer superblock_row_progress + LIBGAV1_GUARDED_BY(superblock_row_mutex); + // The size of this buffer is the number of superblock rows. Used to wait for + // |superblock_row_progress[i]| to reach tile_columns. + DynamicBuffer superblock_row_progress_condvar; + // Used to signal tile decoding failure in the combined multithreading mode. + bool tile_decoding_failed LIBGAV1_GUARDED_BY(superblock_row_mutex); +}; + +class FrameScratchBufferPool { + public: + std::unique_ptr Get() { + std::unique_lock lock(mutex_); + if (!buffers_.Empty()) { + return buffers_.Pop(); + } + lock.unlock(); + std::unique_ptr scratch_buffer(new (std::nothrow) + FrameScratchBuffer); + return scratch_buffer; + } + + void Release(std::unique_ptr scratch_buffer) { + std::lock_guard lock(mutex_); + buffers_.Push(std::move(scratch_buffer)); + } + + private: + std::mutex mutex_; + Stack, kMaxThreads> buffers_ + LIBGAV1_GUARDED_BY(mutex_); +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_FRAME_SCRATCH_BUFFER_H_ diff --git a/src/gav1/decoder.h b/src/gav1/decoder.h new file mode 100644 index 0000000..da08da9 --- /dev/null +++ b/src/gav1/decoder.h @@ -0,0 +1,148 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_GAV1_DECODER_H_ +#define LIBGAV1_SRC_GAV1_DECODER_H_ + +#if defined(__cplusplus) +#include +#include +#include +#else +#include +#include +#endif // defined(__cplusplus) + +// IWYU pragma: begin_exports +#include "gav1/decoder_buffer.h" +#include "gav1/decoder_settings.h" +#include "gav1/frame_buffer.h" +#include "gav1/status_code.h" +#include "gav1/symbol_visibility.h" +#include "gav1/version.h" +// IWYU pragma: end_exports + +#if defined(__cplusplus) +extern "C" { +#endif + +struct Libgav1Decoder; +typedef struct Libgav1Decoder Libgav1Decoder; + +LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderCreate( + const Libgav1DecoderSettings* settings, Libgav1Decoder** decoder_out); + +LIBGAV1_PUBLIC void Libgav1DecoderDestroy(Libgav1Decoder* decoder); + +LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderEnqueueFrame( + Libgav1Decoder* decoder, const uint8_t* data, size_t size, + int64_t user_private_data, void* buffer_private_data); + +LIBGAV1_PUBLIC Libgav1StatusCode Libgav1DecoderDequeueFrame( + Libgav1Decoder* decoder, const Libgav1DecoderBuffer** out_ptr); + +LIBGAV1_PUBLIC Libgav1StatusCode +Libgav1DecoderSignalEOS(Libgav1Decoder* decoder); + +LIBGAV1_PUBLIC int Libgav1DecoderGetMaxBitdepth(void); + +#if defined(__cplusplus) +} // extern "C" + +namespace libgav1 { + +// Forward declaration. +class DecoderImpl; + +class LIBGAV1_PUBLIC Decoder { + public: + Decoder(); + ~Decoder(); + + // Init must be called exactly once per instance. Subsequent calls will do + // nothing. If |settings| is nullptr, the decoder will be initialized with + // default settings. Returns kStatusOk on success, an error status otherwise. + StatusCode Init(const DecoderSettings* settings); + + // Enqueues a compressed frame to be decoded. + // + // This function returns: + // * kStatusOk on success + // * kStatusTryAgain if the decoder queue is full + // * an error status otherwise. + // + // |user_private_data| may be used to associate application specific private + // data with the compressed frame. It will be copied to the user_private_data + // field of the DecoderBuffer returned by the corresponding |DequeueFrame()| + // call. + // + // NOTE: |EnqueueFrame()| does not copy the data. Therefore, after a + // successful |EnqueueFrame()| call, the caller must keep the |data| buffer + // alive until: + // 1) If |settings_.release_input_buffer| is not nullptr, then |data| buffer + // must be kept alive until release_input_buffer is called with the + // |buffer_private_data| passed into this EnqueueFrame call. + // 2) If |settings_.release_input_buffer| is nullptr, then |data| buffer must + // be kept alive until the corresponding DequeueFrame() call is completed. + // + // If the call to |EnqueueFrame()| is not successful, then libgav1 will not + // hold any references to the |data| buffer. |settings_.release_input_buffer| + // callback will not be called in that case. + StatusCode EnqueueFrame(const uint8_t* data, size_t size, + int64_t user_private_data, void* buffer_private_data); + + // Dequeues a decompressed frame. If there are enqueued compressed frames, + // decodes one and sets |*out_ptr| to the last displayable frame in the + // compressed frame. If there are no displayable frames available, sets + // |*out_ptr| to nullptr. + // + // Returns kStatusOk on success. Returns kStatusNothingToDequeue if there are + // no enqueued frames (in this case out_ptr will always be set to nullptr). + // Returns one of the other error statuses if there is an error. + // + // If |settings_.blocking_dequeue| is false and the decoder is operating in + // frame parallel mode (|settings_.frame_parallel| is true and the video + // stream passes the decoder's heuristics for enabling frame parallel mode), + // then this call will return kStatusTryAgain if an enqueued frame is not yet + // decoded (it is a non blocking call in this case). In all other cases, this + // call will block until an enqueued frame has been decoded. + StatusCode DequeueFrame(const DecoderBuffer** out_ptr); + + // Signals the end of stream. + // + // In non-frame-parallel mode, this function will release all the frames held + // by the decoder. If the frame buffers were allocated by libgav1, then the + // pointer obtained by the prior DequeueFrame call will no longer be valid. If + // the frame buffers were allocated by the application, then any references + // that libgav1 is holding on to will be released. + // + // Once this function returns successfully, the decoder state will be reset + // and the decoder is ready to start decoding a new coded video sequence. + StatusCode SignalEOS(); + + // Returns the maximum bitdepth that is supported by this decoder. + static int GetMaxBitdepth(); + + private: + DecoderSettings settings_; + // The object is initialized if and only if impl_ != nullptr. + std::unique_ptr impl_; +}; + +} // namespace libgav1 +#endif // defined(__cplusplus) + +#endif // LIBGAV1_SRC_GAV1_DECODER_H_ diff --git a/src/gav1/decoder_buffer.h b/src/gav1/decoder_buffer.h new file mode 100644 index 0000000..37bcb29 --- /dev/null +++ b/src/gav1/decoder_buffer.h @@ -0,0 +1,279 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_ +#define LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_ + +#if defined(__cplusplus) +#include +#else +#include +#endif // defined(__cplusplus) + +#include "gav1/symbol_visibility.h" + +// All the declarations in this file are part of the public ABI. + +// The documentation for the enum values in this file can be found in Section +// 6.4.2 of the AV1 spec. + +typedef enum Libgav1ChromaSamplePosition { + kLibgav1ChromaSamplePositionUnknown, + kLibgav1ChromaSamplePositionVertical, + kLibgav1ChromaSamplePositionColocated, + kLibgav1ChromaSamplePositionReserved +} Libgav1ChromaSamplePosition; + +typedef enum Libgav1ImageFormat { + kLibgav1ImageFormatYuv420, + kLibgav1ImageFormatYuv422, + kLibgav1ImageFormatYuv444, + kLibgav1ImageFormatMonochrome400 +} Libgav1ImageFormat; + +typedef enum Libgav1ColorPrimary { + // 0 is reserved. + kLibgav1ColorPrimaryBt709 = 1, + kLibgav1ColorPrimaryUnspecified, + // 3 is reserved. + kLibgav1ColorPrimaryBt470M = 4, + kLibgav1ColorPrimaryBt470Bg, + kLibgav1ColorPrimaryBt601, + kLibgav1ColorPrimarySmpte240, + kLibgav1ColorPrimaryGenericFilm, + kLibgav1ColorPrimaryBt2020, + kLibgav1ColorPrimaryXyz, + kLibgav1ColorPrimarySmpte431, + kLibgav1ColorPrimarySmpte432, + // 13-21 are reserved. + kLibgav1ColorPrimaryEbu3213 = 22, + // 23-254 are reserved. + kLibgav1MaxColorPrimaries = 255 +} Libgav1ColorPrimary; + +typedef enum Libgav1TransferCharacteristics { + // 0 is reserved. + kLibgav1TransferCharacteristicsBt709 = 1, + kLibgav1TransferCharacteristicsUnspecified, + // 3 is reserved. + kLibgav1TransferCharacteristicsBt470M = 4, + kLibgav1TransferCharacteristicsBt470Bg, + kLibgav1TransferCharacteristicsBt601, + kLibgav1TransferCharacteristicsSmpte240, + kLibgav1TransferCharacteristicsLinear, + kLibgav1TransferCharacteristicsLog100, + kLibgav1TransferCharacteristicsLog100Sqrt10, + kLibgav1TransferCharacteristicsIec61966, + kLibgav1TransferCharacteristicsBt1361, + kLibgav1TransferCharacteristicsSrgb, + kLibgav1TransferCharacteristicsBt2020TenBit, + kLibgav1TransferCharacteristicsBt2020TwelveBit, + kLibgav1TransferCharacteristicsSmpte2084, + kLibgav1TransferCharacteristicsSmpte428, + kLibgav1TransferCharacteristicsHlg, + // 19-254 are reserved. + kLibgav1MaxTransferCharacteristics = 255 +} Libgav1TransferCharacteristics; + +typedef enum Libgav1MatrixCoefficients { + kLibgav1MatrixCoefficientsIdentity, + kLibgav1MatrixCoefficientsBt709, + kLibgav1MatrixCoefficientsUnspecified, + // 3 is reserved. + kLibgav1MatrixCoefficientsFcc = 4, + kLibgav1MatrixCoefficientsBt470BG, + kLibgav1MatrixCoefficientsBt601, + kLibgav1MatrixCoefficientsSmpte240, + kLibgav1MatrixCoefficientsSmpteYcgco, + kLibgav1MatrixCoefficientsBt2020Ncl, + kLibgav1MatrixCoefficientsBt2020Cl, + kLibgav1MatrixCoefficientsSmpte2085, + kLibgav1MatrixCoefficientsChromatNcl, + kLibgav1MatrixCoefficientsChromatCl, + kLibgav1MatrixCoefficientsIctcp, + // 15-254 are reserved. + kLibgav1MaxMatrixCoefficients = 255 +} Libgav1MatrixCoefficients; + +typedef enum Libgav1ColorRange { + // The color ranges are scaled by value << (bitdepth - 8) for 10 and 12bit + // streams. + kLibgav1ColorRangeStudio, // Y [16..235], UV [16..240] + kLibgav1ColorRangeFull // YUV/RGB [0..255] +} Libgav1ColorRange; + +typedef struct Libgav1DecoderBuffer { +#if defined(__cplusplus) + LIBGAV1_PUBLIC int NumPlanes() const { + return (image_format == kLibgav1ImageFormatMonochrome400) ? 1 : 3; + } +#endif // defined(__cplusplus) + + Libgav1ChromaSamplePosition chroma_sample_position; + Libgav1ImageFormat image_format; + Libgav1ColorRange color_range; + Libgav1ColorPrimary color_primary; + Libgav1TransferCharacteristics transfer_characteristics; + Libgav1MatrixCoefficients matrix_coefficients; + + // Image storage dimensions. + // NOTE: These fields are named w and h in vpx_image_t and aom_image_t. + // uint32_t width; // Stored image width. + // uint32_t height; // Stored image height. + int bitdepth; // Stored image bitdepth. + + // Image display dimensions. + // NOTES: + // 1. These fields are named d_w and d_h in vpx_image_t and aom_image_t. + // 2. libvpx and libaom clients use d_w and d_h much more often than w and h. + // 3. These fields can just be stored for the Y plane and the clients can + // calculate the values for the U and V planes if the image format or + // subsampling is exposed. + int displayed_width[3]; // Displayed image width. + int displayed_height[3]; // Displayed image height. + + int stride[3]; + uint8_t* plane[3]; + + // Spatial id of this frame. + int spatial_id; + // Temporal id of this frame. + int temporal_id; + + // The |user_private_data| argument passed to Decoder::EnqueueFrame(). + int64_t user_private_data; + // The |private_data| field of FrameBuffer. Set by the get frame buffer + // callback when it allocates a frame buffer. + void* buffer_private_data; +} Libgav1DecoderBuffer; + +#if defined(__cplusplus) +namespace libgav1 { + +using ChromaSamplePosition = Libgav1ChromaSamplePosition; +constexpr ChromaSamplePosition kChromaSamplePositionUnknown = + kLibgav1ChromaSamplePositionUnknown; +constexpr ChromaSamplePosition kChromaSamplePositionVertical = + kLibgav1ChromaSamplePositionVertical; +constexpr ChromaSamplePosition kChromaSamplePositionColocated = + kLibgav1ChromaSamplePositionColocated; +constexpr ChromaSamplePosition kChromaSamplePositionReserved = + kLibgav1ChromaSamplePositionReserved; + +using ImageFormat = Libgav1ImageFormat; +constexpr ImageFormat kImageFormatYuv420 = kLibgav1ImageFormatYuv420; +constexpr ImageFormat kImageFormatYuv422 = kLibgav1ImageFormatYuv422; +constexpr ImageFormat kImageFormatYuv444 = kLibgav1ImageFormatYuv444; +constexpr ImageFormat kImageFormatMonochrome400 = + kLibgav1ImageFormatMonochrome400; + +using ColorPrimary = Libgav1ColorPrimary; +constexpr ColorPrimary kColorPrimaryBt709 = kLibgav1ColorPrimaryBt709; +constexpr ColorPrimary kColorPrimaryUnspecified = + kLibgav1ColorPrimaryUnspecified; +constexpr ColorPrimary kColorPrimaryBt470M = kLibgav1ColorPrimaryBt470M; +constexpr ColorPrimary kColorPrimaryBt470Bg = kLibgav1ColorPrimaryBt470Bg; +constexpr ColorPrimary kColorPrimaryBt601 = kLibgav1ColorPrimaryBt601; +constexpr ColorPrimary kColorPrimarySmpte240 = kLibgav1ColorPrimarySmpte240; +constexpr ColorPrimary kColorPrimaryGenericFilm = + kLibgav1ColorPrimaryGenericFilm; +constexpr ColorPrimary kColorPrimaryBt2020 = kLibgav1ColorPrimaryBt2020; +constexpr ColorPrimary kColorPrimaryXyz = kLibgav1ColorPrimaryXyz; +constexpr ColorPrimary kColorPrimarySmpte431 = kLibgav1ColorPrimarySmpte431; +constexpr ColorPrimary kColorPrimarySmpte432 = kLibgav1ColorPrimarySmpte432; +constexpr ColorPrimary kColorPrimaryEbu3213 = kLibgav1ColorPrimaryEbu3213; +constexpr ColorPrimary kMaxColorPrimaries = kLibgav1MaxColorPrimaries; + +using TransferCharacteristics = Libgav1TransferCharacteristics; +constexpr TransferCharacteristics kTransferCharacteristicsBt709 = + kLibgav1TransferCharacteristicsBt709; +constexpr TransferCharacteristics kTransferCharacteristicsUnspecified = + kLibgav1TransferCharacteristicsUnspecified; +constexpr TransferCharacteristics kTransferCharacteristicsBt470M = + kLibgav1TransferCharacteristicsBt470M; +constexpr TransferCharacteristics kTransferCharacteristicsBt470Bg = + kLibgav1TransferCharacteristicsBt470Bg; +constexpr TransferCharacteristics kTransferCharacteristicsBt601 = + kLibgav1TransferCharacteristicsBt601; +constexpr TransferCharacteristics kTransferCharacteristicsSmpte240 = + kLibgav1TransferCharacteristicsSmpte240; +constexpr TransferCharacteristics kTransferCharacteristicsLinear = + kLibgav1TransferCharacteristicsLinear; +constexpr TransferCharacteristics kTransferCharacteristicsLog100 = + kLibgav1TransferCharacteristicsLog100; +constexpr TransferCharacteristics kTransferCharacteristicsLog100Sqrt10 = + kLibgav1TransferCharacteristicsLog100Sqrt10; +constexpr TransferCharacteristics kTransferCharacteristicsIec61966 = + kLibgav1TransferCharacteristicsIec61966; +constexpr TransferCharacteristics kTransferCharacteristicsBt1361 = + kLibgav1TransferCharacteristicsBt1361; +constexpr TransferCharacteristics kTransferCharacteristicsSrgb = + kLibgav1TransferCharacteristicsSrgb; +constexpr TransferCharacteristics kTransferCharacteristicsBt2020TenBit = + kLibgav1TransferCharacteristicsBt2020TenBit; +constexpr TransferCharacteristics kTransferCharacteristicsBt2020TwelveBit = + kLibgav1TransferCharacteristicsBt2020TwelveBit; +constexpr TransferCharacteristics kTransferCharacteristicsSmpte2084 = + kLibgav1TransferCharacteristicsSmpte2084; +constexpr TransferCharacteristics kTransferCharacteristicsSmpte428 = + kLibgav1TransferCharacteristicsSmpte428; +constexpr TransferCharacteristics kTransferCharacteristicsHlg = + kLibgav1TransferCharacteristicsHlg; +constexpr TransferCharacteristics kMaxTransferCharacteristics = + kLibgav1MaxTransferCharacteristics; + +using MatrixCoefficients = Libgav1MatrixCoefficients; +constexpr MatrixCoefficients kMatrixCoefficientsIdentity = + kLibgav1MatrixCoefficientsIdentity; +constexpr MatrixCoefficients kMatrixCoefficientsBt709 = + kLibgav1MatrixCoefficientsBt709; +constexpr MatrixCoefficients kMatrixCoefficientsUnspecified = + kLibgav1MatrixCoefficientsUnspecified; +constexpr MatrixCoefficients kMatrixCoefficientsFcc = + kLibgav1MatrixCoefficientsFcc; +constexpr MatrixCoefficients kMatrixCoefficientsBt470BG = + kLibgav1MatrixCoefficientsBt470BG; +constexpr MatrixCoefficients kMatrixCoefficientsBt601 = + kLibgav1MatrixCoefficientsBt601; +constexpr MatrixCoefficients kMatrixCoefficientsSmpte240 = + kLibgav1MatrixCoefficientsSmpte240; +constexpr MatrixCoefficients kMatrixCoefficientsSmpteYcgco = + kLibgav1MatrixCoefficientsSmpteYcgco; +constexpr MatrixCoefficients kMatrixCoefficientsBt2020Ncl = + kLibgav1MatrixCoefficientsBt2020Ncl; +constexpr MatrixCoefficients kMatrixCoefficientsBt2020Cl = + kLibgav1MatrixCoefficientsBt2020Cl; +constexpr MatrixCoefficients kMatrixCoefficientsSmpte2085 = + kLibgav1MatrixCoefficientsSmpte2085; +constexpr MatrixCoefficients kMatrixCoefficientsChromatNcl = + kLibgav1MatrixCoefficientsChromatNcl; +constexpr MatrixCoefficients kMatrixCoefficientsChromatCl = + kLibgav1MatrixCoefficientsChromatCl; +constexpr MatrixCoefficients kMatrixCoefficientsIctcp = + kLibgav1MatrixCoefficientsIctcp; +constexpr MatrixCoefficients kMaxMatrixCoefficients = + kLibgav1MaxMatrixCoefficients; + +using ColorRange = Libgav1ColorRange; +constexpr ColorRange kColorRangeStudio = kLibgav1ColorRangeStudio; +constexpr ColorRange kColorRangeFull = kLibgav1ColorRangeFull; + +using DecoderBuffer = Libgav1DecoderBuffer; + +} // namespace libgav1 +#endif // defined(__cplusplus) + +#endif // LIBGAV1_SRC_GAV1_DECODER_BUFFER_H_ diff --git a/src/gav1/decoder_settings.h b/src/gav1/decoder_settings.h new file mode 100644 index 0000000..ab22a4d --- /dev/null +++ b/src/gav1/decoder_settings.h @@ -0,0 +1,144 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_ +#define LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_ + +#if defined(__cplusplus) +#include +#else +#include +#endif // defined(__cplusplus) + +#include "gav1/frame_buffer.h" +#include "gav1/symbol_visibility.h" + +// All the declarations in this file are part of the public ABI. + +#if defined(__cplusplus) +extern "C" { +#endif + +// This callback is invoked by the decoder when it is done using an input frame +// buffer. When frame_parallel is set to true, this callback must not be +// nullptr. Otherwise, this callback is optional. +// +// |buffer_private_data| is the value passed in the EnqueueFrame() call. +typedef void (*Libgav1ReleaseInputBufferCallback)(void* callback_private_data, + void* buffer_private_data); + +typedef struct Libgav1DecoderSettings { + // Number of threads to use when decoding. Must be greater than 0. The library + // will create at most |threads| new threads. Defaults to 1 (no new threads + // will be created). + int threads; + // A boolean. Indicate to the decoder that frame parallel decoding is allowed. + // Note that this is just a request and the decoder will decide the number of + // frames to be decoded in parallel based on the video stream being decoded. + int frame_parallel; + // A boolean. In frame parallel mode, should Libgav1DecoderDequeueFrame wait + // until a enqueued frame is available for dequeueing. + // + // If frame_parallel is 0, this setting is ignored. + int blocking_dequeue; + // Called when the first sequence header or a sequence header with a + // different frame size (which includes bitdepth, monochrome, subsampling_x, + // subsampling_y, maximum frame width, or maximum frame height) is received. + Libgav1FrameBufferSizeChangedCallback on_frame_buffer_size_changed; + // Get frame buffer callback. + Libgav1GetFrameBufferCallback get_frame_buffer; + // Release frame buffer callback. + Libgav1ReleaseFrameBufferCallback release_frame_buffer; + // Release input frame buffer callback. + Libgav1ReleaseInputBufferCallback release_input_buffer; + // Passed as the private_data argument to the callbacks. + void* callback_private_data; + // A boolean. If set to 1, the decoder will output all the spatial and + // temporal layers. + int output_all_layers; + // Index of the operating point to decode. + int operating_point; + // Mask indicating the post processing filters that need to be applied to the + // reconstructed frame. Note this is an advanced setting and does not + // typically need to be changed. + // From LSB: + // Bit 0: Loop filter (deblocking filter). + // Bit 1: Cdef. + // Bit 2: SuperRes. + // Bit 3: Loop restoration. + // Bit 4: Film grain synthesis. + // All the bits other than the last 5 are ignored. + uint8_t post_filter_mask; +} Libgav1DecoderSettings; + +LIBGAV1_PUBLIC void Libgav1DecoderSettingsInitDefault( + Libgav1DecoderSettings* settings); + +#if defined(__cplusplus) +} // extern "C" + +namespace libgav1 { + +using ReleaseInputBufferCallback = Libgav1ReleaseInputBufferCallback; + +// Applications must populate this structure before creating a decoder instance. +struct DecoderSettings { + // Number of threads to use when decoding. Must be greater than 0. The library + // will create at most |threads| new threads. Defaults to 1 (no new threads + // will be created). + int threads = 1; + // Indicate to the decoder that frame parallel decoding is allowed. Note that + // this is just a request and the decoder will decide the number of frames to + // be decoded in parallel based on the video stream being decoded. + bool frame_parallel = false; + // In frame parallel mode, should DequeueFrame wait until a enqueued frame is + // available for dequeueing. + // + // If frame_parallel is false, this setting is ignored. + bool blocking_dequeue = false; + // Called when the first sequence header or a sequence header with a + // different frame size (which includes bitdepth, monochrome, subsampling_x, + // subsampling_y, maximum frame width, or maximum frame height) is received. + FrameBufferSizeChangedCallback on_frame_buffer_size_changed = nullptr; + // Get frame buffer callback. + GetFrameBufferCallback get_frame_buffer = nullptr; + // Release frame buffer callback. + ReleaseFrameBufferCallback release_frame_buffer = nullptr; + // Release input frame buffer callback. + ReleaseInputBufferCallback release_input_buffer = nullptr; + // Passed as the private_data argument to the callbacks. + void* callback_private_data = nullptr; + // If set to true, the decoder will output all the spatial and temporal + // layers. + bool output_all_layers = false; + // Index of the operating point to decode. + int operating_point = 0; + // Mask indicating the post processing filters that need to be applied to the + // reconstructed frame. Note this is an advanced setting and does not + // typically need to be changed. + // From LSB: + // Bit 0: Loop filter (deblocking filter). + // Bit 1: Cdef. + // Bit 2: SuperRes. + // Bit 3: Loop restoration. + // Bit 4: Film grain synthesis. + // All the bits other than the last 5 are ignored. + uint8_t post_filter_mask = 0x1f; +}; + +} // namespace libgav1 +#endif // defined(__cplusplus) +#endif // LIBGAV1_SRC_GAV1_DECODER_SETTINGS_H_ diff --git a/src/gav1/frame_buffer.h b/src/gav1/frame_buffer.h new file mode 100644 index 0000000..8132b61 --- /dev/null +++ b/src/gav1/frame_buffer.h @@ -0,0 +1,177 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_ +#define LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_ + +// All the declarations in this file are part of the public ABI. This file may +// be included by both C and C++ files. + +#if defined(__cplusplus) +#include +#include +#else +#include +#include +#endif // defined(__cplusplus) + +#include "gav1/decoder_buffer.h" +#include "gav1/status_code.h" +#include "gav1/symbol_visibility.h" + +// The callback functions use the C linkage conventions. +#if defined(__cplusplus) +extern "C" { +#endif + +// This structure represents an allocated frame buffer. +typedef struct Libgav1FrameBuffer { + // In the |plane| and |stride| arrays, the elements at indexes 0, 1, and 2 + // are for the Y, U, and V planes, respectively. + uint8_t* plane[3]; // Pointers to the frame (excluding the borders) in the + // data buffers. + int stride[3]; // Row strides in bytes. + void* private_data; // Frame buffer's private data. Available for use by the + // release frame buffer callback. Also copied to the + // |buffer_private_data| field of DecoderBuffer for use + // by the consumer of a DecoderBuffer. +} Libgav1FrameBuffer; + +// This callback is invoked by the decoder to provide information on the +// subsequent frames in the video, until the next invocation of this callback +// or the end of the video. +// +// |width| and |height| are the maximum frame width and height in pixels. +// |left_border|, |right_border|, |top_border|, and |bottom_border| are the +// maximum left, right, top, and bottom border sizes in pixels. +// |stride_alignment| specifies the alignment of the row stride in bytes. +// +// Returns kLibgav1StatusOk on success, an error status on failure. +// +// NOTE: This callback may be omitted if the information is not useful to the +// application. +typedef Libgav1StatusCode (*Libgav1FrameBufferSizeChangedCallback)( + void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format, + int width, int height, int left_border, int right_border, int top_border, + int bottom_border, int stride_alignment); + +// This callback is invoked by the decoder to allocate a frame buffer, which +// consists of three data buffers, for the Y, U, and V planes, respectively. +// +// The callback must set |frame_buffer->plane[i]| to point to the data buffers +// of the planes, and set |frame_buffer->stride[i]| to the row strides of the +// planes. If |image_format| is kLibgav1ImageFormatMonochrome400, the callback +// should set |frame_buffer->plane[1]| and |frame_buffer->plane[2]| to a null +// pointer and set |frame_buffer->stride[1]| and |frame_buffer->stride[2]| to +// 0. The callback may set |frame_buffer->private_data| to a value that will +// be useful to the release frame buffer callback and the consumer of a +// DecoderBuffer. +// +// Returns kLibgav1StatusOk on success, an error status on failure. + +// |width| and |height| are the frame width and height in pixels. +// |left_border|, |right_border|, |top_border|, and |bottom_border| are the +// left, right, top, and bottom border sizes in pixels. |stride_alignment| +// specifies the alignment of the row stride in bytes. +typedef Libgav1StatusCode (*Libgav1GetFrameBufferCallback)( + void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format, + int width, int height, int left_border, int right_border, int top_border, + int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer); + +// After a frame buffer is allocated, the decoder starts to write decoded video +// to the frame buffer. When the frame buffer is ready for consumption, it is +// made available to the application in a Decoder::DequeueFrame() call. +// Afterwards, the decoder may continue to use the frame buffer in read-only +// mode. When the decoder is finished using the frame buffer, it notifies the +// application by calling the Libgav1ReleaseFrameBufferCallback. + +// This callback is invoked by the decoder to release a frame buffer. +typedef void (*Libgav1ReleaseFrameBufferCallback)(void* callback_private_data, + void* buffer_private_data); + +// Libgav1ComputeFrameBufferInfo() and Libgav1SetFrameBuffer() are intended to +// help clients implement frame buffer callbacks using memory buffers. First, +// call Libgav1ComputeFrameBufferInfo(). If it succeeds, allocate y_buffer of +// size info.y_buffer_size and allocate u_buffer and v_buffer, both of size +// info.uv_buffer_size. Finally, pass y_buffer, u_buffer, v_buffer, and +// buffer_private_data to Libgav1SetFrameBuffer(). + +// This structure contains information useful for allocating memory for a frame +// buffer. +typedef struct Libgav1FrameBufferInfo { + size_t y_buffer_size; // Size in bytes of the Y buffer. + size_t uv_buffer_size; // Size in bytes of the U or V buffer. + + // The following fields are consumed by Libgav1SetFrameBuffer(). Do not use + // them directly. + int y_stride; // Row stride in bytes of the Y buffer. + int uv_stride; // Row stride in bytes of the U or V buffer. + size_t y_plane_offset; // Offset in bytes of the frame (excluding the + // borders) in the Y buffer. + size_t uv_plane_offset; // Offset in bytes of the frame (excluding the + // borders) in the U or V buffer. + int stride_alignment; // The stride_alignment argument passed to + // Libgav1ComputeFrameBufferInfo(). +} Libgav1FrameBufferInfo; + +// Computes the information useful for allocating memory for a frame buffer. +// On success, stores the output in |info|. +LIBGAV1_PUBLIC Libgav1StatusCode Libgav1ComputeFrameBufferInfo( + int bitdepth, Libgav1ImageFormat image_format, int width, int height, + int left_border, int right_border, int top_border, int bottom_border, + int stride_alignment, Libgav1FrameBufferInfo* info); + +// Sets the |frame_buffer| struct. +LIBGAV1_PUBLIC Libgav1StatusCode Libgav1SetFrameBuffer( + const Libgav1FrameBufferInfo* info, uint8_t* y_buffer, uint8_t* u_buffer, + uint8_t* v_buffer, void* buffer_private_data, + Libgav1FrameBuffer* frame_buffer); + +#if defined(__cplusplus) +} // extern "C" + +// Declare type aliases for C++. +namespace libgav1 { + +using FrameBuffer = Libgav1FrameBuffer; +using FrameBufferSizeChangedCallback = Libgav1FrameBufferSizeChangedCallback; +using GetFrameBufferCallback = Libgav1GetFrameBufferCallback; +using ReleaseFrameBufferCallback = Libgav1ReleaseFrameBufferCallback; +using FrameBufferInfo = Libgav1FrameBufferInfo; + +inline StatusCode ComputeFrameBufferInfo(int bitdepth, ImageFormat image_format, + int width, int height, int left_border, + int right_border, int top_border, + int bottom_border, + int stride_alignment, + FrameBufferInfo* info) { + return Libgav1ComputeFrameBufferInfo(bitdepth, image_format, width, height, + left_border, right_border, top_border, + bottom_border, stride_alignment, info); +} + +inline StatusCode SetFrameBuffer(const FrameBufferInfo* info, uint8_t* y_buffer, + uint8_t* u_buffer, uint8_t* v_buffer, + void* buffer_private_data, + FrameBuffer* frame_buffer) { + return Libgav1SetFrameBuffer(info, y_buffer, u_buffer, v_buffer, + buffer_private_data, frame_buffer); +} + +} // namespace libgav1 +#endif // defined(__cplusplus) + +#endif // LIBGAV1_SRC_GAV1_FRAME_BUFFER_H_ diff --git a/src/gav1/status_code.h b/src/gav1/status_code.h new file mode 100644 index 0000000..d7476ca --- /dev/null +++ b/src/gav1/status_code.h @@ -0,0 +1,118 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_GAV1_STATUS_CODE_H_ +#define LIBGAV1_SRC_GAV1_STATUS_CODE_H_ + +#include "gav1/symbol_visibility.h" + +// All the declarations in this file are part of the public ABI. This file may +// be included by both C and C++ files. + +// The Libgav1StatusCode enum type: A libgav1 function may return +// Libgav1StatusCode to indicate success or the reason for failure. +typedef enum { + // Success. + kLibgav1StatusOk = 0, + + // An unknown error. Used as the default error status if error detail is not + // available. + kLibgav1StatusUnknownError = -1, + + // An invalid function argument. + kLibgav1StatusInvalidArgument = -2, + + // Memory allocation failure. + kLibgav1StatusOutOfMemory = -3, + + // Ran out of a resource (other than memory). + kLibgav1StatusResourceExhausted = -4, + + // The object is not initialized. + kLibgav1StatusNotInitialized = -5, + + // An operation that can only be performed once has already been performed. + kLibgav1StatusAlready = -6, + + // Not implemented, or not supported. + kLibgav1StatusUnimplemented = -7, + + // An internal error in libgav1. Usually this indicates a programming error. + kLibgav1StatusInternalError = -8, + + // The bitstream is not encoded correctly or violates a bitstream conformance + // requirement. + kLibgav1StatusBitstreamError = -9, + + // The operation is not allowed at the moment. This is not a fatal error. Try + // again later. + kLibgav1StatusTryAgain = -10, + + // Used only by DequeueFrame(). There are no enqueued frames, so there is + // nothing to dequeue. This is not a fatal error. Try enqueuing a frame before + // trying to dequeue again. + kLibgav1StatusNothingToDequeue = -11, + + // An extra enumerator to prevent people from writing code that fails to + // compile when a new status code is added. + // + // Do not reference this enumerator. In particular, if you write code that + // switches on Libgav1StatusCode, add a default: case instead of a case that + // mentions this enumerator. + // + // Do not depend on the value (currently -1000) listed here. It may change in + // the future. + kLibgav1StatusReservedForFutureExpansionUseDefaultInSwitchInstead_ = -1000 +} Libgav1StatusCode; + +#if defined(__cplusplus) +extern "C" { +#endif + +// Returns a human readable error string in en-US for the status code |status|. +// Always returns a valid (non-NULL) string. +LIBGAV1_PUBLIC const char* Libgav1GetErrorString(Libgav1StatusCode status); + +#if defined(__cplusplus) +} // extern "C" + +namespace libgav1 { + +// Declare type aliases for C++. +using StatusCode = Libgav1StatusCode; +constexpr StatusCode kStatusOk = kLibgav1StatusOk; +constexpr StatusCode kStatusUnknownError = kLibgav1StatusUnknownError; +constexpr StatusCode kStatusInvalidArgument = kLibgav1StatusInvalidArgument; +constexpr StatusCode kStatusOutOfMemory = kLibgav1StatusOutOfMemory; +constexpr StatusCode kStatusResourceExhausted = kLibgav1StatusResourceExhausted; +constexpr StatusCode kStatusNotInitialized = kLibgav1StatusNotInitialized; +constexpr StatusCode kStatusAlready = kLibgav1StatusAlready; +constexpr StatusCode kStatusUnimplemented = kLibgav1StatusUnimplemented; +constexpr StatusCode kStatusInternalError = kLibgav1StatusInternalError; +constexpr StatusCode kStatusBitstreamError = kLibgav1StatusBitstreamError; +constexpr StatusCode kStatusTryAgain = kLibgav1StatusTryAgain; +constexpr StatusCode kStatusNothingToDequeue = kLibgav1StatusNothingToDequeue; + +// Returns a human readable error string in en-US for the status code |status|. +// Always returns a valid (non-NULL) string. +inline const char* GetErrorString(StatusCode status) { + return Libgav1GetErrorString(status); +} + +} // namespace libgav1 +#endif // defined(__cplusplus) + +#endif // LIBGAV1_SRC_GAV1_STATUS_CODE_H_ diff --git a/src/gav1/symbol_visibility.h b/src/gav1/symbol_visibility.h new file mode 100644 index 0000000..ad7498c --- /dev/null +++ b/src/gav1/symbol_visibility.h @@ -0,0 +1,88 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_ +#define LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_ + +// This module defines the LIBGAV1_PUBLIC macro. LIBGAV1_PUBLIC, when combined +// with the flags -fvisibility=hidden and -fvisibility-inlines-hidden, restricts +// symbol availability when users use the shared object form of libgav1. The +// intent is to prevent exposure of libgav1 internals to users of the library, +// and to avoid ABI compatibility problems that changes to libgav1 internals +// would cause for users of the libgav1 shared object. +// +// Examples: +// +// This form makes a class and all of its members part of the public API: +// +// class LIBGAV1_PUBLIC A { +// public: +// A(); +// ~A(); +// void Foo(); +// int Bar(); +// }; +// +// A::A(), A::~A(), A::Foo(), and A::Bar() are all available to code linking to +// the shared object when this form is used. +// +// This form exposes a single class method as part of the public API: +// +// class B { +// public: +// B(); +// ~B(); +// LIBGAV1_PUBLIC int Foo(); +// }; +// +// In this examples only B::Foo() is available to the user of the shared object. +// +// Non-class member functions can also be exposed individually: +// +// LIBGAV1_PUBLIC void Bar(); +// +// In this example Bar() would be available to users of the shared object. +// +// Much of the above information and more can be found at +// https://gcc.gnu.org/wiki/Visibility + +#if !defined(LIBGAV1_PUBLIC) +#if defined(_WIN32) +#if defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL +#if defined(__GNUC__) +#define LIBGAV1_PUBLIC __attribute__((dllexport)) +#else +#define LIBGAV1_PUBLIC __declspec(dllexport) +#endif // defined(__GNUC__) +#elif defined(LIBGAV1_BUILDING_DLL) +#ifdef __GNUC__ +#define LIBGAV1_PUBLIC __attribute__((dllimport)) +#else +#define LIBGAV1_PUBLIC __declspec(dllimport) +#endif // defined(__GNUC__) +#else +#define LIBGAV1_PUBLIC +#endif // defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL +#else +#if defined(__GNUC__) && __GNUC__ >= 4 +#define LIBGAV1_PUBLIC __attribute__((visibility("default"))) +#else +#define LIBGAV1_PUBLIC +#endif +#endif // defined(_WIN32) +#endif // defined(LIBGAV1_PUBLIC) + +#endif // LIBGAV1_SRC_GAV1_SYMBOL_VISIBILITY_H_ diff --git a/src/gav1/version.h b/src/gav1/version.h new file mode 100644 index 0000000..78a573e --- /dev/null +++ b/src/gav1/version.h @@ -0,0 +1,71 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_GAV1_VERSION_H_ +#define LIBGAV1_SRC_GAV1_VERSION_H_ + +#include "gav1/symbol_visibility.h" + +// This library follows the principles described by Semantic Versioning +// (https://semver.org). + +#define LIBGAV1_MAJOR_VERSION 0 +#define LIBGAV1_MINOR_VERSION 16 +#define LIBGAV1_PATCH_VERSION 1 + +#define LIBGAV1_VERSION \ + ((LIBGAV1_MAJOR_VERSION << 16) | (LIBGAV1_MINOR_VERSION << 8) | \ + LIBGAV1_PATCH_VERSION) + +#if defined(__cplusplus) +extern "C" { +#endif + +// Returns the library's version number, packed in an int using 8 bits for +// each of major/minor/patch. e.g, 1.2.3 is 0x010203. +LIBGAV1_PUBLIC int Libgav1GetVersion(void); + +// Returns the library's version number as a string in the format +// 'MAJOR.MINOR.PATCH'. Always returns a valid (non-NULL) string. +LIBGAV1_PUBLIC const char* Libgav1GetVersionString(void); + +// Returns the build configuration used to produce the library. Always returns +// a valid (non-NULL) string. +LIBGAV1_PUBLIC const char* Libgav1GetBuildConfiguration(void); + +#if defined(__cplusplus) +} // extern "C" + +namespace libgav1 { + +// Returns the library's version number, packed in an int using 8 bits for +// each of major/minor/patch. e.g, 1.2.3 is 0x010203. +inline int GetVersion() { return Libgav1GetVersion(); } + +// Returns the library's version number as a string in the format +// 'MAJOR.MINOR.PATCH'. Always returns a valid (non-NULL) string. +inline const char* GetVersionString() { return Libgav1GetVersionString(); } + +// Returns the build configuration used to produce the library. Always returns +// a valid (non-NULL) string. +inline const char* GetBuildConfiguration() { + return Libgav1GetBuildConfiguration(); +} + +} // namespace libgav1 +#endif // defined(__cplusplus) + +#endif // LIBGAV1_SRC_GAV1_VERSION_H_ diff --git a/src/inter_intra_masks.inc b/src/inter_intra_masks.inc new file mode 100644 index 0000000..2c15f9c --- /dev/null +++ b/src/inter_intra_masks.inc @@ -0,0 +1,581 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file is just a convenience to separate out all the inter intra masks +// from the code where it is used. + +// The tables in this file are computed based on section 7.11.3.13 in the spec. + +constexpr uint8_t kInterIntraMaskDc[] = { + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}; + +constexpr uint8_t kInterIntraMaskVertical4x4[] = { + 60, 60, 60, 60, 19, 19, 19, 19, 6, 6, 6, 6, 2, 2, 2, 2}; +constexpr uint8_t kInterIntraMaskVertical4x8[] = { + 60, 60, 60, 60, 34, 34, 34, 34, 19, 19, 19, 19, 11, 11, 11, 11, + 6, 6, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 1, 1, 1, 1}; +constexpr uint8_t kInterIntraMaskVertical8x4[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34, 34, + 19, 19, 19, 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 11, 11}; +constexpr uint8_t kInterIntraMaskVertical8x8[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34, 34, + 19, 19, 19, 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 11, 11, + 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1}; +constexpr uint8_t kInterIntraMaskVertical8x16[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34, + 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19, + 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 11, 11, 11, 11, 11, 11, 11, 11, 8, + 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, + 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, + 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; +constexpr uint8_t kInterIntraMaskVertical16x8[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; +constexpr uint8_t kInterIntraMaskVertical16x16[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1}; +constexpr uint8_t kInterIntraMaskVertical16x32[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; +constexpr uint8_t kInterIntraMaskVertical32x16[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; +constexpr uint8_t kInterIntraMaskVertical32x32[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + +constexpr uint8_t kInterIntraMaskHorizontal4x4[] = {60, 19, 6, 2, 60, 19, 6, 2, + 60, 19, 6, 2, 60, 19, 6, 2}; +constexpr uint8_t kInterIntraMaskHorizontal4x8[] = { + 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, + 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11}; +constexpr uint8_t kInterIntraMaskHorizontal8x4[] = { + 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1, + 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1}; +constexpr uint8_t kInterIntraMaskHorizontal8x8[] = { + 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1, + 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1, + 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1, + 60, 34, 19, 11, 6, 4, 2, 1, 60, 34, 19, 11, 6, 4, 2, 1}; +constexpr uint8_t kInterIntraMaskHorizontal8x16[] = { + 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, + 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, + 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, + 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, + 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, + 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, + 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8}; +constexpr uint8_t kInterIntraMaskHorizontal16x8[] = { + 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, + 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, + 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, + 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, + 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, + 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, + 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1}; +constexpr uint8_t kInterIntraMaskHorizontal16x16[] = { + 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, + 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, + 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, + 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, + 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, + 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, + 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, + 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, + 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, + 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, + 1, 1, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, + 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, + 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, + 8, 6, 5, 4, 3, 2, 2, 1, 1}; +constexpr uint8_t kInterIntraMaskHorizontal16x32[] = { + 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, + 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, + 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, + 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, + 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, + 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, + 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, + 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, + 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, + 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, + 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, + 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, + 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, + 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, + 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, + 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, + 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, + 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, + 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, + 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, + 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, + 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, + 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, + 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, + 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, + 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, + 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7}; +constexpr uint8_t kInterIntraMaskHorizontal32x16[] = { + 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, + 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, + 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, + 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, + 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, + 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, + 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, + 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, + 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, + 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, + 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, + 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, + 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, + 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, + 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, + 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, + 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, + 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, + 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, + 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, + 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, + 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, + 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, + 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, + 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, + 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1}; +constexpr uint8_t kInterIntraMaskHorizontal32x32[] = { + 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, + 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, + 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, + 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, + 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, + 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, + 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, + 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, + 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, + 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, + 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, + 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, + 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, + 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, + 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, + 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, + 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, + 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, + 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, + 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, + 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, + 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, + 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, + 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, + 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, + 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, + 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, + 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, + 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, + 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, + 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, + 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, + 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, + 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, + 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, + 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, + 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, + 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, + 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, + 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, + 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, + 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, + 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, + 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, + 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, + 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, + 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, + 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, + 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, + 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, + 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, + 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, + 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1}; + +constexpr uint8_t kInterIntraMaskSmooth4x4[] = {60, 60, 60, 60, 60, 19, 19, 19, + 60, 19, 6, 6, 60, 19, 6, 2}; +constexpr uint8_t kInterIntraMaskSmooth4x8[] = { + 60, 60, 60, 60, 60, 34, 34, 34, 60, 34, 19, 19, 60, 34, 19, 11, + 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11, 60, 34, 19, 11}; +constexpr uint8_t kInterIntraMaskSmooth8x4[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34, + 60, 34, 19, 19, 19, 19, 19, 19, 60, 34, 19, 11, 11, 11, 11, 11}; +constexpr uint8_t kInterIntraMaskSmooth8x8[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 60, 34, 34, 34, 34, 34, 34, 34, + 60, 34, 19, 19, 19, 19, 19, 19, 60, 34, 19, 11, 11, 11, 11, 11, + 60, 34, 19, 11, 6, 6, 6, 6, 60, 34, 19, 11, 6, 4, 4, 4, + 60, 34, 19, 11, 6, 4, 2, 2, 60, 34, 19, 11, 6, 4, 2, 1}; +constexpr uint8_t kInterIntraMaskSmooth8x16[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34, + 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19, + 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 60, 45, 34, 26, 19, 15, 11, 11, 60, + 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, + 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, + 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8, 60, 45, + 34, 26, 19, 15, 11, 8, 60, 45, 34, 26, 19, 15, 11, 8}; +constexpr uint8_t kInterIntraMaskSmooth16x8[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 60, 45, 34, 26, 19, 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 45, + 34, 26, 19, 15, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8}; +constexpr uint8_t kInterIntraMaskSmooth16x16[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 45, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 60, 45, 34, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 60, 45, 34, 26, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 60, 45, 34, 26, 19, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 60, 45, 34, 26, 19, 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 45, + 34, 26, 19, 15, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 60, 45, 34, 26, 19, + 15, 11, 8, 6, 6, 6, 6, 6, 6, 6, 6, 60, 45, 34, 26, 19, 15, 11, 8, + 6, 5, 5, 5, 5, 5, 5, 5, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, + 4, 4, 4, 4, 4, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 3, 3, + 3, 3, 60, 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 2, 2, 60, + 45, 34, 26, 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 2, 2, 60, 45, 34, 26, + 19, 15, 11, 8, 6, 5, 4, 3, 2, 2, 1, 1, 60, 45, 34, 26, 19, 15, 11, + 8, 6, 5, 4, 3, 2, 2, 1, 1}; +constexpr uint8_t kInterIntraMaskSmooth16x32[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 60, 52, 45, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52, + 45, 39, 34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34, + 30, 26, 22, 19, 19, 19, 19, 19, 19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22, + 19, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, + 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 13, 13, + 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11, 11, 60, + 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 60, 52, 45, 39, + 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 8, 60, 52, 45, 39, 34, 30, 26, + 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, + 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, + 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, + 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, + 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, + 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, + 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, + 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, + 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, + 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, + 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, + 19, 17, 15, 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, + 13, 11, 10, 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, + 8, 7, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7}; +constexpr uint8_t kInterIntraMaskSmooth32x16[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 45, 60, 52, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52, 45, 39, + 34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34, 30, 26, 22, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45, + 39, 34, 30, 26, 22, 19, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19, + 17, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 52, + 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 60, 52, 45, 39, 34, 30, 26, 22, + 19, 17, 15, 13, 11, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, + 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; +constexpr uint8_t kInterIntraMaskSmooth32x32[] = { + 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 60, 52, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 45, 60, 52, 45, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 60, 52, 45, 39, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 60, 52, 45, 39, 34, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 60, 52, 45, 39, 34, 30, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 60, 52, 45, 39, + 34, 30, 26, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 60, 52, 45, 39, 34, 30, 26, 22, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 60, 52, 45, + 39, 34, 30, 26, 22, 19, 17, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 60, 52, 45, 39, 34, 30, 26, 22, 19, + 17, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 60, 52, + 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 60, 52, 45, 39, 34, 30, 26, 22, + 19, 17, 15, 13, 11, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, + 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 60, + 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 60, 52, 45, 39, 34, 30, 26, + 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, + 10, 8, 7, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 60, 52, 45, 39, 34, 30, + 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, + 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, + 5, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 60, 52, 45, 39, 34, + 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, + 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, + 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 60, 52, 45, 39, + 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, + 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, + 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, + 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, + 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, + 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, + 17, 15, 13, 11, 10, 8, 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, + 1, 1, 1, 1, 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, + 7, 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1}; + +// For each 2D array within this array, the indices are mapped as follows: 0, 1, +// 2 and 3 in each dimension maps to prediction dimension 4, 8, 16 and 32 +// respectively. For example, the entry in [1][2] corresponds to a prediction +// size of 8x16 (width == 8 and height == 16). +const uint8_t* kInterIntraMasks[kNumInterIntraModes][4][4] = { + // kInterIntraModeDc. This is a special case where all the non-nullptr + // entries point to kInterIntraMaskDc (all entries of the array are 32). The + // width can be set according to the prediction size to achieve the desired + // result. + {{kInterIntraMaskDc, kInterIntraMaskDc, nullptr, nullptr}, + {kInterIntraMaskDc, kInterIntraMaskDc, kInterIntraMaskDc, nullptr}, + {nullptr, kInterIntraMaskDc, kInterIntraMaskDc, kInterIntraMaskDc}, + {nullptr, nullptr, kInterIntraMaskDc, kInterIntraMaskDc}}, + // kInterIntraModeVertical + {{kInterIntraMaskVertical4x4, kInterIntraMaskVertical4x8, nullptr, nullptr}, + {kInterIntraMaskVertical8x4, kInterIntraMaskVertical8x8, + kInterIntraMaskVertical8x16, nullptr}, + {nullptr, kInterIntraMaskVertical16x8, kInterIntraMaskVertical16x16, + kInterIntraMaskVertical16x32}, + {nullptr, nullptr, kInterIntraMaskVertical32x16, + kInterIntraMaskVertical32x32}}, + // kInterIntraModeHorizontal + {{kInterIntraMaskHorizontal4x4, kInterIntraMaskHorizontal4x8, nullptr, + nullptr}, + {kInterIntraMaskHorizontal8x4, kInterIntraMaskHorizontal8x8, + kInterIntraMaskHorizontal8x16, nullptr}, + {nullptr, kInterIntraMaskHorizontal16x8, kInterIntraMaskHorizontal16x16, + kInterIntraMaskHorizontal16x32}, + {nullptr, nullptr, kInterIntraMaskHorizontal32x16, + kInterIntraMaskHorizontal32x32}}, + // kInterIntraModeSmooth + {{kInterIntraMaskSmooth4x4, kInterIntraMaskSmooth4x8, nullptr, nullptr}, + {kInterIntraMaskSmooth8x4, kInterIntraMaskSmooth8x8, + kInterIntraMaskSmooth8x16, nullptr}, + {nullptr, kInterIntraMaskSmooth16x8, kInterIntraMaskSmooth16x16, + kInterIntraMaskSmooth16x32}, + {nullptr, nullptr, kInterIntraMaskSmooth32x16, + kInterIntraMaskSmooth32x32}}}; diff --git a/src/internal_frame_buffer_list.cc b/src/internal_frame_buffer_list.cc new file mode 100644 index 0000000..e2d2273 --- /dev/null +++ b/src/internal_frame_buffer_list.cc @@ -0,0 +1,122 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/internal_frame_buffer_list.h" + +#include +#include +#include +#include +#include + +#include "src/utils/common.h" + +namespace libgav1 { +extern "C" { + +Libgav1StatusCode OnInternalFrameBufferSizeChanged( + void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format, + int width, int height, int left_border, int right_border, int top_border, + int bottom_border, int stride_alignment) { + auto* buffer_list = + static_cast(callback_private_data); + return buffer_list->OnFrameBufferSizeChanged( + bitdepth, image_format, width, height, left_border, right_border, + top_border, bottom_border, stride_alignment); +} + +Libgav1StatusCode GetInternalFrameBuffer( + void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format, + int width, int height, int left_border, int right_border, int top_border, + int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer) { + auto* buffer_list = + static_cast(callback_private_data); + return buffer_list->GetFrameBuffer( + bitdepth, image_format, width, height, left_border, right_border, + top_border, bottom_border, stride_alignment, frame_buffer); +} + +void ReleaseInternalFrameBuffer(void* callback_private_data, + void* buffer_private_data) { + auto* buffer_list = + static_cast(callback_private_data); + buffer_list->ReleaseFrameBuffer(buffer_private_data); +} + +} // extern "C" + +StatusCode InternalFrameBufferList::OnFrameBufferSizeChanged( + int /*bitdepth*/, Libgav1ImageFormat /*image_format*/, int /*width*/, + int /*height*/, int /*left_border*/, int /*right_border*/, + int /*top_border*/, int /*bottom_border*/, int /*stride_alignment*/) { + return kStatusOk; +} + +StatusCode InternalFrameBufferList::GetFrameBuffer( + int bitdepth, Libgav1ImageFormat image_format, int width, int height, + int left_border, int right_border, int top_border, int bottom_border, + int stride_alignment, Libgav1FrameBuffer* frame_buffer) { + FrameBufferInfo info; + StatusCode status = ComputeFrameBufferInfo( + bitdepth, image_format, width, height, left_border, right_border, + top_border, bottom_border, stride_alignment, &info); + if (status != kStatusOk) return status; + + if (info.uv_buffer_size > SIZE_MAX / 2 || + info.y_buffer_size > SIZE_MAX - 2 * info.uv_buffer_size) { + return kStatusInvalidArgument; + } + const size_t min_size = info.y_buffer_size + 2 * info.uv_buffer_size; + + Buffer* buffer = nullptr; + for (auto& buffer_ptr : buffers_) { + if (!buffer_ptr->in_use) { + buffer = buffer_ptr.get(); + break; + } + } + if (buffer == nullptr) { + std::unique_ptr new_buffer(new (std::nothrow) Buffer); + if (new_buffer == nullptr || !buffers_.push_back(std::move(new_buffer))) { + return kStatusOutOfMemory; + } + buffer = buffers_.back().get(); + } + + if (buffer->size < min_size) { + std::unique_ptr new_data( + static_cast(malloc(min_size))); + if (new_data == nullptr) return kStatusOutOfMemory; + buffer->data = std::move(new_data); + buffer->size = min_size; + } + + uint8_t* const y_buffer = buffer->data.get(); + uint8_t* const u_buffer = + (info.uv_buffer_size == 0) ? nullptr : y_buffer + info.y_buffer_size; + uint8_t* const v_buffer = + (info.uv_buffer_size == 0) ? nullptr : u_buffer + info.uv_buffer_size; + status = Libgav1SetFrameBuffer(&info, y_buffer, u_buffer, v_buffer, buffer, + frame_buffer); + if (status != kStatusOk) return status; + buffer->in_use = true; + return kStatusOk; +} + +void InternalFrameBufferList::ReleaseFrameBuffer(void* buffer_private_data) { + auto* const buffer = static_cast(buffer_private_data); + buffer->in_use = false; +} + +} // namespace libgav1 diff --git a/src/internal_frame_buffer_list.h b/src/internal_frame_buffer_list.h new file mode 100644 index 0000000..1c50b48 --- /dev/null +++ b/src/internal_frame_buffer_list.h @@ -0,0 +1,81 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_ +#define LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_ + +#include +#include +#include + +#include "src/gav1/frame_buffer.h" +#include "src/utils/memory.h" +#include "src/utils/vector.h" + +namespace libgav1 { + +extern "C" Libgav1StatusCode OnInternalFrameBufferSizeChanged( + void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format, + int width, int height, int left_border, int right_border, int top_border, + int bottom_border, int stride_alignment); + +extern "C" Libgav1StatusCode GetInternalFrameBuffer( + void* callback_private_data, int bitdepth, Libgav1ImageFormat image_format, + int width, int height, int left_border, int right_border, int top_border, + int bottom_border, int stride_alignment, Libgav1FrameBuffer* frame_buffer); + +extern "C" void ReleaseInternalFrameBuffer(void* callback_private_data, + void* buffer_private_data); + +class InternalFrameBufferList : public Allocable { + public: + InternalFrameBufferList() = default; + + // Not copyable or movable. + InternalFrameBufferList(const InternalFrameBufferList&) = delete; + InternalFrameBufferList& operator=(const InternalFrameBufferList&) = delete; + + ~InternalFrameBufferList() = default; + + Libgav1StatusCode OnFrameBufferSizeChanged(int bitdepth, + Libgav1ImageFormat image_format, + int width, int height, + int left_border, int right_border, + int top_border, int bottom_border, + int stride_alignment); + + Libgav1StatusCode GetFrameBuffer(int bitdepth, + Libgav1ImageFormat image_format, int width, + int height, int left_border, + int right_border, int top_border, + int bottom_border, int stride_alignment, + Libgav1FrameBuffer* frame_buffer); + + void ReleaseFrameBuffer(void* buffer_private_data); + + private: + struct Buffer : public Allocable { + std::unique_ptr data; + size_t size = 0; + bool in_use = false; + }; + + Vector> buffers_; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_INTERNAL_FRAME_BUFFER_LIST_H_ diff --git a/src/libgav1_decoder.cmake b/src/libgav1_decoder.cmake new file mode 100644 index 0000000..b97d09d --- /dev/null +++ b/src/libgav1_decoder.cmake @@ -0,0 +1,157 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_) + return() +endif() # LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_ +set(LIBGAV1_SRC_LIBGAV1_DECODER_CMAKE_ 1) + +list(APPEND libgav1_decoder_sources + "${libgav1_source}/buffer_pool.cc" + "${libgav1_source}/buffer_pool.h" + "${libgav1_source}/decoder_impl.cc" + "${libgav1_source}/decoder_impl.h" + "${libgav1_source}/decoder_state.h" + "${libgav1_source}/tile_scratch_buffer.cc" + "${libgav1_source}/tile_scratch_buffer.h" + "${libgav1_source}/film_grain.cc" + "${libgav1_source}/film_grain.h" + "${libgav1_source}/frame_buffer.cc" + "${libgav1_source}/frame_buffer_utils.h" + "${libgav1_source}/frame_scratch_buffer.h" + "${libgav1_source}/inter_intra_masks.inc" + "${libgav1_source}/internal_frame_buffer_list.cc" + "${libgav1_source}/internal_frame_buffer_list.h" + "${libgav1_source}/loop_restoration_info.cc" + "${libgav1_source}/loop_restoration_info.h" + "${libgav1_source}/motion_vector.cc" + "${libgav1_source}/motion_vector.h" + "${libgav1_source}/obu_parser.cc" + "${libgav1_source}/obu_parser.h" + "${libgav1_source}/post_filter/cdef.cc" + "${libgav1_source}/post_filter/deblock.cc" + "${libgav1_source}/post_filter/deblock_thresholds.inc" + "${libgav1_source}/post_filter/loop_restoration.cc" + "${libgav1_source}/post_filter/post_filter.cc" + "${libgav1_source}/post_filter/super_res.cc" + "${libgav1_source}/post_filter.h" + "${libgav1_source}/prediction_mask.cc" + "${libgav1_source}/prediction_mask.h" + "${libgav1_source}/quantizer.cc" + "${libgav1_source}/quantizer.h" + "${libgav1_source}/quantizer_tables.inc" + "${libgav1_source}/reconstruction.cc" + "${libgav1_source}/reconstruction.h" + "${libgav1_source}/residual_buffer_pool.cc" + "${libgav1_source}/residual_buffer_pool.h" + "${libgav1_source}/scan_tables.inc" + "${libgav1_source}/symbol_decoder_context.cc" + "${libgav1_source}/symbol_decoder_context.h" + "${libgav1_source}/symbol_decoder_context_cdfs.inc" + "${libgav1_source}/threading_strategy.cc" + "${libgav1_source}/threading_strategy.h" + "${libgav1_source}/tile.h" + "${libgav1_source}/tile/bitstream/mode_info.cc" + "${libgav1_source}/tile/bitstream/palette.cc" + "${libgav1_source}/tile/bitstream/partition.cc" + "${libgav1_source}/tile/bitstream/transform_size.cc" + "${libgav1_source}/tile/prediction.cc" + "${libgav1_source}/tile/tile.cc" + "${libgav1_source}/warp_prediction.cc" + "${libgav1_source}/warp_prediction.h" + "${libgav1_source}/yuv_buffer.cc" + "${libgav1_source}/yuv_buffer.h") + +list(APPEND libgav1_api_includes "${libgav1_source}/gav1/decoder.h" + "${libgav1_source}/gav1/decoder_buffer.h" + "${libgav1_source}/gav1/decoder_settings.h" + "${libgav1_source}/gav1/frame_buffer.h" + "${libgav1_source}/gav1/status_code.h" + "${libgav1_source}/gav1/symbol_visibility.h" + "${libgav1_source}/gav1/version.h") + +list(APPEND libgav1_api_sources "${libgav1_source}/decoder.cc" + "${libgav1_source}/decoder_settings.cc" + "${libgav1_source}/status_code.cc" + "${libgav1_source}/version.cc" + ${libgav1_api_includes}) + +macro(libgav1_add_decoder_targets) + if(BUILD_SHARED_LIBS) + if(MSVC OR WIN32) + # In order to produce a DLL and import library the Windows tools require + # that the exported symbols are part of the DLL target. The unfortunate + # side effect of this is that a single configuration cannot output both + # the static library and the DLL: This results in an either/or situation. + # Windows users of the libgav1 build can have a DLL and an import library, + # or they can have a static library; they cannot have both from a single + # configuration of the build. + list(APPEND libgav1_shared_lib_sources ${libgav1_api_sources}) + list(APPEND libgav1_static_lib_sources ${libgav1_api_includes}) + else() + list(APPEND libgav1_shared_lib_sources ${libgav1_api_includes}) + list(APPEND libgav1_static_lib_sources ${libgav1_api_sources}) + endif() + else() + list(APPEND libgav1_static_lib_sources ${libgav1_api_sources}) + endif() + + if(NOT ANDROID) + list(APPEND libgav1_absl_deps absl::base absl::synchronization) + endif() + + libgav1_add_library(NAME libgav1_decoder TYPE OBJECT SOURCES + ${libgav1_decoder_sources} DEFINES ${libgav1_defines} + INCLUDES ${libgav1_include_paths}) + + libgav1_add_library(NAME + libgav1_static + OUTPUT_NAME + libgav1 + TYPE + STATIC + SOURCES + ${libgav1_static_lib_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_include_paths} + LIB_DEPS + ${libgav1_absl_deps} + OBJLIB_DEPS + libgav1_dsp + libgav1_decoder + libgav1_utils + PUBLIC_INCLUDES + ${libgav1_source}) + + if(BUILD_SHARED_LIBS) + libgav1_add_library(NAME + libgav1_shared + OUTPUT_NAME + libgav1 + TYPE + SHARED + SOURCES + ${libgav1_shared_lib_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_include_paths} + LIB_DEPS + libgav1_static + PUBLIC_INCLUDES + ${libgav1_source}) + endif() +endmacro() diff --git a/src/loop_restoration_info.cc b/src/loop_restoration_info.cc new file mode 100644 index 0000000..2dba57d --- /dev/null +++ b/src/loop_restoration_info.cc @@ -0,0 +1,240 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/loop_restoration_info.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "src/utils/common.h" +#include "src/utils/logging.h" + +namespace libgav1 { +namespace { + +// Controls how self guided deltas are read. +constexpr int kSgrProjReadControl = 4; +// Maps the restoration type encoded in the compressed headers (restoration_type +// element in the spec) of the bitstream to LoopRestorationType. This is used +// only when the restoration type in the frame header is +// LoopRestorationTypeSwitchable. +constexpr LoopRestorationType kBitstreamRestorationTypeMap[] = { + kLoopRestorationTypeNone, kLoopRestorationTypeWiener, + kLoopRestorationTypeSgrProj}; + +inline int CountLeadingZeroCoefficients(const int16_t* const filter) { + int number_zero_coefficients = 0; + if (filter[0] == 0) { + number_zero_coefficients++; + if (filter[1] == 0) { + number_zero_coefficients++; + if (filter[2] == 0) { + number_zero_coefficients++; + } + } + } + return number_zero_coefficients; +} + +} // namespace + +bool LoopRestorationInfo::Reset(const LoopRestoration* const loop_restoration, + uint32_t width, uint32_t height, + int8_t subsampling_x, int8_t subsampling_y, + bool is_monochrome) { + loop_restoration_ = loop_restoration; + subsampling_x_ = subsampling_x; + subsampling_y_ = subsampling_y; + + const int num_planes = is_monochrome ? kMaxPlanesMonochrome : kMaxPlanes; + int total_num_units = 0; + for (int plane = kPlaneY; plane < num_planes; ++plane) { + if (loop_restoration_->type[plane] == kLoopRestorationTypeNone) { + plane_needs_filtering_[plane] = false; + continue; + } + plane_needs_filtering_[plane] = true; + const int plane_width = + (plane == kPlaneY) ? width : SubsampledValue(width, subsampling_x_); + const int plane_height = + (plane == kPlaneY) ? height : SubsampledValue(height, subsampling_y_); + num_horizontal_units_[plane] = + std::max(1, RightShiftWithRounding( + plane_width, loop_restoration_->unit_size_log2[plane])); + num_vertical_units_[plane] = std::max( + 1, RightShiftWithRounding(plane_height, + loop_restoration_->unit_size_log2[plane])); + num_units_[plane] = + num_horizontal_units_[plane] * num_vertical_units_[plane]; + total_num_units += num_units_[plane]; + } + // Allocate the RestorationUnitInfo arrays for all planes in a single heap + // allocation and divide up the buffer into arrays of the right sizes. + if (!loop_restoration_info_buffer_.Resize(total_num_units)) { + return false; + } + RestorationUnitInfo* loop_restoration_info = + loop_restoration_info_buffer_.get(); + for (int plane = kPlaneY; plane < num_planes; ++plane) { + if (loop_restoration_->type[plane] == kLoopRestorationTypeNone) { + continue; + } + loop_restoration_info_[plane] = loop_restoration_info; + loop_restoration_info += num_units_[plane]; + } + return true; +} + +bool LoopRestorationInfo::PopulateUnitInfoForSuperBlock( + Plane plane, BlockSize block_size, bool is_superres_scaled, + uint8_t superres_scale_denominator, int row4x4, int column4x4, + LoopRestorationUnitInfo* const unit_info) const { + assert(unit_info != nullptr); + if (!plane_needs_filtering_[plane]) return false; + const int numerator_column = + is_superres_scaled ? superres_scale_denominator : 1; + const int pixel_column_start = + RowOrColumn4x4ToPixel(column4x4, plane, subsampling_x_); + const int pixel_column_end = RowOrColumn4x4ToPixel( + column4x4 + kNum4x4BlocksWide[block_size], plane, subsampling_x_); + const int unit_row_log2 = loop_restoration_->unit_size_log2[plane]; + const int denominator_column_log2 = + unit_row_log2 + (is_superres_scaled ? 3 : 0); + const int pixel_row_start = + RowOrColumn4x4ToPixel(row4x4, plane, subsampling_y_); + const int pixel_row_end = RowOrColumn4x4ToPixel( + row4x4 + kNum4x4BlocksHigh[block_size], plane, subsampling_y_); + unit_info->column_start = RightShiftWithCeiling( + pixel_column_start * numerator_column, denominator_column_log2); + unit_info->column_end = RightShiftWithCeiling( + pixel_column_end * numerator_column, denominator_column_log2); + unit_info->row_start = RightShiftWithCeiling(pixel_row_start, unit_row_log2); + unit_info->row_end = RightShiftWithCeiling(pixel_row_end, unit_row_log2); + unit_info->column_end = + std::min(unit_info->column_end, num_horizontal_units_[plane]); + unit_info->row_end = std::min(unit_info->row_end, num_vertical_units_[plane]); + return true; +} + +void LoopRestorationInfo::ReadUnitCoefficients( + DaalaBitReader* const reader, + SymbolDecoderContext* const symbol_decoder_context, Plane plane, + int unit_id, + std::array* const reference_unit_info) { + LoopRestorationType unit_restoration_type = kLoopRestorationTypeNone; + if (loop_restoration_->type[plane] == kLoopRestorationTypeSwitchable) { + unit_restoration_type = kBitstreamRestorationTypeMap + [reader->ReadSymbol( + symbol_decoder_context->restoration_type_cdf)]; + } else if (loop_restoration_->type[plane] == kLoopRestorationTypeWiener) { + const bool use_wiener = + reader->ReadSymbol(symbol_decoder_context->use_wiener_cdf); + if (use_wiener) unit_restoration_type = kLoopRestorationTypeWiener; + } else if (loop_restoration_->type[plane] == kLoopRestorationTypeSgrProj) { + const bool use_sgrproj = + reader->ReadSymbol(symbol_decoder_context->use_sgrproj_cdf); + if (use_sgrproj) unit_restoration_type = kLoopRestorationTypeSgrProj; + } + loop_restoration_info_[plane][unit_id].type = unit_restoration_type; + + if (unit_restoration_type == kLoopRestorationTypeWiener) { + ReadWienerInfo(reader, plane, unit_id, reference_unit_info); + } else if (unit_restoration_type == kLoopRestorationTypeSgrProj) { + ReadSgrProjInfo(reader, plane, unit_id, reference_unit_info); + } +} + +void LoopRestorationInfo::ReadWienerInfo( + DaalaBitReader* const reader, Plane plane, int unit_id, + std::array* const reference_unit_info) { + for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) { + if (plane != kPlaneY) { + loop_restoration_info_[plane][unit_id].wiener_info.filter[i][0] = 0; + } + int sum = 0; + for (int j = static_cast(plane != kPlaneY); j < kNumWienerCoefficients; + ++j) { + const int8_t wiener_min = kWienerTapsMin[j]; + const int8_t wiener_max = kWienerTapsMax[j]; + const int control = j + 1; + int value; + if (!reader->DecodeSignedSubexpWithReference( + wiener_min, wiener_max + 1, + (*reference_unit_info)[plane].wiener_info.filter[i][j], control, + &value)) { + LIBGAV1_DLOG( + ERROR, + "Error decoding Wiener filter coefficients: plane %d, unit_id %d", + static_cast(plane), unit_id); + return; + } + loop_restoration_info_[plane][unit_id].wiener_info.filter[i][j] = value; + (*reference_unit_info)[plane].wiener_info.filter[i][j] = value; + sum += value; + } + loop_restoration_info_[plane][unit_id].wiener_info.filter[i][3] = + 128 - 2 * sum; + loop_restoration_info_[plane][unit_id] + .wiener_info.number_leading_zero_coefficients[i] = + CountLeadingZeroCoefficients( + loop_restoration_info_[plane][unit_id].wiener_info.filter[i]); + } +} + +void LoopRestorationInfo::ReadSgrProjInfo( + DaalaBitReader* const reader, Plane plane, int unit_id, + std::array* const reference_unit_info) { + const int sgr_proj_index = + static_cast(reader->ReadLiteral(kSgrProjParamsBits)); + loop_restoration_info_[plane][unit_id].sgr_proj_info.index = sgr_proj_index; + for (int i = 0; i < 2; ++i) { + const uint8_t radius = kSgrProjParams[sgr_proj_index][i * 2]; + const int8_t multiplier_min = kSgrProjMultiplierMin[i]; + const int8_t multiplier_max = kSgrProjMultiplierMax[i]; + int multiplier; + if (radius != 0) { + if (!reader->DecodeSignedSubexpWithReference( + multiplier_min, multiplier_max + 1, + (*reference_unit_info)[plane].sgr_proj_info.multiplier[i], + kSgrProjReadControl, &multiplier)) { + LIBGAV1_DLOG(ERROR, + "Error decoding Self-guided filter coefficients: plane " + "%d, unit_id %d", + static_cast(plane), unit_id); + return; + } + } else { + // The range of (*reference_unit_info)[plane].sgr_proj_info.multiplier[0] + // from DecodeSignedSubexpWithReference() is [-96, 31], the default is + // -32, making Clip3(128 - 31, -32, 95) unnecessary. + static constexpr int kMultiplier[2] = {0, 95}; + multiplier = kMultiplier[i]; + assert( + i == 0 || + Clip3((1 << kSgrProjPrecisionBits) - + (*reference_unit_info)[plane].sgr_proj_info.multiplier[0], + multiplier_min, multiplier_max) == kMultiplier[1]); + } + loop_restoration_info_[plane][unit_id].sgr_proj_info.multiplier[i] = + multiplier; + (*reference_unit_info)[plane].sgr_proj_info.multiplier[i] = multiplier; + } +} + +} // namespace libgav1 diff --git a/src/loop_restoration_info.h b/src/loop_restoration_info.h new file mode 100644 index 0000000..f174b89 --- /dev/null +++ b/src/loop_restoration_info.h @@ -0,0 +1,104 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_ +#define LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_ + +#include +#include +#include +#include + +#include "src/dsp/common.h" +#include "src/symbol_decoder_context.h" +#include "src/utils/constants.h" +#include "src/utils/dynamic_buffer.h" +#include "src/utils/entropy_decoder.h" +#include "src/utils/types.h" + +namespace libgav1 { + +struct LoopRestorationUnitInfo { + int row_start; + int row_end; + int column_start; + int column_end; +}; + +class LoopRestorationInfo { + public: + LoopRestorationInfo() = default; + + // Non copyable/movable. + LoopRestorationInfo(const LoopRestorationInfo&) = delete; + LoopRestorationInfo& operator=(const LoopRestorationInfo&) = delete; + LoopRestorationInfo(LoopRestorationInfo&&) = delete; + LoopRestorationInfo& operator=(LoopRestorationInfo&&) = delete; + + bool Reset(const LoopRestoration* loop_restoration, uint32_t width, + uint32_t height, int8_t subsampling_x, int8_t subsampling_y, + bool is_monochrome); + // Populates the |unit_info| for the super block at |row4x4|, |column4x4|. + // Returns true on success, false otherwise. + bool PopulateUnitInfoForSuperBlock(Plane plane, BlockSize block_size, + bool is_superres_scaled, + uint8_t superres_scale_denominator, + int row4x4, int column4x4, + LoopRestorationUnitInfo* unit_info) const; + void ReadUnitCoefficients(DaalaBitReader* reader, + SymbolDecoderContext* symbol_decoder_context, + Plane plane, int unit_id, + std::array* + reference_unit_info); // 5.11.58. + void ReadWienerInfo( + DaalaBitReader* reader, Plane plane, int unit_id, + std::array* reference_unit_info); + void ReadSgrProjInfo( + DaalaBitReader* reader, Plane plane, int unit_id, + std::array* reference_unit_info); + + // Getters. + const RestorationUnitInfo* loop_restoration_info(Plane plane, + int unit_id) const { + return &loop_restoration_info_[plane][unit_id]; + } + + int num_horizontal_units(Plane plane) const { + return num_horizontal_units_[plane]; + } + int num_vertical_units(Plane plane) const { + return num_vertical_units_[plane]; + } + int num_units(Plane plane) const { return num_units_[plane]; } + + private: + // If plane_needs_filtering_[plane] is true, loop_restoration_info_[plane] + // points to an array of num_units_[plane] elements. + RestorationUnitInfo* loop_restoration_info_[kMaxPlanes]; + // Owns the memory that loop_restoration_info_[plane] points to. + DynamicBuffer loop_restoration_info_buffer_; + bool plane_needs_filtering_[kMaxPlanes]; + const LoopRestoration* loop_restoration_; + int8_t subsampling_x_; + int8_t subsampling_y_; + int num_horizontal_units_[kMaxPlanes]; + int num_vertical_units_[kMaxPlanes]; + int num_units_[kMaxPlanes]; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_LOOP_RESTORATION_INFO_H_ diff --git a/src/motion_vector.cc b/src/motion_vector.cc new file mode 100644 index 0000000..fdb1875 --- /dev/null +++ b/src/motion_vector.cc @@ -0,0 +1,1001 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/motion_vector.h" + +#include +#include +#include +#include +#include + +#include "src/dsp/dsp.h" +#include "src/utils/bit_mask_set.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/logging.h" + +namespace libgav1 { +namespace { + +// Entry at index i is computed as: +// Clip3(std::max(kBlockWidthPixels[i], kBlockHeightPixels[i], 16, 112)). +constexpr int kWarpValidThreshold[kMaxBlockSizes] = { + 16, 16, 16, 16, 16, 16, 32, 16, 16, 16, 32, + 64, 32, 32, 32, 64, 64, 64, 64, 112, 112, 112}; + +// 7.10.2.10. +void LowerMvPrecision(const ObuFrameHeader& frame_header, + MotionVector* const mvs) { + if (frame_header.allow_high_precision_mv) return; + if (frame_header.force_integer_mv != 0) { + for (auto& mv : mvs->mv) { + // The next line is equivalent to: + // const int value = (std::abs(static_cast(mv)) + 3) & ~7; + // const int sign = mv >> 15; + // mv = ApplySign(value, sign); + mv = (mv + 3 - (mv >> 15)) & ~7; + } + } else { + for (auto& mv : mvs->mv) { + // The next line is equivalent to: + // if ((mv & 1) != 0) mv += (mv > 0) ? -1 : 1; + mv = (mv - (mv >> 15)) & ~1; + } + } +} + +// 7.10.2.1. +void SetupGlobalMv(const Tile::Block& block, int index, + MotionVector* const mv) { + const BlockParameters& bp = *block.bp; + const ObuFrameHeader& frame_header = block.tile.frame_header(); + ReferenceFrameType reference_type = bp.reference_frame[index]; + const auto& gm = frame_header.global_motion[reference_type]; + if (reference_type == kReferenceFrameIntra || + gm.type == kGlobalMotionTransformationTypeIdentity) { + mv->mv32 = 0; + return; + } + if (gm.type == kGlobalMotionTransformationTypeTranslation) { + for (int i = 0; i < 2; ++i) { + mv->mv[i] = gm.params[i] >> (kWarpedModelPrecisionBits - 3); + } + LowerMvPrecision(frame_header, mv); + return; + } + const int x = MultiplyBy4(block.column4x4) + DivideBy2(block.width) - 1; + const int y = MultiplyBy4(block.row4x4) + DivideBy2(block.height) - 1; + const int xc = (gm.params[2] - (1 << kWarpedModelPrecisionBits)) * x + + gm.params[3] * y + gm.params[0]; + const int yc = gm.params[4] * x + + (gm.params[5] - (1 << kWarpedModelPrecisionBits)) * y + + gm.params[1]; + if (frame_header.allow_high_precision_mv) { + mv->mv[MotionVector::kRow] = + RightShiftWithRoundingSigned(yc, kWarpedModelPrecisionBits - 3); + mv->mv[MotionVector::kColumn] = + RightShiftWithRoundingSigned(xc, kWarpedModelPrecisionBits - 3); + } else { + mv->mv[MotionVector::kRow] = MultiplyBy2( + RightShiftWithRoundingSigned(yc, kWarpedModelPrecisionBits - 2)); + mv->mv[MotionVector::kColumn] = MultiplyBy2( + RightShiftWithRoundingSigned(xc, kWarpedModelPrecisionBits - 2)); + LowerMvPrecision(frame_header, mv); + } +} + +constexpr BitMaskSet kPredictionModeNewMvMask(kPredictionModeNewMv, + kPredictionModeNewNewMv, + kPredictionModeNearNewMv, + kPredictionModeNewNearMv, + kPredictionModeNearestNewMv, + kPredictionModeNewNearestMv); + +// 7.10.2.8. +void SearchStack(const Tile::Block& block, const BlockParameters& mv_bp, + int index, int weight, bool* const found_new_mv, + bool* const found_match, int* const num_mv_found) { + const BlockParameters& bp = *block.bp; + const std::array& global_motion = + block.tile.frame_header().global_motion; + PredictionParameters& prediction_parameters = *bp.prediction_parameters; + MotionVector candidate_mv; + // LowerMvPrecision() is not necessary, since the values in + // |prediction_parameters.global_mv| and |mv_bp.mv| were generated by it. + const auto global_motion_type = global_motion[bp.reference_frame[0]].type; + if (IsGlobalMvBlock(mv_bp.is_global_mv_block, global_motion_type)) { + candidate_mv = prediction_parameters.global_mv[0]; + } else { + candidate_mv = mv_bp.mv.mv[index]; + } + *found_new_mv |= kPredictionModeNewMvMask.Contains(mv_bp.y_mode); + *found_match = true; + MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack; + const int num_found = *num_mv_found; + const auto result = std::find_if(ref_mv_stack, ref_mv_stack + num_found, + [&candidate_mv](const MotionVector& ref_mv) { + return ref_mv == candidate_mv; + }); + if (result != ref_mv_stack + num_found) { + prediction_parameters.IncreaseWeight(std::distance(ref_mv_stack, result), + weight); + return; + } + if (num_found >= kMaxRefMvStackSize) return; + ref_mv_stack[num_found] = candidate_mv; + prediction_parameters.SetWeightIndexStackEntry(num_found, weight); + ++*num_mv_found; +} + +// 7.10.2.9. +void CompoundSearchStack(const Tile::Block& block, const BlockParameters& mv_bp, + int weight, bool* const found_new_mv, + bool* const found_match, int* const num_mv_found) { + const BlockParameters& bp = *block.bp; + const std::array& global_motion = + block.tile.frame_header().global_motion; + PredictionParameters& prediction_parameters = *bp.prediction_parameters; + // LowerMvPrecision() is not necessary, since the values in + // |prediction_parameters.global_mv| and |mv_bp.mv| were generated by it. + CompoundMotionVector candidate_mv = mv_bp.mv; + for (int i = 0; i < 2; ++i) { + const auto global_motion_type = global_motion[bp.reference_frame[i]].type; + if (IsGlobalMvBlock(mv_bp.is_global_mv_block, global_motion_type)) { + candidate_mv.mv[i] = prediction_parameters.global_mv[i]; + } + } + *found_new_mv |= kPredictionModeNewMvMask.Contains(mv_bp.y_mode); + *found_match = true; + CompoundMotionVector* const compound_ref_mv_stack = + prediction_parameters.compound_ref_mv_stack; + const int num_found = *num_mv_found; + const auto result = + std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found, + [&candidate_mv](const CompoundMotionVector& ref_mv) { + return ref_mv == candidate_mv; + }); + if (result != compound_ref_mv_stack + num_found) { + prediction_parameters.IncreaseWeight( + std::distance(compound_ref_mv_stack, result), weight); + return; + } + if (num_found >= kMaxRefMvStackSize) return; + compound_ref_mv_stack[num_found] = candidate_mv; + prediction_parameters.SetWeightIndexStackEntry(num_found, weight); + ++*num_mv_found; +} + +// 7.10.2.7. +void AddReferenceMvCandidate(const Tile::Block& block, + const BlockParameters& mv_bp, bool is_compound, + int weight, bool* const found_new_mv, + bool* const found_match, int* const num_mv_found) { + if (!mv_bp.is_inter) return; + const BlockParameters& bp = *block.bp; + if (is_compound) { + if (mv_bp.reference_frame[0] == bp.reference_frame[0] && + mv_bp.reference_frame[1] == bp.reference_frame[1]) { + CompoundSearchStack(block, mv_bp, weight, found_new_mv, found_match, + num_mv_found); + } + return; + } + for (int i = 0; i < 2; ++i) { + if (mv_bp.reference_frame[i] == bp.reference_frame[0]) { + SearchStack(block, mv_bp, i, weight, found_new_mv, found_match, + num_mv_found); + } + } +} + +int GetMinimumStep(int block_width_or_height4x4, int delta_row_or_column) { + assert(delta_row_or_column < 0); + if (block_width_or_height4x4 >= 16) return 4; + if (delta_row_or_column < -1) return 2; + return 0; +} + +// 7.10.2.2. +void ScanRow(const Tile::Block& block, int mv_column, int delta_row, + bool is_compound, bool* const found_new_mv, + bool* const found_match, int* const num_mv_found) { + const int mv_row = block.row4x4 + delta_row; + const Tile& tile = block.tile; + if (!tile.IsTopInside(mv_row + 1)) return; + const int width4x4 = block.width4x4; + const int min_step = GetMinimumStep(width4x4, delta_row); + BlockParameters** bps = tile.BlockParametersAddress(mv_row, mv_column); + BlockParameters** const end_bps = + bps + std::min({static_cast(width4x4), + tile.frame_header().columns4x4 - block.column4x4, 16}); + do { + const BlockParameters& mv_bp = **bps; + const int step = std::max( + std::min(width4x4, static_cast(kNum4x4BlocksWide[mv_bp.size])), + min_step); + AddReferenceMvCandidate(block, mv_bp, is_compound, MultiplyBy2(step), + found_new_mv, found_match, num_mv_found); + bps += step; + } while (bps < end_bps); +} + +// 7.10.2.3. +void ScanColumn(const Tile::Block& block, int mv_row, int delta_column, + bool is_compound, bool* const found_new_mv, + bool* const found_match, int* const num_mv_found) { + const int mv_column = block.column4x4 + delta_column; + const Tile& tile = block.tile; + if (!tile.IsLeftInside(mv_column + 1)) return; + const int height4x4 = block.height4x4; + const int min_step = GetMinimumStep(height4x4, delta_column); + const ptrdiff_t stride = tile.BlockParametersStride(); + BlockParameters** bps = tile.BlockParametersAddress(mv_row, mv_column); + BlockParameters** const end_bps = + bps + stride * std::min({static_cast(height4x4), + tile.frame_header().rows4x4 - block.row4x4, 16}); + do { + const BlockParameters& mv_bp = **bps; + const int step = std::max( + std::min(height4x4, static_cast(kNum4x4BlocksHigh[mv_bp.size])), + min_step); + AddReferenceMvCandidate(block, mv_bp, is_compound, MultiplyBy2(step), + found_new_mv, found_match, num_mv_found); + bps += step * stride; + } while (bps < end_bps); +} + +// 7.10.2.4. +void ScanPoint(const Tile::Block& block, int delta_row, int delta_column, + bool is_compound, bool* const found_new_mv, + bool* const found_match, int* const num_mv_found) { + const int mv_row = block.row4x4 + delta_row; + const int mv_column = block.column4x4 + delta_column; + const Tile& tile = block.tile; + if (!tile.IsInside(mv_row, mv_column) || + !tile.HasParameters(mv_row, mv_column)) { + return; + } + const BlockParameters& mv_bp = tile.Parameters(mv_row, mv_column); + if (mv_bp.reference_frame[0] == kReferenceFrameNone) return; + AddReferenceMvCandidate(block, mv_bp, is_compound, 4, found_new_mv, + found_match, num_mv_found); +} + +// 7.10.2.6. +void AddTemporalReferenceMvCandidate( + const ObuFrameHeader& frame_header, const int reference_offsets[2], + const MotionVector* const temporal_mvs, + const int8_t* const temporal_reference_offsets, int count, bool is_compound, + int* const zero_mv_context, int* const num_mv_found, + PredictionParameters* const prediction_parameters) { + const int mv_projection_function_index = + frame_header.allow_high_precision_mv ? 2 : frame_header.force_integer_mv; + const MotionVector* const global_mv = prediction_parameters->global_mv; + if (is_compound) { + CompoundMotionVector candidate_mvs[kMaxTemporalMvCandidatesWithPadding]; + const dsp::Dsp& dsp = *dsp::GetDspTable(8); + dsp.mv_projection_compound[mv_projection_function_index]( + temporal_mvs, temporal_reference_offsets, reference_offsets, count, + candidate_mvs); + if (*zero_mv_context == -1) { + int max_difference = + std::max(std::abs(candidate_mvs[0].mv[0].mv[0] - global_mv[0].mv[0]), + std::abs(candidate_mvs[0].mv[0].mv[1] - global_mv[0].mv[1])); + max_difference = + std::max(max_difference, + std::abs(candidate_mvs[0].mv[1].mv[0] - global_mv[1].mv[0])); + max_difference = + std::max(max_difference, + std::abs(candidate_mvs[0].mv[1].mv[1] - global_mv[1].mv[1])); + *zero_mv_context = static_cast(max_difference >= 16); + } + CompoundMotionVector* const compound_ref_mv_stack = + prediction_parameters->compound_ref_mv_stack; + int num_found = *num_mv_found; + int index = 0; + do { + const CompoundMotionVector& candidate_mv = candidate_mvs[index]; + const auto result = + std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found, + [&candidate_mv](const CompoundMotionVector& ref_mv) { + return ref_mv == candidate_mv; + }); + if (result != compound_ref_mv_stack + num_found) { + prediction_parameters->IncreaseWeight( + std::distance(compound_ref_mv_stack, result), 2); + continue; + } + if (num_found >= kMaxRefMvStackSize) continue; + compound_ref_mv_stack[num_found] = candidate_mv; + prediction_parameters->SetWeightIndexStackEntry(num_found, 2); + ++num_found; + } while (++index < count); + *num_mv_found = num_found; + return; + } + MotionVector* const ref_mv_stack = prediction_parameters->ref_mv_stack; + if (reference_offsets[0] == 0) { + if (*zero_mv_context == -1) { + const int max_difference = + std::max(std::abs(global_mv[0].mv[0]), std::abs(global_mv[0].mv[1])); + *zero_mv_context = static_cast(max_difference >= 16); + } + const MotionVector candidate_mv = {}; + const int num_found = *num_mv_found; + const auto result = + std::find_if(ref_mv_stack, ref_mv_stack + num_found, + [&candidate_mv](const MotionVector& ref_mv) { + return ref_mv == candidate_mv; + }); + if (result != ref_mv_stack + num_found) { + prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result), + 2 * count); + return; + } + if (num_found >= kMaxRefMvStackSize) return; + ref_mv_stack[num_found] = candidate_mv; + prediction_parameters->SetWeightIndexStackEntry(num_found, 2 * count); + ++*num_mv_found; + return; + } + alignas(kMaxAlignment) + MotionVector candidate_mvs[kMaxTemporalMvCandidatesWithPadding]; + const dsp::Dsp& dsp = *dsp::GetDspTable(8); + dsp.mv_projection_single[mv_projection_function_index]( + temporal_mvs, temporal_reference_offsets, reference_offsets[0], count, + candidate_mvs); + if (*zero_mv_context == -1) { + const int max_difference = + std::max(std::abs(candidate_mvs[0].mv[0] - global_mv[0].mv[0]), + std::abs(candidate_mvs[0].mv[1] - global_mv[0].mv[1])); + *zero_mv_context = static_cast(max_difference >= 16); + } + int num_found = *num_mv_found; + int index = 0; + do { + const MotionVector& candidate_mv = candidate_mvs[index]; + const auto result = + std::find_if(ref_mv_stack, ref_mv_stack + num_found, + [&candidate_mv](const MotionVector& ref_mv) { + return ref_mv == candidate_mv; + }); + if (result != ref_mv_stack + num_found) { + prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result), + 2); + continue; + } + if (num_found >= kMaxRefMvStackSize) continue; + ref_mv_stack[num_found] = candidate_mv; + prediction_parameters->SetWeightIndexStackEntry(num_found, 2); + ++num_found; + } while (++index < count); + *num_mv_found = num_found; +} + +// Part of 7.10.2.5. +bool IsWithinTheSame64x64Block(const Tile::Block& block, int delta_row, + int delta_column) { + const int row = (block.row4x4 & 15) + delta_row; + const int column = (block.column4x4 & 15) + delta_column; + // |block.height4x4| is at least 2 for all elements in |kTemporalScanMask|. + // So |row| are all non-negative. + assert(row >= 0); + return row < 16 && column >= 0 && column < 16; +} + +constexpr BitMaskSet kTemporalScanMask(kBlock8x8, kBlock8x16, kBlock8x32, + kBlock16x8, kBlock16x16, kBlock16x32, + kBlock32x8, kBlock32x16, kBlock32x32); + +// 7.10.2.5. +void TemporalScan(const Tile::Block& block, bool is_compound, + int* const zero_mv_context, int* const num_mv_found) { + const int step_w = (block.width4x4 >= 16) ? 4 : 2; + const int step_h = (block.height4x4 >= 16) ? 4 : 2; + const int row_start = block.row4x4 | 1; + const int column_start = block.column4x4 | 1; + const int row_end = + row_start + std::min(static_cast(block.height4x4), 16); + const int column_end = + column_start + std::min(static_cast(block.width4x4), 16); + const Tile& tile = block.tile; + const TemporalMotionField& motion_field = tile.motion_field(); + const int stride = motion_field.mv.columns(); + const MotionVector* motion_field_mv = motion_field.mv[0]; + const int8_t* motion_field_reference_offset = + motion_field.reference_offset[0]; + alignas(kMaxAlignment) + MotionVector temporal_mvs[kMaxTemporalMvCandidatesWithPadding]; + int8_t temporal_reference_offsets[kMaxTemporalMvCandidatesWithPadding]; + int count = 0; + int offset = stride * (row_start >> 1); + int mv_row = row_start; + do { + int mv_column = column_start; + do { + // Both horizontal and vertical offsets are positive. Only bottom and + // right boundaries need to be checked. + if (tile.IsBottomRightInside(mv_row, mv_column)) { + const int x8 = mv_column >> 1; + const MotionVector temporal_mv = motion_field_mv[offset + x8]; + if (temporal_mv.mv[0] == kInvalidMvValue) { + if (mv_row == row_start && mv_column == column_start) { + *zero_mv_context = 1; + } + } else { + temporal_mvs[count] = temporal_mv; + temporal_reference_offsets[count++] = + motion_field_reference_offset[offset + x8]; + } + } + mv_column += step_w; + } while (mv_column < column_end); + offset += stride * step_h >> 1; + mv_row += step_h; + } while (mv_row < row_end); + if (kTemporalScanMask.Contains(block.size)) { + const int temporal_sample_positions[3][2] = { + {block.height4x4, -2}, + {block.height4x4, block.width4x4}, + {block.height4x4 - 2, block.width4x4}}; + // Getting the address of an element in Array2D is slow. Precalculate the + // offsets. + int temporal_sample_offsets[3]; + temporal_sample_offsets[0] = stride * ((row_start + block.height4x4) >> 1) + + ((column_start - 2) >> 1); + temporal_sample_offsets[1] = + temporal_sample_offsets[0] + ((block.width4x4 + 2) >> 1); + temporal_sample_offsets[2] = temporal_sample_offsets[1] - stride; + for (int i = 0; i < 3; i++) { + const int row = temporal_sample_positions[i][0]; + const int column = temporal_sample_positions[i][1]; + if (!IsWithinTheSame64x64Block(block, row, column)) continue; + const int mv_row = row_start + row; + const int mv_column = column_start + column; + // IsWithinTheSame64x64Block() guarantees the reference block is inside + // the top and left boundary. + if (!tile.IsBottomRightInside(mv_row, mv_column)) continue; + const MotionVector temporal_mv = + motion_field_mv[temporal_sample_offsets[i]]; + if (temporal_mv.mv[0] != kInvalidMvValue) { + temporal_mvs[count] = temporal_mv; + temporal_reference_offsets[count++] = + motion_field_reference_offset[temporal_sample_offsets[i]]; + } + } + } + if (count != 0) { + BlockParameters* const bp = block.bp; + int reference_offsets[2]; + const int offset_0 = tile.current_frame() + .reference_info() + ->relative_distance_to[bp->reference_frame[0]]; + reference_offsets[0] = + Clip3(offset_0, -kMaxFrameDistance, kMaxFrameDistance); + if (is_compound) { + const int offset_1 = tile.current_frame() + .reference_info() + ->relative_distance_to[bp->reference_frame[1]]; + reference_offsets[1] = + Clip3(offset_1, -kMaxFrameDistance, kMaxFrameDistance); + // Pad so that SIMD implementations won't read uninitialized memory. + if ((count & 1) != 0) { + temporal_mvs[count].mv32 = 0; + temporal_reference_offsets[count] = 0; + } + } else { + // Pad so that SIMD implementations won't read uninitialized memory. + for (int i = count; i < ((count + 3) & ~3); ++i) { + temporal_mvs[i].mv32 = 0; + temporal_reference_offsets[i] = 0; + } + } + AddTemporalReferenceMvCandidate( + tile.frame_header(), reference_offsets, temporal_mvs, + temporal_reference_offsets, count, is_compound, zero_mv_context, + num_mv_found, &(*bp->prediction_parameters)); + } +} + +// Part of 7.10.2.13. +void AddExtraCompoundMvCandidate(const Tile::Block& block, int mv_row, + int mv_column, int* const ref_id_count, + MotionVector ref_id[2][2], + int* const ref_diff_count, + MotionVector ref_diff[2][2]) { + const auto& bp = block.tile.Parameters(mv_row, mv_column); + const std::array& reference_frame_sign_bias = + block.tile.reference_frame_sign_bias(); + for (int i = 0; i < 2; ++i) { + const ReferenceFrameType candidate_reference_frame = bp.reference_frame[i]; + if (candidate_reference_frame <= kReferenceFrameIntra) continue; + for (int j = 0; j < 2; ++j) { + MotionVector candidate_mv = bp.mv.mv[i]; + const ReferenceFrameType block_reference_frame = + block.bp->reference_frame[j]; + if (candidate_reference_frame == block_reference_frame && + ref_id_count[j] < 2) { + ref_id[j][ref_id_count[j]] = candidate_mv; + ++ref_id_count[j]; + } else if (ref_diff_count[j] < 2) { + if (reference_frame_sign_bias[candidate_reference_frame] != + reference_frame_sign_bias[block_reference_frame]) { + candidate_mv.mv[0] *= -1; + candidate_mv.mv[1] *= -1; + } + ref_diff[j][ref_diff_count[j]] = candidate_mv; + ++ref_diff_count[j]; + } + } + } +} + +// Part of 7.10.2.13. +void AddExtraSingleMvCandidate(const Tile::Block& block, int mv_row, + int mv_column, int* const num_mv_found) { + const auto& bp = block.tile.Parameters(mv_row, mv_column); + const std::array& reference_frame_sign_bias = + block.tile.reference_frame_sign_bias(); + const ReferenceFrameType block_reference_frame = block.bp->reference_frame[0]; + PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack; + int num_found = *num_mv_found; + for (int i = 0; i < 2; ++i) { + const ReferenceFrameType candidate_reference_frame = bp.reference_frame[i]; + if (candidate_reference_frame <= kReferenceFrameIntra) continue; + MotionVector candidate_mv = bp.mv.mv[i]; + if (reference_frame_sign_bias[candidate_reference_frame] != + reference_frame_sign_bias[block_reference_frame]) { + candidate_mv.mv[0] *= -1; + candidate_mv.mv[1] *= -1; + } + assert(num_found <= 2); + if ((num_found != 0 && ref_mv_stack[0] == candidate_mv) || + (num_found == 2 && ref_mv_stack[1] == candidate_mv)) { + continue; + } + ref_mv_stack[num_found] = candidate_mv; + prediction_parameters.SetWeightIndexStackEntry(num_found, 0); + ++num_found; + } + *num_mv_found = num_found; +} + +// 7.10.2.12. +void ExtraSearch(const Tile::Block& block, bool is_compound, + int* const num_mv_found) { + const Tile& tile = block.tile; + const int num4x4 = std::min({static_cast(block.width4x4), + tile.frame_header().columns4x4 - block.column4x4, + static_cast(block.height4x4), + tile.frame_header().rows4x4 - block.row4x4, 16}); + int ref_id_count[2] = {}; + MotionVector ref_id[2][2] = {}; + int ref_diff_count[2] = {}; + MotionVector ref_diff[2][2] = {}; + PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + for (int pass = 0; pass < 2 && *num_mv_found < 2; ++pass) { + for (int i = 0; i < num4x4;) { + const int mv_row = block.row4x4 + ((pass == 0) ? -1 : i); + const int mv_column = block.column4x4 + ((pass == 0) ? i : -1); + if (!tile.IsTopLeftInside(mv_row + 1, mv_column + 1)) break; + if (is_compound) { + AddExtraCompoundMvCandidate(block, mv_row, mv_column, ref_id_count, + ref_id, ref_diff_count, ref_diff); + } else { + AddExtraSingleMvCandidate(block, mv_row, mv_column, num_mv_found); + if (*num_mv_found >= 2) break; + } + const auto& bp = tile.Parameters(mv_row, mv_column); + i += + (pass == 0) ? kNum4x4BlocksWide[bp.size] : kNum4x4BlocksHigh[bp.size]; + } + } + if (is_compound) { + // Merge compound mode extra search into mv stack. + CompoundMotionVector* const compound_ref_mv_stack = + prediction_parameters.compound_ref_mv_stack; + CompoundMotionVector combined_mvs[2] = {}; + for (int i = 0; i < 2; ++i) { + int count = 0; + assert(ref_id_count[i] <= 2); + for (int j = 0; j < ref_id_count[i]; ++j, ++count) { + combined_mvs[count].mv[i] = ref_id[i][j]; + } + for (int j = 0; j < ref_diff_count[i] && count < 2; ++j, ++count) { + combined_mvs[count].mv[i] = ref_diff[i][j]; + } + for (; count < 2; ++count) { + combined_mvs[count].mv[i] = prediction_parameters.global_mv[i]; + } + } + if (*num_mv_found == 1) { + if (combined_mvs[0] == compound_ref_mv_stack[0]) { + compound_ref_mv_stack[1] = combined_mvs[1]; + } else { + compound_ref_mv_stack[1] = combined_mvs[0]; + } + prediction_parameters.SetWeightIndexStackEntry(1, 0); + } else { + assert(*num_mv_found == 0); + for (int i = 0; i < 2; ++i) { + compound_ref_mv_stack[i] = combined_mvs[i]; + prediction_parameters.SetWeightIndexStackEntry(i, 0); + } + } + *num_mv_found = 2; + } else { + // single prediction mode + MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack; + for (int i = *num_mv_found; i < 2; ++i) { + ref_mv_stack[i] = prediction_parameters.global_mv[0]; + prediction_parameters.SetWeightIndexStackEntry(i, 0); + } + } +} + +void DescendingOrderTwo(int* const a, int* const b) { + if (*a < *b) { + std::swap(*a, *b); + } +} + +// Comparator used for sorting candidate motion vectors in descending order of +// their weights (as specified in 7.10.2.11). +bool CompareCandidateMotionVectors(const int16_t& lhs, const int16_t& rhs) { + return lhs > rhs; +} + +void SortWeightIndexStack(const int size, const int sort_to_n, + int16_t* const weight_index_stack) { + if (size <= 1) return; + if (size <= 3) { + // Specialize small sort sizes to speed up. + int weight_index_0 = weight_index_stack[0]; + int weight_index_1 = weight_index_stack[1]; + DescendingOrderTwo(&weight_index_0, &weight_index_1); + if (size == 3) { + int weight_index_2 = weight_index_stack[2]; + DescendingOrderTwo(&weight_index_1, &weight_index_2); + DescendingOrderTwo(&weight_index_0, &weight_index_1); + weight_index_stack[2] = weight_index_2; + } + weight_index_stack[0] = weight_index_0; + weight_index_stack[1] = weight_index_1; + return; + } + if (sort_to_n == 1) { + // std::max_element() is not efficient. Find the max element in a loop. + int16_t max_element = weight_index_stack[0]; + int i = 1; + do { + max_element = std::max(max_element, weight_index_stack[i]); + } while (++i < size); + weight_index_stack[0] = max_element; + return; + } + std::partial_sort(&weight_index_stack[0], &weight_index_stack[sort_to_n], + &weight_index_stack[size], CompareCandidateMotionVectors); +} + +// 7.10.2.14 (part 2). +void ComputeContexts(bool found_new_mv, int nearest_matches, int total_matches, + int* new_mv_context, int* reference_mv_context) { + switch (nearest_matches) { + case 0: + *new_mv_context = std::min(total_matches, 1); + *reference_mv_context = total_matches; + break; + case 1: + *new_mv_context = 3 - static_cast(found_new_mv); + *reference_mv_context = 2 + total_matches; + break; + default: + *new_mv_context = 5 - static_cast(found_new_mv); + *reference_mv_context = 5; + break; + } +} + +// 7.10.4.2. +void AddSample(const Tile::Block& block, int delta_row, int delta_column, + int* const num_warp_samples, int* const num_samples_scanned, + int candidates[kMaxLeastSquaresSamples][4]) { + if (*num_samples_scanned >= kMaxLeastSquaresSamples) return; + const int mv_row = block.row4x4 + delta_row; + const int mv_column = block.column4x4 + delta_column; + const Tile& tile = block.tile; + if (!tile.IsInside(mv_row, mv_column) || + !tile.HasParameters(mv_row, mv_column)) { + return; + } + const BlockParameters& bp = *block.bp; + const BlockParameters& mv_bp = tile.Parameters(mv_row, mv_column); + if (mv_bp.reference_frame[0] != bp.reference_frame[0] || + mv_bp.reference_frame[1] != kReferenceFrameNone) { + return; + } + ++*num_samples_scanned; + const int candidate_height4x4 = kNum4x4BlocksHigh[mv_bp.size]; + const int candidate_row = mv_row & ~(candidate_height4x4 - 1); + const int candidate_width4x4 = kNum4x4BlocksWide[mv_bp.size]; + const int candidate_column = mv_column & ~(candidate_width4x4 - 1); + const BlockParameters& candidate_bp = + tile.Parameters(candidate_row, candidate_column); + const int mv_diff_row = + std::abs(candidate_bp.mv.mv[0].mv[0] - bp.mv.mv[0].mv[0]); + const int mv_diff_column = + std::abs(candidate_bp.mv.mv[0].mv[1] - bp.mv.mv[0].mv[1]); + const bool is_valid = + mv_diff_row + mv_diff_column <= kWarpValidThreshold[block.size]; + if (!is_valid && *num_samples_scanned > 1) { + return; + } + const int mid_y = + MultiplyBy4(candidate_row) + MultiplyBy2(candidate_height4x4) - 1; + const int mid_x = + MultiplyBy4(candidate_column) + MultiplyBy2(candidate_width4x4) - 1; + candidates[*num_warp_samples][0] = MultiplyBy8(mid_y); + candidates[*num_warp_samples][1] = MultiplyBy8(mid_x); + candidates[*num_warp_samples][2] = + MultiplyBy8(mid_y) + candidate_bp.mv.mv[0].mv[0]; + candidates[*num_warp_samples][3] = + MultiplyBy8(mid_x) + candidate_bp.mv.mv[0].mv[1]; + if (is_valid) ++*num_warp_samples; +} + +// 7.9.2. +// In the spec, |dst_sign| is either 1 or -1. Here we set |dst_sign| to either 0 +// or -1 so that it can be XORed and subtracted directly in ApplySign() and +// corresponding SIMD implementations. +bool MotionFieldProjection( + const ObuFrameHeader& frame_header, + const std::array& + reference_frames, + ReferenceFrameType source, int reference_to_current_with_sign, int dst_sign, + int y8_start, int y8_end, int x8_start, int x8_end, + TemporalMotionField* const motion_field) { + const int source_index = + frame_header.reference_frame_index[source - kReferenceFrameLast]; + auto* const source_frame = reference_frames[source_index].get(); + assert(source_frame != nullptr); + assert(dst_sign == 0 || dst_sign == -1); + if (source_frame->rows4x4() != frame_header.rows4x4 || + source_frame->columns4x4() != frame_header.columns4x4 || + IsIntraFrame(source_frame->frame_type())) { + return false; + } + assert(reference_to_current_with_sign >= -kMaxFrameDistance); + if (reference_to_current_with_sign > kMaxFrameDistance) return true; + const ReferenceInfo& reference_info = *source_frame->reference_info(); + const dsp::Dsp& dsp = *dsp::GetDspTable(8); + dsp.motion_field_projection_kernel( + reference_info, reference_to_current_with_sign, dst_sign, y8_start, + y8_end, x8_start, x8_end, motion_field); + return true; +} + +} // namespace + +void FindMvStack(const Tile::Block& block, bool is_compound, + MvContexts* const contexts) { + PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + SetupGlobalMv(block, 0, &prediction_parameters.global_mv[0]); + if (is_compound) SetupGlobalMv(block, 1, &prediction_parameters.global_mv[1]); + bool found_new_mv = false; + bool found_row_match = false; + int num_mv_found = 0; + ScanRow(block, block.column4x4, -1, is_compound, &found_new_mv, + &found_row_match, &num_mv_found); + bool found_column_match = false; + ScanColumn(block, block.row4x4, -1, is_compound, &found_new_mv, + &found_column_match, &num_mv_found); + if (std::max(block.width4x4, block.height4x4) <= 16) { + ScanPoint(block, -1, block.width4x4, is_compound, &found_new_mv, + &found_row_match, &num_mv_found); + } + const int nearest_matches = + static_cast(found_row_match) + static_cast(found_column_match); + prediction_parameters.nearest_mv_count = num_mv_found; + if (block.tile.frame_header().use_ref_frame_mvs) { + // Initialize to invalid value, and it will be set when temporal mv is zero. + contexts->zero_mv = -1; + TemporalScan(block, is_compound, &contexts->zero_mv, &num_mv_found); + } else { + contexts->zero_mv = 0; + } + bool dummy_bool = false; + ScanPoint(block, -1, -1, is_compound, &dummy_bool, &found_row_match, + &num_mv_found); + static constexpr int deltas[2] = {-3, -5}; + for (int i = 0; i < 2; ++i) { + if (i == 0 || block.height4x4 > 1) { + ScanRow(block, block.column4x4 | 1, deltas[i] + (block.row4x4 & 1), + is_compound, &dummy_bool, &found_row_match, &num_mv_found); + } + if (i == 0 || block.width4x4 > 1) { + ScanColumn(block, block.row4x4 | 1, deltas[i] + (block.column4x4 & 1), + is_compound, &dummy_bool, &found_column_match, &num_mv_found); + } + } + if (num_mv_found < 2) { + ExtraSearch(block, is_compound, &num_mv_found); + } else { + // The sort of |weight_index_stack| could be moved to Tile::AssignIntraMv() + // and Tile::AssignInterMv(), and only do a partial sort to the max index we + // need. However, the speed gain is trivial. + // For intra case, only the first 1 or 2 mvs in the stack will be used. + // For inter case, |prediction_parameters.ref_mv_index| is at most 3. + // We only need to do the partial sort up to the first 4 mvs. + SortWeightIndexStack(prediction_parameters.nearest_mv_count, 4, + prediction_parameters.weight_index_stack); + // When there are 4 or more nearest mvs, the other mvs will not be used. + if (prediction_parameters.nearest_mv_count < 4) { + SortWeightIndexStack( + num_mv_found - prediction_parameters.nearest_mv_count, + 4 - prediction_parameters.nearest_mv_count, + prediction_parameters.weight_index_stack + + prediction_parameters.nearest_mv_count); + } + } + prediction_parameters.ref_mv_count = num_mv_found; + const int total_matches = + static_cast(found_row_match) + static_cast(found_column_match); + ComputeContexts(found_new_mv, nearest_matches, total_matches, + &contexts->new_mv, &contexts->reference_mv); + // The mv stack clamping process is in Tile::AssignIntraMv() and + // Tile::AssignInterMv(), and only up to two mvs are clamped. +} + +void FindWarpSamples(const Tile::Block& block, int* const num_warp_samples, + int* const num_samples_scanned, + int candidates[kMaxLeastSquaresSamples][4]) { + const Tile& tile = block.tile; + bool top_left = true; + bool top_right = true; + int step = 1; + if (block.top_available[kPlaneY]) { + BlockSize source_size = + tile.Parameters(block.row4x4 - 1, block.column4x4).size; + const int source_width4x4 = kNum4x4BlocksWide[source_size]; + if (block.width4x4 <= source_width4x4) { + // The & here is equivalent to % since source_width4x4 is a power of two. + const int column_offset = -(block.column4x4 & (source_width4x4 - 1)); + if (column_offset < 0) top_left = false; + if (column_offset + source_width4x4 > block.width4x4) top_right = false; + AddSample(block, -1, 0, num_warp_samples, num_samples_scanned, + candidates); + } else { + for (int i = 0; + i < std::min(static_cast(block.width4x4), + tile.frame_header().columns4x4 - block.column4x4); + i += step) { + source_size = + tile.Parameters(block.row4x4 - 1, block.column4x4 + i).size; + step = std::min(static_cast(block.width4x4), + static_cast(kNum4x4BlocksWide[source_size])); + AddSample(block, -1, i, num_warp_samples, num_samples_scanned, + candidates); + } + } + } + if (block.left_available[kPlaneY]) { + BlockSize source_size = + tile.Parameters(block.row4x4, block.column4x4 - 1).size; + const int source_height4x4 = kNum4x4BlocksHigh[source_size]; + if (block.height4x4 <= source_height4x4) { + const int row_offset = -(block.row4x4 & (source_height4x4 - 1)); + if (row_offset < 0) top_left = false; + AddSample(block, 0, -1, num_warp_samples, num_samples_scanned, + candidates); + } else { + for (int i = 0; i < std::min(static_cast(block.height4x4), + tile.frame_header().rows4x4 - block.row4x4); + i += step) { + source_size = + tile.Parameters(block.row4x4 + i, block.column4x4 - 1).size; + step = std::min(static_cast(block.height4x4), + static_cast(kNum4x4BlocksHigh[source_size])); + AddSample(block, i, -1, num_warp_samples, num_samples_scanned, + candidates); + } + } + } + if (top_left) { + AddSample(block, -1, -1, num_warp_samples, num_samples_scanned, candidates); + } + if (top_right && block.size <= kBlock64x64) { + AddSample(block, -1, block.width4x4, num_warp_samples, num_samples_scanned, + candidates); + } + if (*num_warp_samples == 0 && *num_samples_scanned > 0) *num_warp_samples = 1; +} + +void SetupMotionField( + const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame, + const std::array& + reference_frames, + int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end, + TemporalMotionField* const motion_field) { + assert(frame_header.use_ref_frame_mvs); + const int y8_start = DivideBy2(row4x4_start); + const int y8_end = DivideBy2(std::min(row4x4_end, frame_header.rows4x4)); + const int x8_start = DivideBy2(column4x4_start); + const int x8_end = + DivideBy2(std::min(column4x4_end, frame_header.columns4x4)); + const int last_index = frame_header.reference_frame_index[0]; + const ReferenceInfo& reference_info = *current_frame.reference_info(); + if (!IsIntraFrame(reference_frames[last_index]->frame_type())) { + const int last_alternate_order_hint = + reference_frames[last_index] + ->reference_info() + ->order_hint[kReferenceFrameAlternate]; + const int current_gold_order_hint = + reference_info.order_hint[kReferenceFrameGolden]; + if (last_alternate_order_hint != current_gold_order_hint) { + const int reference_offset_last = + -reference_info.relative_distance_from[kReferenceFrameLast]; + if (std::abs(reference_offset_last) <= kMaxFrameDistance) { + MotionFieldProjection(frame_header, reference_frames, + kReferenceFrameLast, reference_offset_last, -1, + y8_start, y8_end, x8_start, x8_end, motion_field); + } + } + } + int ref_stamp = 1; + const int reference_offset_backward = + reference_info.relative_distance_from[kReferenceFrameBackward]; + if (reference_offset_backward > 0 && + MotionFieldProjection(frame_header, reference_frames, + kReferenceFrameBackward, reference_offset_backward, + 0, y8_start, y8_end, x8_start, x8_end, + motion_field)) { + --ref_stamp; + } + const int reference_offset_alternate2 = + reference_info.relative_distance_from[kReferenceFrameAlternate2]; + if (reference_offset_alternate2 > 0 && + MotionFieldProjection(frame_header, reference_frames, + kReferenceFrameAlternate2, + reference_offset_alternate2, 0, y8_start, y8_end, + x8_start, x8_end, motion_field)) { + --ref_stamp; + } + if (ref_stamp >= 0) { + const int reference_offset_alternate = + reference_info.relative_distance_from[kReferenceFrameAlternate]; + if (reference_offset_alternate > 0 && + MotionFieldProjection(frame_header, reference_frames, + kReferenceFrameAlternate, + reference_offset_alternate, 0, y8_start, y8_end, + x8_start, x8_end, motion_field)) { + --ref_stamp; + } + } + if (ref_stamp >= 0) { + const int reference_offset_last2 = + -reference_info.relative_distance_from[kReferenceFrameLast2]; + if (std::abs(reference_offset_last2) <= kMaxFrameDistance) { + MotionFieldProjection(frame_header, reference_frames, + kReferenceFrameLast2, reference_offset_last2, -1, + y8_start, y8_end, x8_start, x8_end, motion_field); + } + } +} + +} // namespace libgav1 diff --git a/src/motion_vector.h b/src/motion_vector.h new file mode 100644 index 0000000..d739e80 --- /dev/null +++ b/src/motion_vector.h @@ -0,0 +1,59 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_MOTION_VECTOR_H_ +#define LIBGAV1_SRC_MOTION_VECTOR_H_ + +#include +#include +#include + +#include "src/buffer_pool.h" +#include "src/obu_parser.h" +#include "src/tile.h" +#include "src/utils/array_2d.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { + +constexpr bool IsGlobalMvBlock(bool is_global_mv_block, + GlobalMotionTransformationType type) { + return is_global_mv_block && + type > kGlobalMotionTransformationTypeTranslation; +} + +// The |contexts| output parameter may be null. If the caller does not need +// the |contexts| output, pass nullptr as the argument. +void FindMvStack(const Tile::Block& block, bool is_compound, + MvContexts* contexts); // 7.10.2 + +void FindWarpSamples(const Tile::Block& block, int* num_warp_samples, + int* num_samples_scanned, + int candidates[kMaxLeastSquaresSamples][4]); // 7.10.4. + +// Section 7.9.1 in the spec. But this is done per tile instead of for the whole +// frame. +void SetupMotionField( + const ObuFrameHeader& frame_header, const RefCountedBuffer& current_frame, + const std::array& + reference_frames, + int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end, + TemporalMotionField* motion_field); + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_MOTION_VECTOR_H_ diff --git a/src/obu_parser.cc b/src/obu_parser.cc new file mode 100644 index 0000000..bbf00ed --- /dev/null +++ b/src/obu_parser.cc @@ -0,0 +1,2885 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/obu_parser.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "src/buffer_pool.h" +#include "src/decoder_impl.h" +#include "src/motion_vector.h" +#include "src/utils/common.h" +#include "src/utils/logging.h" + +namespace libgav1 { +namespace { + +// 5.9.16. +// Find the smallest value of k such that block_size << k is greater than or +// equal to target. +// +// NOTE: TileLog2(block_size, target) is equal to +// CeilLog2(ceil((double)target / block_size)) +// where the division is a floating-point number division. (This equality holds +// even when |target| is equal to 0.) In the special case of block_size == 1, +// TileLog2(1, target) is equal to CeilLog2(target). +int TileLog2(int block_size, int target) { + int k = 0; + for (; (block_size << k) < target; ++k) { + } + return k; +} + +void ParseBitStreamLevel(BitStreamLevel* const level, uint8_t level_bits) { + level->major = kMinimumMajorBitstreamLevel + (level_bits >> 2); + level->minor = level_bits & 3; +} + +// This function assumes loop_filter is zero-initialized, so only it needs to +// set the nonzero default values. +void SetDefaultRefDeltas(LoopFilter* const loop_filter) { + loop_filter->ref_deltas[kReferenceFrameIntra] = 1; + loop_filter->ref_deltas[kReferenceFrameGolden] = -1; + loop_filter->ref_deltas[kReferenceFrameAlternate] = -1; + loop_filter->ref_deltas[kReferenceFrameAlternate2] = -1; +} + +bool InTemporalLayer(int operating_point_idc, int temporal_id) { + return ((operating_point_idc >> temporal_id) & 1) != 0; +} + +bool InSpatialLayer(int operating_point_idc, int spatial_id) { + return ((operating_point_idc >> (spatial_id + 8)) & 1) != 0; +} + +// Returns the index of the last nonzero byte in the |data| buffer of |size| +// bytes. If there is no nonzero byte in the |data| buffer, returns -1. +int GetLastNonzeroByteIndex(const uint8_t* data, size_t size) { + // Scan backward for a nonzero byte. + if (size > INT_MAX) return -1; + int i = static_cast(size) - 1; + while (i >= 0 && data[i] == 0) { + --i; + } + return i; +} + +// A cleanup helper class that releases the frame buffer reference held in +// |frame| in the destructor. +class RefCountedBufferPtrCleanup { + public: + explicit RefCountedBufferPtrCleanup(RefCountedBufferPtr* frame) + : frame_(*frame) {} + + // Not copyable or movable. + RefCountedBufferPtrCleanup(const RefCountedBufferPtrCleanup&) = delete; + RefCountedBufferPtrCleanup& operator=(const RefCountedBufferPtrCleanup&) = + delete; + + ~RefCountedBufferPtrCleanup() { frame_ = nullptr; } + + private: + RefCountedBufferPtr& frame_; +}; + +} // namespace + +bool ObuSequenceHeader::ParametersChanged(const ObuSequenceHeader& old) const { + // Note that the operating_parameters field is not compared per Section 7.5: + // Within a particular coded video sequence, the contents of + // sequence_header_obu must be bit-identical each time the sequence header + // appears except for the contents of operating_parameters_info. + return memcmp(this, &old, + offsetof(ObuSequenceHeader, operating_parameters)) != 0; +} + +// Macros to avoid repeated error checks in the parser code. +#define OBU_LOG_AND_RETURN_FALSE \ + do { \ + LIBGAV1_DLOG(ERROR, "%s:%d (%s): Not enough bits.", __FILE__, __LINE__, \ + __func__); \ + return false; \ + } while (false) +#define OBU_PARSER_FAIL \ + do { \ + if (scratch == -1) { \ + OBU_LOG_AND_RETURN_FALSE; \ + } \ + } while (false) +#define OBU_READ_BIT_OR_FAIL \ + scratch = bit_reader_->ReadBit(); \ + OBU_PARSER_FAIL +#define OBU_READ_LITERAL_OR_FAIL(n) \ + scratch = bit_reader_->ReadLiteral(n); \ + OBU_PARSER_FAIL +#define OBU_READ_UVLC_OR_FAIL(x) \ + do { \ + if (!bit_reader_->ReadUvlc(&(x))) { \ + OBU_LOG_AND_RETURN_FALSE; \ + } \ + } while (false) + +bool ObuParser::ParseColorConfig(ObuSequenceHeader* sequence_header) { + int64_t scratch; + ColorConfig* const color_config = &sequence_header->color_config; + OBU_READ_BIT_OR_FAIL; + const auto high_bitdepth = static_cast(scratch); + if (sequence_header->profile == kProfile2 && high_bitdepth) { + OBU_READ_BIT_OR_FAIL; + const auto is_twelve_bit = static_cast(scratch); + color_config->bitdepth = is_twelve_bit ? 12 : 10; + } else { + color_config->bitdepth = high_bitdepth ? 10 : 8; + } + if (sequence_header->profile == kProfile1) { + color_config->is_monochrome = false; + } else { + OBU_READ_BIT_OR_FAIL; + color_config->is_monochrome = static_cast(scratch); + } + OBU_READ_BIT_OR_FAIL; + const auto color_description_present_flag = static_cast(scratch); + if (color_description_present_flag) { + OBU_READ_LITERAL_OR_FAIL(8); + color_config->color_primary = static_cast(scratch); + OBU_READ_LITERAL_OR_FAIL(8); + color_config->transfer_characteristics = + static_cast(scratch); + OBU_READ_LITERAL_OR_FAIL(8); + color_config->matrix_coefficients = + static_cast(scratch); + } else { + color_config->color_primary = kColorPrimaryUnspecified; + color_config->transfer_characteristics = + kTransferCharacteristicsUnspecified; + color_config->matrix_coefficients = kMatrixCoefficientsUnspecified; + } + if (color_config->is_monochrome) { + OBU_READ_BIT_OR_FAIL; + color_config->color_range = static_cast(scratch); + // Set subsampling_x and subsampling_y to 1 for monochrome. This makes it + // easy to allow monochrome to be supported in profile 0. Profile 0 + // requires subsampling_x and subsampling_y to be 1. + color_config->subsampling_x = 1; + color_config->subsampling_y = 1; + color_config->chroma_sample_position = kChromaSamplePositionUnknown; + } else { + if (color_config->color_primary == kColorPrimaryBt709 && + color_config->transfer_characteristics == + kTransferCharacteristicsSrgb && + color_config->matrix_coefficients == kMatrixCoefficientsIdentity) { + color_config->color_range = kColorRangeFull; + color_config->subsampling_x = 0; + color_config->subsampling_y = 0; + // YUV 4:4:4 is only allowed in profile 1, or profile 2 with bit depth 12. + // See the table at the beginning of Section 6.4.1. + if (sequence_header->profile != kProfile1 && + (sequence_header->profile != kProfile2 || + color_config->bitdepth != 12)) { + LIBGAV1_DLOG(ERROR, + "YUV 4:4:4 is not allowed in profile %d for bitdepth %d.", + sequence_header->profile, color_config->bitdepth); + return false; + } + } else { + OBU_READ_BIT_OR_FAIL; + color_config->color_range = static_cast(scratch); + if (sequence_header->profile == kProfile0) { + color_config->subsampling_x = 1; + color_config->subsampling_y = 1; + } else if (sequence_header->profile == kProfile1) { + color_config->subsampling_x = 0; + color_config->subsampling_y = 0; + } else { + if (color_config->bitdepth == 12) { + OBU_READ_BIT_OR_FAIL; + color_config->subsampling_x = scratch; + if (color_config->subsampling_x == 1) { + OBU_READ_BIT_OR_FAIL; + color_config->subsampling_y = scratch; + } else { + color_config->subsampling_y = 0; + } + } else { + color_config->subsampling_x = 1; + color_config->subsampling_y = 0; + } + } + if (color_config->subsampling_x == 1 && + color_config->subsampling_y == 1) { + OBU_READ_LITERAL_OR_FAIL(2); + color_config->chroma_sample_position = + static_cast(scratch); + } + } + OBU_READ_BIT_OR_FAIL; + color_config->separate_uv_delta_q = static_cast(scratch); + } + if (color_config->matrix_coefficients == kMatrixCoefficientsIdentity && + (color_config->subsampling_x != 0 || color_config->subsampling_y != 0)) { + LIBGAV1_DLOG(ERROR, + "matrix_coefficients is MC_IDENTITY, but subsampling_x (%d) " + "and subsampling_y (%d) are not both 0.", + color_config->subsampling_x, color_config->subsampling_y); + return false; + } + return true; +} + +bool ObuParser::ParseTimingInfo(ObuSequenceHeader* sequence_header) { + int64_t scratch; + OBU_READ_BIT_OR_FAIL; + sequence_header->timing_info_present_flag = static_cast(scratch); + if (!sequence_header->timing_info_present_flag) return true; + TimingInfo* const info = &sequence_header->timing_info; + OBU_READ_LITERAL_OR_FAIL(32); + info->num_units_in_tick = static_cast(scratch); + if (info->num_units_in_tick == 0) { + LIBGAV1_DLOG(ERROR, "num_units_in_tick is 0."); + return false; + } + OBU_READ_LITERAL_OR_FAIL(32); + info->time_scale = static_cast(scratch); + if (info->time_scale == 0) { + LIBGAV1_DLOG(ERROR, "time_scale is 0."); + return false; + } + OBU_READ_BIT_OR_FAIL; + info->equal_picture_interval = static_cast(scratch); + if (info->equal_picture_interval) { + OBU_READ_UVLC_OR_FAIL(info->num_ticks_per_picture); + ++info->num_ticks_per_picture; + } + return true; +} + +bool ObuParser::ParseDecoderModelInfo(ObuSequenceHeader* sequence_header) { + if (!sequence_header->timing_info_present_flag) return true; + int64_t scratch; + OBU_READ_BIT_OR_FAIL; + sequence_header->decoder_model_info_present_flag = static_cast(scratch); + if (!sequence_header->decoder_model_info_present_flag) return true; + DecoderModelInfo* const info = &sequence_header->decoder_model_info; + OBU_READ_LITERAL_OR_FAIL(5); + info->encoder_decoder_buffer_delay_length = 1 + scratch; + OBU_READ_LITERAL_OR_FAIL(32); + info->num_units_in_decoding_tick = static_cast(scratch); + OBU_READ_LITERAL_OR_FAIL(5); + info->buffer_removal_time_length = 1 + scratch; + OBU_READ_LITERAL_OR_FAIL(5); + info->frame_presentation_time_length = 1 + scratch; + return true; +} + +bool ObuParser::ParseOperatingParameters(ObuSequenceHeader* sequence_header, + int index) { + int64_t scratch; + OBU_READ_BIT_OR_FAIL; + sequence_header->decoder_model_present_for_operating_point[index] = + static_cast(scratch); + if (!sequence_header->decoder_model_present_for_operating_point[index]) { + return true; + } + OperatingParameters* const params = &sequence_header->operating_parameters; + OBU_READ_LITERAL_OR_FAIL( + sequence_header->decoder_model_info.encoder_decoder_buffer_delay_length); + params->decoder_buffer_delay[index] = static_cast(scratch); + OBU_READ_LITERAL_OR_FAIL( + sequence_header->decoder_model_info.encoder_decoder_buffer_delay_length); + params->encoder_buffer_delay[index] = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + params->low_delay_mode_flag[index] = static_cast(scratch); + return true; +} + +bool ObuParser::ParseSequenceHeader(bool seen_frame_header) { + ObuSequenceHeader sequence_header = {}; + int64_t scratch; + OBU_READ_LITERAL_OR_FAIL(3); + if (scratch >= kMaxProfiles) { + LIBGAV1_DLOG(ERROR, "Invalid profile: %d.", static_cast(scratch)); + return false; + } + sequence_header.profile = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + sequence_header.still_picture = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + sequence_header.reduced_still_picture_header = static_cast(scratch); + if (sequence_header.reduced_still_picture_header) { + if (!sequence_header.still_picture) { + LIBGAV1_DLOG( + ERROR, "reduced_still_picture_header is 1, but still_picture is 0."); + return false; + } + sequence_header.operating_points = 1; + sequence_header.operating_point_idc[0] = 0; + OBU_READ_LITERAL_OR_FAIL(5); + ParseBitStreamLevel(&sequence_header.level[0], scratch); + } else { + if (!ParseTimingInfo(&sequence_header) || + !ParseDecoderModelInfo(&sequence_header)) { + return false; + } + OBU_READ_BIT_OR_FAIL; + const auto initial_display_delay_present_flag = static_cast(scratch); + OBU_READ_LITERAL_OR_FAIL(5); + sequence_header.operating_points = static_cast(1 + scratch); + if (operating_point_ >= sequence_header.operating_points) { + LIBGAV1_DLOG( + ERROR, + "Invalid operating point: %d (valid range is [0,%d] inclusive).", + operating_point_, sequence_header.operating_points - 1); + return false; + } + for (int i = 0; i < sequence_header.operating_points; ++i) { + OBU_READ_LITERAL_OR_FAIL(12); + sequence_header.operating_point_idc[i] = static_cast(scratch); + for (int j = 0; j < i; ++j) { + if (sequence_header.operating_point_idc[i] == + sequence_header.operating_point_idc[j]) { + LIBGAV1_DLOG(ERROR, + "operating_point_idc[%d] (%d) is equal to " + "operating_point_idc[%d] (%d).", + i, sequence_header.operating_point_idc[i], j, + sequence_header.operating_point_idc[j]); + return false; + } + } + OBU_READ_LITERAL_OR_FAIL(5); + ParseBitStreamLevel(&sequence_header.level[i], scratch); + if (sequence_header.level[i].major > 3) { + OBU_READ_BIT_OR_FAIL; + sequence_header.tier[i] = scratch; + } + if (sequence_header.decoder_model_info_present_flag && + !ParseOperatingParameters(&sequence_header, i)) { + return false; + } + if (initial_display_delay_present_flag) { + OBU_READ_BIT_OR_FAIL; + if (static_cast(scratch)) { + OBU_READ_LITERAL_OR_FAIL(4); + sequence_header.initial_display_delay[i] = 1 + scratch; + } + } + } + } + OBU_READ_LITERAL_OR_FAIL(4); + sequence_header.frame_width_bits = 1 + scratch; + OBU_READ_LITERAL_OR_FAIL(4); + sequence_header.frame_height_bits = 1 + scratch; + OBU_READ_LITERAL_OR_FAIL(sequence_header.frame_width_bits); + sequence_header.max_frame_width = static_cast(1 + scratch); + OBU_READ_LITERAL_OR_FAIL(sequence_header.frame_height_bits); + sequence_header.max_frame_height = static_cast(1 + scratch); + if (!sequence_header.reduced_still_picture_header) { + OBU_READ_BIT_OR_FAIL; + sequence_header.frame_id_numbers_present = static_cast(scratch); + } + if (sequence_header.frame_id_numbers_present) { + OBU_READ_LITERAL_OR_FAIL(4); + sequence_header.delta_frame_id_length_bits = 2 + scratch; + OBU_READ_LITERAL_OR_FAIL(3); + sequence_header.frame_id_length_bits = + sequence_header.delta_frame_id_length_bits + 1 + scratch; + // Section 6.8.2: It is a requirement of bitstream conformance that the + // number of bits needed to read display_frame_id does not exceed 16. This + // is equivalent to the constraint that idLen <= 16. + if (sequence_header.frame_id_length_bits > 16) { + LIBGAV1_DLOG(ERROR, "Invalid frame_id_length_bits: %d.", + sequence_header.frame_id_length_bits); + return false; + } + } + OBU_READ_BIT_OR_FAIL; + sequence_header.use_128x128_superblock = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + sequence_header.enable_filter_intra = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + sequence_header.enable_intra_edge_filter = static_cast(scratch); + if (sequence_header.reduced_still_picture_header) { + sequence_header.force_screen_content_tools = kSelectScreenContentTools; + sequence_header.force_integer_mv = kSelectIntegerMv; + } else { + OBU_READ_BIT_OR_FAIL; + sequence_header.enable_interintra_compound = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + sequence_header.enable_masked_compound = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + sequence_header.enable_warped_motion = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + sequence_header.enable_dual_filter = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + sequence_header.enable_order_hint = static_cast(scratch); + if (sequence_header.enable_order_hint) { + OBU_READ_BIT_OR_FAIL; + sequence_header.enable_jnt_comp = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + sequence_header.enable_ref_frame_mvs = static_cast(scratch); + } + OBU_READ_BIT_OR_FAIL; + sequence_header.choose_screen_content_tools = static_cast(scratch); + if (sequence_header.choose_screen_content_tools) { + sequence_header.force_screen_content_tools = kSelectScreenContentTools; + } else { + OBU_READ_BIT_OR_FAIL; + sequence_header.force_screen_content_tools = scratch; + } + if (sequence_header.force_screen_content_tools > 0) { + OBU_READ_BIT_OR_FAIL; + sequence_header.choose_integer_mv = static_cast(scratch); + if (sequence_header.choose_integer_mv) { + sequence_header.force_integer_mv = kSelectIntegerMv; + } else { + OBU_READ_BIT_OR_FAIL; + sequence_header.force_integer_mv = scratch; + } + } else { + sequence_header.force_integer_mv = kSelectIntegerMv; + } + if (sequence_header.enable_order_hint) { + OBU_READ_LITERAL_OR_FAIL(3); + sequence_header.order_hint_bits = 1 + scratch; + sequence_header.order_hint_shift_bits = + Mod32(32 - sequence_header.order_hint_bits); + } + } + OBU_READ_BIT_OR_FAIL; + sequence_header.enable_superres = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + sequence_header.enable_cdef = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + sequence_header.enable_restoration = static_cast(scratch); + if (!ParseColorConfig(&sequence_header)) return false; + OBU_READ_BIT_OR_FAIL; + sequence_header.film_grain_params_present = static_cast(scratch); + // Compare new sequence header with old sequence header. + if (has_sequence_header_ && + sequence_header.ParametersChanged(sequence_header_)) { + // Between the frame header OBU and the last tile group OBU of the frame, + // do not allow the sequence header to change. + if (seen_frame_header) { + LIBGAV1_DLOG(ERROR, "Sequence header changed in the middle of a frame."); + return false; + } + decoder_state_.ClearReferenceFrames(); + } + sequence_header_ = sequence_header; + has_sequence_header_ = true; + // Section 6.4.1: It is a requirement of bitstream conformance that if + // OperatingPointIdc is equal to 0, then obu_extension_flag is equal to 0 for + // all OBUs that follow this sequence header until the next sequence header. + extension_disallowed_ = + (sequence_header_.operating_point_idc[operating_point_] == 0); + return true; +} + +// Marks reference frames as invalid for referencing when they are too far in +// the past to be referenced by the frame id mechanism. +void ObuParser::MarkInvalidReferenceFrames() { + // The current lower bound of the frame ids for reference frames. + int lower_bound = decoder_state_.current_frame_id - + (1 << sequence_header_.delta_frame_id_length_bits); + // True if lower_bound is smaller than current_frame_id. False if lower_bound + // wraps around (in modular arithmetic) to the other side of current_frame_id. + bool lower_bound_is_smaller = true; + if (lower_bound <= 0) { + lower_bound += 1 << sequence_header_.frame_id_length_bits; + lower_bound_is_smaller = false; + } + for (int i = 0; i < kNumReferenceFrameTypes; ++i) { + const uint16_t reference_frame_id = decoder_state_.reference_frame_id[i]; + if (lower_bound_is_smaller) { + if (reference_frame_id > decoder_state_.current_frame_id || + reference_frame_id < lower_bound) { + decoder_state_.reference_valid[i] = false; + } + } else { + if (reference_frame_id > decoder_state_.current_frame_id && + reference_frame_id < lower_bound) { + decoder_state_.reference_valid[i] = false; + } + } + } +} + +bool ObuParser::ParseFrameSizeAndRenderSize() { + int64_t scratch; + // Frame Size. + if (frame_header_.frame_size_override_flag) { + OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_width_bits); + frame_header_.width = static_cast(1 + scratch); + OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_height_bits); + frame_header_.height = static_cast(1 + scratch); + if (frame_header_.width > sequence_header_.max_frame_width || + frame_header_.height > sequence_header_.max_frame_height) { + LIBGAV1_DLOG(ERROR, + "Frame dimensions are larger than the maximum values"); + return false; + } + } else { + frame_header_.width = sequence_header_.max_frame_width; + frame_header_.height = sequence_header_.max_frame_height; + } + if (!ParseSuperResParametersAndComputeImageSize()) return false; + + // Render Size. + OBU_READ_BIT_OR_FAIL; + frame_header_.render_and_frame_size_different = static_cast(scratch); + if (frame_header_.render_and_frame_size_different) { + OBU_READ_LITERAL_OR_FAIL(16); + frame_header_.render_width = static_cast(1 + scratch); + OBU_READ_LITERAL_OR_FAIL(16); + frame_header_.render_height = static_cast(1 + scratch); + } else { + frame_header_.render_width = frame_header_.upscaled_width; + frame_header_.render_height = frame_header_.height; + } + + return true; +} + +bool ObuParser::ParseSuperResParametersAndComputeImageSize() { + int64_t scratch; + // SuperRes. + frame_header_.upscaled_width = frame_header_.width; + frame_header_.use_superres = false; + if (sequence_header_.enable_superres) { + OBU_READ_BIT_OR_FAIL; + frame_header_.use_superres = static_cast(scratch); + } + if (frame_header_.use_superres) { + OBU_READ_LITERAL_OR_FAIL(3); + // 9 is the smallest value for the denominator. + frame_header_.superres_scale_denominator = scratch + 9; + frame_header_.width = + (frame_header_.upscaled_width * kSuperResScaleNumerator + + (frame_header_.superres_scale_denominator / 2)) / + frame_header_.superres_scale_denominator; + } else { + frame_header_.superres_scale_denominator = kSuperResScaleNumerator; + } + assert(frame_header_.width != 0); + assert(frame_header_.height != 0); + // Check if multiplying upscaled_width by height would overflow. + assert(frame_header_.upscaled_width >= frame_header_.width); + if (frame_header_.upscaled_width > INT32_MAX / frame_header_.height) { + LIBGAV1_DLOG(ERROR, "Frame dimensions too big: width=%d height=%d.", + frame_header_.width, frame_header_.height); + return false; + } + frame_header_.columns4x4 = ((frame_header_.width + 7) >> 3) << 1; + frame_header_.rows4x4 = ((frame_header_.height + 7) >> 3) << 1; + return true; +} + +bool ObuParser::ValidateInterFrameSize() const { + for (int index : frame_header_.reference_frame_index) { + const RefCountedBuffer* reference_frame = + decoder_state_.reference_frame[index].get(); + if (2 * frame_header_.width < reference_frame->upscaled_width() || + 2 * frame_header_.height < reference_frame->frame_height() || + frame_header_.width > 16 * reference_frame->upscaled_width() || + frame_header_.height > 16 * reference_frame->frame_height()) { + LIBGAV1_DLOG(ERROR, + "Invalid inter frame size: width=%d, height=%d. Reference " + "frame: index=%d, upscaled width=%d, height=%d.", + frame_header_.width, frame_header_.height, index, + reference_frame->upscaled_width(), + reference_frame->frame_height()); + return false; + } + } + return true; +} + +bool ObuParser::ParseReferenceOrderHint() { + if (!frame_header_.error_resilient_mode || + !sequence_header_.enable_order_hint) { + return true; + } + int64_t scratch; + for (int i = 0; i < kNumReferenceFrameTypes; ++i) { + OBU_READ_LITERAL_OR_FAIL(sequence_header_.order_hint_bits); + frame_header_.reference_order_hint[i] = scratch; + if (frame_header_.reference_order_hint[i] != + decoder_state_.reference_order_hint[i]) { + decoder_state_.reference_valid[i] = false; + } + } + return true; +} + +// static +int ObuParser::FindLatestBackwardReference( + const int current_frame_hint, + const std::array& shifted_order_hints, + const std::array& used_frame) { + int ref = -1; + int latest_order_hint = INT_MIN; + for (int i = 0; i < kNumReferenceFrameTypes; ++i) { + const int hint = shifted_order_hints[i]; + if (!used_frame[i] && hint >= current_frame_hint && + hint >= latest_order_hint) { + ref = i; + latest_order_hint = hint; + } + } + return ref; +} + +// static +int ObuParser::FindEarliestBackwardReference( + const int current_frame_hint, + const std::array& shifted_order_hints, + const std::array& used_frame) { + int ref = -1; + int earliest_order_hint = INT_MAX; + for (int i = 0; i < kNumReferenceFrameTypes; ++i) { + const int hint = shifted_order_hints[i]; + if (!used_frame[i] && hint >= current_frame_hint && + hint < earliest_order_hint) { + ref = i; + earliest_order_hint = hint; + } + } + return ref; +} + +// static +int ObuParser::FindLatestForwardReference( + const int current_frame_hint, + const std::array& shifted_order_hints, + const std::array& used_frame) { + int ref = -1; + int latest_order_hint = INT_MIN; + for (int i = 0; i < kNumReferenceFrameTypes; ++i) { + const int hint = shifted_order_hints[i]; + if (!used_frame[i] && hint < current_frame_hint && + hint >= latest_order_hint) { + ref = i; + latest_order_hint = hint; + } + } + return ref; +} + +// static +int ObuParser::FindReferenceWithSmallestOutputOrder( + const std::array& shifted_order_hints) { + int ref = -1; + int earliest_order_hint = INT_MAX; + for (int i = 0; i < kNumReferenceFrameTypes; ++i) { + const int hint = shifted_order_hints[i]; + if (hint < earliest_order_hint) { + ref = i; + earliest_order_hint = hint; + } + } + return ref; +} + +// Computes the elements in the frame_header_.reference_frame_index array +// based on: +// * the syntax elements last_frame_idx and gold_frame_idx, and +// * the values stored within the decoder_state_.reference_order_hint array +// (these values represent the least significant bits of the expected output +// order of the frames). +// +// Frame type: { +// libgav1_name spec_name int +// kReferenceFrameLast, LAST_FRAME 1 +// kReferenceFrameLast2, LAST2_FRAME 2 +// kReferenceFrameLast3, LAST3_FRAME 3 +// kReferenceFrameGolden, GOLDEN_FRAME 4 +// kReferenceFrameBackward, BWDREF_FRAME 5 +// kReferenceFrameAlternate2, ALTREF2_FRAME 6 +// kReferenceFrameAlternate, ALTREF_FRAME 7 +// } +// +// A typical case of a group of pictures (frames) in display order: +// (However, more complex cases are possibly allowed in terms of +// bitstream conformance.) +// +// | | | | | | | | +// | | | | | | | | +// | | | | | | | | +// | | | | | | | | +// +// 4 3 2 1 current_frame 5 6 7 +// +bool ObuParser::SetFrameReferences(const int8_t last_frame_idx, + const int8_t gold_frame_idx) { + // Set the ref_frame_idx entries for kReferenceFrameLast and + // kReferenceFrameGolden to last_frame_idx and gold_frame_idx. Initialize + // the other entries to -1. + for (int8_t& reference_frame_index : frame_header_.reference_frame_index) { + reference_frame_index = -1; + } + frame_header_ + .reference_frame_index[kReferenceFrameLast - kReferenceFrameLast] = + last_frame_idx; + frame_header_ + .reference_frame_index[kReferenceFrameGolden - kReferenceFrameLast] = + gold_frame_idx; + + // used_frame records which reference frames have been used. + std::array used_frame; + used_frame.fill(false); + used_frame[last_frame_idx] = true; + used_frame[gold_frame_idx] = true; + + assert(sequence_header_.order_hint_bits >= 1); + const int current_frame_hint = 1 << (sequence_header_.order_hint_bits - 1); + // shifted_order_hints contains the expected output order shifted such that + // the current frame has hint equal to current_frame_hint. + std::array shifted_order_hints; + for (int i = 0; i < kNumReferenceFrameTypes; ++i) { + const int relative_distance = GetRelativeDistance( + decoder_state_.reference_order_hint[i], frame_header_.order_hint, + sequence_header_.order_hint_shift_bits); + shifted_order_hints[i] = current_frame_hint + relative_distance; + } + + // The expected output orders for kReferenceFrameLast and + // kReferenceFrameGolden. + const int last_order_hint = shifted_order_hints[last_frame_idx]; + const int gold_order_hint = shifted_order_hints[gold_frame_idx]; + + // Section 7.8: It is a requirement of bitstream conformance that + // lastOrderHint and goldOrderHint are strictly less than curFrameHint. + if (last_order_hint >= current_frame_hint || + gold_order_hint >= current_frame_hint) { + return false; + } + + // Find a backward reference to the frame with highest output order. If + // found, set the kReferenceFrameAlternate reference to that backward + // reference. + int ref = FindLatestBackwardReference(current_frame_hint, shifted_order_hints, + used_frame); + if (ref >= 0) { + frame_header_ + .reference_frame_index[kReferenceFrameAlternate - kReferenceFrameLast] = + ref; + used_frame[ref] = true; + } + + // Find a backward reference to the closest frame. If found, set the + // kReferenceFrameBackward reference to that backward reference. + ref = FindEarliestBackwardReference(current_frame_hint, shifted_order_hints, + used_frame); + if (ref >= 0) { + frame_header_ + .reference_frame_index[kReferenceFrameBackward - kReferenceFrameLast] = + ref; + used_frame[ref] = true; + } + + // Set the kReferenceFrameAlternate2 reference to the next closest backward + // reference. + ref = FindEarliestBackwardReference(current_frame_hint, shifted_order_hints, + used_frame); + if (ref >= 0) { + frame_header_.reference_frame_index[kReferenceFrameAlternate2 - + kReferenceFrameLast] = ref; + used_frame[ref] = true; + } + + // The remaining references are set to be forward references in + // reverse chronological order. + static constexpr ReferenceFrameType + kRefFrameList[kNumInterReferenceFrameTypes - 2] = { + kReferenceFrameLast2, kReferenceFrameLast3, kReferenceFrameBackward, + kReferenceFrameAlternate2, kReferenceFrameAlternate}; + for (const ReferenceFrameType ref_frame : kRefFrameList) { + if (frame_header_.reference_frame_index[ref_frame - kReferenceFrameLast] < + 0) { + ref = FindLatestForwardReference(current_frame_hint, shifted_order_hints, + used_frame); + if (ref >= 0) { + frame_header_.reference_frame_index[ref_frame - kReferenceFrameLast] = + ref; + used_frame[ref] = true; + } + } + } + + // Finally, any remaining references are set to the reference frame with + // smallest output order. + ref = FindReferenceWithSmallestOutputOrder(shifted_order_hints); + assert(ref >= 0); + for (int8_t& reference_frame_index : frame_header_.reference_frame_index) { + if (reference_frame_index < 0) { + reference_frame_index = ref; + } + } + + return true; +} + +bool ObuParser::ParseLoopFilterParameters() { + LoopFilter* const loop_filter = &frame_header_.loop_filter; + if (frame_header_.coded_lossless || frame_header_.allow_intrabc) { + SetDefaultRefDeltas(loop_filter); + return true; + } + // IsIntraFrame implies kPrimaryReferenceNone. + assert(!IsIntraFrame(frame_header_.frame_type) || + frame_header_.primary_reference_frame == kPrimaryReferenceNone); + if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) { + // Part of the setup_past_independence() function in the spec. It is not + // necessary to set loop_filter->delta_enabled to true. See + // https://crbug.com/aomedia/2305. + SetDefaultRefDeltas(loop_filter); + } else { + // Part of the load_previous() function in the spec. + const int prev_frame_index = + frame_header_ + .reference_frame_index[frame_header_.primary_reference_frame]; + const RefCountedBuffer* prev_frame = + decoder_state_.reference_frame[prev_frame_index].get(); + loop_filter->ref_deltas = prev_frame->loop_filter_ref_deltas(); + loop_filter->mode_deltas = prev_frame->loop_filter_mode_deltas(); + } + int64_t scratch; + for (int i = 0; i < 2; ++i) { + OBU_READ_LITERAL_OR_FAIL(6); + loop_filter->level[i] = scratch; + } + if (!sequence_header_.color_config.is_monochrome && + (loop_filter->level[0] != 0 || loop_filter->level[1] != 0)) { + for (int i = 2; i < 4; ++i) { + OBU_READ_LITERAL_OR_FAIL(6); + loop_filter->level[i] = scratch; + } + } + OBU_READ_LITERAL_OR_FAIL(3); + loop_filter->sharpness = scratch; + OBU_READ_BIT_OR_FAIL; + loop_filter->delta_enabled = static_cast(scratch); + if (loop_filter->delta_enabled) { + OBU_READ_BIT_OR_FAIL; + loop_filter->delta_update = static_cast(scratch); + if (loop_filter->delta_update) { + for (auto& ref_delta : loop_filter->ref_deltas) { + OBU_READ_BIT_OR_FAIL; + const auto update_ref_delta = static_cast(scratch); + if (update_ref_delta) { + int scratch_int; + if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) { + LIBGAV1_DLOG(ERROR, "Not enough bits."); + return false; + } + ref_delta = scratch_int; + } + } + for (auto& mode_delta : loop_filter->mode_deltas) { + OBU_READ_BIT_OR_FAIL; + const auto update_mode_delta = static_cast(scratch); + if (update_mode_delta) { + int scratch_int; + if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) { + LIBGAV1_DLOG(ERROR, "Not enough bits."); + return false; + } + mode_delta = scratch_int; + } + } + } + } else { + loop_filter->delta_update = false; + } + return true; +} + +bool ObuParser::ParseDeltaQuantizer(int8_t* const delta) { + int64_t scratch; + *delta = 0; + OBU_READ_BIT_OR_FAIL; + const auto delta_coded = static_cast(scratch); + if (delta_coded) { + int scratch_int; + if (!bit_reader_->ReadInverseSignedLiteral(6, &scratch_int)) { + LIBGAV1_DLOG(ERROR, "Not enough bits."); + return false; + } + *delta = scratch_int; + } + return true; +} + +bool ObuParser::ParseQuantizerParameters() { + int64_t scratch; + QuantizerParameters* const quantizer = &frame_header_.quantizer; + OBU_READ_LITERAL_OR_FAIL(8); + quantizer->base_index = scratch; + if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneY])) return false; + if (!sequence_header_.color_config.is_monochrome) { + bool diff_uv_delta = false; + if (sequence_header_.color_config.separate_uv_delta_q) { + OBU_READ_BIT_OR_FAIL; + diff_uv_delta = static_cast(scratch); + } + if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneU]) || + !ParseDeltaQuantizer(&quantizer->delta_ac[kPlaneU])) { + return false; + } + if (diff_uv_delta) { + if (!ParseDeltaQuantizer(&quantizer->delta_dc[kPlaneV]) || + !ParseDeltaQuantizer(&quantizer->delta_ac[kPlaneV])) { + return false; + } + } else { + quantizer->delta_dc[kPlaneV] = quantizer->delta_dc[kPlaneU]; + quantizer->delta_ac[kPlaneV] = quantizer->delta_ac[kPlaneU]; + } + } + OBU_READ_BIT_OR_FAIL; + quantizer->use_matrix = static_cast(scratch); + if (quantizer->use_matrix) { + OBU_READ_LITERAL_OR_FAIL(4); + quantizer->matrix_level[kPlaneY] = scratch; + OBU_READ_LITERAL_OR_FAIL(4); + quantizer->matrix_level[kPlaneU] = scratch; + if (sequence_header_.color_config.separate_uv_delta_q) { + OBU_READ_LITERAL_OR_FAIL(4); + quantizer->matrix_level[kPlaneV] = scratch; + } else { + quantizer->matrix_level[kPlaneV] = quantizer->matrix_level[kPlaneU]; + } + } + return true; +} + +// This method implements the following functions in the spec: +// - segmentation_params() +// - part of setup_past_independence(): Set the FeatureData and FeatureEnabled +// arrays to all 0. +// - part of load_previous(): Call load_segmentation_params(). +// +// A careful analysis of the spec shows the part of setup_past_independence() +// can be optimized away and the part of load_previous() only needs to be +// invoked under a specific condition. Although the logic looks different from +// the spec, it is equivalent and more efficient. +bool ObuParser::ParseSegmentationParameters() { + int64_t scratch; + Segmentation* const segmentation = &frame_header_.segmentation; + OBU_READ_BIT_OR_FAIL; + segmentation->enabled = static_cast(scratch); + if (!segmentation->enabled) return true; + if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) { + segmentation->update_map = true; + segmentation->update_data = true; + } else { + OBU_READ_BIT_OR_FAIL; + segmentation->update_map = static_cast(scratch); + if (segmentation->update_map) { + OBU_READ_BIT_OR_FAIL; + segmentation->temporal_update = static_cast(scratch); + } + OBU_READ_BIT_OR_FAIL; + segmentation->update_data = static_cast(scratch); + if (!segmentation->update_data) { + // Part of the load_previous() function in the spec. + const int prev_frame_index = + frame_header_ + .reference_frame_index[frame_header_.primary_reference_frame]; + decoder_state_.reference_frame[prev_frame_index] + ->GetSegmentationParameters(segmentation); + return true; + } + } + for (int8_t i = 0; i < kMaxSegments; ++i) { + for (int8_t j = 0; j < kSegmentFeatureMax; ++j) { + OBU_READ_BIT_OR_FAIL; + segmentation->feature_enabled[i][j] = static_cast(scratch); + if (segmentation->feature_enabled[i][j]) { + if (Segmentation::FeatureSigned(static_cast(j))) { + int scratch_int; + if (!bit_reader_->ReadInverseSignedLiteral( + kSegmentationFeatureBits[j], &scratch_int)) { + LIBGAV1_DLOG(ERROR, "Not enough bits."); + return false; + } + segmentation->feature_data[i][j] = + Clip3(scratch_int, -kSegmentationFeatureMaxValues[j], + kSegmentationFeatureMaxValues[j]); + } else { + if (kSegmentationFeatureBits[j] > 0) { + OBU_READ_LITERAL_OR_FAIL(kSegmentationFeatureBits[j]); + segmentation->feature_data[i][j] = Clip3( + static_cast(scratch), 0, kSegmentationFeatureMaxValues[j]); + } else { + segmentation->feature_data[i][j] = 0; + } + } + segmentation->last_active_segment_id = i; + if (j >= kSegmentFeatureReferenceFrame) { + segmentation->segment_id_pre_skip = true; + } + } + } + } + return true; +} + +bool ObuParser::ParseQuantizerIndexDeltaParameters() { + int64_t scratch; + if (frame_header_.quantizer.base_index > 0) { + OBU_READ_BIT_OR_FAIL; + frame_header_.delta_q.present = static_cast(scratch); + if (frame_header_.delta_q.present) { + OBU_READ_LITERAL_OR_FAIL(2); + frame_header_.delta_q.scale = scratch; + } + } + return true; +} + +bool ObuParser::ParseLoopFilterDeltaParameters() { + int64_t scratch; + if (frame_header_.delta_q.present) { + if (!frame_header_.allow_intrabc) { + OBU_READ_BIT_OR_FAIL; + frame_header_.delta_lf.present = static_cast(scratch); + } + if (frame_header_.delta_lf.present) { + OBU_READ_LITERAL_OR_FAIL(2); + frame_header_.delta_lf.scale = scratch; + OBU_READ_BIT_OR_FAIL; + frame_header_.delta_lf.multi = static_cast(scratch); + } + } + return true; +} + +void ObuParser::ComputeSegmentLosslessAndQIndex() { + frame_header_.coded_lossless = true; + Segmentation* const segmentation = &frame_header_.segmentation; + const QuantizerParameters* const quantizer = &frame_header_.quantizer; + for (int i = 0; i < kMaxSegments; ++i) { + segmentation->qindex[i] = + GetQIndex(*segmentation, i, quantizer->base_index); + segmentation->lossless[i] = + segmentation->qindex[i] == 0 && quantizer->delta_dc[kPlaneY] == 0 && + quantizer->delta_dc[kPlaneU] == 0 && + quantizer->delta_ac[kPlaneU] == 0 && + quantizer->delta_dc[kPlaneV] == 0 && quantizer->delta_ac[kPlaneV] == 0; + if (!segmentation->lossless[i]) frame_header_.coded_lossless = false; + // The spec calls for setting up a two-dimensional SegQMLevel array here. + // We avoid the SegQMLevel array by using segmentation->lossless[i] and + // quantizer->matrix_level[plane] directly in the reconstruct process of + // Section 7.12.3. + } + frame_header_.upscaled_lossless = + frame_header_.coded_lossless && + frame_header_.width == frame_header_.upscaled_width; +} + +bool ObuParser::ParseCdefParameters() { + const int coeff_shift = sequence_header_.color_config.bitdepth - 8; + if (frame_header_.coded_lossless || frame_header_.allow_intrabc || + !sequence_header_.enable_cdef) { + frame_header_.cdef.damping = 3 + coeff_shift; + return true; + } + Cdef* const cdef = &frame_header_.cdef; + int64_t scratch; + OBU_READ_LITERAL_OR_FAIL(2); + cdef->damping = scratch + 3 + coeff_shift; + OBU_READ_LITERAL_OR_FAIL(2); + cdef->bits = scratch; + for (int i = 0; i < (1 << cdef->bits); ++i) { + OBU_READ_LITERAL_OR_FAIL(4); + cdef->y_primary_strength[i] = scratch << coeff_shift; + OBU_READ_LITERAL_OR_FAIL(2); + cdef->y_secondary_strength[i] = scratch; + if (cdef->y_secondary_strength[i] == 3) ++cdef->y_secondary_strength[i]; + cdef->y_secondary_strength[i] <<= coeff_shift; + if (sequence_header_.color_config.is_monochrome) continue; + OBU_READ_LITERAL_OR_FAIL(4); + cdef->uv_primary_strength[i] = scratch << coeff_shift; + OBU_READ_LITERAL_OR_FAIL(2); + cdef->uv_secondary_strength[i] = scratch; + if (cdef->uv_secondary_strength[i] == 3) ++cdef->uv_secondary_strength[i]; + cdef->uv_secondary_strength[i] <<= coeff_shift; + } + return true; +} + +bool ObuParser::ParseLoopRestorationParameters() { + if (frame_header_.upscaled_lossless || frame_header_.allow_intrabc || + !sequence_header_.enable_restoration) { + return true; + } + int64_t scratch; + bool uses_loop_restoration = false; + bool uses_chroma_loop_restoration = false; + LoopRestoration* const loop_restoration = &frame_header_.loop_restoration; + const int num_planes = sequence_header_.color_config.is_monochrome + ? kMaxPlanesMonochrome + : kMaxPlanes; + for (int i = 0; i < num_planes; ++i) { + OBU_READ_LITERAL_OR_FAIL(2); + loop_restoration->type[i] = static_cast(scratch); + if (loop_restoration->type[i] != kLoopRestorationTypeNone) { + uses_loop_restoration = true; + if (i > 0) uses_chroma_loop_restoration = true; + } + } + if (uses_loop_restoration) { + uint8_t unit_shift; + if (sequence_header_.use_128x128_superblock) { + OBU_READ_BIT_OR_FAIL; + unit_shift = scratch + 1; + } else { + OBU_READ_BIT_OR_FAIL; + unit_shift = scratch; + if (unit_shift != 0) { + OBU_READ_BIT_OR_FAIL; + const uint8_t unit_extra_shift = scratch; + unit_shift += unit_extra_shift; + } + } + loop_restoration->unit_size_log2[kPlaneY] = 6 + unit_shift; + uint8_t uv_shift = 0; + if (sequence_header_.color_config.subsampling_x != 0 && + sequence_header_.color_config.subsampling_y != 0 && + uses_chroma_loop_restoration) { + OBU_READ_BIT_OR_FAIL; + uv_shift = scratch; + } + loop_restoration->unit_size_log2[kPlaneU] = + loop_restoration->unit_size_log2[kPlaneV] = + loop_restoration->unit_size_log2[0] - uv_shift; + } + return true; +} + +bool ObuParser::ParseTxModeSyntax() { + if (frame_header_.coded_lossless) { + frame_header_.tx_mode = kTxModeOnly4x4; + return true; + } + int64_t scratch; + OBU_READ_BIT_OR_FAIL; + frame_header_.tx_mode = (scratch == 1) ? kTxModeSelect : kTxModeLargest; + return true; +} + +bool ObuParser::ParseFrameReferenceModeSyntax() { + int64_t scratch; + if (!IsIntraFrame(frame_header_.frame_type)) { + OBU_READ_BIT_OR_FAIL; + frame_header_.reference_mode_select = static_cast(scratch); + } + return true; +} + +bool ObuParser::IsSkipModeAllowed() { + if (IsIntraFrame(frame_header_.frame_type) || + !frame_header_.reference_mode_select || + !sequence_header_.enable_order_hint) { + return false; + } + // Identify the nearest forward and backward references. + int forward_index = -1; + int backward_index = -1; + int forward_hint = -1; + int backward_hint = -1; + for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) { + const unsigned int reference_hint = + decoder_state_ + .reference_order_hint[frame_header_.reference_frame_index[i]]; + // TODO(linfengz): |relative_distance| equals + // current_frame_->reference_info()-> + // relative_distance_from[i + kReferenceFrameLast]; + // However, the unit test ObuParserTest.SkipModeParameters() would fail. + // Will figure out how to initialize |current_frame_.reference_info_| in the + // RefCountedBuffer later. + const int relative_distance = + GetRelativeDistance(reference_hint, frame_header_.order_hint, + sequence_header_.order_hint_shift_bits); + if (relative_distance < 0) { + if (forward_index < 0 || + GetRelativeDistance(reference_hint, forward_hint, + sequence_header_.order_hint_shift_bits) > 0) { + forward_index = i; + forward_hint = reference_hint; + } + } else if (relative_distance > 0) { + if (backward_index < 0 || + GetRelativeDistance(reference_hint, backward_hint, + sequence_header_.order_hint_shift_bits) < 0) { + backward_index = i; + backward_hint = reference_hint; + } + } + } + if (forward_index < 0) return false; + if (backward_index >= 0) { + // Bidirectional prediction. + frame_header_.skip_mode_frame[0] = static_cast( + kReferenceFrameLast + std::min(forward_index, backward_index)); + frame_header_.skip_mode_frame[1] = static_cast( + kReferenceFrameLast + std::max(forward_index, backward_index)); + return true; + } + // Forward prediction only. Identify the second nearest forward reference. + int second_forward_index = -1; + int second_forward_hint = -1; + for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) { + const unsigned int reference_hint = + decoder_state_ + .reference_order_hint[frame_header_.reference_frame_index[i]]; + if (GetRelativeDistance(reference_hint, forward_hint, + sequence_header_.order_hint_shift_bits) < 0) { + if (second_forward_index < 0 || + GetRelativeDistance(reference_hint, second_forward_hint, + sequence_header_.order_hint_shift_bits) > 0) { + second_forward_index = i; + second_forward_hint = reference_hint; + } + } + } + if (second_forward_index < 0) return false; + frame_header_.skip_mode_frame[0] = static_cast( + kReferenceFrameLast + std::min(forward_index, second_forward_index)); + frame_header_.skip_mode_frame[1] = static_cast( + kReferenceFrameLast + std::max(forward_index, second_forward_index)); + return true; +} + +bool ObuParser::ParseSkipModeParameters() { + if (!IsSkipModeAllowed()) return true; + int64_t scratch; + OBU_READ_BIT_OR_FAIL; + frame_header_.skip_mode_present = static_cast(scratch); + return true; +} + +// Sets frame_header_.global_motion[ref].params[index]. +bool ObuParser::ParseGlobalParamSyntax( + int ref, int index, + const std::array& + prev_global_motions) { + GlobalMotion* const global_motion = &frame_header_.global_motion[ref]; + const GlobalMotion* const prev_global_motion = &prev_global_motions[ref]; + int abs_bits = kGlobalMotionAlphaBits; + int precision_bits = kGlobalMotionAlphaPrecisionBits; + if (index < 2) { + if (global_motion->type == kGlobalMotionTransformationTypeTranslation) { + const auto high_precision_mv_factor = + static_cast(!frame_header_.allow_high_precision_mv); + abs_bits = kGlobalMotionTranslationOnlyBits - high_precision_mv_factor; + precision_bits = + kGlobalMotionTranslationOnlyPrecisionBits - high_precision_mv_factor; + } else { + abs_bits = kGlobalMotionTranslationBits; + precision_bits = kGlobalMotionTranslationPrecisionBits; + } + } + const int precision_diff = kWarpedModelPrecisionBits - precision_bits; + const int round = (index % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0; + const int sub = (index % 3 == 2) ? 1 << precision_bits : 0; + const int mx = 1 << abs_bits; + const int reference = + (prev_global_motion->params[index] >> precision_diff) - sub; + int scratch; + if (!bit_reader_->DecodeSignedSubexpWithReference( + -mx, mx + 1, reference, kGlobalMotionReadControl, &scratch)) { + LIBGAV1_DLOG(ERROR, "Not enough bits."); + return false; + } + global_motion->params[index] = LeftShift(scratch, precision_diff) + round; + return true; +} + +bool ObuParser::ParseGlobalMotionParameters() { + for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) { + frame_header_.global_motion[ref].type = + kGlobalMotionTransformationTypeIdentity; + for (int i = 0; i < 6; ++i) { + frame_header_.global_motion[ref].params[i] = + (i % 3 == 2) ? 1 << kWarpedModelPrecisionBits : 0; + } + } + if (IsIntraFrame(frame_header_.frame_type)) return true; + const std::array* prev_global_motions = + nullptr; + if (frame_header_.primary_reference_frame == kPrimaryReferenceNone) { + // Part of the setup_past_independence() function in the spec. The value + // that the spec says PrevGmParams[ref][i] should be set to is exactly + // the value frame_header_.global_motion[ref].params[i] is set to by the + // for loop above. Therefore prev_global_motions can simply point to + // frame_header_.global_motion. + prev_global_motions = &frame_header_.global_motion; + } else { + // Part of the load_previous() function in the spec. + const int prev_frame_index = + frame_header_ + .reference_frame_index[frame_header_.primary_reference_frame]; + prev_global_motions = + &decoder_state_.reference_frame[prev_frame_index]->GlobalMotions(); + } + for (int ref = kReferenceFrameLast; ref <= kReferenceFrameAlternate; ++ref) { + GlobalMotion* const global_motion = &frame_header_.global_motion[ref]; + int64_t scratch; + OBU_READ_BIT_OR_FAIL; + const auto is_global = static_cast(scratch); + if (is_global) { + OBU_READ_BIT_OR_FAIL; + const auto is_rot_zoom = static_cast(scratch); + if (is_rot_zoom) { + global_motion->type = kGlobalMotionTransformationTypeRotZoom; + } else { + OBU_READ_BIT_OR_FAIL; + const auto is_translation = static_cast(scratch); + global_motion->type = is_translation + ? kGlobalMotionTransformationTypeTranslation + : kGlobalMotionTransformationTypeAffine; + } + } else { + global_motion->type = kGlobalMotionTransformationTypeIdentity; + } + if (global_motion->type >= kGlobalMotionTransformationTypeRotZoom) { + if (!ParseGlobalParamSyntax(ref, 2, *prev_global_motions) || + !ParseGlobalParamSyntax(ref, 3, *prev_global_motions)) { + return false; + } + if (global_motion->type == kGlobalMotionTransformationTypeAffine) { + if (!ParseGlobalParamSyntax(ref, 4, *prev_global_motions) || + !ParseGlobalParamSyntax(ref, 5, *prev_global_motions)) { + return false; + } + } else { + global_motion->params[4] = -global_motion->params[3]; + global_motion->params[5] = global_motion->params[2]; + } + } + if (global_motion->type >= kGlobalMotionTransformationTypeTranslation) { + if (!ParseGlobalParamSyntax(ref, 0, *prev_global_motions) || + !ParseGlobalParamSyntax(ref, 1, *prev_global_motions)) { + return false; + } + } + } + return true; +} + +bool ObuParser::ParseFilmGrainParameters() { + if (!sequence_header_.film_grain_params_present || + (!frame_header_.show_frame && !frame_header_.showable_frame)) { + // frame_header_.film_grain_params is already zero-initialized. + return true; + } + + FilmGrainParams& film_grain_params = frame_header_.film_grain_params; + int64_t scratch; + OBU_READ_BIT_OR_FAIL; + film_grain_params.apply_grain = static_cast(scratch); + if (!film_grain_params.apply_grain) { + // film_grain_params is already zero-initialized. + return true; + } + + OBU_READ_LITERAL_OR_FAIL(16); + film_grain_params.grain_seed = static_cast(scratch); + film_grain_params.update_grain = true; + if (frame_header_.frame_type == kFrameInter) { + OBU_READ_BIT_OR_FAIL; + film_grain_params.update_grain = static_cast(scratch); + } + if (!film_grain_params.update_grain) { + OBU_READ_LITERAL_OR_FAIL(3); + film_grain_params.reference_index = static_cast(scratch); + bool found = false; + for (const auto index : frame_header_.reference_frame_index) { + if (film_grain_params.reference_index == index) { + found = true; + break; + } + } + if (!found) { + static_assert(sizeof(frame_header_.reference_frame_index) / + sizeof(frame_header_.reference_frame_index[0]) == + 7, + ""); + LIBGAV1_DLOG(ERROR, + "Invalid value for film_grain_params_ref_idx (%d). " + "ref_frame_idx = {%d, %d, %d, %d, %d, %d, %d}", + film_grain_params.reference_index, + frame_header_.reference_frame_index[0], + frame_header_.reference_frame_index[1], + frame_header_.reference_frame_index[2], + frame_header_.reference_frame_index[3], + frame_header_.reference_frame_index[4], + frame_header_.reference_frame_index[5], + frame_header_.reference_frame_index[6]); + return false; + } + const RefCountedBuffer* grain_params_reference_frame = + decoder_state_.reference_frame[film_grain_params.reference_index].get(); + if (grain_params_reference_frame == nullptr) { + LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a decoded frame", + film_grain_params.reference_index); + return false; + } + const int temp_grain_seed = film_grain_params.grain_seed; + const bool temp_update_grain = film_grain_params.update_grain; + const int temp_reference_index = film_grain_params.reference_index; + film_grain_params = grain_params_reference_frame->film_grain_params(); + film_grain_params.grain_seed = temp_grain_seed; + film_grain_params.update_grain = temp_update_grain; + film_grain_params.reference_index = temp_reference_index; + return true; + } + + OBU_READ_LITERAL_OR_FAIL(4); + film_grain_params.num_y_points = scratch; + if (film_grain_params.num_y_points > 14) { + LIBGAV1_DLOG(ERROR, "Invalid value for num_y_points (%d).", + film_grain_params.num_y_points); + return false; + } + for (int i = 0; i < film_grain_params.num_y_points; ++i) { + OBU_READ_LITERAL_OR_FAIL(8); + film_grain_params.point_y_value[i] = scratch; + if (i != 0 && film_grain_params.point_y_value[i - 1] >= + film_grain_params.point_y_value[i]) { + LIBGAV1_DLOG(ERROR, "point_y_value[%d] (%d) >= point_y_value[%d] (%d).", + i - 1, film_grain_params.point_y_value[i - 1], i, + film_grain_params.point_y_value[i]); + return false; + } + OBU_READ_LITERAL_OR_FAIL(8); + film_grain_params.point_y_scaling[i] = scratch; + } + if (sequence_header_.color_config.is_monochrome) { + film_grain_params.chroma_scaling_from_luma = false; + } else { + OBU_READ_BIT_OR_FAIL; + film_grain_params.chroma_scaling_from_luma = static_cast(scratch); + } + if (sequence_header_.color_config.is_monochrome || + film_grain_params.chroma_scaling_from_luma || + (sequence_header_.color_config.subsampling_x == 1 && + sequence_header_.color_config.subsampling_y == 1 && + film_grain_params.num_y_points == 0)) { + film_grain_params.num_u_points = 0; + film_grain_params.num_v_points = 0; + } else { + OBU_READ_LITERAL_OR_FAIL(4); + film_grain_params.num_u_points = scratch; + if (film_grain_params.num_u_points > 10) { + LIBGAV1_DLOG(ERROR, "Invalid value for num_u_points (%d).", + film_grain_params.num_u_points); + return false; + } + for (int i = 0; i < film_grain_params.num_u_points; ++i) { + OBU_READ_LITERAL_OR_FAIL(8); + film_grain_params.point_u_value[i] = scratch; + if (i != 0 && film_grain_params.point_u_value[i - 1] >= + film_grain_params.point_u_value[i]) { + LIBGAV1_DLOG(ERROR, "point_u_value[%d] (%d) >= point_u_value[%d] (%d).", + i - 1, film_grain_params.point_u_value[i - 1], i, + film_grain_params.point_u_value[i]); + return false; + } + OBU_READ_LITERAL_OR_FAIL(8); + film_grain_params.point_u_scaling[i] = scratch; + } + OBU_READ_LITERAL_OR_FAIL(4); + film_grain_params.num_v_points = scratch; + if (film_grain_params.num_v_points > 10) { + LIBGAV1_DLOG(ERROR, "Invalid value for num_v_points (%d).", + film_grain_params.num_v_points); + return false; + } + if (sequence_header_.color_config.subsampling_x == 1 && + sequence_header_.color_config.subsampling_y == 1 && + (film_grain_params.num_u_points == 0) != + (film_grain_params.num_v_points == 0)) { + LIBGAV1_DLOG(ERROR, + "Invalid values for num_u_points (%d) and num_v_points (%d) " + "for 4:2:0 chroma subsampling.", + film_grain_params.num_u_points, + film_grain_params.num_v_points); + return false; + } + for (int i = 0; i < film_grain_params.num_v_points; ++i) { + OBU_READ_LITERAL_OR_FAIL(8); + film_grain_params.point_v_value[i] = scratch; + if (i != 0 && film_grain_params.point_v_value[i - 1] >= + film_grain_params.point_v_value[i]) { + LIBGAV1_DLOG(ERROR, "point_v_value[%d] (%d) >= point_v_value[%d] (%d).", + i - 1, film_grain_params.point_v_value[i - 1], i, + film_grain_params.point_v_value[i]); + return false; + } + OBU_READ_LITERAL_OR_FAIL(8); + film_grain_params.point_v_scaling[i] = scratch; + } + } + OBU_READ_LITERAL_OR_FAIL(2); + film_grain_params.chroma_scaling = scratch + 8; + OBU_READ_LITERAL_OR_FAIL(2); + film_grain_params.auto_regression_coeff_lag = scratch; + + const int num_pos_y = + MultiplyBy2(film_grain_params.auto_regression_coeff_lag) * + (film_grain_params.auto_regression_coeff_lag + 1); + int num_pos_uv = num_pos_y; + if (film_grain_params.num_y_points > 0) { + ++num_pos_uv; + for (int i = 0; i < num_pos_y; ++i) { + OBU_READ_LITERAL_OR_FAIL(8); + film_grain_params.auto_regression_coeff_y[i] = + static_cast(scratch - 128); + } + } + if (film_grain_params.chroma_scaling_from_luma || + film_grain_params.num_u_points > 0) { + for (int i = 0; i < num_pos_uv; ++i) { + OBU_READ_LITERAL_OR_FAIL(8); + film_grain_params.auto_regression_coeff_u[i] = + static_cast(scratch - 128); + } + } + if (film_grain_params.chroma_scaling_from_luma || + film_grain_params.num_v_points > 0) { + for (int i = 0; i < num_pos_uv; ++i) { + OBU_READ_LITERAL_OR_FAIL(8); + film_grain_params.auto_regression_coeff_v[i] = + static_cast(scratch - 128); + } + } + OBU_READ_LITERAL_OR_FAIL(2); + film_grain_params.auto_regression_shift = static_cast(scratch + 6); + OBU_READ_LITERAL_OR_FAIL(2); + film_grain_params.grain_scale_shift = static_cast(scratch); + if (film_grain_params.num_u_points > 0) { + OBU_READ_LITERAL_OR_FAIL(8); + film_grain_params.u_multiplier = static_cast(scratch - 128); + OBU_READ_LITERAL_OR_FAIL(8); + film_grain_params.u_luma_multiplier = static_cast(scratch - 128); + OBU_READ_LITERAL_OR_FAIL(9); + film_grain_params.u_offset = static_cast(scratch - 256); + } + if (film_grain_params.num_v_points > 0) { + OBU_READ_LITERAL_OR_FAIL(8); + film_grain_params.v_multiplier = static_cast(scratch - 128); + OBU_READ_LITERAL_OR_FAIL(8); + film_grain_params.v_luma_multiplier = static_cast(scratch - 128); + OBU_READ_LITERAL_OR_FAIL(9); + film_grain_params.v_offset = static_cast(scratch - 256); + } + OBU_READ_BIT_OR_FAIL; + film_grain_params.overlap_flag = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + film_grain_params.clip_to_restricted_range = static_cast(scratch); + return true; +} + +bool ObuParser::ParseTileInfoSyntax() { + TileInfo* const tile_info = &frame_header_.tile_info; + const int sb_columns = sequence_header_.use_128x128_superblock + ? ((frame_header_.columns4x4 + 31) >> 5) + : ((frame_header_.columns4x4 + 15) >> 4); + const int sb_rows = sequence_header_.use_128x128_superblock + ? ((frame_header_.rows4x4 + 31) >> 5) + : ((frame_header_.rows4x4 + 15) >> 4); + tile_info->sb_columns = sb_columns; + tile_info->sb_rows = sb_rows; + const int sb_shift = sequence_header_.use_128x128_superblock ? 5 : 4; + const int sb_size = 2 + sb_shift; + const int sb_max_tile_width = kMaxTileWidth >> sb_size; + const int sb_max_tile_area = kMaxTileArea >> MultiplyBy2(sb_size); + const int minlog2_tile_columns = TileLog2(sb_max_tile_width, sb_columns); + const int maxlog2_tile_columns = + CeilLog2(std::min(sb_columns, static_cast(kMaxTileColumns))); + const int maxlog2_tile_rows = + CeilLog2(std::min(sb_rows, static_cast(kMaxTileRows))); + const int min_log2_tiles = std::max( + minlog2_tile_columns, TileLog2(sb_max_tile_area, sb_rows * sb_columns)); + int64_t scratch; + OBU_READ_BIT_OR_FAIL; + tile_info->uniform_spacing = static_cast(scratch); + if (tile_info->uniform_spacing) { + // Read tile columns. + tile_info->tile_columns_log2 = minlog2_tile_columns; + while (tile_info->tile_columns_log2 < maxlog2_tile_columns) { + OBU_READ_BIT_OR_FAIL; + if (scratch == 0) break; + ++tile_info->tile_columns_log2; + } + + // Compute tile column starts. + const int sb_tile_width = + (sb_columns + (1 << tile_info->tile_columns_log2) - 1) >> + tile_info->tile_columns_log2; + if (sb_tile_width <= 0) return false; + int i = 0; + for (int sb_start = 0; sb_start < sb_columns; sb_start += sb_tile_width) { + if (i >= kMaxTileColumns) { + LIBGAV1_DLOG(ERROR, + "tile_columns would be greater than kMaxTileColumns."); + return false; + } + tile_info->tile_column_start[i++] = sb_start << sb_shift; + } + tile_info->tile_column_start[i] = frame_header_.columns4x4; + tile_info->tile_columns = i; + + // Read tile rows. + const int minlog2_tile_rows = + std::max(min_log2_tiles - tile_info->tile_columns_log2, 0); + tile_info->tile_rows_log2 = minlog2_tile_rows; + while (tile_info->tile_rows_log2 < maxlog2_tile_rows) { + OBU_READ_BIT_OR_FAIL; + if (scratch == 0) break; + ++tile_info->tile_rows_log2; + } + + // Compute tile row starts. + const int sb_tile_height = + (sb_rows + (1 << tile_info->tile_rows_log2) - 1) >> + tile_info->tile_rows_log2; + if (sb_tile_height <= 0) return false; + i = 0; + for (int sb_start = 0; sb_start < sb_rows; sb_start += sb_tile_height) { + if (i >= kMaxTileRows) { + LIBGAV1_DLOG(ERROR, "tile_rows would be greater than kMaxTileRows."); + return false; + } + tile_info->tile_row_start[i++] = sb_start << sb_shift; + } + tile_info->tile_row_start[i] = frame_header_.rows4x4; + tile_info->tile_rows = i; + } else { + int widest_tile_sb = 1; + int i = 0; + for (int sb_start = 0; sb_start < sb_columns; ++i) { + if (i >= kMaxTileColumns) { + LIBGAV1_DLOG(ERROR, + "tile_columns would be greater than kMaxTileColumns."); + return false; + } + tile_info->tile_column_start[i] = sb_start << sb_shift; + const int max_width = + std::min(sb_columns - sb_start, static_cast(sb_max_tile_width)); + if (!bit_reader_->DecodeUniform( + max_width, &tile_info->tile_column_width_in_superblocks[i])) { + LIBGAV1_DLOG(ERROR, "Not enough bits."); + return false; + } + ++tile_info->tile_column_width_in_superblocks[i]; + widest_tile_sb = std::max(tile_info->tile_column_width_in_superblocks[i], + widest_tile_sb); + sb_start += tile_info->tile_column_width_in_superblocks[i]; + } + tile_info->tile_column_start[i] = frame_header_.columns4x4; + tile_info->tile_columns = i; + tile_info->tile_columns_log2 = CeilLog2(tile_info->tile_columns); + + int max_tile_area_sb = sb_rows * sb_columns; + if (min_log2_tiles > 0) max_tile_area_sb >>= min_log2_tiles + 1; + const int max_tile_height_sb = + std::max(max_tile_area_sb / widest_tile_sb, 1); + + i = 0; + for (int sb_start = 0; sb_start < sb_rows; ++i) { + if (i >= kMaxTileRows) { + LIBGAV1_DLOG(ERROR, "tile_rows would be greater than kMaxTileRows."); + return false; + } + tile_info->tile_row_start[i] = sb_start << sb_shift; + const int max_height = std::min(sb_rows - sb_start, max_tile_height_sb); + if (!bit_reader_->DecodeUniform( + max_height, &tile_info->tile_row_height_in_superblocks[i])) { + LIBGAV1_DLOG(ERROR, "Not enough bits."); + return false; + } + ++tile_info->tile_row_height_in_superblocks[i]; + sb_start += tile_info->tile_row_height_in_superblocks[i]; + } + tile_info->tile_row_start[i] = frame_header_.rows4x4; + tile_info->tile_rows = i; + tile_info->tile_rows_log2 = CeilLog2(tile_info->tile_rows); + } + tile_info->tile_count = tile_info->tile_rows * tile_info->tile_columns; + if (!tile_buffers_.reserve(tile_info->tile_count)) { + LIBGAV1_DLOG(ERROR, "Unable to allocate memory for tile_buffers_."); + return false; + } + tile_info->context_update_id = 0; + const int tile_bits = + tile_info->tile_columns_log2 + tile_info->tile_rows_log2; + if (tile_bits != 0) { + OBU_READ_LITERAL_OR_FAIL(tile_bits); + tile_info->context_update_id = static_cast(scratch); + if (tile_info->context_update_id >= tile_info->tile_count) { + LIBGAV1_DLOG(ERROR, "Invalid context_update_tile_id (%d) >= %d.", + tile_info->context_update_id, tile_info->tile_count); + return false; + } + OBU_READ_LITERAL_OR_FAIL(2); + tile_info->tile_size_bytes = 1 + scratch; + } + return true; +} + +bool ObuParser::ReadAllowWarpedMotion() { + if (IsIntraFrame(frame_header_.frame_type) || + frame_header_.error_resilient_mode || + !sequence_header_.enable_warped_motion) { + return true; + } + int64_t scratch; + OBU_READ_BIT_OR_FAIL; + frame_header_.allow_warped_motion = static_cast(scratch); + return true; +} + +bool ObuParser::ParseFrameParameters() { + int64_t scratch; + if (sequence_header_.reduced_still_picture_header) { + frame_header_.show_frame = true; + current_frame_ = buffer_pool_->GetFreeBuffer(); + if (current_frame_ == nullptr) { + LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool."); + return false; + } + } else { + OBU_READ_BIT_OR_FAIL; + frame_header_.show_existing_frame = static_cast(scratch); + if (frame_header_.show_existing_frame) { + OBU_READ_LITERAL_OR_FAIL(3); + frame_header_.frame_to_show = scratch; + if (sequence_header_.decoder_model_info_present_flag && + !sequence_header_.timing_info.equal_picture_interval) { + OBU_READ_LITERAL_OR_FAIL( + sequence_header_.decoder_model_info.frame_presentation_time_length); + frame_header_.frame_presentation_time = static_cast(scratch); + } + if (sequence_header_.frame_id_numbers_present) { + OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_id_length_bits); + frame_header_.display_frame_id = static_cast(scratch); + // Section 6.8.2: It is a requirement of bitstream conformance that + // whenever display_frame_id is read, the value matches + // RefFrameId[ frame_to_show_map_idx ] ..., and that + // RefValid[ frame_to_show_map_idx ] is equal to 1. + if (frame_header_.display_frame_id != + decoder_state_ + .reference_frame_id[frame_header_.frame_to_show] || + !decoder_state_.reference_valid[frame_header_.frame_to_show]) { + LIBGAV1_DLOG(ERROR, + "Reference buffer %d has a frame id number mismatch.", + frame_header_.frame_to_show); + return false; + } + } + // Section 7.18.2. Note: This is also needed for Section 7.21 if + // frame_type is kFrameKey. + current_frame_ = + decoder_state_.reference_frame[frame_header_.frame_to_show]; + if (current_frame_ == nullptr) { + LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a decoded frame", + frame_header_.frame_to_show); + return false; + } + // Section 6.8.2: It is a requirement of bitstream conformance that + // when show_existing_frame is used to show a previous frame, that the + // value of showable_frame for the previous frame was equal to 1. + if (!current_frame_->showable_frame()) { + LIBGAV1_DLOG(ERROR, "Buffer %d does not contain a showable frame", + frame_header_.frame_to_show); + return false; + } + if (current_frame_->frame_type() == kFrameKey) { + frame_header_.refresh_frame_flags = 0xff; + // Section 6.8.2: It is a requirement of bitstream conformance that + // when show_existing_frame is used to show a previous frame with + // RefFrameType[ frame_to_show_map_idx ] equal to KEY_FRAME, that + // the frame is output via the show_existing_frame mechanism at most + // once. + current_frame_->set_showable_frame(false); + + // Section 7.21. Note: decoder_state_.current_frame_id must be set + // only when frame_type is kFrameKey per the spec. Among all the + // variables set in Section 7.21, current_frame_id is the only one + // whose value lives across frames. (PrevFrameID is set equal to the + // current_frame_id value for the previous frame.) + decoder_state_.current_frame_id = + decoder_state_.reference_frame_id[frame_header_.frame_to_show]; + decoder_state_.order_hint = + decoder_state_.reference_order_hint[frame_header_.frame_to_show]; + } + return true; + } + current_frame_ = buffer_pool_->GetFreeBuffer(); + if (current_frame_ == nullptr) { + LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool."); + return false; + } + OBU_READ_LITERAL_OR_FAIL(2); + frame_header_.frame_type = static_cast(scratch); + current_frame_->set_frame_type(frame_header_.frame_type); + OBU_READ_BIT_OR_FAIL; + frame_header_.show_frame = static_cast(scratch); + if (frame_header_.show_frame && + sequence_header_.decoder_model_info_present_flag && + !sequence_header_.timing_info.equal_picture_interval) { + OBU_READ_LITERAL_OR_FAIL( + sequence_header_.decoder_model_info.frame_presentation_time_length); + frame_header_.frame_presentation_time = static_cast(scratch); + } + if (frame_header_.show_frame) { + frame_header_.showable_frame = (frame_header_.frame_type != kFrameKey); + } else { + OBU_READ_BIT_OR_FAIL; + frame_header_.showable_frame = static_cast(scratch); + } + current_frame_->set_showable_frame(frame_header_.showable_frame); + if (frame_header_.frame_type == kFrameSwitch || + (frame_header_.frame_type == kFrameKey && frame_header_.show_frame)) { + frame_header_.error_resilient_mode = true; + } else { + OBU_READ_BIT_OR_FAIL; + frame_header_.error_resilient_mode = static_cast(scratch); + } + } + if (frame_header_.frame_type == kFrameKey && frame_header_.show_frame) { + decoder_state_.reference_valid.fill(false); + decoder_state_.reference_order_hint.fill(0); + decoder_state_.reference_frame.fill(nullptr); + } + OBU_READ_BIT_OR_FAIL; + frame_header_.enable_cdf_update = !static_cast(scratch); + if (sequence_header_.force_screen_content_tools == + kSelectScreenContentTools) { + OBU_READ_BIT_OR_FAIL; + frame_header_.allow_screen_content_tools = static_cast(scratch); + } else { + frame_header_.allow_screen_content_tools = + static_cast(sequence_header_.force_screen_content_tools); + } + if (frame_header_.allow_screen_content_tools) { + if (sequence_header_.force_integer_mv == kSelectIntegerMv) { + OBU_READ_BIT_OR_FAIL; + frame_header_.force_integer_mv = scratch; + } else { + frame_header_.force_integer_mv = sequence_header_.force_integer_mv; + } + } else { + frame_header_.force_integer_mv = 0; + } + if (IsIntraFrame(frame_header_.frame_type)) { + frame_header_.force_integer_mv = 1; + } + if (sequence_header_.frame_id_numbers_present) { + OBU_READ_LITERAL_OR_FAIL(sequence_header_.frame_id_length_bits); + frame_header_.current_frame_id = static_cast(scratch); + const int previous_frame_id = decoder_state_.current_frame_id; + decoder_state_.current_frame_id = frame_header_.current_frame_id; + if (frame_header_.frame_type != kFrameKey || !frame_header_.show_frame) { + if (previous_frame_id >= 0) { + // Section 6.8.2: ..., it is a requirement of bitstream conformance + // that all of the following conditions are true: + // * current_frame_id is not equal to PrevFrameID, + // * DiffFrameID is less than 1 << ( idLen - 1 ) + int diff_frame_id = decoder_state_.current_frame_id - previous_frame_id; + const int id_length_max_value = + 1 << sequence_header_.frame_id_length_bits; + if (diff_frame_id <= 0) { + diff_frame_id += id_length_max_value; + } + if (diff_frame_id >= DivideBy2(id_length_max_value)) { + LIBGAV1_DLOG(ERROR, + "current_frame_id (%d) equals or differs too much from " + "previous_frame_id (%d).", + decoder_state_.current_frame_id, previous_frame_id); + return false; + } + } + MarkInvalidReferenceFrames(); + } + } else { + frame_header_.current_frame_id = 0; + decoder_state_.current_frame_id = frame_header_.current_frame_id; + } + if (frame_header_.frame_type == kFrameSwitch) { + frame_header_.frame_size_override_flag = true; + } else if (!sequence_header_.reduced_still_picture_header) { + OBU_READ_BIT_OR_FAIL; + frame_header_.frame_size_override_flag = static_cast(scratch); + } + if (sequence_header_.order_hint_bits > 0) { + OBU_READ_LITERAL_OR_FAIL(sequence_header_.order_hint_bits); + frame_header_.order_hint = scratch; + } + decoder_state_.order_hint = frame_header_.order_hint; + if (IsIntraFrame(frame_header_.frame_type) || + frame_header_.error_resilient_mode) { + frame_header_.primary_reference_frame = kPrimaryReferenceNone; + } else { + OBU_READ_LITERAL_OR_FAIL(3); + frame_header_.primary_reference_frame = scratch; + } + if (sequence_header_.decoder_model_info_present_flag) { + OBU_READ_BIT_OR_FAIL; + const auto buffer_removal_time_present = static_cast(scratch); + if (buffer_removal_time_present) { + for (int i = 0; i < sequence_header_.operating_points; ++i) { + if (!sequence_header_.decoder_model_present_for_operating_point[i]) { + continue; + } + const int index = sequence_header_.operating_point_idc[i]; + if (index == 0 || + (InTemporalLayer(index, obu_headers_.back().temporal_id) && + InSpatialLayer(index, obu_headers_.back().spatial_id))) { + OBU_READ_LITERAL_OR_FAIL( + sequence_header_.decoder_model_info.buffer_removal_time_length); + frame_header_.buffer_removal_time[i] = static_cast(scratch); + } + } + } + } + if (frame_header_.frame_type == kFrameSwitch || + (frame_header_.frame_type == kFrameKey && frame_header_.show_frame)) { + frame_header_.refresh_frame_flags = 0xff; + } else { + OBU_READ_LITERAL_OR_FAIL(8); + frame_header_.refresh_frame_flags = scratch; + // Section 6.8.2: If frame_type is equal to INTRA_ONLY_FRAME, it is a + // requirement of bitstream conformance that refresh_frame_flags is not + // equal to 0xff. + if (frame_header_.frame_type == kFrameIntraOnly && + frame_header_.refresh_frame_flags == 0xff) { + LIBGAV1_DLOG(ERROR, "Intra only frames cannot have refresh flags 0xFF."); + return false; + } + } + if ((!IsIntraFrame(frame_header_.frame_type) || + frame_header_.refresh_frame_flags != 0xff) && + !ParseReferenceOrderHint()) { + return false; + } + if (IsIntraFrame(frame_header_.frame_type)) { + if (!ParseFrameSizeAndRenderSize()) return false; + if (frame_header_.allow_screen_content_tools && + frame_header_.width == frame_header_.upscaled_width) { + OBU_READ_BIT_OR_FAIL; + frame_header_.allow_intrabc = static_cast(scratch); + } + } else { + if (!sequence_header_.enable_order_hint) { + frame_header_.frame_refs_short_signaling = false; + } else { + OBU_READ_BIT_OR_FAIL; + frame_header_.frame_refs_short_signaling = static_cast(scratch); + if (frame_header_.frame_refs_short_signaling) { + OBU_READ_LITERAL_OR_FAIL(3); + const int8_t last_frame_idx = scratch; + OBU_READ_LITERAL_OR_FAIL(3); + const int8_t gold_frame_idx = scratch; + if (!SetFrameReferences(last_frame_idx, gold_frame_idx)) { + return false; + } + } + } + for (int i = 0; i < kNumInterReferenceFrameTypes; ++i) { + if (!frame_header_.frame_refs_short_signaling) { + OBU_READ_LITERAL_OR_FAIL(3); + frame_header_.reference_frame_index[i] = scratch; + } + const int reference_frame_index = frame_header_.reference_frame_index[i]; + assert(reference_frame_index >= 0); + // Section 6.8.2: It is a requirement of bitstream conformance that + // RefValid[ ref_frame_idx[ i ] ] is equal to 1 ... + // The remainder of the statement is handled by ParseSequenceHeader(). + // Note if support for Annex C: Error resilience behavior is added this + // check should be omitted per C.5 Decoder consequences of processable + // frames. + if (!decoder_state_.reference_valid[reference_frame_index]) { + LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i, + reference_frame_index); + return false; + } + // Check if the inter frame requests a nonexistent reference, whether or + // not frame_refs_short_signaling is used. + if (decoder_state_.reference_frame[reference_frame_index] == nullptr) { + LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not a decoded frame.", i, + reference_frame_index); + return false; + } + if (sequence_header_.frame_id_numbers_present) { + OBU_READ_LITERAL_OR_FAIL(sequence_header_.delta_frame_id_length_bits); + const int delta_frame_id = static_cast(1 + scratch); + const int id_length_max_value = + 1 << sequence_header_.frame_id_length_bits; + frame_header_.expected_frame_id[i] = + (frame_header_.current_frame_id + id_length_max_value - + delta_frame_id) % + id_length_max_value; + // Section 6.8.2: It is a requirement of bitstream conformance that + // whenever expectedFrameId[ i ] is calculated, the value matches + // RefFrameId[ ref_frame_idx[ i ] ] ... + // + // Section 6.8.2: It is a requirement of bitstream conformance that + // RefValid[ ref_frame_idx[ i ] ] is equal to 1, ... + if (frame_header_.expected_frame_id[i] != + decoder_state_.reference_frame_id[reference_frame_index] || + !decoder_state_.reference_valid[reference_frame_index]) { + LIBGAV1_DLOG(ERROR, + "Reference buffer %d has a frame id number mismatch.", + reference_frame_index); + return false; + } + } + } + if (frame_header_.frame_size_override_flag && + !frame_header_.error_resilient_mode) { + // Section 5.9.7. + for (int index : frame_header_.reference_frame_index) { + OBU_READ_BIT_OR_FAIL; + frame_header_.found_reference = static_cast(scratch); + if (frame_header_.found_reference) { + const RefCountedBuffer* reference_frame = + decoder_state_.reference_frame[index].get(); + // frame_header_.upscaled_width will be set in the + // ParseSuperResParametersAndComputeImageSize() call below. + frame_header_.width = reference_frame->upscaled_width(); + frame_header_.height = reference_frame->frame_height(); + frame_header_.render_width = reference_frame->render_width(); + frame_header_.render_height = reference_frame->render_height(); + if (!ParseSuperResParametersAndComputeImageSize()) return false; + break; + } + } + if (!frame_header_.found_reference && !ParseFrameSizeAndRenderSize()) { + return false; + } + } else { + if (!ParseFrameSizeAndRenderSize()) return false; + } + if (!ValidateInterFrameSize()) return false; + if (frame_header_.force_integer_mv != 0) { + frame_header_.allow_high_precision_mv = false; + } else { + OBU_READ_BIT_OR_FAIL; + frame_header_.allow_high_precision_mv = static_cast(scratch); + } + OBU_READ_BIT_OR_FAIL; + const auto is_filter_switchable = static_cast(scratch); + if (is_filter_switchable) { + frame_header_.interpolation_filter = kInterpolationFilterSwitchable; + } else { + OBU_READ_LITERAL_OR_FAIL(2); + frame_header_.interpolation_filter = + static_cast(scratch); + } + OBU_READ_BIT_OR_FAIL; + frame_header_.is_motion_mode_switchable = static_cast(scratch); + if (frame_header_.error_resilient_mode || + !sequence_header_.enable_ref_frame_mvs) { + frame_header_.use_ref_frame_mvs = false; + } else { + OBU_READ_BIT_OR_FAIL; + frame_header_.use_ref_frame_mvs = static_cast(scratch); + } + } + // At this point, we have parsed the frame and render sizes and computed + // the image size, whether it's an intra or inter frame. So we can save + // the sizes in the current frame now. + if (!current_frame_->SetFrameDimensions(frame_header_)) { + LIBGAV1_DLOG(ERROR, "Setting current frame dimensions failed."); + return false; + } + if (!IsIntraFrame(frame_header_.frame_type)) { + // Initialize the kReferenceFrameIntra type reference frame information to + // simplify the frame type validation in motion field projection. + // Set the kReferenceFrameIntra type |order_hint_| to + // |frame_header_.order_hint|. This guarantees that in SIMD implementations, + // the other reference frame information of the kReferenceFrameIntra type + // could be correctly initialized using the following loop with + // |frame_header_.order_hint| being the |hint|. + ReferenceInfo* const reference_info = current_frame_->reference_info(); + reference_info->order_hint[kReferenceFrameIntra] = frame_header_.order_hint; + reference_info->relative_distance_from[kReferenceFrameIntra] = 0; + reference_info->relative_distance_to[kReferenceFrameIntra] = 0; + reference_info->skip_references[kReferenceFrameIntra] = true; + reference_info->projection_divisions[kReferenceFrameIntra] = 0; + + for (int i = kReferenceFrameLast; i <= kNumInterReferenceFrameTypes; ++i) { + const auto reference_frame = static_cast(i); + const uint8_t hint = + decoder_state_.reference_order_hint + [frame_header_.reference_frame_index[i - kReferenceFrameLast]]; + reference_info->order_hint[reference_frame] = hint; + const int relative_distance_from = + GetRelativeDistance(hint, frame_header_.order_hint, + sequence_header_.order_hint_shift_bits); + const int relative_distance_to = + GetRelativeDistance(frame_header_.order_hint, hint, + sequence_header_.order_hint_shift_bits); + reference_info->relative_distance_from[reference_frame] = + relative_distance_from; + reference_info->relative_distance_to[reference_frame] = + relative_distance_to; + reference_info->skip_references[reference_frame] = + relative_distance_to > kMaxFrameDistance || relative_distance_to <= 0; + reference_info->projection_divisions[reference_frame] = + reference_info->skip_references[reference_frame] + ? 0 + : kProjectionMvDivisionLookup[relative_distance_to]; + decoder_state_.reference_frame_sign_bias[reference_frame] = + relative_distance_from > 0; + } + } + if (frame_header_.enable_cdf_update && + !sequence_header_.reduced_still_picture_header) { + OBU_READ_BIT_OR_FAIL; + frame_header_.enable_frame_end_update_cdf = !static_cast(scratch); + } else { + frame_header_.enable_frame_end_update_cdf = false; + } + return true; +} + +bool ObuParser::ParseFrameHeader() { + // Section 6.8.1: It is a requirement of bitstream conformance that a + // sequence header OBU has been received before a frame header OBU. + if (!has_sequence_header_) return false; + if (!ParseFrameParameters()) return false; + if (frame_header_.show_existing_frame) return true; + assert(!obu_headers_.empty()); + current_frame_->set_spatial_id(obu_headers_.back().spatial_id); + current_frame_->set_temporal_id(obu_headers_.back().temporal_id); + bool status = ParseTileInfoSyntax() && ParseQuantizerParameters() && + ParseSegmentationParameters(); + if (!status) return false; + current_frame_->SetSegmentationParameters(frame_header_.segmentation); + status = + ParseQuantizerIndexDeltaParameters() && ParseLoopFilterDeltaParameters(); + if (!status) return false; + ComputeSegmentLosslessAndQIndex(); + // Section 6.8.2: It is a requirement of bitstream conformance that + // delta_q_present is equal to 0 when CodedLossless is equal to 1. + if (frame_header_.coded_lossless && frame_header_.delta_q.present) { + return false; + } + status = ParseLoopFilterParameters(); + if (!status) return false; + current_frame_->SetLoopFilterDeltas(frame_header_.loop_filter); + status = ParseCdefParameters() && ParseLoopRestorationParameters() && + ParseTxModeSyntax() && ParseFrameReferenceModeSyntax() && + ParseSkipModeParameters() && ReadAllowWarpedMotion(); + if (!status) return false; + int64_t scratch; + OBU_READ_BIT_OR_FAIL; + frame_header_.reduced_tx_set = static_cast(scratch); + status = ParseGlobalMotionParameters(); + if (!status) return false; + current_frame_->SetGlobalMotions(frame_header_.global_motion); + status = ParseFilmGrainParameters(); + if (!status) return false; + if (sequence_header_.film_grain_params_present) { + current_frame_->set_film_grain_params(frame_header_.film_grain_params); + } + return true; +} + +bool ObuParser::ParsePadding(const uint8_t* data, size_t size) { + // The spec allows a padding OBU to be header-only (i.e., |size| = 0). So + // check trailing bits only if |size| > 0. + if (size == 0) return true; + // The payload of a padding OBU is byte aligned. Therefore the first + // trailing byte should be 0x80. See https://crbug.com/aomedia/2393. + const int i = GetLastNonzeroByteIndex(data, size); + if (i < 0) { + LIBGAV1_DLOG(ERROR, "Trailing bit is missing."); + return false; + } + if (data[i] != 0x80) { + LIBGAV1_DLOG( + ERROR, + "The last nonzero byte of the payload data is 0x%x, should be 0x80.", + data[i]); + return false; + } + // Skip all bits before the trailing bit. + bit_reader_->SkipBytes(i); + return true; +} + +bool ObuParser::ParseMetadataScalability() { + int64_t scratch; + // scalability_mode_idc + OBU_READ_LITERAL_OR_FAIL(8); + const auto scalability_mode_idc = static_cast(scratch); + if (scalability_mode_idc == kScalabilitySS) { + // Parse scalability_structure(). + // spatial_layers_cnt_minus_1 + OBU_READ_LITERAL_OR_FAIL(2); + const auto spatial_layers_count = static_cast(scratch) + 1; + // spatial_layer_dimensions_present_flag + OBU_READ_BIT_OR_FAIL; + const auto spatial_layer_dimensions_present_flag = + static_cast(scratch); + // spatial_layer_description_present_flag + OBU_READ_BIT_OR_FAIL; + const auto spatial_layer_description_present_flag = + static_cast(scratch); + // temporal_group_description_present_flag + OBU_READ_BIT_OR_FAIL; + const auto temporal_group_description_present_flag = + static_cast(scratch); + // scalability_structure_reserved_3bits + OBU_READ_LITERAL_OR_FAIL(3); + if (scratch != 0) { + LIBGAV1_DLOG(WARNING, + "scalability_structure_reserved_3bits is not zero."); + } + if (spatial_layer_dimensions_present_flag) { + for (int i = 0; i < spatial_layers_count; ++i) { + // spatial_layer_max_width[i] + OBU_READ_LITERAL_OR_FAIL(16); + // spatial_layer_max_height[i] + OBU_READ_LITERAL_OR_FAIL(16); + } + } + if (spatial_layer_description_present_flag) { + for (int i = 0; i < spatial_layers_count; ++i) { + // spatial_layer_ref_id[i] + OBU_READ_LITERAL_OR_FAIL(8); + } + } + if (temporal_group_description_present_flag) { + // temporal_group_size + OBU_READ_LITERAL_OR_FAIL(8); + const auto temporal_group_size = static_cast(scratch); + for (int i = 0; i < temporal_group_size; ++i) { + // temporal_group_temporal_id[i] + OBU_READ_LITERAL_OR_FAIL(3); + // temporal_group_temporal_switching_up_point_flag[i] + OBU_READ_BIT_OR_FAIL; + // temporal_group_spatial_switching_up_point_flag[i] + OBU_READ_BIT_OR_FAIL; + // temporal_group_ref_cnt[i] + OBU_READ_LITERAL_OR_FAIL(3); + const auto temporal_group_ref_count = static_cast(scratch); + for (int j = 0; j < temporal_group_ref_count; ++j) { + // temporal_group_ref_pic_diff[i][j] + OBU_READ_LITERAL_OR_FAIL(8); + } + } + } + } + return true; +} + +bool ObuParser::ParseMetadataTimecode() { + int64_t scratch; + // counting_type: should be the same for all pictures in the coded video + // sequence. 7..31 are reserved. + OBU_READ_LITERAL_OR_FAIL(5); + // full_timestamp_flag + OBU_READ_BIT_OR_FAIL; + const auto full_timestamp_flag = static_cast(scratch); + // discontinuity_flag + OBU_READ_BIT_OR_FAIL; + // cnt_dropped_flag + OBU_READ_BIT_OR_FAIL; + // n_frames + OBU_READ_LITERAL_OR_FAIL(9); + if (full_timestamp_flag) { + // seconds_value + OBU_READ_LITERAL_OR_FAIL(6); + const auto seconds_value = static_cast(scratch); + if (seconds_value > 59) { + LIBGAV1_DLOG(ERROR, "Invalid seconds_value %d.", seconds_value); + return false; + } + // minutes_value + OBU_READ_LITERAL_OR_FAIL(6); + const auto minutes_value = static_cast(scratch); + if (minutes_value > 59) { + LIBGAV1_DLOG(ERROR, "Invalid minutes_value %d.", minutes_value); + return false; + } + // hours_value + OBU_READ_LITERAL_OR_FAIL(5); + const auto hours_value = static_cast(scratch); + if (hours_value > 23) { + LIBGAV1_DLOG(ERROR, "Invalid hours_value %d.", hours_value); + return false; + } + } else { + // seconds_flag + OBU_READ_BIT_OR_FAIL; + const auto seconds_flag = static_cast(scratch); + if (seconds_flag) { + // seconds_value + OBU_READ_LITERAL_OR_FAIL(6); + const auto seconds_value = static_cast(scratch); + if (seconds_value > 59) { + LIBGAV1_DLOG(ERROR, "Invalid seconds_value %d.", seconds_value); + return false; + } + // minutes_flag + OBU_READ_BIT_OR_FAIL; + const auto minutes_flag = static_cast(scratch); + if (minutes_flag) { + // minutes_value + OBU_READ_LITERAL_OR_FAIL(6); + const auto minutes_value = static_cast(scratch); + if (minutes_value > 59) { + LIBGAV1_DLOG(ERROR, "Invalid minutes_value %d.", minutes_value); + return false; + } + // hours_flag + OBU_READ_BIT_OR_FAIL; + const auto hours_flag = static_cast(scratch); + if (hours_flag) { + // hours_value + OBU_READ_LITERAL_OR_FAIL(5); + const auto hours_value = static_cast(scratch); + if (hours_value > 23) { + LIBGAV1_DLOG(ERROR, "Invalid hours_value %d.", hours_value); + return false; + } + } + } + } + } + // time_offset_length: should be the same for all pictures in the coded + // video sequence. + OBU_READ_LITERAL_OR_FAIL(5); + const auto time_offset_length = static_cast(scratch); + if (time_offset_length > 0) { + // time_offset_value + OBU_READ_LITERAL_OR_FAIL(time_offset_length); + } + // Compute clockTimestamp. Section 6.7.7: + // When timing_info_present_flag is equal to 1 and discontinuity_flag is + // equal to 0, the value of clockTimestamp shall be greater than or equal + // to the value of clockTimestamp for the previous set of clock timestamp + // syntax elements in output order. + return true; +} + +bool ObuParser::ParseMetadata(const uint8_t* data, size_t size) { + const size_t start_offset = bit_reader_->byte_offset(); + size_t metadata_type; + if (!bit_reader_->ReadUnsignedLeb128(&metadata_type)) { + LIBGAV1_DLOG(ERROR, "Could not read metadata_type."); + return false; + } + const size_t metadata_type_size = bit_reader_->byte_offset() - start_offset; + if (size < metadata_type_size) { + LIBGAV1_DLOG( + ERROR, "metadata_type is longer than metadata OBU payload %zu vs %zu.", + metadata_type_size, size); + return false; + } + data += metadata_type_size; + size -= metadata_type_size; + int64_t scratch; + switch (metadata_type) { + case kMetadataTypeHdrContentLightLevel: + OBU_READ_LITERAL_OR_FAIL(16); + metadata_.max_cll = scratch; + OBU_READ_LITERAL_OR_FAIL(16); + metadata_.max_fall = scratch; + break; + case kMetadataTypeHdrMasteringDisplayColorVolume: + for (int i = 0; i < 3; ++i) { + OBU_READ_LITERAL_OR_FAIL(16); + metadata_.primary_chromaticity_x[i] = scratch; + OBU_READ_LITERAL_OR_FAIL(16); + metadata_.primary_chromaticity_y[i] = scratch; + } + OBU_READ_LITERAL_OR_FAIL(16); + metadata_.white_point_chromaticity_x = scratch; + OBU_READ_LITERAL_OR_FAIL(16); + metadata_.white_point_chromaticity_y = scratch; + OBU_READ_LITERAL_OR_FAIL(32); + metadata_.luminance_max = static_cast(scratch); + OBU_READ_LITERAL_OR_FAIL(32); + metadata_.luminance_min = static_cast(scratch); + break; + case kMetadataTypeScalability: + if (!ParseMetadataScalability()) return false; + break; + case kMetadataTypeItutT35: { + OBU_READ_LITERAL_OR_FAIL(8); + metadata_.itu_t_t35_country_code = static_cast(scratch); + ++data; + --size; + if (metadata_.itu_t_t35_country_code == 0xFF) { + OBU_READ_LITERAL_OR_FAIL(8); + metadata_.itu_t_t35_country_code_extension_byte = + static_cast(scratch); + ++data; + --size; + } + // Read itu_t_t35_payload_bytes. Section 6.7.2 of the spec says: + // itu_t_t35_payload_bytes shall be bytes containing data registered as + // specified in Recommendation ITU-T T.35. + // Therefore itu_t_t35_payload_bytes is byte aligned and the first + // trailing byte should be 0x80. Since the exact syntax of + // itu_t_t35_payload_bytes is not defined in the AV1 spec, identify the + // end of itu_t_t35_payload_bytes by searching for the trailing bit. + const int i = GetLastNonzeroByteIndex(data, size); + if (i < 0) { + LIBGAV1_DLOG(ERROR, "Trailing bit is missing."); + return false; + } + if (data[i] != 0x80) { + LIBGAV1_DLOG( + ERROR, + "itu_t_t35_payload_bytes is not byte aligned. The last nonzero " + "byte of the payload data is 0x%x, should be 0x80.", + data[i]); + return false; + } + if (i != 0) { + // data[0]..data[i - 1] are itu_t_t35_payload_bytes. + metadata_.itu_t_t35_payload_bytes.reset(new (std::nothrow) uint8_t[i]); + if (metadata_.itu_t_t35_payload_bytes == nullptr) { + LIBGAV1_DLOG(ERROR, "Allocation of itu_t_t35_payload_bytes failed."); + return false; + } + memcpy(metadata_.itu_t_t35_payload_bytes.get(), data, i); + metadata_.itu_t_t35_payload_size = i; + } + // Skip all bits before the trailing bit. + bit_reader_->SkipBytes(i); + break; + } + case kMetadataTypeTimecode: + if (!ParseMetadataTimecode()) return false; + break; + default: { + // metadata_type is equal to a value reserved for future use or a user + // private value. + // + // The Note in Section 5.8.1 says "Decoders should ignore the entire OBU + // if they do not understand the metadata_type." Find the trailing bit + // and skip all bits before the trailing bit. + const int i = GetLastNonzeroByteIndex(data, size); + if (i >= 0) { + // The last 1 bit in the last nonzero byte is the trailing bit. Skip + // all bits before the trailing bit. + const int n = CountTrailingZeros(data[i]); + bit_reader_->SkipBits(i * 8 + 7 - n); + } + break; + } + } + return true; +} + +bool ObuParser::AddTileBuffers(int start, int end, size_t total_size, + size_t tg_header_size, + size_t bytes_consumed_so_far) { + // Validate that the tile group start and end are within the allowed range. + if (start != next_tile_group_start_ || start > end || + end >= frame_header_.tile_info.tile_count) { + LIBGAV1_DLOG(ERROR, + "Invalid tile group start %d or end %d: expected tile group " + "start %d, tile_count %d.", + start, end, next_tile_group_start_, + frame_header_.tile_info.tile_count); + return false; + } + next_tile_group_start_ = end + 1; + + if (total_size < tg_header_size) { + LIBGAV1_DLOG(ERROR, "total_size (%zu) is less than tg_header_size (%zu).)", + total_size, tg_header_size); + return false; + } + size_t bytes_left = total_size - tg_header_size; + const uint8_t* data = data_ + bytes_consumed_so_far + tg_header_size; + for (int tile_number = start; tile_number <= end; ++tile_number) { + size_t tile_size = 0; + if (tile_number != end) { + RawBitReader bit_reader(data, bytes_left); + if (!bit_reader.ReadLittleEndian(frame_header_.tile_info.tile_size_bytes, + &tile_size)) { + LIBGAV1_DLOG(ERROR, "Could not read tile size for tile #%d", + tile_number); + return false; + } + ++tile_size; + data += frame_header_.tile_info.tile_size_bytes; + bytes_left -= frame_header_.tile_info.tile_size_bytes; + if (tile_size > bytes_left) { + LIBGAV1_DLOG(ERROR, "Invalid tile size %zu for tile #%d", tile_size, + tile_number); + return false; + } + } else { + tile_size = bytes_left; + if (tile_size == 0) { + LIBGAV1_DLOG(ERROR, "Invalid tile size %zu for tile #%d", tile_size, + tile_number); + return false; + } + } + // The memory for this has been allocated in ParseTileInfoSyntax(). So it is + // safe to use push_back_unchecked here. + tile_buffers_.push_back_unchecked({data, tile_size}); + data += tile_size; + bytes_left -= tile_size; + } + bit_reader_->SkipBytes(total_size - tg_header_size); + return true; +} + +bool ObuParser::ParseTileGroup(size_t size, size_t bytes_consumed_so_far) { + const TileInfo* const tile_info = &frame_header_.tile_info; + const size_t start_offset = bit_reader_->byte_offset(); + const int tile_bits = + tile_info->tile_columns_log2 + tile_info->tile_rows_log2; + if (tile_bits == 0) { + return AddTileBuffers(0, 0, size, 0, bytes_consumed_so_far); + } + int64_t scratch; + OBU_READ_BIT_OR_FAIL; + const auto tile_start_and_end_present_flag = static_cast(scratch); + if (!tile_start_and_end_present_flag) { + if (!bit_reader_->AlignToNextByte()) { + LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits."); + return false; + } + return AddTileBuffers(0, tile_info->tile_count - 1, size, 1, + bytes_consumed_so_far); + } + if (obu_headers_.back().type == kObuFrame) { + // 6.10.1: If obu_type is equal to OBU_FRAME, it is a requirement of + // bitstream conformance that the value of tile_start_and_end_present_flag + // is equal to 0. + LIBGAV1_DLOG(ERROR, + "tile_start_and_end_present_flag must be 0 in Frame OBU"); + return false; + } + OBU_READ_LITERAL_OR_FAIL(tile_bits); + const int start = static_cast(scratch); + OBU_READ_LITERAL_OR_FAIL(tile_bits); + const int end = static_cast(scratch); + if (!bit_reader_->AlignToNextByte()) { + LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits."); + return false; + } + const size_t tg_header_size = bit_reader_->byte_offset() - start_offset; + return AddTileBuffers(start, end, size, tg_header_size, + bytes_consumed_so_far); +} + +bool ObuParser::ParseHeader() { + ObuHeader obu_header; + int64_t scratch = bit_reader_->ReadBit(); + if (scratch != 0) { + LIBGAV1_DLOG(ERROR, "forbidden_bit is not zero."); + return false; + } + OBU_READ_LITERAL_OR_FAIL(4); + obu_header.type = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + const auto extension_flag = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; + obu_header.has_size_field = static_cast(scratch); + OBU_READ_BIT_OR_FAIL; // reserved. + if (scratch != 0) { + LIBGAV1_DLOG(WARNING, "obu_reserved_1bit is not zero."); + } + obu_header.has_extension = extension_flag; + if (extension_flag) { + if (extension_disallowed_) { + LIBGAV1_DLOG(ERROR, + "OperatingPointIdc is 0, but obu_extension_flag is 1."); + return false; + } + OBU_READ_LITERAL_OR_FAIL(3); + obu_header.temporal_id = scratch; + OBU_READ_LITERAL_OR_FAIL(2); + obu_header.spatial_id = scratch; + OBU_READ_LITERAL_OR_FAIL(3); // reserved. + if (scratch != 0) { + LIBGAV1_DLOG(WARNING, "extension_header_reserved_3bits is not zero."); + } + } else { + obu_header.temporal_id = 0; + obu_header.spatial_id = 0; + } + return obu_headers_.push_back(obu_header); +} + +#undef OBU_READ_UVLC_OR_FAIL +#undef OBU_READ_LITERAL_OR_FAIL +#undef OBU_READ_BIT_OR_FAIL +#undef OBU_PARSER_FAIL +#undef OBU_LOG_AND_RETURN_FALSE + +bool ObuParser::InitBitReader(const uint8_t* const data, size_t size) { + bit_reader_.reset(new (std::nothrow) RawBitReader(data, size)); + return bit_reader_ != nullptr; +} + +bool ObuParser::HasData() const { return size_ > 0; } + +StatusCode ObuParser::ParseOneFrame(RefCountedBufferPtr* const current_frame) { + if (data_ == nullptr || size_ == 0) return kStatusInvalidArgument; + + assert(current_frame_ == nullptr); + // This is used to release any references held in case of parsing failure. + RefCountedBufferPtrCleanup current_frame_cleanup(¤t_frame_); + + const uint8_t* data = data_; + size_t size = size_; + + // Clear everything except the sequence header. + obu_headers_.clear(); + frame_header_ = {}; + metadata_ = {}; + tile_buffers_.clear(); + next_tile_group_start_ = 0; + + bool parsed_one_full_frame = false; + bool seen_frame_header = false; + const uint8_t* frame_header = nullptr; + size_t frame_header_size_in_bits = 0; + while (size > 0 && !parsed_one_full_frame) { + if (!InitBitReader(data, size)) { + LIBGAV1_DLOG(ERROR, "Failed to initialize bit reader."); + return kStatusOutOfMemory; + } + if (!ParseHeader()) { + LIBGAV1_DLOG(ERROR, "Failed to parse OBU Header."); + return kStatusBitstreamError; + } + const ObuHeader& obu_header = obu_headers_.back(); + if (!obu_header.has_size_field) { + LIBGAV1_DLOG( + ERROR, + "has_size_field is zero. libgav1 does not support such streams."); + return kStatusUnimplemented; + } + const size_t obu_header_size = bit_reader_->byte_offset(); + size_t obu_size; + if (!bit_reader_->ReadUnsignedLeb128(&obu_size)) { + LIBGAV1_DLOG(ERROR, "Could not read OBU size."); + return kStatusBitstreamError; + } + const size_t obu_length_size = bit_reader_->byte_offset() - obu_header_size; + if (size - bit_reader_->byte_offset() < obu_size) { + LIBGAV1_DLOG(ERROR, "Not enough bits left to parse OBU %zu vs %zu.", + size - bit_reader_->bit_offset(), obu_size); + return kStatusBitstreamError; + } + + const ObuType obu_type = obu_header.type; + if (obu_type != kObuSequenceHeader && obu_type != kObuTemporalDelimiter && + has_sequence_header_ && + sequence_header_.operating_point_idc[operating_point_] != 0 && + obu_header.has_extension && + (!InTemporalLayer( + sequence_header_.operating_point_idc[operating_point_], + obu_header.temporal_id) || + !InSpatialLayer(sequence_header_.operating_point_idc[operating_point_], + obu_header.spatial_id))) { + obu_headers_.pop_back(); + bit_reader_->SkipBytes(obu_size); + data += bit_reader_->byte_offset(); + size -= bit_reader_->byte_offset(); + continue; + } + + const size_t obu_start_position = bit_reader_->bit_offset(); + // The bit_reader_ is byte aligned after reading obu_header and obu_size. + // Therefore the byte offset can be computed as obu_start_position >> 3 + // below. + assert((obu_start_position & 7) == 0); + bool obu_skipped = false; + switch (obu_type) { + case kObuTemporalDelimiter: + break; + case kObuSequenceHeader: + if (!ParseSequenceHeader(seen_frame_header)) { + LIBGAV1_DLOG(ERROR, "Failed to parse SequenceHeader OBU."); + return kStatusBitstreamError; + } + if (sequence_header_.color_config.bitdepth > LIBGAV1_MAX_BITDEPTH) { + LIBGAV1_DLOG( + ERROR, + "Bitdepth %d is not supported. The maximum bitdepth is %d.", + sequence_header_.color_config.bitdepth, LIBGAV1_MAX_BITDEPTH); + return kStatusUnimplemented; + } + break; + case kObuFrameHeader: + if (seen_frame_header) { + LIBGAV1_DLOG(ERROR, + "Frame header found but frame header was already seen."); + return kStatusBitstreamError; + } + if (!ParseFrameHeader()) { + LIBGAV1_DLOG(ERROR, "Failed to parse FrameHeader OBU."); + return kStatusBitstreamError; + } + frame_header = &data[obu_start_position >> 3]; + frame_header_size_in_bits = + bit_reader_->bit_offset() - obu_start_position; + seen_frame_header = true; + parsed_one_full_frame = frame_header_.show_existing_frame; + break; + case kObuRedundantFrameHeader: { + if (!seen_frame_header) { + LIBGAV1_DLOG(ERROR, + "Redundant frame header found but frame header was not " + "yet seen."); + return kStatusBitstreamError; + } + const size_t fh_size = (frame_header_size_in_bits + 7) >> 3; + if (obu_size < fh_size || + memcmp(frame_header, &data[obu_start_position >> 3], fh_size) != + 0) { + LIBGAV1_DLOG(ERROR, + "Redundant frame header differs from frame header."); + return kStatusBitstreamError; + } + bit_reader_->SkipBits(frame_header_size_in_bits); + break; + } + case kObuFrame: { + const size_t fh_start_offset = bit_reader_->byte_offset(); + if (seen_frame_header) { + LIBGAV1_DLOG(ERROR, + "Frame header found but frame header was already seen."); + return kStatusBitstreamError; + } + if (!ParseFrameHeader()) { + LIBGAV1_DLOG(ERROR, "Failed to parse FrameHeader in Frame OBU."); + return kStatusBitstreamError; + } + // Section 6.8.2: If obu_type is equal to OBU_FRAME, it is a + // requirement of bitstream conformance that show_existing_frame is + // equal to 0. + if (frame_header_.show_existing_frame) { + LIBGAV1_DLOG(ERROR, "Frame OBU cannot set show_existing_frame to 1."); + return kStatusBitstreamError; + } + if (!bit_reader_->AlignToNextByte()) { + LIBGAV1_DLOG(ERROR, "Byte alignment has non zero bits."); + return kStatusBitstreamError; + } + const size_t fh_size = bit_reader_->byte_offset() - fh_start_offset; + if (fh_size >= obu_size) { + LIBGAV1_DLOG(ERROR, "Frame header size (%zu) >= obu_size (%zu).", + fh_size, obu_size); + return kStatusBitstreamError; + } + if (!ParseTileGroup(obu_size - fh_size, + size_ - size + bit_reader_->byte_offset())) { + LIBGAV1_DLOG(ERROR, "Failed to parse TileGroup in Frame OBU."); + return kStatusBitstreamError; + } + parsed_one_full_frame = true; + break; + } + case kObuTileGroup: + if (!ParseTileGroup(obu_size, + size_ - size + bit_reader_->byte_offset())) { + LIBGAV1_DLOG(ERROR, "Failed to parse TileGroup OBU."); + return kStatusBitstreamError; + } + parsed_one_full_frame = + (next_tile_group_start_ == frame_header_.tile_info.tile_count); + break; + case kObuTileList: + LIBGAV1_DLOG(ERROR, "Decoding of tile list OBUs is not supported."); + return kStatusUnimplemented; + case kObuPadding: + if (!ParsePadding(&data[obu_start_position >> 3], obu_size)) { + LIBGAV1_DLOG(ERROR, "Failed to parse Padding OBU."); + return kStatusBitstreamError; + } + break; + case kObuMetadata: + if (!ParseMetadata(&data[obu_start_position >> 3], obu_size)) { + LIBGAV1_DLOG(ERROR, "Failed to parse Metadata OBU."); + return kStatusBitstreamError; + } + break; + default: + // Skip reserved OBUs. Section 6.2.2: Reserved units are for future use + // and shall be ignored by AV1 decoder. + bit_reader_->SkipBytes(obu_size); + obu_skipped = true; + break; + } + if (obu_size > 0 && !obu_skipped && obu_type != kObuFrame && + obu_type != kObuTileGroup) { + const size_t parsed_obu_size_in_bits = + bit_reader_->bit_offset() - obu_start_position; + if (obu_size * 8 < parsed_obu_size_in_bits) { + LIBGAV1_DLOG( + ERROR, + "Parsed OBU size (%zu bits) is greater than expected OBU size " + "(%zu bytes) obu_type: %d.", + parsed_obu_size_in_bits, obu_size, obu_type); + return kStatusBitstreamError; + } + if (!bit_reader_->VerifyAndSkipTrailingBits(obu_size * 8 - + parsed_obu_size_in_bits)) { + LIBGAV1_DLOG(ERROR, + "Error when verifying trailing bits for obu type: %d", + obu_type); + return kStatusBitstreamError; + } + } + const size_t bytes_consumed = bit_reader_->byte_offset(); + const size_t consumed_obu_size = + bytes_consumed - obu_length_size - obu_header_size; + if (consumed_obu_size != obu_size) { + LIBGAV1_DLOG(ERROR, + "OBU size (%zu) and consumed size (%zu) does not match for " + "obu_type: %d.", + obu_size, consumed_obu_size, obu_type); + return kStatusBitstreamError; + } + data += bytes_consumed; + size -= bytes_consumed; + } + if (!parsed_one_full_frame && seen_frame_header) { + LIBGAV1_DLOG(ERROR, "The last tile group in the frame was not received."); + return kStatusBitstreamError; + } + data_ = data; + size_ = size; + *current_frame = std::move(current_frame_); + return kStatusOk; +} + +} // namespace libgav1 diff --git a/src/obu_parser.h b/src/obu_parser.h new file mode 100644 index 0000000..86d165f --- /dev/null +++ b/src/obu_parser.h @@ -0,0 +1,406 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_OBU_PARSER_H_ +#define LIBGAV1_SRC_OBU_PARSER_H_ + +#include +#include +#include +#include +#include + +#include "src/buffer_pool.h" +#include "src/decoder_state.h" +#include "src/dsp/common.h" +#include "src/gav1/decoder_buffer.h" +#include "src/gav1/status_code.h" +#include "src/quantizer.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" +#include "src/utils/raw_bit_reader.h" +#include "src/utils/segmentation.h" +#include "src/utils/vector.h" + +namespace libgav1 { + +// structs and enums related to Open Bitstream Units (OBU). + +enum { + kMinimumMajorBitstreamLevel = 2, + kSelectScreenContentTools = 2, + kSelectIntegerMv = 2, + kLoopRestorationTileSizeMax = 256, + kGlobalMotionAlphaBits = 12, + kGlobalMotionTranslationBits = 12, + kGlobalMotionTranslationOnlyBits = 9, + kGlobalMotionAlphaPrecisionBits = 15, + kGlobalMotionTranslationPrecisionBits = 6, + kGlobalMotionTranslationOnlyPrecisionBits = 3, + kMaxTileWidth = 4096, + kMaxTileArea = 4096 * 2304, + kPrimaryReferenceNone = 7, + // A special value of the scalability_mode_idc syntax element that indicates + // the picture prediction structure is specified in scalability_structure(). + kScalabilitySS = 14 +}; // anonymous enum + +struct ObuHeader { + ObuType type; + bool has_extension; + bool has_size_field; + int8_t temporal_id; + int8_t spatial_id; +}; + +enum BitstreamProfile : uint8_t { + kProfile0, + kProfile1, + kProfile2, + kMaxProfiles +}; + +// In the bitstream the level is encoded in five bits: the first three bits +// encode |major| - 2 and the last two bits encode |minor|. +// +// If the mapped level (major.minor) is in the tables in Annex A.3, there are +// bitstream conformance requirements on the maximum or minimum values of +// several variables. The encoded value of 31 (which corresponds to the mapped +// level 9.3) is the "maximum parameters" level and imposes no level-based +// constraints on the bitstream. +struct BitStreamLevel { + uint8_t major; // Range: 2-9. + uint8_t minor; // Range: 0-3. +}; + +struct ColorConfig { + int8_t bitdepth; + bool is_monochrome; + ColorPrimary color_primary; + TransferCharacteristics transfer_characteristics; + MatrixCoefficients matrix_coefficients; + // A binary value (0 or 1) that is associated with the VideoFullRangeFlag + // variable specified in ISO/IEC 23091-4/ITUT H.273. + // * 0: the studio swing representation. + // * 1: the full swing representation. + ColorRange color_range; + int8_t subsampling_x; + int8_t subsampling_y; + ChromaSamplePosition chroma_sample_position; + bool separate_uv_delta_q; +}; + +struct TimingInfo { + uint32_t num_units_in_tick; + uint32_t time_scale; + bool equal_picture_interval; + uint32_t num_ticks_per_picture; +}; + +struct DecoderModelInfo { + uint8_t encoder_decoder_buffer_delay_length; + uint32_t num_units_in_decoding_tick; + uint8_t buffer_removal_time_length; + uint8_t frame_presentation_time_length; +}; + +struct OperatingParameters { + uint32_t decoder_buffer_delay[kMaxOperatingPoints]; + uint32_t encoder_buffer_delay[kMaxOperatingPoints]; + bool low_delay_mode_flag[kMaxOperatingPoints]; +}; + +struct ObuSequenceHeader { + // Section 7.5: + // Within a particular coded video sequence, the contents of + // sequence_header_obu must be bit-identical each time the sequence header + // appears except for the contents of operating_parameters_info. A new + // coded video sequence is required if the sequence header parameters + // change. + // + // IMPORTANT: ParametersChanged() is implemented with a memcmp() call. For + // this to work, this object and the |old| object must be initialized with + // an empty brace-enclosed list, which initializes any padding to zero bits. + // See https://en.cppreference.com/w/cpp/language/zero_initialization. + bool ParametersChanged(const ObuSequenceHeader& old) const; + + BitstreamProfile profile; + bool still_picture; + bool reduced_still_picture_header; + int operating_points; + int operating_point_idc[kMaxOperatingPoints]; + BitStreamLevel level[kMaxOperatingPoints]; + int8_t tier[kMaxOperatingPoints]; + int8_t frame_width_bits; + int8_t frame_height_bits; + int32_t max_frame_width; + int32_t max_frame_height; + bool frame_id_numbers_present; + int8_t frame_id_length_bits; + int8_t delta_frame_id_length_bits; + bool use_128x128_superblock; + bool enable_filter_intra; + bool enable_intra_edge_filter; + bool enable_interintra_compound; + bool enable_masked_compound; + bool enable_warped_motion; + bool enable_dual_filter; + bool enable_order_hint; + // If enable_order_hint is true, order_hint_bits is in the range [1, 8]. + // If enable_order_hint is false, order_hint_bits is 0. + int8_t order_hint_bits; + // order_hint_shift_bits equals (32 - order_hint_bits) % 32. + // This is used frequently in GetRelativeDistance(). + uint8_t order_hint_shift_bits; + bool enable_jnt_comp; + bool enable_ref_frame_mvs; + bool choose_screen_content_tools; + int8_t force_screen_content_tools; + bool choose_integer_mv; + int8_t force_integer_mv; + bool enable_superres; + bool enable_cdef; + bool enable_restoration; + ColorConfig color_config; + bool timing_info_present_flag; + TimingInfo timing_info; + bool decoder_model_info_present_flag; + DecoderModelInfo decoder_model_info; + bool decoder_model_present_for_operating_point[kMaxOperatingPoints]; + bool initial_display_delay_present_flag; + uint8_t initial_display_delay[kMaxOperatingPoints]; + bool film_grain_params_present; + + // IMPORTANT: the operating_parameters member must be at the end of the + // struct so that ParametersChanged() can be implemented with a memcmp() + // call. + OperatingParameters operating_parameters; +}; +// Verify it is safe to use offsetof with ObuSequenceHeader and to use memcmp +// to compare two ObuSequenceHeader objects. +static_assert(std::is_standard_layout::value, ""); +// Verify operating_parameters is the last member of ObuSequenceHeader. The +// second assertion assumes that ObuSequenceHeader has no padding after the +// operating_parameters field. The first assertion is a sufficient condition +// for ObuSequenceHeader to have no padding after the operating_parameters +// field. +static_assert(alignof(ObuSequenceHeader) == alignof(OperatingParameters), ""); +static_assert(sizeof(ObuSequenceHeader) == + offsetof(ObuSequenceHeader, operating_parameters) + + sizeof(OperatingParameters), + ""); + +struct TileBuffer { + const uint8_t* data; + size_t size; +}; + +enum MetadataType : uint8_t { + // 0 is reserved for AOM use. + kMetadataTypeHdrContentLightLevel = 1, + kMetadataTypeHdrMasteringDisplayColorVolume = 2, + kMetadataTypeScalability = 3, + kMetadataTypeItutT35 = 4, + kMetadataTypeTimecode = 5, + // 6-31 are unregistered user private. + // 32 and greater are reserved for AOM use. +}; + +struct ObuMetadata { + // Maximum content light level. + uint16_t max_cll; + // Maximum frame-average light level. + uint16_t max_fall; + uint16_t primary_chromaticity_x[3]; + uint16_t primary_chromaticity_y[3]; + uint16_t white_point_chromaticity_x; + uint16_t white_point_chromaticity_y; + uint32_t luminance_max; + uint32_t luminance_min; + // ITU-T T.35. + uint8_t itu_t_t35_country_code; + uint8_t itu_t_t35_country_code_extension_byte; // Valid if + // itu_t_t35_country_code is + // 0xFF. + std::unique_ptr itu_t_t35_payload_bytes; + size_t itu_t_t35_payload_size; +}; + +class ObuParser : public Allocable { + public: + ObuParser(const uint8_t* const data, size_t size, int operating_point, + BufferPool* const buffer_pool, DecoderState* const decoder_state) + : data_(data), + size_(size), + operating_point_(operating_point), + buffer_pool_(buffer_pool), + decoder_state_(*decoder_state) {} + + // Not copyable or movable. + ObuParser(const ObuParser& rhs) = delete; + ObuParser& operator=(const ObuParser& rhs) = delete; + + // Returns true if there is more data that needs to be parsed. + bool HasData() const; + + // Parses a sequence of Open Bitstream Units until a decodable frame is found + // (or until the end of stream is reached). A decodable frame is considered to + // be found when one of the following happens: + // * A kObuFrame is seen. + // * The kObuTileGroup containing the last tile is seen. + // * A kFrameHeader with show_existing_frame = true is seen. + // + // If the parsing is successful, relevant fields will be populated. The fields + // are valid only if the return value is kStatusOk. Returns kStatusOk on + // success, an error status otherwise. On success, |current_frame| will be + // populated with a valid frame buffer. + StatusCode ParseOneFrame(RefCountedBufferPtr* current_frame); + + // Getters. Only valid if ParseOneFrame() completes successfully. + const Vector& obu_headers() const { return obu_headers_; } + const ObuSequenceHeader& sequence_header() const { return sequence_header_; } + const ObuFrameHeader& frame_header() const { return frame_header_; } + const Vector& tile_buffers() const { return tile_buffers_; } + const ObuMetadata& metadata() const { return metadata_; } + + // Setters. + void set_sequence_header(const ObuSequenceHeader& sequence_header) { + sequence_header_ = sequence_header; + has_sequence_header_ = true; + } + + // Moves |tile_buffers_| into |tile_buffers|. + void MoveTileBuffers(Vector* tile_buffers) { + *tile_buffers = std::move(tile_buffers_); + } + + private: + // Initializes the bit reader. This is a function of its own to make unit + // testing of private functions simpler. + LIBGAV1_MUST_USE_RESULT bool InitBitReader(const uint8_t* data, size_t size); + + // Parse helper functions. + bool ParseHeader(); // 5.3.2 and 5.3.3. + bool ParseColorConfig(ObuSequenceHeader* sequence_header); // 5.5.2. + bool ParseTimingInfo(ObuSequenceHeader* sequence_header); // 5.5.3. + bool ParseDecoderModelInfo(ObuSequenceHeader* sequence_header); // 5.5.4. + bool ParseOperatingParameters(ObuSequenceHeader* sequence_header, + int index); // 5.5.5. + bool ParseSequenceHeader(bool seen_frame_header); // 5.5.1. + bool ParseFrameParameters(); // 5.9.2, 5.9.7 and 5.9.10. + void MarkInvalidReferenceFrames(); // 5.9.4. + bool ParseFrameSizeAndRenderSize(); // 5.9.5 and 5.9.6. + bool ParseSuperResParametersAndComputeImageSize(); // 5.9.8 and 5.9.9. + // Checks the bitstream conformance requirement in Section 6.8.6. + bool ValidateInterFrameSize() const; + bool ParseReferenceOrderHint(); + static int FindLatestBackwardReference( + const int current_frame_hint, + const std::array& shifted_order_hints, + const std::array& used_frame); + static int FindEarliestBackwardReference( + const int current_frame_hint, + const std::array& shifted_order_hints, + const std::array& used_frame); + static int FindLatestForwardReference( + const int current_frame_hint, + const std::array& shifted_order_hints, + const std::array& used_frame); + static int FindReferenceWithSmallestOutputOrder( + const std::array& shifted_order_hints); + bool SetFrameReferences(int8_t last_frame_idx, + int8_t gold_frame_idx); // 7.8. + bool ParseLoopFilterParameters(); // 5.9.11. + bool ParseDeltaQuantizer(int8_t* delta); // 5.9.13. + bool ParseQuantizerParameters(); // 5.9.12. + bool ParseSegmentationParameters(); // 5.9.14. + bool ParseQuantizerIndexDeltaParameters(); // 5.9.17. + bool ParseLoopFilterDeltaParameters(); // 5.9.18. + void ComputeSegmentLosslessAndQIndex(); + bool ParseCdefParameters(); // 5.9.19. + bool ParseLoopRestorationParameters(); // 5.9.20. + bool ParseTxModeSyntax(); // 5.9.21. + bool ParseFrameReferenceModeSyntax(); // 5.9.23. + // Returns whether skip mode is allowed. When it returns true, it also sets + // the frame_header_.skip_mode_frame array. + bool IsSkipModeAllowed(); + bool ParseSkipModeParameters(); // 5.9.22. + bool ReadAllowWarpedMotion(); + bool ParseGlobalParamSyntax( + int ref, int index, + const std::array& + prev_global_motions); // 5.9.25. + bool ParseGlobalMotionParameters(); // 5.9.24. + bool ParseFilmGrainParameters(); // 5.9.30. + bool ParseTileInfoSyntax(); // 5.9.15. + bool ParseFrameHeader(); // 5.9. + // |data| and |size| specify the payload data of the padding OBU. + // NOTE: Although the payload data is available in the bit_reader_ member, + // it is also passed to ParsePadding() as function parameters so that + // ParsePadding() can find the trailing bit of the OBU and skip over the + // payload data as an opaque chunk of data. + bool ParsePadding(const uint8_t* data, size_t size); // 5.7. + bool ParseMetadataScalability(); // 5.8.5 and 5.8.6. + bool ParseMetadataTimecode(); // 5.8.7. + // |data| and |size| specify the payload data of the metadata OBU. + // NOTE: Although the payload data is available in the bit_reader_ member, + // it is also passed to ParseMetadata() as function parameters so that + // ParseMetadata() can find the trailing bit of the OBU and either extract + // or skip over the payload data as an opaque chunk of data. + bool ParseMetadata(const uint8_t* data, size_t size); // 5.8. + // Adds and populates the TileBuffer for each tile in the tile group and + // updates |next_tile_group_start_| + bool AddTileBuffers(int start, int end, size_t total_size, + size_t tg_header_size, size_t bytes_consumed_so_far); + bool ParseTileGroup(size_t size, size_t bytes_consumed_so_far); // 5.11.1. + + // Parser elements. + std::unique_ptr bit_reader_; + const uint8_t* data_; + size_t size_; + const int operating_point_; + + // OBU elements. Only valid if ParseOneFrame() completes successfully. + Vector obu_headers_; + ObuSequenceHeader sequence_header_ = {}; + ObuFrameHeader frame_header_ = {}; + Vector tile_buffers_; + ObuMetadata metadata_ = {}; + // The expected starting tile number of the next Tile Group. + int next_tile_group_start_ = 0; + // If true, the sequence_header_ field is valid. + bool has_sequence_header_ = false; + // If true, the obu_extension_flag syntax element in the OBU header must be + // 0. Set to true when parsing a sequence header if OperatingPointIdc is 0. + bool extension_disallowed_ = false; + + BufferPool* const buffer_pool_; + DecoderState& decoder_state_; + // Used by ParseOneFrame() to populate the current frame that is being + // decoded. The invariant maintained is that this variable will be nullptr at + // the beginning and at the end of each call to ParseOneFrame(). This ensures + // that the ObuParser is not holding on to any references to the current + // frame once the ParseOneFrame() call is complete. + RefCountedBufferPtr current_frame_; + + // For unit testing private functions. + friend class ObuParserTest; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_OBU_PARSER_H_ diff --git a/src/post_filter.h b/src/post_filter.h new file mode 100644 index 0000000..800d51d --- /dev/null +++ b/src/post_filter.h @@ -0,0 +1,565 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_POST_FILTER_H_ +#define LIBGAV1_SRC_POST_FILTER_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "src/dsp/common.h" +#include "src/dsp/dsp.h" +#include "src/frame_scratch_buffer.h" +#include "src/loop_restoration_info.h" +#include "src/obu_parser.h" +#include "src/utils/array_2d.h" +#include "src/utils/block_parameters_holder.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" +#include "src/utils/threadpool.h" +#include "src/yuv_buffer.h" + +namespace libgav1 { + +// This class applies in-loop filtering for each frame after it is +// reconstructed. The in-loop filtering contains all post processing filtering +// for the reconstructed frame, including deblock filter, CDEF, superres, +// and loop restoration. +// Historically, for example in libaom, loop filter refers to deblock filter. +// To avoid name conflicts, we call this class PostFilter (post processing). +// In-loop post filtering order is: +// deblock --> CDEF --> super resolution--> loop restoration. +// When CDEF and super resolution is not used, we can combine deblock +// and restoration together to only filter frame buffer once. +class PostFilter { + public: + // This class does not take ownership of the masks/restoration_info, but it + // may change their values. + // + // The overall flow of data in this class (for both single and multi-threaded + // cases) is as follows: + // -> Input: |frame_buffer_|. + // -> Initialize |source_buffer_|, |cdef_buffer_|, |superres_buffer_| and + // |loop_restoration_buffer_|. + // -> Deblocking: + // * Input: |source_buffer_| + // * Output: |source_buffer_| + // -> CDEF: + // * Input: |source_buffer_| + // * Output: |cdef_buffer_| + // -> SuperRes: + // * Input: |cdef_buffer_| + // * Output: |superres_buffer_| + // -> Loop Restoration: + // * Input: |superres_buffer_| + // * Output: |loop_restoration_buffer_|. + // -> Now |frame_buffer_| contains the filtered frame. + PostFilter(const ObuFrameHeader& frame_header, + const ObuSequenceHeader& sequence_header, + FrameScratchBuffer* frame_scratch_buffer, YuvBuffer* frame_buffer, + const dsp::Dsp* dsp, int do_post_filter_mask); + + // non copyable/movable. + PostFilter(const PostFilter&) = delete; + PostFilter& operator=(const PostFilter&) = delete; + PostFilter(PostFilter&&) = delete; + PostFilter& operator=(PostFilter&&) = delete; + + // The overall function that applies all post processing filtering with + // multiple threads. + // * The filtering order is: + // deblock --> CDEF --> super resolution--> loop restoration. + // * The output of each filter is the input for the following filter. A + // special case is that loop restoration needs a few rows of the deblocked + // frame and the entire cdef filtered frame: + // deblock --> CDEF --> super resolution --> loop restoration. + // | ^ + // | | + // -----------> super resolution ----- + // * Any of these filters could be present or absent. + // * |frame_buffer_| points to the decoded frame buffer. When + // ApplyFilteringThreaded() is called, |frame_buffer_| is modified by each + // of the filters as described below. + // Filter behavior (multi-threaded): + // * Deblock: In-place filtering. The output is written to |source_buffer_|. + // If cdef and loop restoration are both on, then 4 rows (as + // specified by |kLoopRestorationBorderRows|) in every 64x64 block + // is copied into |loop_restoration_border_|. + // * Cdef: In-place filtering. Uses the |source_buffer_| and |cdef_border_| as + // the input and the output is written into |cdef_buffer_| (which is + // the same as |source_buffer_|). + // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| and + // |superres_line_buffer_| as the input and the output is written + // into |superres_buffer_| (which is just |cdef_buffer_| with a + // shift to the top). + // * Restoration: Near in-place filtering. + // Uses the |superres_buffer_| and |loop_restoration_border_| + // as the input and the output is written into + // |loop_restoration_buffer_| (which is just |superres_buffer_| + // with a shift to the left). + void ApplyFilteringThreaded(); + + // Does the overall post processing filter for one superblock row starting at + // |row4x4| with height 4*|sb4x4|. If |do_deblock| is false, deblocking filter + // will not be applied. + // + // Filter behavior (single-threaded): + // * Deblock: In-place filtering. The output is written to |source_buffer_|. + // If cdef and loop restoration are both on, then 4 rows (as + // specified by |kLoopRestorationBorderRows|) in every 64x64 block + // is copied into |loop_restoration_border_|. + // * Cdef: In-place filtering. The output is written into |cdef_buffer_| + // (which is just |source_buffer_| with a shift to the top-left). + // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| as the input + // and the output is written into |superres_buffer_| (which is + // just |cdef_buffer_| with a shift to the top). + // * Restoration: Near in-place filtering. + // Uses the |superres_buffer_| and |loop_restoration_border_| + // as the input and the output is written into + // |loop_restoration_buffer_| (which is just |superres_buffer_| + // with a shift to the left or top-left). + // Returns the index of the last row whose post processing is complete and can + // be used for referencing. + int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row, + bool do_deblock); + + // Apply deblocking filter in one direction (specified by |loop_filter_type|) + // for the superblock row starting at |row4x4_start| for columns starting from + // |column4x4_start| in increments of 16 (or 8 for chroma with subsampling) + // until the smallest multiple of 16 that is >= |column4x4_end| or until + // |frame_header_.columns4x4|, whichever is lower. This function must be + // called only if |DoDeblock()| returns true. + void ApplyDeblockFilter(LoopFilterType loop_filter_type, int row4x4_start, + int column4x4_start, int column4x4_end, int sb4x4); + + static bool DoCdef(const ObuFrameHeader& frame_header, + int do_post_filter_mask) { + return (frame_header.cdef.bits > 0 || + frame_header.cdef.y_primary_strength[0] > 0 || + frame_header.cdef.y_secondary_strength[0] > 0 || + frame_header.cdef.uv_primary_strength[0] > 0 || + frame_header.cdef.uv_secondary_strength[0] > 0) && + (do_post_filter_mask & 0x02) != 0; + } + bool DoCdef() const { return DoCdef(frame_header_, do_post_filter_mask_); } + // If filter levels for Y plane (0 for vertical, 1 for horizontal), + // are all zero, deblock filter will not be applied. + static bool DoDeblock(const ObuFrameHeader& frame_header, + uint8_t do_post_filter_mask) { + return (frame_header.loop_filter.level[0] > 0 || + frame_header.loop_filter.level[1] > 0) && + (do_post_filter_mask & 0x01) != 0; + } + bool DoDeblock() const { + return DoDeblock(frame_header_, do_post_filter_mask_); + } + + uint8_t GetZeroDeltaDeblockFilterLevel(int segment_id, int level_index, + ReferenceFrameType type, + int mode_id) const { + return deblock_filter_levels_[segment_id][level_index][type][mode_id]; + } + // Computes the deblock filter levels using |delta_lf| and stores them in + // |deblock_filter_levels|. + void ComputeDeblockFilterLevels( + const int8_t delta_lf[kFrameLfCount], + uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount] + [kNumReferenceFrameTypes][2]) const; + // Returns true if loop restoration will be performed for the given parameters + // and mask. + static bool DoRestoration(const LoopRestoration& loop_restoration, + uint8_t do_post_filter_mask, int num_planes) { + if (num_planes == kMaxPlanesMonochrome) { + return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone && + (do_post_filter_mask & 0x08) != 0; + } + return (loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone || + loop_restoration.type[kPlaneU] != kLoopRestorationTypeNone || + loop_restoration.type[kPlaneV] != kLoopRestorationTypeNone) && + (do_post_filter_mask & 0x08) != 0; + } + bool DoRestoration() const { + return DoRestoration(loop_restoration_, do_post_filter_mask_, planes_); + } + + // Returns a pointer to the unfiltered buffer. This is used by the Tile class + // to determine where to write the output of the tile decoding process taking + // in-place filtering offsets into consideration. + uint8_t* GetUnfilteredBuffer(int plane) { return source_buffer_[plane]; } + const YuvBuffer& frame_buffer() const { return frame_buffer_; } + + // Returns true if SuperRes will be performed for the given frame header and + // mask. + static bool DoSuperRes(const ObuFrameHeader& frame_header, + uint8_t do_post_filter_mask) { + return frame_header.width != frame_header.upscaled_width && + (do_post_filter_mask & 0x04) != 0; + } + bool DoSuperRes() const { + return DoSuperRes(frame_header_, do_post_filter_mask_); + } + LoopRestorationInfo* restoration_info() const { return restoration_info_; } + uint8_t* GetBufferOffset(uint8_t* base_buffer, int stride, Plane plane, + int row, int column) const { + return base_buffer + (row >> subsampling_y_[plane]) * stride + + ((column >> subsampling_x_[plane]) << pixel_size_log2_); + } + uint8_t* GetSourceBuffer(Plane plane, int row4x4, int column4x4) const { + return GetBufferOffset(source_buffer_[plane], frame_buffer_.stride(plane), + plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4)); + } + uint8_t* GetCdefBuffer(Plane plane, int row4x4, int column4x4) const { + return GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane), + plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4)); + } + uint8_t* GetSuperResBuffer(Plane plane, int row4x4, int column4x4) const { + return GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane), + plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4)); + } + + template + static void ExtendFrame(Pixel* frame_start, int width, int height, + ptrdiff_t stride, int left, int right, int top, + int bottom); + + private: + // The type of the HorizontalDeblockFilter and VerticalDeblockFilter member + // functions. + using DeblockFilter = void (PostFilter::*)(int row4x4_start, + int column4x4_start); + // The lookup table for picking the deblock filter, according to deblock + // filter type. + const DeblockFilter deblock_filter_func_[2] = { + &PostFilter::VerticalDeblockFilter, &PostFilter::HorizontalDeblockFilter}; + + // Functions common to all post filters. + + // Extends the frame by setting the border pixel values to the one from its + // closest frame boundary. + void ExtendFrameBoundary(uint8_t* frame_start, int width, int height, + ptrdiff_t stride, int left, int right, int top, + int bottom) const; + // Extend frame boundary for referencing if the frame will be saved as a + // reference frame. + void ExtendBordersForReferenceFrame(); + // Copies the deblocked pixels needed for loop restoration. + void CopyDeblockedPixels(Plane plane, int row4x4); + // Copies the border for one superblock row. If |for_loop_restoration| is + // true, then it assumes that the border extension is being performed for the + // input of the loop restoration process. If |for_loop_restoration| is false, + // then it assumes that the border extension is being performed for using the + // current frame as a reference frame. In this case, |progress_row_| is also + // updated. + void CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4, + bool for_loop_restoration); + // Sets up the |loop_restoration_border_| for loop restoration. + // TODO(linfengz): Unify duplicates in the following two functions if + // possible. + // This is called when there is no CDEF filter. We copy rows from + // |superres_buffer_| and do the line extension. + void SetupLoopRestorationBorder(int row4x4_start); + // This is called when there is CDEF filter. We copy rows from + // |source_buffer_|, apply superres and do the line extension. + void SetupLoopRestorationBorder(int row4x4_start, int sb4x4); + // Returns true if we can perform border extension in loop (i.e.) without + // waiting until the entire frame is decoded. If intra_block_copy is true, we + // do in-loop border extension only if the upscaled_width is the same as 4 * + // columns4x4. Otherwise, we cannot do in loop border extension since those + // pixels may be used by intra block copy. + bool DoBorderExtensionInLoop() const { + return !frame_header_.allow_intrabc || + frame_header_.upscaled_width == + MultiplyBy4(frame_header_.columns4x4); + } + template + void CopyPlane(const Pixel* src, ptrdiff_t src_stride, int width, int height, + Pixel* dst, ptrdiff_t dst_stride) { + assert(height > 0); + do { + memcpy(dst, src, width * sizeof(Pixel)); + src += src_stride; + dst += dst_stride; + } while (--height != 0); + } + + // Worker function used for multi-threaded implementation of Deblocking, CDEF + // and Loop Restoration. + using WorkerFunction = void (PostFilter::*)(std::atomic* row4x4_atomic); + // Schedules |worker| jobs to the |thread_pool_|, runs them in the calling + // thread and returns once all the jobs are completed. + void RunJobs(WorkerFunction worker); + + // Functions for the Deblocking filter. + + static int GetIndex(int row4x4) { return DivideBy4(row4x4); } + static int GetShift(int row4x4, int column4x4) { + return ((row4x4 & 3) << 4) | column4x4; + } + int GetDeblockUnitId(int row_unit, int column_unit) const { + return row_unit * num_64x64_blocks_per_row_ + column_unit; + } + bool GetHorizontalDeblockFilterEdgeInfo(int row4x4, int column4x4, + uint8_t* level, int* step, + int* filter_length) const; + void GetHorizontalDeblockFilterEdgeInfoUV(int row4x4, int column4x4, + uint8_t* level_u, uint8_t* level_v, + int* step, + int* filter_length) const; + bool GetVerticalDeblockFilterEdgeInfo(int row4x4, int column4x4, + BlockParameters* const* bp_ptr, + uint8_t* level, int* step, + int* filter_length) const; + void GetVerticalDeblockFilterEdgeInfoUV(int column4x4, + BlockParameters* const* bp_ptr, + uint8_t* level_u, uint8_t* level_v, + int* step, int* filter_length) const; + void HorizontalDeblockFilter(int row4x4_start, int column4x4_start); + void VerticalDeblockFilter(int row4x4_start, int column4x4_start); + // HorizontalDeblockFilter and VerticalDeblockFilter must have the correct + // signature. + static_assert(std::is_same::value, + ""); + static_assert(std::is_same::value, + ""); + // Applies deblock filtering for the superblock row starting at |row4x4| with + // a height of 4*|sb4x4|. + void ApplyDeblockFilterForOneSuperBlockRow(int row4x4, int sb4x4); + // Worker function used for multi-threaded deblocking. + template + void DeblockFilterWorker(std::atomic* row4x4_atomic); + static_assert( + std::is_same< + decltype(&PostFilter::DeblockFilterWorker), + WorkerFunction>::value, + ""); + static_assert( + std::is_same< + decltype(&PostFilter::DeblockFilterWorker), + WorkerFunction>::value, + ""); + + // Functions for the cdef filter. + + // Copies the deblocked pixels necessary for use by the multi-threaded cdef + // implementation into |cdef_border_|. + void SetupCdefBorder(int row4x4); + // This function prepares the input source block for cdef filtering. The input + // source block contains a 12x12 block, with the inner 8x8 as the desired + // filter region. It pads the block if the 12x12 block includes out of frame + // pixels with a large value. This achieves the required behavior defined in + // section 5.11.52 of the spec. + template + void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row4x4, + int column4x4, uint16_t* cdef_source, + ptrdiff_t cdef_stride, bool y_plane, + const uint8_t border_columns[kMaxPlanes][256], + bool use_border_columns); + // Applies cdef for one 64x64 block. + template + void ApplyCdefForOneUnit(uint16_t* cdef_block, int index, int block_width4x4, + int block_height4x4, int row4x4_start, + int column4x4_start, + uint8_t border_columns[2][kMaxPlanes][256], + bool use_border_columns[2][2]); + // Helper function used by ApplyCdefForOneSuperBlockRow to avoid some code + // duplication. + void ApplyCdefForOneSuperBlockRowHelper( + uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256], + int row4x4, int block_height4x4); + // Applies CDEF filtering for the superblock row starting at |row4x4| with a + // height of 4*|sb4x4|. + void ApplyCdefForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row); + // Worker function used for multi-threaded CDEF. + void ApplyCdefWorker(std::atomic* row4x4_atomic); + static_assert(std::is_same::value, + ""); + + // Functions for the SuperRes filter. + + // Applies super resolution for the |src| for |rows[plane]| rows of each + // plane. If |line_buffer_row| is larger than or equal to 0, one more row will + // be processed, the line buffer indicated by |line_buffer_row| will be used + // as the source. + void ApplySuperRes( + const std::array& src, + const std::array& rows, int line_buffer_row, + const std::array& dst); // Section 7.16. + // Applies SuperRes for the superblock row starting at |row4x4| with a height + // of 4*|sb4x4|. + void ApplySuperResForOneSuperBlockRow(int row4x4, int sb4x4, + bool is_last_row); + void ApplySuperResThreaded(); + + // Functions for the Loop Restoration filter. + + // Notes about Loop Restoration: + // (1). Loop restoration processing unit size is default to 64x64. + // Only when the remaining filtering area is smaller than 64x64, the + // processing unit size is the actual area size. + // For U/V plane, it is (64 >> subsampling_x) x (64 >> subsampling_y). + // (2). Loop restoration unit size can be 64x64, 128x128, 256x256 for Y + // plane. The unit size for chroma can be the same or half, depending on + // subsampling. If either subsampling_x or subsampling_y is one, unit size + // is halved on both x and y sides. + // All loop restoration units have the same size for one plane. + // One loop restoration unit could contain multiple processing units. + // But they share the same sets of loop restoration parameters. + // (3). Loop restoration has a row offset, kRestorationUnitOffset = 8. The + // size of first row of loop restoration units and processing units is + // shrunk by the offset. + // (4). Loop restoration units wrap the bottom and the right of the frame, + // if the remaining area is small. The criteria is whether the number of + // remaining rows/columns is smaller than half of loop restoration unit + // size. + // For example, if the frame size is 140x140, loop restoration unit size is + // 128x128. The size of the first loop restoration unit is 128x(128-8) = + // 128 columns x 120 rows. + // Since 140 - 120 < 128/2. The remaining 20 rows will be folded to the loop + // restoration unit. Similarly, the remaining 12 columns will also be folded + // to current loop restoration unit. So, even frame size is 140x140, + // there's only one loop restoration unit. Suppose processing unit is 64x64, + // then sizes of the first row of processing units are 64x56, 64x56, 12x56, + // respectively. The second row is 64x64, 64x64, 12x64. + // The third row is 64x20, 64x20, 12x20. + + // |stride| is shared by |src_buffer| and |dst_buffer|. + template + void ApplyLoopRestorationForOneRow(const Pixel* src_buffer, ptrdiff_t stride, + Plane plane, int plane_height, + int plane_width, int y, int unit_row, + int current_process_unit_height, + int plane_unit_size, Pixel* dst_buffer); + // Applies loop restoration for the superblock row starting at |row4x4_start| + // with a height of 4*|sb4x4|. + template + void ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start, int sb4x4); + // Helper function that calls the right variant of + // ApplyLoopRestorationForOneSuperBlockRow based on the bitdepth. + void ApplyLoopRestoration(int row4x4_start, int sb4x4); + // Worker function used for multithreaded Loop Restoration. + void ApplyLoopRestorationWorker(std::atomic* row4x4_atomic); + static_assert(std::is_same::value, + ""); + + const ObuFrameHeader& frame_header_; + const LoopRestoration& loop_restoration_; + const dsp::Dsp& dsp_; + const int num_64x64_blocks_per_row_; + const int upscaled_width_; + const int width_; + const int height_; + const int8_t bitdepth_; + const int8_t subsampling_x_[kMaxPlanes]; + const int8_t subsampling_y_[kMaxPlanes]; + const int8_t planes_; + const int pixel_size_log2_; + const uint8_t* const inner_thresh_; + const uint8_t* const outer_thresh_; + const bool needs_chroma_deblock_; + // This stores the deblocking filter levels assuming that the delta is zero. + // This will be used by all superblocks whose delta is zero (without having to + // recompute them). The dimensions (in order) are: segment_id, level_index + // (based on plane and direction), reference_frame and mode_id. + uint8_t deblock_filter_levels_[kMaxSegments][kFrameLfCount] + [kNumReferenceFrameTypes][2]; + // Stores the SuperRes info for the frame. + struct { + int upscaled_width; + int initial_subpixel_x; + int step; + } super_res_info_[kMaxPlanes]; + const Array2D& cdef_index_; + const Array2D& inter_transform_sizes_; + LoopRestorationInfo* const restoration_info_; + uint8_t* const superres_coefficients_[kNumPlaneTypes]; + // Line buffer used by multi-threaded ApplySuperRes(). + // In the multi-threaded case, this buffer will store the last downscaled row + // input of each thread to avoid overwrites by the first upscaled row output + // of the thread below it. + YuvBuffer& superres_line_buffer_; + const BlockParametersHolder& block_parameters_; + // Frame buffer to hold cdef filtered frame. + YuvBuffer cdef_filtered_buffer_; + // Input frame buffer. + YuvBuffer& frame_buffer_; + // A view into |frame_buffer_| that points to the input and output of the + // deblocking process. + uint8_t* source_buffer_[kMaxPlanes]; + // A view into |frame_buffer_| that points to the output of the CDEF filtered + // planes (to facilitate in-place CDEF filtering). + uint8_t* cdef_buffer_[kMaxPlanes]; + // A view into |frame_buffer_| that points to the planes after the SuperRes + // filter is applied (to facilitate in-place SuperRes). + uint8_t* superres_buffer_[kMaxPlanes]; + // A view into |frame_buffer_| that points to the output of the Loop Restored + // planes (to facilitate in-place Loop Restoration). + uint8_t* loop_restoration_buffer_[kMaxPlanes]; + YuvBuffer& cdef_border_; + // Buffer used to store the border pixels that are necessary for loop + // restoration. This buffer will store 4 rows for every 64x64 block (4 rows + // for every 32x32 for chroma with subsampling). The indices of the rows that + // are stored are specified in |kLoopRestorationBorderRows|. First 4 rows of + // this buffer are never populated and never used. + // This buffer is used only when both of the following conditions are true: + // (1). Loop Restoration is on. + // (2). Cdef is on, or multi-threading is enabled for post filter. + YuvBuffer& loop_restoration_border_; + const uint8_t do_post_filter_mask_; + ThreadPool* const thread_pool_; + + // Tracks the progress of the post filters. + int progress_row_ = -1; + + // A block buffer to hold the input that is converted to uint16_t before + // cdef filtering. Only used in single threaded case. Y plane is processed + // separately. U and V planes are processed together. So it is sufficient to + // have this buffer to accommodate 2 planes at a time. + uint16_t cdef_block_[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2]; + + template + friend class PostFilterSuperResTest; + + template + friend class PostFilterHelperFuncTest; +}; + +extern template void PostFilter::ExtendFrame(uint8_t* frame_start, + int width, int height, + ptrdiff_t stride, + int left, int right, + int top, int bottom); + +#if LIBGAV1_MAX_BITDEPTH >= 10 +extern template void PostFilter::ExtendFrame(uint16_t* frame_start, + int width, int height, + ptrdiff_t stride, + int left, int right, + int top, int bottom); +#endif + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_POST_FILTER_H_ diff --git a/src/post_filter/cdef.cc b/src/post_filter/cdef.cc new file mode 100644 index 0000000..994f448 --- /dev/null +++ b/src/post_filter/cdef.cc @@ -0,0 +1,660 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "src/post_filter.h" +#include "src/utils/blocking_counter.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace { + +constexpr int kStep64x64 = 16; // =64/4. +constexpr int kCdefSkip = 8; + +constexpr uint8_t kCdefUvDirection[2][2][8] = { + {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}}, + {{7, 0, 2, 4, 5, 6, 6, 6}, {0, 1, 2, 3, 4, 5, 6, 7}}}; + +constexpr int kCdefBorderRows[2][4] = {{0, 1, 62, 63}, {0, 1, 30, 31}}; + +template +void CopyRowForCdef(const Pixel* src, int block_width, int unit_width, + bool is_frame_left, bool is_frame_right, + uint16_t* const dst, const Pixel* left_border = nullptr) { + if (sizeof(src[0]) == sizeof(dst[0])) { + if (is_frame_left) { + Memset(dst - kCdefBorder, kCdefLargeValue, kCdefBorder); + } else if (left_border == nullptr) { + memcpy(dst - kCdefBorder, src - kCdefBorder, + kCdefBorder * sizeof(dst[0])); + } else { + memcpy(dst - kCdefBorder, left_border, kCdefBorder * sizeof(dst[0])); + } + memcpy(dst, src, block_width * sizeof(dst[0])); + if (is_frame_right) { + Memset(dst + block_width, kCdefLargeValue, + unit_width + kCdefBorder - block_width); + } else { + memcpy(dst + block_width, src + block_width, + (unit_width + kCdefBorder - block_width) * sizeof(dst[0])); + } + return; + } + if (is_frame_left) { + for (int x = -kCdefBorder; x < 0; ++x) { + dst[x] = static_cast(kCdefLargeValue); + } + } else if (left_border == nullptr) { + for (int x = -kCdefBorder; x < 0; ++x) { + dst[x] = src[x]; + } + } else { + for (int x = -kCdefBorder; x < 0; ++x) { + dst[x] = left_border[x + kCdefBorder]; + } + } + for (int x = 0; x < block_width; ++x) { + dst[x] = src[x]; + } + for (int x = block_width; x < unit_width + kCdefBorder; ++x) { + dst[x] = is_frame_right ? static_cast(kCdefLargeValue) : src[x]; + } +} + +// For |height| rows, copy |width| pixels of size |pixel_size| from |src| to +// |dst|. +void CopyPixels(const uint8_t* src, int src_stride, uint8_t* dst, + int dst_stride, int width, int height, size_t pixel_size) { + int y = height; + do { + memcpy(dst, src, width * pixel_size); + src += src_stride; + dst += dst_stride; + } while (--y != 0); +} + +} // namespace + +void PostFilter::SetupCdefBorder(int row4x4) { + assert(row4x4 >= 0); + assert(DoCdef()); + int plane = kPlaneY; + do { + const ptrdiff_t src_stride = frame_buffer_.stride(plane); + const ptrdiff_t dst_stride = cdef_border_.stride(plane); + const int row_offset = DivideBy4(row4x4); + const int num_pixels = SubsampledValue( + MultiplyBy4(frame_header_.columns4x4), subsampling_x_[plane]); + const int row_width = num_pixels << pixel_size_log2_; + const int plane_height = SubsampledValue(MultiplyBy4(frame_header_.rows4x4), + subsampling_y_[plane]); + for (int i = 0; i < 4; ++i) { + const int row = kCdefBorderRows[subsampling_y_[plane]][i]; + const int absolute_row = + (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row; + if (absolute_row >= plane_height) break; + const uint8_t* src = + GetSourceBuffer(static_cast(plane), row4x4, 0) + + row * src_stride; + uint8_t* dst = cdef_border_.data(plane) + dst_stride * (row_offset + i); + memcpy(dst, src, row_width); + } + } while (++plane < planes_); +} + +template +void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4, + int row4x4, int column4x4, + uint16_t* cdef_source, ptrdiff_t cdef_stride, + const bool y_plane, + const uint8_t border_columns[kMaxPlanes][256], + bool use_border_columns) { + assert(y_plane || planes_ == kMaxPlanes); + const int max_planes = y_plane ? 1 : kMaxPlanes; + const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU]; + const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU]; + const int start_x = MultiplyBy4(column4x4) >> subsampling_x; + const int start_y = MultiplyBy4(row4x4) >> subsampling_y; + const int plane_width = SubsampledValue(width_, subsampling_x); + const int plane_height = SubsampledValue(height_, subsampling_y); + const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x; + const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y; + // unit_width, unit_height are the same as block_width, block_height unless + // it reaches the frame boundary, where block_width < 64 or + // block_height < 64. unit_width, unit_height guarantee we build blocks on + // a multiple of 8. + const int unit_width = Align(block_width, 8 >> subsampling_x); + const int unit_height = Align(block_height, 8 >> subsampling_y); + const bool is_frame_left = column4x4 == 0; + const bool is_frame_right = start_x + block_width >= plane_width; + const bool is_frame_top = row4x4 == 0; + const bool is_frame_bottom = start_y + block_height >= plane_height; + const int y_offset = is_frame_top ? 0 : kCdefBorder; + const int cdef_border_row_offset = DivideBy4(row4x4) - (is_frame_top ? 0 : 2); + + for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) { + uint16_t* cdef_src = cdef_source + static_cast(plane == kPlaneV) * + kCdefUnitSizeWithBorders * + kCdefUnitSizeWithBorders; + const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel); + const Pixel* src_buffer = + reinterpret_cast(source_buffer_[plane]) + + (start_y - y_offset) * src_stride + start_x; + const int cdef_border_stride = cdef_border_.stride(plane) / sizeof(Pixel); + const Pixel* cdef_border = + (thread_pool_ == nullptr) + ? nullptr + : reinterpret_cast(cdef_border_.data(plane)) + + cdef_border_row_offset * cdef_border_stride + start_x; + + // All the copying code will use negative indices for populating the left + // border. So the starting point is set to kCdefBorder. + cdef_src += kCdefBorder; + + // Copy the top 2 rows as follows; + // If is_frame_top is true, both the rows are set to kCdefLargeValue. + // Otherwise: + // If multi-threaded filtering is off, the rows are copied from + // |src_buffer|. + // Otherwise, the rows are copied from |cdef_border|. + if (is_frame_top) { + for (int y = 0; y < kCdefBorder; ++y) { + Memset(cdef_src - kCdefBorder, kCdefLargeValue, + unit_width + 2 * kCdefBorder); + cdef_src += cdef_stride; + } + } else { + const Pixel* top_border = + (thread_pool_ == nullptr) ? src_buffer : cdef_border; + const int top_border_stride = + (thread_pool_ == nullptr) ? src_stride : cdef_border_stride; + for (int y = 0; y < kCdefBorder; ++y) { + CopyRowForCdef(top_border, block_width, unit_width, is_frame_left, + is_frame_right, cdef_src); + top_border += top_border_stride; + cdef_src += cdef_stride; + // We need to increment |src_buffer| and |cdef_border| in this loop to + // set them up for the subsequent loops below. + src_buffer += src_stride; + cdef_border += cdef_border_stride; + } + } + + // Copy the body as follows; + // If multi-threaded filtering is off or if is_frame_bottom is true, all the + // rows are copied from |src_buffer|. + // Otherwise, the first |block_height|-kCdefBorder rows are copied from + // |src_buffer| and the last kCdefBorder rows are coped from |cdef_border|. + int y = block_height; + const int y_threshold = + (thread_pool_ == nullptr || is_frame_bottom) ? 0 : kCdefBorder; + const Pixel* left_border = + (thread_pool_ == nullptr || !use_border_columns) + ? nullptr + : reinterpret_cast(border_columns[plane]); + do { + CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left, + is_frame_right, cdef_src, left_border); + cdef_src += cdef_stride; + src_buffer += src_stride; + if (left_border != nullptr) left_border += kCdefBorder; + } while (--y != y_threshold); + + if (y > 0) { + assert(y == kCdefBorder); + // |cdef_border| now points to the top 2 rows of the current block. For + // the next loop, we need it to point to the bottom 2 rows of the + // current block. So increment it by 2 rows. + cdef_border += MultiplyBy2(cdef_border_stride); + for (int i = 0; i < kCdefBorder; ++i) { + CopyRowForCdef(cdef_border, block_width, unit_width, is_frame_left, + is_frame_right, cdef_src); + cdef_src += cdef_stride; + cdef_border += cdef_border_stride; + } + } + + // Copy the bottom 2 rows as follows; + // If is_frame_bottom is true, both the rows are set to kCdefLargeValue. + // Otherwise: + // If multi-threaded filtering is off, the rows are copied from + // |src_buffer|. + // Otherwise, the rows are copied from |cdef_border|. + y = 0; + if (is_frame_bottom) { + do { + Memset(cdef_src - kCdefBorder, kCdefLargeValue, + unit_width + 2 * kCdefBorder); + cdef_src += cdef_stride; + } while (++y < kCdefBorder + unit_height - block_height); + } else { + const Pixel* bottom_border = + (thread_pool_ == nullptr) ? src_buffer : cdef_border; + const int bottom_border_stride = + (thread_pool_ == nullptr) ? src_stride : cdef_border_stride; + do { + CopyRowForCdef(bottom_border, block_width, unit_width, is_frame_left, + is_frame_right, cdef_src); + bottom_border += bottom_border_stride; + cdef_src += cdef_stride; + } while (++y < kCdefBorder + unit_height - block_height); + } + } +} + +template +void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index, + const int block_width4x4, + const int block_height4x4, + const int row4x4_start, + const int column4x4_start, + uint8_t border_columns[2][kMaxPlanes][256], + bool use_border_columns[2][2]) { + // Cdef operates in 8x8 blocks (4x4 for chroma with subsampling). + static constexpr int kStep = 8; + static constexpr int kStep4x4 = 2; + + int cdef_buffer_row_base_stride[kMaxPlanes]; + uint8_t* cdef_buffer_row_base[kMaxPlanes]; + int src_buffer_row_base_stride[kMaxPlanes]; + const uint8_t* src_buffer_row_base[kMaxPlanes]; + const uint16_t* cdef_src_row_base[kMaxPlanes]; + int cdef_src_row_base_stride[kMaxPlanes]; + int column_step[kMaxPlanes]; + assert(planes_ >= 1); + int plane = kPlaneY; + do { + cdef_buffer_row_base[plane] = + GetCdefBuffer(static_cast(plane), row4x4_start, column4x4_start); + cdef_buffer_row_base_stride[plane] = + frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]); + src_buffer_row_base[plane] = GetSourceBuffer(static_cast(plane), + row4x4_start, column4x4_start); + src_buffer_row_base_stride[plane] = + frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]); + cdef_src_row_base[plane] = + cdef_block + + static_cast(plane == kPlaneV) * kCdefUnitSizeWithBorders * + kCdefUnitSizeWithBorders + + kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder; + cdef_src_row_base_stride[plane] = + kCdefUnitSizeWithBorders * (kStep >> subsampling_y_[plane]); + column_step[plane] = (kStep >> subsampling_x_[plane]) * sizeof(Pixel); + } while (++plane < planes_); + + // |border_columns| contains two buffers. In each call to this function, we + // will use one of them as the "destination" for the current call. And the + // other one as the "source" for the current call (which would have been the + // "destination" of the previous call). We will use the src_index to populate + // the borders which were backed up in the previous call. We will use the + // dst_index to populate the borders to be used in the next call. + const int border_columns_src_index = DivideBy16(column4x4_start) & 1; + const int border_columns_dst_index = border_columns_src_index ^ 1; + + if (index == -1) { + if (thread_pool_ == nullptr) { + int plane = kPlaneY; + do { + CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane), + cdef_buffer_row_base[plane], frame_buffer_.stride(plane), + MultiplyBy4(block_width4x4) >> subsampling_x_[plane], + MultiplyBy4(block_height4x4) >> subsampling_y_[plane], + sizeof(Pixel)); + } while (++plane < planes_); + } + use_border_columns[border_columns_dst_index][0] = false; + use_border_columns[border_columns_dst_index][1] = false; + return; + } + + const bool is_frame_right = + MultiplyBy4(column4x4_start) + MultiplyBy4(block_width4x4) >= width_; + if (!is_frame_right && thread_pool_ != nullptr) { + // Backup the last 2 columns for use in the next iteration. + use_border_columns[border_columns_dst_index][0] = true; + const uint8_t* src_line = + GetSourceBuffer(kPlaneY, row4x4_start, + column4x4_start + block_width4x4) - + kCdefBorder * sizeof(Pixel); + CopyPixels(src_line, frame_buffer_.stride(kPlaneY), + border_columns[border_columns_dst_index][kPlaneY], + kCdefBorder * sizeof(Pixel), kCdefBorder, + MultiplyBy4(block_height4x4), sizeof(Pixel)); + } + + PrepareCdefBlock( + block_width4x4, block_height4x4, row4x4_start, column4x4_start, + cdef_block, kCdefUnitSizeWithBorders, true, + (border_columns != nullptr) ? border_columns[border_columns_src_index] + : nullptr, + use_border_columns[border_columns_src_index][0]); + + // Stored direction used during the u/v pass. If bit 3 is set, then block is + // a skip. + uint8_t direction_y[8 * 8]; + int y_index = 0; + + const uint8_t y_primary_strength = + frame_header_.cdef.y_primary_strength[index]; + const uint8_t y_secondary_strength = + frame_header_.cdef.y_secondary_strength[index]; + // y_strength_index is 0 for both primary and secondary strengths being + // non-zero, 1 for primary only, 2 for secondary only. This will be updated + // with y_primary_strength after variance is applied. + int y_strength_index = static_cast(y_secondary_strength == 0); + + const bool compute_direction_and_variance = + (y_primary_strength | frame_header_.cdef.uv_primary_strength[index]) != 0; + BlockParameters* const* bp_row0_base = + block_parameters_.Address(row4x4_start, column4x4_start); + BlockParameters* const* bp_row1_base = + bp_row0_base + block_parameters_.columns4x4(); + const int bp_stride = MultiplyBy2(block_parameters_.columns4x4()); + int row4x4 = row4x4_start; + do { + uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY]; + const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY]; + const uint16_t* cdef_src_base = cdef_src_row_base[kPlaneY]; + BlockParameters* const* bp0 = bp_row0_base; + BlockParameters* const* bp1 = bp_row1_base; + int column4x4 = column4x4_start; + do { + const int block_width = kStep; + const int block_height = kStep; + const int cdef_stride = frame_buffer_.stride(kPlaneY); + uint8_t* const cdef_buffer = cdef_buffer_base; + const uint16_t* const cdef_src = cdef_src_base; + const int src_stride = frame_buffer_.stride(kPlaneY); + const uint8_t* const src_buffer = src_buffer_base; + + const bool skip = (*bp0)->skip && (*(bp0 + 1))->skip && (*bp1)->skip && + (*(bp1 + 1))->skip; + + if (skip) { // No cdef filtering. + direction_y[y_index] = kCdefSkip; + if (thread_pool_ == nullptr) { + CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride, + block_width, block_height, sizeof(Pixel)); + } + } else { + // Zero out residual skip flag. + direction_y[y_index] = 0; + + int variance = 0; + if (compute_direction_and_variance) { + if (thread_pool_ == nullptr || + row4x4 + kStep4x4 < row4x4_start + block_height4x4) { + dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index], + &variance); + } else if (sizeof(Pixel) == 2) { + dsp_.cdef_direction(cdef_src, kCdefUnitSizeWithBorders * 2, + &direction_y[y_index], &variance); + } else { + // If we are in the last row4x4 for this unit, then the last two + // input rows have to come from |cdef_border_|. Since we already + // have |cdef_src| populated correctly, use that as the input + // for the direction process. + uint8_t direction_src[8][8]; + const uint16_t* cdef_src_line = cdef_src; + for (auto& direction_src_line : direction_src) { + for (int i = 0; i < 8; ++i) { + direction_src_line[i] = cdef_src_line[i]; + } + cdef_src_line += kCdefUnitSizeWithBorders; + } + dsp_.cdef_direction(direction_src, 8, &direction_y[y_index], + &variance); + } + } + const int direction = + (y_primary_strength == 0) ? 0 : direction_y[y_index]; + const int variance_strength = + ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12) : 0; + const uint8_t primary_strength = + (variance != 0) + ? (y_primary_strength * (4 + variance_strength) + 8) >> 4 + : 0; + if ((primary_strength | y_secondary_strength) == 0) { + if (thread_pool_ == nullptr) { + CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride, + block_width, block_height, sizeof(Pixel)); + } + } else { + const int strength_index = + y_strength_index | (static_cast(primary_strength == 0) << 1); + dsp_.cdef_filters[1][strength_index]( + cdef_src, kCdefUnitSizeWithBorders, block_height, + primary_strength, y_secondary_strength, + frame_header_.cdef.damping, direction, cdef_buffer, cdef_stride); + } + } + cdef_buffer_base += column_step[kPlaneY]; + src_buffer_base += column_step[kPlaneY]; + cdef_src_base += column_step[kPlaneY] / sizeof(Pixel); + + bp0 += kStep4x4; + bp1 += kStep4x4; + column4x4 += kStep4x4; + y_index++; + } while (column4x4 < column4x4_start + block_width4x4); + + cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY]; + src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY]; + cdef_src_row_base[kPlaneY] += cdef_src_row_base_stride[kPlaneY]; + bp_row0_base += bp_stride; + bp_row1_base += bp_stride; + row4x4 += kStep4x4; + } while (row4x4 < row4x4_start + block_height4x4); + + if (planes_ == kMaxPlanesMonochrome) { + return; + } + + const uint8_t uv_primary_strength = + frame_header_.cdef.uv_primary_strength[index]; + const uint8_t uv_secondary_strength = + frame_header_.cdef.uv_secondary_strength[index]; + + if ((uv_primary_strength | uv_secondary_strength) == 0) { + if (thread_pool_ == nullptr) { + for (int plane = kPlaneU; plane <= kPlaneV; ++plane) { + CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane), + cdef_buffer_row_base[plane], frame_buffer_.stride(plane), + MultiplyBy4(block_width4x4) >> subsampling_x_[plane], + MultiplyBy4(block_height4x4) >> subsampling_y_[plane], + sizeof(Pixel)); + } + } + use_border_columns[border_columns_dst_index][1] = false; + return; + } + + if (!is_frame_right && thread_pool_ != nullptr) { + use_border_columns[border_columns_dst_index][1] = true; + for (int plane = kPlaneU; plane <= kPlaneV; ++plane) { + // Backup the last 2 columns for use in the next iteration. + const uint8_t* src_line = + GetSourceBuffer(static_cast(plane), row4x4_start, + column4x4_start + block_width4x4) - + kCdefBorder * sizeof(Pixel); + CopyPixels(src_line, frame_buffer_.stride(plane), + border_columns[border_columns_dst_index][plane], + kCdefBorder * sizeof(Pixel), kCdefBorder, + MultiplyBy4(block_height4x4) >> subsampling_y_[plane], + sizeof(Pixel)); + } + } + + PrepareCdefBlock( + block_width4x4, block_height4x4, row4x4_start, column4x4_start, + cdef_block, kCdefUnitSizeWithBorders, false, + (border_columns != nullptr) ? border_columns[border_columns_src_index] + : nullptr, + use_border_columns[border_columns_src_index][1]); + + // uv_strength_index is 0 for both primary and secondary strengths being + // non-zero, 1 for primary only, 2 for secondary only. + const int uv_strength_index = + (static_cast(uv_primary_strength == 0) << 1) | + static_cast(uv_secondary_strength == 0); + for (int plane = kPlaneU; plane <= kPlaneV; ++plane) { + const int8_t subsampling_x = subsampling_x_[plane]; + const int8_t subsampling_y = subsampling_y_[plane]; + const int block_width = kStep >> subsampling_x; + const int block_height = kStep >> subsampling_y; + int row4x4 = row4x4_start; + + y_index = 0; + do { + uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane]; + const uint8_t* src_buffer_base = src_buffer_row_base[plane]; + const uint16_t* cdef_src_base = cdef_src_row_base[plane]; + int column4x4 = column4x4_start; + do { + const int cdef_stride = frame_buffer_.stride(plane); + uint8_t* const cdef_buffer = cdef_buffer_base; + const int src_stride = frame_buffer_.stride(plane); + const uint8_t* const src_buffer = src_buffer_base; + const uint16_t* const cdef_src = cdef_src_base; + const bool skip = (direction_y[y_index] & kCdefSkip) != 0; + int dual_cdef = 0; + + if (skip) { // No cdef filtering. + if (thread_pool_ == nullptr) { + CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride, + block_width, block_height, sizeof(Pixel)); + } + } else { + // Make sure block pair is not out of bounds. + if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) { + // Enable dual processing if subsampling_x is 1. + dual_cdef = subsampling_x; + } + + int direction = (uv_primary_strength == 0) + ? 0 + : kCdefUvDirection[subsampling_x][subsampling_y] + [direction_y[y_index]]; + + if (dual_cdef != 0) { + if (uv_primary_strength && + direction_y[y_index] != direction_y[y_index + 1]) { + // Disable dual processing if the second block of the pair does + // not have the same direction. + dual_cdef = 0; + } + + // Disable dual processing if the second block of the pair is a + // skip. + if (direction_y[y_index + 1] == kCdefSkip) { + dual_cdef = 0; + } + } + + // Block width is 8 if either dual_cdef is true or subsampling_x == 0. + const int width_index = dual_cdef | (subsampling_x ^ 1); + dsp_.cdef_filters[width_index][uv_strength_index]( + cdef_src, kCdefUnitSizeWithBorders, block_height, + uv_primary_strength, uv_secondary_strength, + frame_header_.cdef.damping - 1, direction, cdef_buffer, + cdef_stride); + } + // When dual_cdef is set, the above cdef_filter() will process 2 blocks, + // so adjust the pointers and indexes for 2 blocks. + cdef_buffer_base += column_step[plane] << dual_cdef; + src_buffer_base += column_step[plane] << dual_cdef; + cdef_src_base += (column_step[plane] / sizeof(Pixel)) << dual_cdef; + column4x4 += kStep4x4 << dual_cdef; + y_index += 1 << dual_cdef; + } while (column4x4 < column4x4_start + block_width4x4); + + cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane]; + src_buffer_row_base[plane] += src_buffer_row_base_stride[plane]; + cdef_src_row_base[plane] += cdef_src_row_base_stride[plane]; + row4x4 += kStep4x4; + } while (row4x4 < row4x4_start + block_height4x4); + } +} + +void PostFilter::ApplyCdefForOneSuperBlockRowHelper( + uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256], + int row4x4, int block_height4x4) { + bool use_border_columns[2][2] = {}; + for (int column4x4 = 0; column4x4 < frame_header_.columns4x4; + column4x4 += kStep64x64) { + const int index = cdef_index_[DivideBy16(row4x4)][DivideBy16(column4x4)]; + const int block_width4x4 = + std::min(kStep64x64, frame_header_.columns4x4 - column4x4); + +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth_ >= 10) { + ApplyCdefForOneUnit(cdef_block, index, block_width4x4, + block_height4x4, row4x4, column4x4, + border_columns, use_border_columns); + continue; + } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + ApplyCdefForOneUnit(cdef_block, index, block_width4x4, + block_height4x4, row4x4, column4x4, + border_columns, use_border_columns); + } +} + +void PostFilter::ApplyCdefForOneSuperBlockRow(int row4x4_start, int sb4x4, + bool is_last_row) { + assert(row4x4_start >= 0); + assert(DoCdef()); + for (int y = 0; y < sb4x4; y += kStep64x64) { + const int row4x4 = row4x4_start + y; + if (row4x4 >= frame_header_.rows4x4) return; + + // Apply cdef for the last 8 rows of the previous superblock row. + // One exception: If the superblock size is 128x128 and is_last_row is true, + // then we simply apply cdef for the entire superblock row without any lag. + // In that case, apply cdef for the previous superblock row only during the + // first iteration (y == 0). + if (row4x4 > 0 && (!is_last_row || y == 0)) { + assert(row4x4 >= 16); + ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4 - 2, 2); + } + + // Apply cdef for the current superblock row. If this is the last superblock + // row we apply cdef for all the rows, otherwise we leave out the last 8 + // rows. + const int block_height4x4 = + std::min(kStep64x64, frame_header_.rows4x4 - row4x4); + const int height4x4 = block_height4x4 - (is_last_row ? 0 : 2); + if (height4x4 > 0) { + ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4, + height4x4); + } + } +} + +void PostFilter::ApplyCdefWorker(std::atomic* row4x4_atomic) { + int row4x4; + uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2]; + // Each border_column buffer has to store 64 rows and 2 columns for each + // plane. For 10bit, that is 64*2*2 = 256 bytes. + alignas(kMaxAlignment) uint8_t border_columns[2][kMaxPlanes][256]; + while ((row4x4 = row4x4_atomic->fetch_add( + kStep64x64, std::memory_order_relaxed)) < frame_header_.rows4x4) { + const int block_height4x4 = + std::min(kStep64x64, frame_header_.rows4x4 - row4x4); + ApplyCdefForOneSuperBlockRowHelper(cdef_block, border_columns, row4x4, + block_height4x4); + } +} + +} // namespace libgav1 diff --git a/src/post_filter/deblock.cc b/src/post_filter/deblock.cc new file mode 100644 index 0000000..9b5ed0f --- /dev/null +++ b/src/post_filter/deblock.cc @@ -0,0 +1,523 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include + +#include "src/post_filter.h" + +namespace libgav1 { +namespace { + +constexpr uint8_t HevThresh(int level) { return DivideBy16(level); } + +// GetLoopFilterSize* functions depend on this exact ordering of the +// LoopFilterSize enums. +static_assert(dsp::kLoopFilterSize4 == 0, ""); +static_assert(dsp::kLoopFilterSize6 == 1, ""); +static_assert(dsp::kLoopFilterSize8 == 2, ""); +static_assert(dsp::kLoopFilterSize14 == 3, ""); + +dsp::LoopFilterSize GetLoopFilterSizeY(int filter_length) { + // |filter_length| must be a power of 2. + assert((filter_length & (filter_length - 1)) == 0); + // This code is the branch free equivalent of: + // if (filter_length == 4) return kLoopFilterSize4; + // if (filter_length == 8) return kLoopFilterSize8; + // return kLoopFilterSize14; + return static_cast( + MultiplyBy2(static_cast(filter_length > 4)) + + static_cast(filter_length > 8)); +} + +constexpr dsp::LoopFilterSize GetLoopFilterSizeUV(int filter_length) { + // For U & V planes, size is kLoopFilterSize4 if |filter_length| is 4, + // otherwise size is kLoopFilterSize6. + return static_cast(filter_length != 4); +} + +bool NonBlockBorderNeedsFilter(const BlockParameters& bp, int filter_id, + uint8_t* const level) { + if (bp.deblock_filter_level[filter_id] == 0 || (bp.skip && bp.is_inter)) { + return false; + } + *level = bp.deblock_filter_level[filter_id]; + return true; +} + +// 7.14.5. +void ComputeDeblockFilterLevelsHelper( + const ObuFrameHeader& frame_header, int segment_id, int level_index, + const int8_t delta_lf[kFrameLfCount], + uint8_t deblock_filter_levels[kNumReferenceFrameTypes][2]) { + const int delta = delta_lf[frame_header.delta_lf.multi ? level_index : 0]; + uint8_t level = Clip3(frame_header.loop_filter.level[level_index] + delta, 0, + kMaxLoopFilterValue); + const auto feature = static_cast( + kSegmentFeatureLoopFilterYVertical + level_index); + level = + Clip3(level + frame_header.segmentation.feature_data[segment_id][feature], + 0, kMaxLoopFilterValue); + if (!frame_header.loop_filter.delta_enabled) { + static_assert(sizeof(deblock_filter_levels[0][0]) == 1, ""); + memset(deblock_filter_levels, level, kNumReferenceFrameTypes * 2); + return; + } + assert(frame_header.loop_filter.delta_enabled); + const int shift = level >> 5; + deblock_filter_levels[kReferenceFrameIntra][0] = Clip3( + level + + LeftShift(frame_header.loop_filter.ref_deltas[kReferenceFrameIntra], + shift), + 0, kMaxLoopFilterValue); + // deblock_filter_levels[kReferenceFrameIntra][1] is never used. So it does + // not have to be populated. + for (int reference_frame = kReferenceFrameIntra + 1; + reference_frame < kNumReferenceFrameTypes; ++reference_frame) { + for (int mode_id = 0; mode_id < 2; ++mode_id) { + deblock_filter_levels[reference_frame][mode_id] = Clip3( + level + + LeftShift(frame_header.loop_filter.ref_deltas[reference_frame] + + frame_header.loop_filter.mode_deltas[mode_id], + shift), + 0, kMaxLoopFilterValue); + } + } +} + +} // namespace + +void PostFilter::ComputeDeblockFilterLevels( + const int8_t delta_lf[kFrameLfCount], + uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount] + [kNumReferenceFrameTypes][2]) const { + if (!DoDeblock()) return; + for (int segment_id = 0; + segment_id < (frame_header_.segmentation.enabled ? kMaxSegments : 1); + ++segment_id) { + int level_index = 0; + for (; level_index < 2; ++level_index) { + ComputeDeblockFilterLevelsHelper( + frame_header_, segment_id, level_index, delta_lf, + deblock_filter_levels[segment_id][level_index]); + } + for (; level_index < kFrameLfCount; ++level_index) { + if (frame_header_.loop_filter.level[level_index] != 0) { + ComputeDeblockFilterLevelsHelper( + frame_header_, segment_id, level_index, delta_lf, + deblock_filter_levels[segment_id][level_index]); + } + } + } +} + +bool PostFilter::GetHorizontalDeblockFilterEdgeInfo(int row4x4, int column4x4, + uint8_t* level, int* step, + int* filter_length) const { + *step = kTransformHeight[inter_transform_sizes_[row4x4][column4x4]]; + if (row4x4 == 0) return false; + + const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4); + const int row4x4_prev = row4x4 - 1; + assert(row4x4_prev >= 0); + const BlockParameters* bp_prev = + block_parameters_.Find(row4x4_prev, column4x4); + + if (bp == bp_prev) { + // Not a border. + if (!NonBlockBorderNeedsFilter(*bp, 1, level)) return false; + } else { + const uint8_t level_this = bp->deblock_filter_level[1]; + *level = level_this; + if (level_this == 0) { + const uint8_t level_prev = bp_prev->deblock_filter_level[1]; + if (level_prev == 0) return false; + *level = level_prev; + } + } + const int step_prev = + kTransformHeight[inter_transform_sizes_[row4x4_prev][column4x4]]; + *filter_length = std::min(*step, step_prev); + return true; +} + +void PostFilter::GetHorizontalDeblockFilterEdgeInfoUV( + int row4x4, int column4x4, uint8_t* level_u, uint8_t* level_v, int* step, + int* filter_length) const { + const int subsampling_x = subsampling_x_[kPlaneU]; + const int subsampling_y = subsampling_y_[kPlaneU]; + row4x4 = GetDeblockPosition(row4x4, subsampling_y); + column4x4 = GetDeblockPosition(column4x4, subsampling_x); + const BlockParameters* bp = block_parameters_.Find(row4x4, column4x4); + *level_u = 0; + *level_v = 0; + *step = kTransformHeight[bp->uv_transform_size]; + if (row4x4 == subsampling_y) { + return; + } + + bool need_filter_u = frame_header_.loop_filter.level[kPlaneU + 1] != 0; + bool need_filter_v = frame_header_.loop_filter.level[kPlaneV + 1] != 0; + assert(need_filter_u || need_filter_v); + const int filter_id_u = + kDeblockFilterLevelIndex[kPlaneU][kLoopFilterTypeHorizontal]; + const int filter_id_v = + kDeblockFilterLevelIndex[kPlaneV][kLoopFilterTypeHorizontal]; + const int row4x4_prev = row4x4 - (1 << subsampling_y); + assert(row4x4_prev >= 0); + const BlockParameters* bp_prev = + block_parameters_.Find(row4x4_prev, column4x4); + + if (bp == bp_prev) { + // Not a border. + const bool skip = bp->skip && bp->is_inter; + need_filter_u = + need_filter_u && bp->deblock_filter_level[filter_id_u] != 0 && !skip; + need_filter_v = + need_filter_v && bp->deblock_filter_level[filter_id_v] != 0 && !skip; + if (!need_filter_u && !need_filter_v) return; + if (need_filter_u) *level_u = bp->deblock_filter_level[filter_id_u]; + if (need_filter_v) *level_v = bp->deblock_filter_level[filter_id_v]; + *filter_length = *step; + return; + } + + // It is a border. + if (need_filter_u) { + const uint8_t level_u_this = bp->deblock_filter_level[filter_id_u]; + *level_u = level_u_this; + if (level_u_this == 0) { + *level_u = bp_prev->deblock_filter_level[filter_id_u]; + } + } + if (need_filter_v) { + const uint8_t level_v_this = bp->deblock_filter_level[filter_id_v]; + *level_v = level_v_this; + if (level_v_this == 0) { + *level_v = bp_prev->deblock_filter_level[filter_id_v]; + } + } + const int step_prev = kTransformHeight[bp_prev->uv_transform_size]; + *filter_length = std::min(*step, step_prev); +} + +bool PostFilter::GetVerticalDeblockFilterEdgeInfo( + int row4x4, int column4x4, BlockParameters* const* bp_ptr, uint8_t* level, + int* step, int* filter_length) const { + const BlockParameters* bp = *bp_ptr; + *step = kTransformWidth[inter_transform_sizes_[row4x4][column4x4]]; + if (column4x4 == 0) return false; + + const int filter_id = 0; + const int column4x4_prev = column4x4 - 1; + assert(column4x4_prev >= 0); + const BlockParameters* bp_prev = *(bp_ptr - 1); + if (bp == bp_prev) { + // Not a border. + if (!NonBlockBorderNeedsFilter(*bp, filter_id, level)) return false; + } else { + // It is a border. + const uint8_t level_this = bp->deblock_filter_level[filter_id]; + *level = level_this; + if (level_this == 0) { + const uint8_t level_prev = bp_prev->deblock_filter_level[filter_id]; + if (level_prev == 0) return false; + *level = level_prev; + } + } + const int step_prev = + kTransformWidth[inter_transform_sizes_[row4x4][column4x4_prev]]; + *filter_length = std::min(*step, step_prev); + return true; +} + +void PostFilter::GetVerticalDeblockFilterEdgeInfoUV( + int column4x4, BlockParameters* const* bp_ptr, uint8_t* level_u, + uint8_t* level_v, int* step, int* filter_length) const { + const int subsampling_x = subsampling_x_[kPlaneU]; + column4x4 = GetDeblockPosition(column4x4, subsampling_x); + const BlockParameters* bp = *bp_ptr; + *level_u = 0; + *level_v = 0; + *step = kTransformWidth[bp->uv_transform_size]; + if (column4x4 == subsampling_x) { + return; + } + + bool need_filter_u = frame_header_.loop_filter.level[kPlaneU + 1] != 0; + bool need_filter_v = frame_header_.loop_filter.level[kPlaneV + 1] != 0; + assert(need_filter_u || need_filter_v); + const int filter_id_u = + kDeblockFilterLevelIndex[kPlaneU][kLoopFilterTypeVertical]; + const int filter_id_v = + kDeblockFilterLevelIndex[kPlaneV][kLoopFilterTypeVertical]; + const BlockParameters* bp_prev = *(bp_ptr - (ptrdiff_t{1} << subsampling_x)); + + if (bp == bp_prev) { + // Not a border. + const bool skip = bp->skip && bp->is_inter; + need_filter_u = + need_filter_u && bp->deblock_filter_level[filter_id_u] != 0 && !skip; + need_filter_v = + need_filter_v && bp->deblock_filter_level[filter_id_v] != 0 && !skip; + if (!need_filter_u && !need_filter_v) return; + if (need_filter_u) *level_u = bp->deblock_filter_level[filter_id_u]; + if (need_filter_v) *level_v = bp->deblock_filter_level[filter_id_v]; + *filter_length = *step; + return; + } + + // It is a border. + if (need_filter_u) { + const uint8_t level_u_this = bp->deblock_filter_level[filter_id_u]; + *level_u = level_u_this; + if (level_u_this == 0) { + *level_u = bp_prev->deblock_filter_level[filter_id_u]; + } + } + if (need_filter_v) { + const uint8_t level_v_this = bp->deblock_filter_level[filter_id_v]; + *level_v = level_v_this; + if (level_v_this == 0) { + *level_v = bp_prev->deblock_filter_level[filter_id_v]; + } + } + const int step_prev = kTransformWidth[bp_prev->uv_transform_size]; + *filter_length = std::min(*step, step_prev); +} + +void PostFilter::HorizontalDeblockFilter(int row4x4_start, + int column4x4_start) { + const int column_step = 1; + const int src_step = 4 << pixel_size_log2_; + const ptrdiff_t src_stride = frame_buffer_.stride(kPlaneY); + uint8_t* src = GetSourceBuffer(kPlaneY, row4x4_start, column4x4_start); + int row_step; + uint8_t level; + int filter_length; + + for (int column4x4 = 0; column4x4 < kNum4x4InLoopFilterUnit && + MultiplyBy4(column4x4_start + column4x4) < width_; + column4x4 += column_step, src += src_step) { + uint8_t* src_row = src; + for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit && + MultiplyBy4(row4x4_start + row4x4) < height_; + row4x4 += row_step) { + const bool need_filter = GetHorizontalDeblockFilterEdgeInfo( + row4x4_start + row4x4, column4x4_start + column4x4, &level, &row_step, + &filter_length); + if (need_filter) { + const dsp::LoopFilterSize size = GetLoopFilterSizeY(filter_length); + dsp_.loop_filters[size][kLoopFilterTypeHorizontal]( + src_row, src_stride, outer_thresh_[level], inner_thresh_[level], + HevThresh(level)); + } + // TODO(chengchen): use shifts instead of multiplication. + src_row += row_step * src_stride; + row_step = DivideBy4(row_step); + } + } + + if (needs_chroma_deblock_) { + const int8_t subsampling_x = subsampling_x_[kPlaneU]; + const int8_t subsampling_y = subsampling_y_[kPlaneU]; + const int column_step = 1 << subsampling_x; + const ptrdiff_t src_stride_u = frame_buffer_.stride(kPlaneU); + const ptrdiff_t src_stride_v = frame_buffer_.stride(kPlaneV); + uint8_t* src_u = GetSourceBuffer(kPlaneU, row4x4_start, column4x4_start); + uint8_t* src_v = GetSourceBuffer(kPlaneV, row4x4_start, column4x4_start); + int row_step; + uint8_t level_u; + uint8_t level_v; + int filter_length; + + for (int column4x4 = 0; column4x4 < kNum4x4InLoopFilterUnit && + MultiplyBy4(column4x4_start + column4x4) < width_; + column4x4 += column_step, src_u += src_step, src_v += src_step) { + uint8_t* src_row_u = src_u; + uint8_t* src_row_v = src_v; + for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit && + MultiplyBy4(row4x4_start + row4x4) < height_; + row4x4 += row_step) { + GetHorizontalDeblockFilterEdgeInfoUV( + row4x4_start + row4x4, column4x4_start + column4x4, &level_u, + &level_v, &row_step, &filter_length); + if (level_u != 0) { + const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length); + dsp_.loop_filters[size][kLoopFilterTypeHorizontal]( + src_row_u, src_stride_u, outer_thresh_[level_u], + inner_thresh_[level_u], HevThresh(level_u)); + } + if (level_v != 0) { + const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length); + dsp_.loop_filters[size][kLoopFilterTypeHorizontal]( + src_row_v, src_stride_v, outer_thresh_[level_v], + inner_thresh_[level_v], HevThresh(level_v)); + } + src_row_u += row_step * src_stride_u; + src_row_v += row_step * src_stride_v; + row_step = DivideBy4(row_step << subsampling_y); + } + } + } +} + +void PostFilter::VerticalDeblockFilter(int row4x4_start, int column4x4_start) { + const ptrdiff_t row_stride = MultiplyBy4(frame_buffer_.stride(kPlaneY)); + const ptrdiff_t src_stride = frame_buffer_.stride(kPlaneY); + uint8_t* src = GetSourceBuffer(kPlaneY, row4x4_start, column4x4_start); + int column_step; + uint8_t level; + int filter_length; + + BlockParameters* const* bp_row_base = + block_parameters_.Address(row4x4_start, column4x4_start); + const int bp_stride = block_parameters_.columns4x4(); + const int column_step_shift = pixel_size_log2_; + for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit && + MultiplyBy4(row4x4_start + row4x4) < height_; + ++row4x4, src += row_stride, bp_row_base += bp_stride) { + uint8_t* src_row = src; + BlockParameters* const* bp = bp_row_base; + for (int column4x4 = 0; column4x4 < kNum4x4InLoopFilterUnit && + MultiplyBy4(column4x4_start + column4x4) < width_; + column4x4 += column_step, bp += column_step) { + const bool need_filter = GetVerticalDeblockFilterEdgeInfo( + row4x4_start + row4x4, column4x4_start + column4x4, bp, &level, + &column_step, &filter_length); + if (need_filter) { + const dsp::LoopFilterSize size = GetLoopFilterSizeY(filter_length); + dsp_.loop_filters[size][kLoopFilterTypeVertical]( + src_row, src_stride, outer_thresh_[level], inner_thresh_[level], + HevThresh(level)); + } + src_row += column_step << column_step_shift; + column_step = DivideBy4(column_step); + } + } + + if (needs_chroma_deblock_) { + const int8_t subsampling_x = subsampling_x_[kPlaneU]; + const int8_t subsampling_y = subsampling_y_[kPlaneU]; + const int row_step = 1 << subsampling_y; + uint8_t* src_u = GetSourceBuffer(kPlaneU, row4x4_start, column4x4_start); + uint8_t* src_v = GetSourceBuffer(kPlaneV, row4x4_start, column4x4_start); + const ptrdiff_t src_stride_u = frame_buffer_.stride(kPlaneU); + const ptrdiff_t src_stride_v = frame_buffer_.stride(kPlaneV); + const ptrdiff_t row_stride_u = MultiplyBy4(frame_buffer_.stride(kPlaneU)); + const ptrdiff_t row_stride_v = MultiplyBy4(frame_buffer_.stride(kPlaneV)); + const LoopFilterType type = kLoopFilterTypeVertical; + int column_step; + uint8_t level_u; + uint8_t level_v; + int filter_length; + + BlockParameters* const* bp_row_base = block_parameters_.Address( + GetDeblockPosition(row4x4_start, subsampling_y), + GetDeblockPosition(column4x4_start, subsampling_x)); + const int bp_stride = block_parameters_.columns4x4() << subsampling_y; + for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit && + MultiplyBy4(row4x4_start + row4x4) < height_; + row4x4 += row_step, src_u += row_stride_u, src_v += row_stride_v, + bp_row_base += bp_stride) { + uint8_t* src_row_u = src_u; + uint8_t* src_row_v = src_v; + BlockParameters* const* bp = bp_row_base; + for (int column4x4 = 0; column4x4 < kNum4x4InLoopFilterUnit && + MultiplyBy4(column4x4_start + column4x4) < width_; + column4x4 += column_step, bp += column_step) { + GetVerticalDeblockFilterEdgeInfoUV(column4x4_start + column4x4, bp, + &level_u, &level_v, &column_step, + &filter_length); + if (level_u != 0) { + const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length); + dsp_.loop_filters[size][type]( + src_row_u, src_stride_u, outer_thresh_[level_u], + inner_thresh_[level_u], HevThresh(level_u)); + } + if (level_v != 0) { + const dsp::LoopFilterSize size = GetLoopFilterSizeUV(filter_length); + dsp_.loop_filters[size][type]( + src_row_v, src_stride_v, outer_thresh_[level_v], + inner_thresh_[level_v], HevThresh(level_v)); + } + src_row_u += column_step << column_step_shift; + src_row_v += column_step << column_step_shift; + column_step = DivideBy4(column_step << subsampling_x); + } + } + } +} + +void PostFilter::ApplyDeblockFilterForOneSuperBlockRow(int row4x4_start, + int sb4x4) { + assert(row4x4_start >= 0); + assert(DoDeblock()); + for (int y = 0; y < sb4x4; y += 16) { + const int row4x4 = row4x4_start + y; + if (row4x4 >= frame_header_.rows4x4) break; + int column4x4; + for (column4x4 = 0; column4x4 < frame_header_.columns4x4; + column4x4 += kNum4x4InLoopFilterUnit) { + // First apply vertical filtering + VerticalDeblockFilter(row4x4, column4x4); + + // Delay one superblock to apply horizontal filtering. + if (column4x4 != 0) { + HorizontalDeblockFilter(row4x4, column4x4 - kNum4x4InLoopFilterUnit); + } + } + // Horizontal filtering for the last 64x64 block. + HorizontalDeblockFilter(row4x4, column4x4 - kNum4x4InLoopFilterUnit); + } +} + +template +void PostFilter::DeblockFilterWorker(std::atomic* row4x4_atomic) { + int row4x4; + while ((row4x4 = row4x4_atomic->fetch_add(kNum4x4InLoopFilterUnit, + std::memory_order_relaxed)) < + frame_header_.rows4x4) { + for (int column4x4 = 0; column4x4 < frame_header_.columns4x4; + column4x4 += kNum4x4InLoopFilterUnit) { + (this->*deblock_filter_func_[loop_filter_type])(row4x4, column4x4); + } + } +} + +template void PostFilter::DeblockFilterWorker( + std::atomic* row4x4_atomic); +template void PostFilter::DeblockFilterWorker( + std::atomic* row4x4_atomic); + +void PostFilter::ApplyDeblockFilter(LoopFilterType loop_filter_type, + int row4x4_start, int column4x4_start, + int column4x4_end, int sb4x4) { + assert(row4x4_start >= 0); + assert(DoDeblock()); + + column4x4_end = std::min(column4x4_end, frame_header_.columns4x4); + if (column4x4_start >= column4x4_end) return; + + const DeblockFilter deblock_filter = deblock_filter_func_[loop_filter_type]; + const int sb_height4x4 = + std::min(sb4x4, frame_header_.rows4x4 - row4x4_start); + for (int y = 0; y < sb_height4x4; y += kNum4x4InLoopFilterUnit) { + const int row4x4 = row4x4_start + y; + for (int column4x4 = column4x4_start; column4x4 < column4x4_end; + column4x4 += kNum4x4InLoopFilterUnit) { + (this->*deblock_filter)(row4x4, column4x4); + } + } +} + +} // namespace libgav1 diff --git a/src/post_filter/deblock_thresholds.inc b/src/post_filter/deblock_thresholds.inc new file mode 100644 index 0000000..ca12aaa --- /dev/null +++ b/src/post_filter/deblock_thresholds.inc @@ -0,0 +1,85 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Thresholds for the deblocking filter. Precomputed values of part of Section +// 7.14.4 for all possible values of sharpness. + +constexpr uint8_t kInnerThresh[8][kMaxLoopFilterValue + 1] = { + {1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}, + {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}, + {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}, + {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}, + {1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}, + {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}, + {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, + {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}; + +constexpr uint8_t kOuterThresh[8][kMaxLoopFilterValue + 1] = { + {5, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, + 43, 46, 49, 52, 55, 58, 61, 64, 67, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118, + 121, 124, 127, 130, 133, 136, 139, 142, 145, 148, 151, 154, 157, + 160, 163, 166, 169, 172, 175, 178, 181, 184, 187, 190, 193}, + {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34, + 36, 39, 41, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, + 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, + 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138}, + {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34, + 36, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, + 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, + 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, + 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137}, + {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 34, + 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, + 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, + 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, + 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136}, + {5, 7, 9, 11, 14, 16, 19, 21, 24, 26, 29, 31, 33, + 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, + 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, + 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, + 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135}, + {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 31, + 33, 35, 37, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, + 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, + 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, + 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134}, + {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 31, + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, + 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, + 85, 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, + 111, 113, 115, 117, 119, 121, 123, 125, 127, 129, 131, 133}, + {5, 7, 9, 11, 13, 15, 17, 19, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, + 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, + 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, + 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132}}; diff --git a/src/post_filter/loop_restoration.cc b/src/post_filter/loop_restoration.cc new file mode 100644 index 0000000..3d5da90 --- /dev/null +++ b/src/post_filter/loop_restoration.cc @@ -0,0 +1,172 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "src/post_filter.h" +#include "src/utils/blocking_counter.h" + +namespace libgav1 { + +template +void PostFilter::ApplyLoopRestorationForOneRow( + const Pixel* src_buffer, const ptrdiff_t stride, const Plane plane, + const int plane_height, const int plane_width, const int unit_y, + const int unit_row, const int current_process_unit_height, + const int plane_unit_size, Pixel* dst_buffer) { + const int num_horizontal_units = + restoration_info_->num_horizontal_units(static_cast(plane)); + const RestorationUnitInfo* const restoration_info = + restoration_info_->loop_restoration_info(static_cast(plane), + unit_row * num_horizontal_units); + const bool in_place = DoCdef() || thread_pool_ != nullptr; + const Pixel* border = nullptr; + src_buffer += unit_y * stride; + if (in_place) { + assert(loop_restoration_border_.stride(plane) == + static_cast(sizeof(Pixel) * stride)); + const int border_unit_y = std::max( + RightShiftWithCeiling(unit_y, 4 - subsampling_y_[plane]) - 4, 0); + border = + reinterpret_cast(loop_restoration_border_.data(plane)) + + border_unit_y * stride; + } + int unit_column = 0; + int column = 0; + do { + const int current_process_unit_width = + std::min(plane_unit_size, plane_width - column); + const Pixel* src = src_buffer + column; + unit_column = std::min(unit_column, num_horizontal_units - 1); + if (restoration_info[unit_column].type == kLoopRestorationTypeNone) { + Pixel* dst = dst_buffer + column; + if (in_place) { + int k = current_process_unit_height; + do { + memmove(dst, src, current_process_unit_width * sizeof(Pixel)); + src += stride; + dst += stride; + } while (--k != 0); + } else { + CopyPlane(src, stride, current_process_unit_width, + current_process_unit_height, dst, stride); + } + } else { + const Pixel* top_border = src - kRestorationVerticalBorder * stride; + const Pixel* bottom_border = src + current_process_unit_height * stride; + const bool frame_bottom_border = + (unit_y + current_process_unit_height >= plane_height); + if (in_place && (unit_y != 0 || !frame_bottom_border)) { + const Pixel* loop_restoration_border = border + column; + if (unit_y != 0) { + top_border = loop_restoration_border; + loop_restoration_border += 4 * stride; + } + if (!frame_bottom_border) { + bottom_border = + loop_restoration_border + kRestorationVerticalBorder * stride; + } + } + RestorationBuffer restoration_buffer; + const LoopRestorationType type = restoration_info[unit_column].type; + assert(type == kLoopRestorationTypeSgrProj || + type == kLoopRestorationTypeWiener); + const dsp::LoopRestorationFunc restoration_func = + dsp_.loop_restorations[type - 2]; + restoration_func(restoration_info[unit_column], src, top_border, + bottom_border, stride, current_process_unit_width, + current_process_unit_height, &restoration_buffer, + dst_buffer + column); + } + ++unit_column; + column += plane_unit_size; + } while (column < plane_width); +} + +template +void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(const int row4x4_start, + const int sb4x4) { + assert(row4x4_start >= 0); + assert(DoRestoration()); + int plane = kPlaneY; + do { + if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) { + continue; + } + const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(Pixel); + const int unit_height_offset = + kRestorationUnitOffset >> subsampling_y_[plane]; + const int plane_height = SubsampledValue(height_, subsampling_y_[plane]); + const int plane_width = + SubsampledValue(upscaled_width_, subsampling_x_[plane]); + const int plane_unit_size = 1 << loop_restoration_.unit_size_log2[plane]; + const int plane_process_unit_height = + kRestorationUnitHeight >> subsampling_y_[plane]; + int y = (row4x4_start == 0) + ? 0 + : (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) - + unit_height_offset; + int expected_height = plane_process_unit_height - + ((row4x4_start == 0) ? unit_height_offset : 0); + int current_process_unit_height; + for (int sb_y = 0; sb_y < sb4x4; + sb_y += 16, y += current_process_unit_height) { + if (y >= plane_height) break; + const int unit_row = std::min( + (y + unit_height_offset) >> loop_restoration_.unit_size_log2[plane], + restoration_info_->num_vertical_units(static_cast(plane)) - 1); + current_process_unit_height = std::min(expected_height, plane_height - y); + expected_height = plane_process_unit_height; + ApplyLoopRestorationForOneRow( + reinterpret_cast(superres_buffer_[plane]), stride, + static_cast(plane), plane_height, plane_width, y, unit_row, + current_process_unit_height, plane_unit_size, + reinterpret_cast(loop_restoration_buffer_[plane]) + + y * stride); + } + } while (++plane < planes_); +} + +void PostFilter::ApplyLoopRestoration(const int row4x4_start, const int sb4x4) { +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth_ >= 10) { + ApplyLoopRestorationForOneSuperBlockRow(row4x4_start, sb4x4); + return; + } +#endif + ApplyLoopRestorationForOneSuperBlockRow(row4x4_start, sb4x4); +} + +void PostFilter::ApplyLoopRestorationWorker(std::atomic* row4x4_atomic) { + int row4x4; + // Loop Restoration operates with a lag of 8 rows (4 for chroma with + // subsampling) and hence we need to make sure to cover the last 8 rows of the + // last superblock row. So we run this loop for an extra iteration to + // accomplish that. + const int row4x4_end = frame_header_.rows4x4 + kNum4x4InLoopRestorationUnit; + while ((row4x4 = row4x4_atomic->fetch_add(kNum4x4InLoopRestorationUnit, + std::memory_order_relaxed)) < + row4x4_end) { + CopyBordersForOneSuperBlockRow(row4x4, kNum4x4InLoopRestorationUnit, + /*for_loop_restoration=*/true); +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth_ >= 10) { + ApplyLoopRestorationForOneSuperBlockRow( + row4x4, kNum4x4InLoopRestorationUnit); + continue; + } +#endif + ApplyLoopRestorationForOneSuperBlockRow( + row4x4, kNum4x4InLoopRestorationUnit); + } +} + +} // namespace libgav1 diff --git a/src/post_filter/post_filter.cc b/src/post_filter/post_filter.cc new file mode 100644 index 0000000..0eacf34 --- /dev/null +++ b/src/post_filter/post_filter.cc @@ -0,0 +1,601 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/post_filter.h" + +#include +#include +#include +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/array_2d.h" +#include "src/utils/blocking_counter.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" +#include "src/utils/types.h" + +namespace libgav1 { +namespace { + +// Import all the constants in the anonymous namespace. +#include "src/post_filter/deblock_thresholds.inc" + +// Row indices of loop restoration border. This is used to populate the +// |loop_restoration_border_| when either cdef is on or multithreading is +// enabled. The dimension is subsampling_y. +constexpr int kLoopRestorationBorderRows[2] = {54, 26}; + +} // namespace + +// The following example illustrates how ExtendFrame() extends a frame. +// Suppose the frame width is 8 and height is 4, and left, right, top, and +// bottom are all equal to 3. +// +// Before: +// +// ABCDEFGH +// IJKLMNOP +// QRSTUVWX +// YZabcdef +// +// After: +// +// AAA|ABCDEFGH|HHH [3] +// AAA|ABCDEFGH|HHH +// AAA|ABCDEFGH|HHH +// ---+--------+--- +// AAA|ABCDEFGH|HHH [1] +// III|IJKLMNOP|PPP +// QQQ|QRSTUVWX|XXX +// YYY|YZabcdef|fff +// ---+--------+--- +// YYY|YZabcdef|fff [2] +// YYY|YZabcdef|fff +// YYY|YZabcdef|fff +// +// ExtendFrame() first extends the rows to the left and to the right[1]. Then +// it copies the extended last row to the bottom borders[2]. Finally it copies +// the extended first row to the top borders[3]. +// static +template +void PostFilter::ExtendFrame(Pixel* const frame_start, const int width, + const int height, const ptrdiff_t stride, + const int left, const int right, const int top, + const int bottom) { + Pixel* src = frame_start; + // Copy to left and right borders. + int y = height; + do { + ExtendLine(src, width, left, right); + src += stride; + } while (--y != 0); + // Copy to bottom borders. For performance we copy |stride| pixels + // (including some padding pixels potentially) in each row, ending at the + // bottom right border pixel. In the diagram the asterisks indicate padding + // pixels. + // + // |<--- stride --->| + // **YYY|YZabcdef|fff <-- Copy from the extended last row. + // -----+--------+--- + // **YYY|YZabcdef|fff + // **YYY|YZabcdef|fff + // **YYY|YZabcdef|fff <-- bottom right border pixel + assert(src == frame_start + height * stride); + Pixel* dst = src - left; + src = dst - stride; + for (int y = 0; y < bottom; ++y) { + memcpy(dst, src, sizeof(Pixel) * stride); + dst += stride; + } + // Copy to top borders. For performance we copy |stride| pixels (including + // some padding pixels potentially) in each row, starting from the top left + // border pixel. In the diagram the asterisks indicate padding pixels. + // + // +-- top left border pixel + // | + // v + // AAA|ABCDEFGH|HHH** + // AAA|ABCDEFGH|HHH** + // AAA|ABCDEFGH|HHH** + // ---+--------+----- + // AAA|ABCDEFGH|HHH** <-- Copy from the extended first row. + // |<--- stride --->| + src = frame_start - left; + dst = frame_start - left - top * stride; + for (int y = 0; y < top; ++y) { + memcpy(dst, src, sizeof(Pixel) * stride); + dst += stride; + } +} + +template void PostFilter::ExtendFrame(uint8_t* const frame_start, + const int width, + const int height, + const ptrdiff_t stride, + const int left, const int right, + const int top, const int bottom); + +#if LIBGAV1_MAX_BITDEPTH >= 10 +template void PostFilter::ExtendFrame( + uint16_t* const frame_start, const int width, const int height, + const ptrdiff_t stride, const int left, const int right, const int top, + const int bottom); +#endif + +PostFilter::PostFilter(const ObuFrameHeader& frame_header, + const ObuSequenceHeader& sequence_header, + FrameScratchBuffer* const frame_scratch_buffer, + YuvBuffer* const frame_buffer, const dsp::Dsp* dsp, + int do_post_filter_mask) + : frame_header_(frame_header), + loop_restoration_(frame_header.loop_restoration), + dsp_(*dsp), + // Deblocking filter always uses 64x64 as step size. + num_64x64_blocks_per_row_(DivideBy64(frame_header.width + 63)), + upscaled_width_(frame_header.upscaled_width), + width_(frame_header.width), + height_(frame_header.height), + bitdepth_(sequence_header.color_config.bitdepth), + subsampling_x_{0, sequence_header.color_config.subsampling_x, + sequence_header.color_config.subsampling_x}, + subsampling_y_{0, sequence_header.color_config.subsampling_y, + sequence_header.color_config.subsampling_y}, + planes_(sequence_header.color_config.is_monochrome ? kMaxPlanesMonochrome + : kMaxPlanes), + pixel_size_log2_(static_cast((bitdepth_ == 8) ? sizeof(uint8_t) + : sizeof(uint16_t)) - + 1), + inner_thresh_(kInnerThresh[frame_header.loop_filter.sharpness]), + outer_thresh_(kOuterThresh[frame_header.loop_filter.sharpness]), + needs_chroma_deblock_(frame_header.loop_filter.level[kPlaneU + 1] != 0 || + frame_header.loop_filter.level[kPlaneV + 1] != 0), + cdef_index_(frame_scratch_buffer->cdef_index), + inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes), + restoration_info_(&frame_scratch_buffer->loop_restoration_info), + superres_coefficients_{ + frame_scratch_buffer->superres_coefficients[kPlaneTypeY].get(), + frame_scratch_buffer + ->superres_coefficients + [(sequence_header.color_config.is_monochrome || + sequence_header.color_config.subsampling_x == 0) + ? kPlaneTypeY + : kPlaneTypeUV] + .get()}, + superres_line_buffer_(frame_scratch_buffer->superres_line_buffer), + block_parameters_(frame_scratch_buffer->block_parameters_holder), + frame_buffer_(*frame_buffer), + cdef_border_(frame_scratch_buffer->cdef_border), + loop_restoration_border_(frame_scratch_buffer->loop_restoration_border), + do_post_filter_mask_(do_post_filter_mask), + thread_pool_( + frame_scratch_buffer->threading_strategy.post_filter_thread_pool()) { + const int8_t zero_delta_lf[kFrameLfCount] = {}; + ComputeDeblockFilterLevels(zero_delta_lf, deblock_filter_levels_); + if (DoSuperRes()) { + int plane = kPlaneY; + do { + const int downscaled_width = + SubsampledValue(width_, subsampling_x_[plane]); + const int upscaled_width = + SubsampledValue(upscaled_width_, subsampling_x_[plane]); + const int superres_width = downscaled_width << kSuperResScaleBits; + super_res_info_[plane].step = + (superres_width + upscaled_width / 2) / upscaled_width; + const int error = + super_res_info_[plane].step * upscaled_width - superres_width; + super_res_info_[plane].initial_subpixel_x = + ((-((upscaled_width - downscaled_width) << (kSuperResScaleBits - 1)) + + DivideBy2(upscaled_width)) / + upscaled_width + + (1 << (kSuperResExtraBits - 1)) - error / 2) & + kSuperResScaleMask; + super_res_info_[plane].upscaled_width = upscaled_width; + } while (++plane < planes_); + if (dsp->super_res_coefficients != nullptr) { + int plane = kPlaneY; + const int number_loops = (superres_coefficients_[kPlaneTypeY] == + superres_coefficients_[kPlaneTypeUV]) + ? kMaxPlanesMonochrome + : static_cast(kNumPlaneTypes); + do { + dsp->super_res_coefficients( + SubsampledValue(upscaled_width_, subsampling_x_[plane]), + super_res_info_[plane].initial_subpixel_x, + super_res_info_[plane].step, superres_coefficients_[plane]); + } while (++plane < number_loops); + } + } + int plane = kPlaneY; + do { + loop_restoration_buffer_[plane] = frame_buffer_.data(plane); + cdef_buffer_[plane] = frame_buffer_.data(plane); + superres_buffer_[plane] = frame_buffer_.data(plane); + source_buffer_[plane] = frame_buffer_.data(plane); + } while (++plane < planes_); + if (DoCdef() || DoRestoration() || DoSuperRes()) { + plane = kPlaneY; + const int pixel_size_log2 = pixel_size_log2_; + do { + int horizontal_shift = 0; + int vertical_shift = 0; + if (DoRestoration() && + loop_restoration_.type[plane] != kLoopRestorationTypeNone) { + horizontal_shift += frame_buffer_.alignment(); + if (!DoCdef() && thread_pool_ == nullptr) { + vertical_shift += kRestorationVerticalBorder; + } + superres_buffer_[plane] += + vertical_shift * frame_buffer_.stride(plane) + + (horizontal_shift << pixel_size_log2); + } + if (DoSuperRes()) { + vertical_shift += kSuperResVerticalBorder; + } + cdef_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) + + (horizontal_shift << pixel_size_log2); + if (DoCdef() && thread_pool_ == nullptr) { + horizontal_shift += frame_buffer_.alignment(); + vertical_shift += kCdefBorder; + } + assert(horizontal_shift <= frame_buffer_.right_border(plane)); + assert(vertical_shift <= frame_buffer_.bottom_border(plane)); + source_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) + + (horizontal_shift << pixel_size_log2); + } while (++plane < planes_); + } +} + +void PostFilter::ExtendFrameBoundary(uint8_t* const frame_start, + const int width, const int height, + const ptrdiff_t stride, const int left, + const int right, const int top, + const int bottom) const { +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth_ >= 10) { + ExtendFrame(reinterpret_cast(frame_start), width, + height, stride / sizeof(uint16_t), left, right, top, + bottom); + return; + } +#endif + ExtendFrame(frame_start, width, height, stride, left, right, top, + bottom); +} + +void PostFilter::ExtendBordersForReferenceFrame() { + if (frame_header_.refresh_frame_flags == 0) return; + int plane = kPlaneY; + do { + const int plane_width = + SubsampledValue(upscaled_width_, subsampling_x_[plane]); + const int plane_height = SubsampledValue(height_, subsampling_y_[plane]); + assert(frame_buffer_.left_border(plane) >= kMinLeftBorderPixels && + frame_buffer_.right_border(plane) >= kMinRightBorderPixels && + frame_buffer_.top_border(plane) >= kMinTopBorderPixels && + frame_buffer_.bottom_border(plane) >= kMinBottomBorderPixels); + // plane subsampling_x_ left_border + // Y N/A 64, 48 + // U,V 0 64, 48 + // U,V 1 32, 16 + assert(frame_buffer_.left_border(plane) >= 16); + // The |left| argument to ExtendFrameBoundary() must be at least + // kMinLeftBorderPixels (13) for warp. + static_assert(16 >= kMinLeftBorderPixels, ""); + ExtendFrameBoundary( + frame_buffer_.data(plane), plane_width, plane_height, + frame_buffer_.stride(plane), frame_buffer_.left_border(plane), + frame_buffer_.right_border(plane), frame_buffer_.top_border(plane), + frame_buffer_.bottom_border(plane)); + } while (++plane < planes_); +} + +void PostFilter::CopyDeblockedPixels(Plane plane, int row4x4) { + assert(frame_buffer_.stride(plane) == loop_restoration_border_.stride(plane)); + const ptrdiff_t stride = frame_buffer_.stride(plane); + const uint8_t* const src = GetSourceBuffer(plane, row4x4, 0); + const int row_offset = DivideBy4(row4x4); + uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * stride; + const int num_pixels = SubsampledValue(MultiplyBy4(frame_header_.columns4x4), + subsampling_x_[plane]); + const int row_width = num_pixels << pixel_size_log2_; + int last_valid_row = -1; + const int plane_height = + SubsampledValue(frame_header_.height, subsampling_y_[plane]); + int row = kLoopRestorationBorderRows[subsampling_y_[plane]]; + const int absolute_row = (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row; + for (int i = 0; i < 4; ++i, ++row) { + if (absolute_row + i >= plane_height) { + if (last_valid_row == -1) break; + // If we run out of rows, copy the last valid row (mimics the bottom + // border extension). + row = last_valid_row; + } + memcpy(dst, src + row * stride, row_width); + last_valid_row = row; + dst += stride; + } +} + +void PostFilter::CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4, + bool for_loop_restoration) { + // Number of rows to be subtracted from the start position described by + // row4x4. We always lag by 8 rows (to account for in-loop post filters). + const int row_offset = (row4x4 == 0) ? 0 : 8; + // Number of rows to be subtracted from the height described by sb4x4. + const int height_offset = (row4x4 == 0) ? 8 : 0; + // If cdef is off and post filter multithreading is off, then loop restoration + // needs 2 extra rows for the bottom border in each plane. + const int extra_rows = + (for_loop_restoration && thread_pool_ == nullptr && !DoCdef()) ? 2 : 0; + int plane = kPlaneY; + do { + const int plane_width = + SubsampledValue(upscaled_width_, subsampling_x_[plane]); + const int plane_height = SubsampledValue(height_, subsampling_y_[plane]); + const int row = (MultiplyBy4(row4x4) - row_offset) >> subsampling_y_[plane]; + assert(row >= 0); + if (row >= plane_height) break; + const int num_rows = + std::min(SubsampledValue(MultiplyBy4(sb4x4) - height_offset, + subsampling_y_[plane]) + + extra_rows, + plane_height - row); + // We only need to track the progress of the Y plane since the progress of + // the U and V planes will be inferred from the progress of the Y plane. + if (!for_loop_restoration && plane == kPlaneY) { + progress_row_ = row + num_rows; + } + const bool copy_bottom = row + num_rows == plane_height; + const int stride = frame_buffer_.stride(plane); + uint8_t* const start = (for_loop_restoration ? superres_buffer_[plane] + : frame_buffer_.data(plane)) + + row * stride; + const int left_border = for_loop_restoration + ? kRestorationHorizontalBorder + : frame_buffer_.left_border(plane); + const int right_border = for_loop_restoration + ? kRestorationHorizontalBorder + : frame_buffer_.right_border(plane); + const int top_border = + (row == 0) ? (for_loop_restoration ? kRestorationVerticalBorder + : frame_buffer_.top_border(plane)) + : 0; + const int bottom_border = + copy_bottom + ? (for_loop_restoration ? kRestorationVerticalBorder + : frame_buffer_.bottom_border(plane)) + : 0; + ExtendFrameBoundary(start, plane_width, num_rows, stride, left_border, + right_border, top_border, bottom_border); + } while (++plane < planes_); +} + +void PostFilter::SetupLoopRestorationBorder(const int row4x4) { + assert(row4x4 >= 0); + assert(!DoCdef()); + assert(DoRestoration()); + int plane = kPlaneY; + do { + if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) { + continue; + } + assert(frame_buffer_.stride(plane) == + loop_restoration_border_.stride(plane)); + const ptrdiff_t stride = frame_buffer_.stride(plane); + const int row_offset = DivideBy4(row4x4); + const int num_pixels = + SubsampledValue(upscaled_width_, subsampling_x_[plane]); + const int row_width = num_pixels << pixel_size_log2_; + const int plane_height = SubsampledValue(height_, subsampling_y_[plane]); + const int row = kLoopRestorationBorderRows[subsampling_y_[plane]]; + const int absolute_row = + (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row; + const uint8_t* src = + GetSuperResBuffer(static_cast(plane), row4x4, 0) + row * stride; + uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * stride; + for (int i = 0; i < 4; ++i) { + memcpy(dst, src, row_width); +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth_ >= 10) { + ExtendLine(dst, num_pixels, kRestorationHorizontalBorder, + kRestorationHorizontalBorder); + } else // NOLINT. +#endif + ExtendLine(dst, num_pixels, kRestorationHorizontalBorder, + kRestorationHorizontalBorder); + // If we run out of rows, copy the last valid row (mimics the bottom + // border extension). + if (absolute_row + i < plane_height - 1) src += stride; + dst += stride; + } + } while (++plane < planes_); +} + +void PostFilter::SetupLoopRestorationBorder(int row4x4_start, int sb4x4) { + assert(row4x4_start >= 0); + assert(DoCdef()); + assert(DoRestoration()); + for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) { + const int row4x4 = row4x4_start + sb_y; + const int row_offset_start = DivideBy4(row4x4); + std::array dst = { + loop_restoration_border_.data(kPlaneY) + + row_offset_start * loop_restoration_border_.stride(kPlaneY), + loop_restoration_border_.data(kPlaneU) + + row_offset_start * loop_restoration_border_.stride(kPlaneU), + loop_restoration_border_.data(kPlaneV) + + row_offset_start * loop_restoration_border_.stride(kPlaneV)}; + // If SuperRes is enabled, then we apply SuperRes for the rows to be copied + // directly with |loop_restoration_border_| as the destination. Otherwise, + // we simply copy the rows. + if (DoSuperRes()) { + std::array src; + std::array rows; + int plane = kPlaneY; + do { + if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) { + rows[plane] = 0; + continue; + } + const int plane_height = + SubsampledValue(frame_header_.height, subsampling_y_[plane]); + const int row = kLoopRestorationBorderRows[subsampling_y_[plane]]; + const int absolute_row = + (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row; + src[plane] = GetSourceBuffer(static_cast(plane), row4x4, 0) + + row * frame_buffer_.stride(plane); + rows[plane] = Clip3(plane_height - absolute_row, 0, 4); + } while (++plane < planes_); + ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst); + // If we run out of rows, copy the last valid row (mimics the bottom + // border extension). + plane = kPlaneY; + do { + if (rows[plane] == 0 || rows[plane] >= 4) continue; + const ptrdiff_t stride = frame_buffer_.stride(plane); + uint8_t* dst_line = dst[plane] + rows[plane] * stride; + const uint8_t* const src_line = dst_line - stride; + const int upscaled_width = super_res_info_[plane].upscaled_width + << pixel_size_log2_; + for (int i = rows[plane]; i < 4; ++i) { + memcpy(dst_line, src_line, upscaled_width); + dst_line += stride; + } + } while (++plane < planes_); + } else { + int plane = kPlaneY; + do { + CopyDeblockedPixels(static_cast(plane), row4x4); + } while (++plane < planes_); + } + // Extend the left and right boundaries needed for loop restoration. + int plane = kPlaneY; + do { + if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) { + continue; + } + uint8_t* dst_line = dst[plane]; + const int plane_width = + SubsampledValue(upscaled_width_, subsampling_x_[plane]); + for (int i = 0; i < 4; ++i) { +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth_ >= 10) { + ExtendLine(dst_line, plane_width, + kRestorationHorizontalBorder, + kRestorationHorizontalBorder); + } else // NOLINT. +#endif + { + ExtendLine(dst_line, plane_width, + kRestorationHorizontalBorder, + kRestorationHorizontalBorder); + } + dst_line += loop_restoration_border_.stride(plane); + } + } while (++plane < planes_); + } +} + +void PostFilter::RunJobs(WorkerFunction worker) { + std::atomic row4x4(0); + const int num_workers = thread_pool_->num_threads(); + BlockingCounter pending_workers(num_workers); + for (int i = 0; i < num_workers; ++i) { + thread_pool_->Schedule([this, &row4x4, &pending_workers, worker]() { + (this->*worker)(&row4x4); + pending_workers.Decrement(); + }); + } + // Run the jobs on the current thread. + (this->*worker)(&row4x4); + // Wait for the threadpool jobs to finish. + pending_workers.Wait(); +} + +void PostFilter::ApplyFilteringThreaded() { + if (DoDeblock()) { + RunJobs(&PostFilter::DeblockFilterWorker); + RunJobs(&PostFilter::DeblockFilterWorker); + } + if (DoCdef() && DoRestoration()) { + for (int row4x4 = 0; row4x4 < frame_header_.rows4x4; + row4x4 += kNum4x4InLoopFilterUnit) { + SetupLoopRestorationBorder(row4x4, kNum4x4InLoopFilterUnit); + } + } + if (DoCdef()) { + for (int row4x4 = 0; row4x4 < frame_header_.rows4x4; + row4x4 += kNum4x4InLoopFilterUnit) { + SetupCdefBorder(row4x4); + } + RunJobs(&PostFilter::ApplyCdefWorker); + } + if (DoSuperRes()) ApplySuperResThreaded(); + if (DoRestoration()) { + if (!DoCdef()) { + int row4x4 = 0; + do { + SetupLoopRestorationBorder(row4x4); + row4x4 += kNum4x4InLoopFilterUnit; + } while (row4x4 < frame_header_.rows4x4); + } + RunJobs(&PostFilter::ApplyLoopRestorationWorker); + } + ExtendBordersForReferenceFrame(); +} + +int PostFilter::ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, + bool is_last_row, + bool do_deblock) { + if (row4x4 < 0) return -1; + if (DoDeblock() && do_deblock) { + ApplyDeblockFilterForOneSuperBlockRow(row4x4, sb4x4); + } + if (DoRestoration() && DoCdef()) { + SetupLoopRestorationBorder(row4x4, sb4x4); + } + if (DoCdef()) { + ApplyCdefForOneSuperBlockRow(row4x4, sb4x4, is_last_row); + } + if (DoSuperRes()) { + ApplySuperResForOneSuperBlockRow(row4x4, sb4x4, is_last_row); + } + if (DoRestoration()) { + CopyBordersForOneSuperBlockRow(row4x4, sb4x4, true); + ApplyLoopRestoration(row4x4, sb4x4); + if (is_last_row) { + // Loop restoration operates with a lag of 8 rows. So make sure to cover + // all the rows of the last superblock row. + CopyBordersForOneSuperBlockRow(row4x4 + sb4x4, 16, true); + ApplyLoopRestoration(row4x4 + sb4x4, 16); + } + } + if (frame_header_.refresh_frame_flags != 0 && DoBorderExtensionInLoop()) { + CopyBordersForOneSuperBlockRow(row4x4, sb4x4, false); + if (is_last_row) { + CopyBordersForOneSuperBlockRow(row4x4 + sb4x4, 16, false); + } + } + if (is_last_row && !DoBorderExtensionInLoop()) { + ExtendBordersForReferenceFrame(); + } + return is_last_row ? height_ : progress_row_; +} + +} // namespace libgav1 diff --git a/src/post_filter/super_res.cc b/src/post_filter/super_res.cc new file mode 100644 index 0000000..a70e4ed --- /dev/null +++ b/src/post_filter/super_res.cc @@ -0,0 +1,199 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "src/post_filter.h" +#include "src/utils/blocking_counter.h" + +namespace libgav1 { + +void PostFilter::ApplySuperRes(const std::array& src, + const std::array& rows, + const int line_buffer_row, + const std::array& dst) { + int plane = kPlaneY; + do { + const int plane_width = + MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane]; +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth_ >= 10) { + auto* input = reinterpret_cast(src[plane]); + auto* output = reinterpret_cast(dst[plane]); + const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(uint16_t); + if (rows[plane] > 0) { + dsp_.super_res(superres_coefficients_[static_cast(plane != 0)], + input, stride, rows[plane], plane_width, + super_res_info_[plane].upscaled_width, + super_res_info_[plane].initial_subpixel_x, + super_res_info_[plane].step, output); + } + // In the multi-threaded case, the |superres_line_buffer_| holds the last + // input row. Apply SuperRes for that row. + if (line_buffer_row >= 0) { + auto* const line_buffer_start = + reinterpret_cast(superres_line_buffer_.data(plane)) + + line_buffer_row * superres_line_buffer_.stride(plane) / + sizeof(uint16_t) + + kSuperResHorizontalBorder; + dsp_.super_res( + superres_coefficients_[static_cast(plane != 0)], + line_buffer_start, /*stride=*/0, + /*height=*/1, plane_width, super_res_info_[plane].upscaled_width, + super_res_info_[plane].initial_subpixel_x, + super_res_info_[plane].step, output + rows[plane] * stride); + } + continue; + } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + uint8_t* input = src[plane]; + uint8_t* output = dst[plane]; + if (rows[plane] > 0) { + dsp_.super_res(superres_coefficients_[static_cast(plane != 0)], + input, frame_buffer_.stride(plane), rows[plane], + plane_width, super_res_info_[plane].upscaled_width, + super_res_info_[plane].initial_subpixel_x, + super_res_info_[plane].step, output); + } + // In the multi-threaded case, the |superres_line_buffer_| holds the last + // input row. Apply SuperRes for that row. + if (line_buffer_row >= 0) { + uint8_t* const line_buffer_start = + superres_line_buffer_.data(plane) + + line_buffer_row * superres_line_buffer_.stride(plane) + + kSuperResHorizontalBorder; + dsp_.super_res(superres_coefficients_[static_cast(plane != 0)], + line_buffer_start, /*stride=*/0, + /*height=*/1, plane_width, + super_res_info_[plane].upscaled_width, + super_res_info_[plane].initial_subpixel_x, + super_res_info_[plane].step, + output + rows[plane] * frame_buffer_.stride(plane)); + } + } while (++plane < planes_); +} + +void PostFilter::ApplySuperResForOneSuperBlockRow(int row4x4_start, int sb4x4, + bool is_last_row) { + assert(row4x4_start >= 0); + assert(DoSuperRes()); + // If not doing cdef, then LR needs two rows of border with superres applied. + const int num_rows_extra = (DoCdef() || !DoRestoration()) ? 0 : 2; + std::array src; + std::array dst; + std::array rows; + const int num_rows4x4 = + std::min(sb4x4, frame_header_.rows4x4 - row4x4_start) - + (is_last_row ? 0 : 2); + if (row4x4_start > 0) { + const int row4x4 = row4x4_start - 2; + int plane = kPlaneY; + do { + const int row = + (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + num_rows_extra; + const ptrdiff_t row_offset = row * frame_buffer_.stride(plane); + src[plane] = cdef_buffer_[plane] + row_offset; + dst[plane] = superres_buffer_[plane] + row_offset; + // Note that the |num_rows_extra| subtraction is done after the value is + // subsampled since we always need to work on |num_rows_extra| extra rows + // irrespective of the plane subsampling. + // Apply superres for the last 8-|num_rows_extra| rows of the previous + // superblock. + rows[plane] = (8 >> subsampling_y_[plane]) - num_rows_extra; + // Apply superres for the current superblock row (except for the last + // 8-|num_rows_extra| rows). + rows[plane] += (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) + + (is_last_row ? 0 : num_rows_extra); + } while (++plane < planes_); + } else { + // Apply superres for the current superblock row (except for the last + // 8-|num_rows_extra| rows). + int plane = kPlaneY; + do { + const ptrdiff_t row_offset = + (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) * + frame_buffer_.stride(plane); + src[plane] = cdef_buffer_[plane] + row_offset; + dst[plane] = superres_buffer_[plane] + row_offset; + // Note that the |num_rows_extra| addition is done after the value is + // subsampled since we always need to work on |num_rows_extra| extra rows + // irrespective of the plane subsampling. + rows[plane] = (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) + + (is_last_row ? 0 : num_rows_extra); + } while (++plane < planes_); + } + ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst); +} + +void PostFilter::ApplySuperResThreaded() { + int num_threads = thread_pool_->num_threads() + 1; + // The number of rows that will be processed by each thread in the thread pool + // (other than the current thread). + int thread_pool_rows = height_ / num_threads; + thread_pool_rows = std::max(thread_pool_rows, 1); + // Make rows of Y plane even when there is subsampling for the other planes. + if ((thread_pool_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) { + ++thread_pool_rows; + } + // Adjust the number of threads to what we really need. + num_threads = Clip3(height_ / thread_pool_rows, 1, num_threads); + // For the current thread, we round up to process all the remaining rows. + int current_thread_rows = height_ - thread_pool_rows * (num_threads - 1); + // Make rows of Y plane even when there is subsampling for the other planes. + if ((current_thread_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) { + ++current_thread_rows; + } + assert(current_thread_rows > 0); + BlockingCounter pending_workers(num_threads - 1); + for (int line_buffer_row = 0, row_start = 0; line_buffer_row < num_threads; + ++line_buffer_row, row_start += thread_pool_rows) { + std::array src; + std::array dst; + std::array rows; + int plane = kPlaneY; + const int pixel_size_log2 = pixel_size_log2_; + do { + src[plane] = + GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane), + static_cast(plane), row_start, 0); + dst[plane] = + GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane), + static_cast(plane), row_start, 0); + rows[plane] = + (((line_buffer_row < num_threads - 1) ? thread_pool_rows + : current_thread_rows) >> + subsampling_y_[plane]) - + 1; + const int plane_width = + MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane]; + uint8_t* const input = + src[plane] + rows[plane] * frame_buffer_.stride(plane); + uint8_t* const line_buffer_start = + superres_line_buffer_.data(plane) + + line_buffer_row * superres_line_buffer_.stride(plane) + + (kSuperResHorizontalBorder << pixel_size_log2); + memcpy(line_buffer_start, input, plane_width << pixel_size_log2); + } while (++plane < planes_); + if (line_buffer_row < num_threads - 1) { + thread_pool_->Schedule( + [this, src, rows, line_buffer_row, dst, &pending_workers]() { + ApplySuperRes(src, rows, line_buffer_row, dst); + pending_workers.Decrement(); + }); + } else { + ApplySuperRes(src, rows, line_buffer_row, dst); + } + } + // Wait for the threadpool jobs to finish. + pending_workers.Wait(); +} + +} // namespace libgav1 diff --git a/src/prediction_mask.cc b/src/prediction_mask.cc new file mode 100644 index 0000000..ab4d849 --- /dev/null +++ b/src/prediction_mask.cc @@ -0,0 +1,236 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/prediction_mask.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "src/utils/array_2d.h" +#include "src/utils/bit_mask_set.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/logging.h" +#include "src/utils/memory.h" + +namespace libgav1 { +namespace { + +constexpr int kWedgeDirectionTypes = 16; + +enum kWedgeDirection : uint8_t { + kWedgeHorizontal, + kWedgeVertical, + kWedgeOblique27, + kWedgeOblique63, + kWedgeOblique117, + kWedgeOblique153, +}; + +constexpr uint8_t kWedgeCodebook[3][16][3] = {{{kWedgeOblique27, 4, 4}, + {kWedgeOblique63, 4, 4}, + {kWedgeOblique117, 4, 4}, + {kWedgeOblique153, 4, 4}, + {kWedgeHorizontal, 4, 2}, + {kWedgeHorizontal, 4, 4}, + {kWedgeHorizontal, 4, 6}, + {kWedgeVertical, 4, 4}, + {kWedgeOblique27, 4, 2}, + {kWedgeOblique27, 4, 6}, + {kWedgeOblique153, 4, 2}, + {kWedgeOblique153, 4, 6}, + {kWedgeOblique63, 2, 4}, + {kWedgeOblique63, 6, 4}, + {kWedgeOblique117, 2, 4}, + {kWedgeOblique117, 6, 4}}, + {{kWedgeOblique27, 4, 4}, + {kWedgeOblique63, 4, 4}, + {kWedgeOblique117, 4, 4}, + {kWedgeOblique153, 4, 4}, + {kWedgeVertical, 2, 4}, + {kWedgeVertical, 4, 4}, + {kWedgeVertical, 6, 4}, + {kWedgeHorizontal, 4, 4}, + {kWedgeOblique27, 4, 2}, + {kWedgeOblique27, 4, 6}, + {kWedgeOblique153, 4, 2}, + {kWedgeOblique153, 4, 6}, + {kWedgeOblique63, 2, 4}, + {kWedgeOblique63, 6, 4}, + {kWedgeOblique117, 2, 4}, + {kWedgeOblique117, 6, 4}}, + {{kWedgeOblique27, 4, 4}, + {kWedgeOblique63, 4, 4}, + {kWedgeOblique117, 4, 4}, + {kWedgeOblique153, 4, 4}, + {kWedgeHorizontal, 4, 2}, + {kWedgeHorizontal, 4, 6}, + {kWedgeVertical, 2, 4}, + {kWedgeVertical, 6, 4}, + {kWedgeOblique27, 4, 2}, + {kWedgeOblique27, 4, 6}, + {kWedgeOblique153, 4, 2}, + {kWedgeOblique153, 4, 6}, + {kWedgeOblique63, 2, 4}, + {kWedgeOblique63, 6, 4}, + {kWedgeOblique117, 2, 4}, + {kWedgeOblique117, 6, 4}}}; + +constexpr BitMaskSet kWedgeFlipSignMasks[9] = { + BitMaskSet(0xBBFF), // kBlock8x8 + BitMaskSet(0xBBEF), // kBlock8x16 + BitMaskSet(0xBAEF), // kBlock8x32 + BitMaskSet(0xBBEF), // kBlock16x8 + BitMaskSet(0xBBFF), // kBlock16x16 + BitMaskSet(0xBBEF), // kBlock16x32 + BitMaskSet(0xABEF), // kBlock32x8 + BitMaskSet(0xBBEF), // kBlock32x16 + BitMaskSet(0xBBFF) // kBlock32x32 +}; + +// This table (and the one below) contains a few leading zeros and trailing 64s +// to avoid some additional memcpys where it is actually used. +constexpr uint8_t kWedgeMasterObliqueOdd[kWedgeMaskMasterSize * 3 / 2] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 6, 18, 37, + 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}; + +constexpr uint8_t kWedgeMasterObliqueEven[kWedgeMaskMasterSize * 3 / 2] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 11, 27, + 46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}; + +constexpr uint8_t kWedgeMasterVertical[kWedgeMaskMasterSize] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 7, 21, + 43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}; + +int BlockShape(BlockSize block_size) { + const int width = kNum4x4BlocksWide[block_size]; + const int height = kNum4x4BlocksHigh[block_size]; + if (height > width) return 0; + if (height < width) return 1; + return 2; +} + +uint8_t GetWedgeDirection(BlockSize block_size, int index) { + return kWedgeCodebook[BlockShape(block_size)][index][0]; +} + +uint8_t GetWedgeOffsetX(BlockSize block_size, int index) { + return kWedgeCodebook[BlockShape(block_size)][index][1]; +} + +uint8_t GetWedgeOffsetY(BlockSize block_size, int index) { + return kWedgeCodebook[BlockShape(block_size)][index][2]; +} + +} // namespace + +bool GenerateWedgeMask(WedgeMaskArray* const wedge_masks) { + // Generate master masks. + uint8_t master_mask[6][kWedgeMaskMasterSize][kWedgeMaskMasterSize]; + for (int y = 0; y < kWedgeMaskMasterSize; ++y) { + memcpy(master_mask[kWedgeVertical][y], kWedgeMasterVertical, + kWedgeMaskMasterSize); + } + + for (int y = 0, shift = 0; y < kWedgeMaskMasterSize; y += 2, ++shift) { + memcpy(master_mask[kWedgeOblique63][y], kWedgeMasterObliqueEven + shift, + kWedgeMaskMasterSize); + memcpy(master_mask[kWedgeOblique63][y + 1], kWedgeMasterObliqueOdd + shift, + kWedgeMaskMasterSize); + } + + for (int y = 0; y < kWedgeMaskMasterSize; ++y) { + for (int x = 0; x < kWedgeMaskMasterSize; ++x) { + const uint8_t mask_value = master_mask[kWedgeOblique63][y][x]; + master_mask[kWedgeHorizontal][x][y] = master_mask[kWedgeVertical][y][x]; + master_mask[kWedgeOblique27][x][y] = mask_value; + master_mask[kWedgeOblique117][y][kWedgeMaskMasterSize - 1 - x] = + 64 - mask_value; + master_mask[kWedgeOblique153][(kWedgeMaskMasterSize - 1 - x)][y] = + 64 - mask_value; + } + } + + // Generate wedge masks. + int block_size_index = 0; + for (int size = kBlock8x8; size <= kBlock32x32; ++size) { + if (!kIsWedgeCompoundModeAllowed.Contains(size)) continue; + + const int width = kBlockWidthPixels[size]; + const int height = kBlockHeightPixels[size]; + assert(width >= 8); + assert(width <= 32); + assert(height >= 8); + assert(height <= 32); + + const auto block_size = static_cast(size); + for (int wedge_index = 0; wedge_index < kWedgeDirectionTypes; + ++wedge_index) { + const uint8_t direction = GetWedgeDirection(block_size, wedge_index); + const uint8_t offset_x = + DivideBy2(kWedgeMaskMasterSize) - + ((GetWedgeOffsetX(block_size, wedge_index) * width) >> 3); + const uint8_t offset_y = + DivideBy2(kWedgeMaskMasterSize) - + ((GetWedgeOffsetY(block_size, wedge_index) * height) >> 3); + + // Allocate the 2d array. + for (int flip_sign = 0; flip_sign < 2; ++flip_sign) { + if (!((*wedge_masks)[block_size_index][flip_sign][wedge_index].Reset( + height, width, /*zero_initialize=*/false))) { + LIBGAV1_DLOG(ERROR, "Failed to allocate memory for wedge masks."); + return false; + } + } + + const auto flip_sign = static_cast( + kWedgeFlipSignMasks[block_size_index].Contains(wedge_index)); + uint8_t* wedge_masks_row = + (*wedge_masks)[block_size_index][flip_sign][wedge_index][0]; + uint8_t* wedge_masks_row_flip = + (*wedge_masks)[block_size_index][1 - flip_sign][wedge_index][0]; + uint8_t* master_mask_row = &master_mask[direction][offset_y][offset_x]; + for (int y = 0; y < height; ++y) { + memcpy(wedge_masks_row, master_mask_row, width); + for (int x = 0; x < width; ++x) { + wedge_masks_row_flip[x] = 64 - wedge_masks_row[x]; + } + wedge_masks_row += width; + wedge_masks_row_flip += width; + master_mask_row += kWedgeMaskMasterSize; + } + } + + block_size_index++; + } + return true; +} + +} // namespace libgav1 diff --git a/src/prediction_mask.h b/src/prediction_mask.h new file mode 100644 index 0000000..0134a0d --- /dev/null +++ b/src/prediction_mask.h @@ -0,0 +1,41 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_PREDICTION_MASK_H_ +#define LIBGAV1_SRC_PREDICTION_MASK_H_ + +#include +#include + +#include "src/utils/bit_mask_set.h" +#include "src/utils/types.h" + +namespace libgav1 { + +constexpr BitMaskSet kIsWedgeCompoundModeAllowed(kBlock8x8, kBlock8x16, + kBlock8x32, kBlock16x8, + kBlock16x16, kBlock16x32, + kBlock32x8, kBlock32x16, + kBlock32x32); + +// This function generates wedge masks. It should be called only once for the +// decoder. If the video is key frame only, we don't have to call this +// function. Returns true on success, false on allocation failure. +// 7.11.3.11. +bool GenerateWedgeMask(WedgeMaskArray* wedge_masks); + +} // namespace libgav1 +#endif // LIBGAV1_SRC_PREDICTION_MASK_H_ diff --git a/src/quantizer.cc b/src/quantizer.cc new file mode 100644 index 0000000..cd720d6 --- /dev/null +++ b/src/quantizer.cc @@ -0,0 +1,269 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/quantizer.h" + +#include +#include + +#include "src/utils/common.h" +#include "src/utils/constants.h" + +#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10 +#error LIBGAV1_MAX_BITDEPTH must be 8 or 10 +#endif + +namespace libgav1 { +namespace { + +// Import all the constants in the anonymous namespace. +#include "src/quantizer_tables.inc" + +// Format the kDcLookup and kAcLookup arrays manually for easier comparison +// with the Dc_Qlookup and Ac_Qlookup arrays in Section 7.12.2. + +// clang-format off +constexpr int16_t kDcLookup[][256] = { + // Lookup table for 8 bit. + { + 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, + 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, + 27, 28, 29, 30, 31, 32, 32, 33, 34, 35, 36, 37, + 38, 38, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, + 48, 48, 49, 50, 51, 52, 53, 53, 54, 55, 56, 57, + 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, 66, 66, + 67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76, + 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85, + 87, 88, 90, 92, 93, 95, 96, 98, 99, 101, 102, 104, + 105, 107, 108, 110, 111, 113, 114, 116, 117, 118, 120, 121, + 123, 125, 127, 129, 131, 134, 136, 138, 140, 142, 144, 146, + 148, 150, 152, 154, 156, 158, 161, 164, 166, 169, 172, 174, + 177, 180, 182, 185, 187, 190, 192, 195, 199, 202, 205, 208, + 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247, + 250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292, + 296, 300, 304, 309, 313, 317, 322, 326, 330, 335, 340, 344, + 349, 354, 359, 364, 369, 374, 379, 384, 389, 395, 400, 406, + 411, 417, 423, 429, 435, 441, 447, 454, 461, 467, 475, 482, + 489, 497, 505, 513, 522, 530, 539, 549, 559, 569, 579, 590, + 602, 614, 626, 640, 654, 668, 684, 700, 717, 736, 755, 775, + 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139, + 1184, 1232, 1282, 1336 + }, +#if LIBGAV1_MAX_BITDEPTH >= 10 + // Lookup table for 10 bit. + { + 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, + 37, 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, + 78, 82, 86, 90, 93, 97, 101, 105, 109, 113, 116, 120, + 124, 128, 132, 136, 140, 143, 147, 151, 155, 159, 163, 166, + 170, 174, 178, 182, 185, 189, 193, 197, 200, 204, 208, 212, + 215, 219, 223, 226, 230, 233, 237, 241, 244, 248, 251, 255, + 259, 262, 266, 269, 273, 276, 280, 283, 287, 290, 293, 297, + 300, 304, 307, 310, 314, 317, 321, 324, 327, 331, 334, 337, + 343, 350, 356, 362, 369, 375, 381, 387, 394, 400, 406, 412, + 418, 424, 430, 436, 442, 448, 454, 460, 466, 472, 478, 484, + 490, 499, 507, 516, 525, 533, 542, 550, 559, 567, 576, 584, + 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, 698, + 708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831, + 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988, + 1001, 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, + 1186, 1202, 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, + 1398, 1416, 1436, 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, + 1647, 1670, 1692, 1717, 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, + 1958, 1990, 2021, 2054, 2088, 2123, 2159, 2197, 2236, 2276, 2319, 2363, + 2410, 2458, 2508, 2561, 2616, 2675, 2737, 2802, 2871, 2944, 3020, 3102, + 3188, 3280, 3375, 3478, 3586, 3702, 3823, 3953, 4089, 4236, 4394, 4559, + 4737, 4929, 5130, 5347 + }, +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +}; + +constexpr int16_t kAcLookup[][256] = { + // Lookup table for 8 bit. + { + 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, + 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, + 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, + 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, + 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, + 152, 155, 158, 161, 164, 167, 170, 173, 176, 179, 182, 185, + 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, 227, + 231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280, + 285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347, + 353, 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432, + 440, 448, 456, 465, 474, 483, 492, 501, 510, 520, 530, 540, + 550, 560, 571, 582, 593, 604, 615, 627, 639, 651, 663, 676, + 689, 702, 715, 729, 743, 757, 771, 786, 801, 816, 832, 848, + 864, 881, 898, 915, 933, 951, 969, 988, 1007, 1026, 1046, 1066, + 1087, 1108, 1129, 1151, 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343, + 1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567, 1597, 1628, 1660, 1692, + 1725, 1759, 1793, 1828 + }, +#if LIBGAV1_MAX_BITDEPTH >= 10 + // Lookup table for 10 bit. + { + 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, + 40, 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, + 88, 92, 96, 100, 105, 109, 114, 118, 122, 127, 131, 136, + 140, 145, 149, 154, 158, 163, 168, 172, 177, 181, 186, 190, + 195, 199, 204, 208, 213, 217, 222, 226, 231, 235, 240, 244, + 249, 253, 258, 262, 267, 271, 275, 280, 284, 289, 293, 297, + 302, 306, 311, 315, 319, 324, 328, 332, 337, 341, 345, 349, + 354, 358, 362, 367, 371, 375, 379, 384, 388, 392, 396, 401, + 409, 417, 425, 433, 441, 449, 458, 466, 474, 482, 490, 498, + 506, 514, 523, 531, 539, 547, 555, 563, 571, 579, 588, 596, + 604, 616, 628, 640, 652, 664, 676, 688, 700, 713, 725, 737, + 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, 905, + 922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118, + 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, + 1411, 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, + 1759, 1791, 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, + 2199, 2239, 2283, 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, + 2755, 2807, 2859, 2915, 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, + 3455, 3523, 3591, 3659, 3731, 3803, 3876, 3952, 4028, 4104, 4184, 4264, + 4348, 4432, 4516, 4604, 4692, 4784, 4876, 4972, 5068, 5168, 5268, 5372, + 5476, 5584, 5692, 5804, 5916, 6032, 6148, 6268, 6388, 6512, 6640, 6768, + 6900, 7036, 7172, 7312 + }, +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +}; +// clang-format on + +void Transpose(uint8_t* const dst, const uint8_t* const src, int src_width, + int src_height) { + const int dst_width = src_height; + const int dst_height = src_width; + Array2DView source(src_height, src_width, src); + Array2DView dest(dst_height, dst_width, dst); + for (int y = 0; y < dst_height; ++y) { + for (int x = 0; x < dst_width; ++x) { + dest[y][x] = source[x][y]; + } + } +} + +// Copies the lower triangle and fills the upper triangle of |dst| using |src| +// as the source. +void FillUpperTriangle(uint8_t* dst, const uint8_t* src, int size) { + Array2DView dest(size, size, dst); + int k = 0; + for (int y = 0; y < size; ++y) { + for (int x = 0; x <= y; ++x) { + dest[y][x] = dest[x][y] = src[k++]; + } + } +} + +} // namespace + +bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix_ptr) { + for (int level = 0; level < kNumQuantizerLevelsForQuantizerMatrix; ++level) { + for (int plane_type = kPlaneTypeY; plane_type < kNumPlaneTypes; + ++plane_type) { + auto& quantizer_matrix = (*quantizer_matrix_ptr)[level][plane_type]; + // Notes about how these matrices are populated: + // * For square transforms, we store only the lower left triangle (it is + // symmetric about the main diagonal. So when populating the matrix, we + // will have to fill in the upper right triangle. + // * For rectangular transforms, the matrices are transposes when the + // width and height are reversed. So when populating we populate it with + // memcpy when w < h and populate it by transposing when w > h. + // * There is a special case for 16x16 where the matrix is the same as + // 32x32 with some offsets. + // * We use the "adjusted transform size" when using these matrices, so we + // won't have to populate them for transform sizes with one of the + // dimensions equal to 64. + for (int tx_size = 0; tx_size < kNumTransformSizes; ++tx_size) { + if (kTransformWidth[tx_size] == 64 || kTransformHeight[tx_size] == 64) { + continue; + } + const int size = kTransformWidth[tx_size] * kTransformHeight[tx_size]; + if (!quantizer_matrix[tx_size].Resize(size)) { + return false; + } + } +#define QUANTIZER_MEMCPY(W, H) \ + memcpy(quantizer_matrix[kTransformSize##W##x##H].get(), \ + kQuantizerMatrix##W##x##H[level][plane_type], (W) * (H)) +#define QUANTIZER_TRANSPOSE(W, H) \ + Transpose(quantizer_matrix[kTransformSize##W##x##H].get(), \ + kQuantizerMatrix##H##x##W[level][plane_type], H, W) +#define QUANTIZER_FILL_UPPER_TRIANGLE(SIZE) \ + FillUpperTriangle(quantizer_matrix[kTransformSize##SIZE##x##SIZE].get(), \ + kQuantizerMatrix##SIZE##x##SIZE[level][plane_type], SIZE) + QUANTIZER_FILL_UPPER_TRIANGLE(4); // 4x4 + QUANTIZER_MEMCPY(4, 8); // 4x8 + QUANTIZER_MEMCPY(4, 16); // 4x16 + QUANTIZER_TRANSPOSE(8, 4); // 8x4 + QUANTIZER_FILL_UPPER_TRIANGLE(8); // 8x8 + QUANTIZER_MEMCPY(8, 16); // 8x16 + QUANTIZER_MEMCPY(8, 32); // 8x32 + QUANTIZER_TRANSPOSE(16, 4); // 16x4 + QUANTIZER_TRANSPOSE(16, 8); // 16x8 + QUANTIZER_MEMCPY(16, 32); // 16x32 + QUANTIZER_TRANSPOSE(32, 8); // 32x8 + QUANTIZER_TRANSPOSE(32, 16); // 32x16 + QUANTIZER_FILL_UPPER_TRIANGLE(32); // 32x32 + // 16x16. + Array2DView dst16x16( + 16, 16, quantizer_matrix[kTransformSize16x16].get()); + Array2DView src32x32( + 32, 32, quantizer_matrix[kTransformSize32x32].get()); + for (int y = 0; y < 16; ++y) { + for (int x = 0; x < 16; ++x) { + dst16x16[y][x] = src32x32[MultiplyBy2(y)][MultiplyBy2(x)]; + } + } +#undef QUANTIZER_FILL_UPPER_TRIANGLE +#undef QUANTIZER_TRANSPOSE +#undef QUANTIZER_MEMCPY + } + } + return true; +} + +int GetQIndex(const Segmentation& segmentation, int index, int base_qindex) { + if (segmentation.FeatureActive(index, kSegmentFeatureQuantizer)) { + const int segment_qindex = + base_qindex + + segmentation.feature_data[index][kSegmentFeatureQuantizer]; + return Clip3(segment_qindex, kMinQuantizer, kMaxQuantizer); + } + return base_qindex; +} + +Quantizer::Quantizer(int bitdepth, const QuantizerParameters* params) + : params_(*params) { + assert(bitdepth >= 8 && bitdepth <= LIBGAV1_MAX_BITDEPTH); + const int index = BitdepthToArrayIndex(bitdepth); + dc_lookup_ = kDcLookup[index]; + ac_lookup_ = kAcLookup[index]; +} + +int Quantizer::GetDcValue(Plane plane, int qindex) const { + return dc_lookup_[Clip3(qindex + params_.delta_dc[plane], kMinQuantizer, + kMaxQuantizer)]; +} + +int Quantizer::GetAcValue(Plane plane, int qindex) const { + return ac_lookup_[Clip3(qindex + params_.delta_ac[plane], kMinQuantizer, + kMaxQuantizer)]; +} + +} // namespace libgav1 diff --git a/src/quantizer.h b/src/quantizer.h new file mode 100644 index 0000000..00c53ab --- /dev/null +++ b/src/quantizer.h @@ -0,0 +1,74 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_QUANTIZER_H_ +#define LIBGAV1_SRC_QUANTIZER_H_ + +#include + +#include "src/utils/constants.h" +#include "src/utils/dynamic_buffer.h" +#include "src/utils/segmentation.h" +#include "src/utils/types.h" + +namespace libgav1 { + +using QuantizerMatrix = std::array< + std::array, kNumTransformSizes>, + kNumPlaneTypes>, + kNumQuantizerLevelsForQuantizerMatrix>; + +// Implements the dequantization functions of Section 7.12.2. +class Quantizer { + public: + Quantizer(int bitdepth, const QuantizerParameters* params); + + // Returns the quantizer value for the dc coefficient for the given plane. + // The caller should call GetQIndex() with Tile::current_quantizer_index_ as + // the |base_qindex| argument, and pass the return value as the |qindex| + // argument to this method. + int GetDcValue(Plane plane, int qindex) const; + + // Returns the quantizer value for the ac coefficient for the given plane. + // The caller should call GetQIndex() with Tile::current_quantizer_index_ as + // the |base_qindex| argument, and pass the return value as the |qindex| + // argument to this method. + int GetAcValue(Plane plane, int qindex) const; + + private: + const QuantizerParameters& params_; + const int16_t* dc_lookup_; + const int16_t* ac_lookup_; +}; + +// Initialize the quantizer matrix. +bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix); + +// Get the quantizer index for the |index|th segment. +// +// This function has two use cases. What should be passed as the |base_qindex| +// argument depends on the use case. +// 1. While parsing the uncompressed header or transform type, pass +// Quantizer::base_index. +// Note: In this use case, the caller only cares about whether the return +// value is zero. +// 2. To generate the |qindex| argument to Quantizer::GetDcQuant() or +// Quantizer::GetAcQuant(), pass Tile::current_quantizer_index_. +int GetQIndex(const Segmentation& segmentation, int index, int base_qindex); + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_QUANTIZER_H_ diff --git a/src/quantizer_tables.inc b/src/quantizer_tables.inc new file mode 100644 index 0000000..34342c4 --- /dev/null +++ b/src/quantizer_tables.inc @@ -0,0 +1,3080 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file is just a convenience to separate out all the quantizer table +// definitions from the quantizer functions. + +constexpr uint8_t kQuantizerMatrix4x8 + [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][32] = { + {{32, 42, 75, 91, 33, 42, 69, 86, 37, 58, 84, + 91, 49, 71, 103, 110, 65, 84, 125, 128, 80, 97, + 142, 152, 91, 100, 145, 178, 104, 112, 146, 190}, + {31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64, 48, 61, 75, 73, + 54, 65, 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105}}, + {{32, 42, 69, 88, 33, 42, 64, 83, 36, 56, 77, + 88, 46, 67, 93, 105, 60, 79, 112, 122, 75, 92, + 130, 144, 86, 95, 136, 167, 98, 105, 136, 177}, + {31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72, + 52, 64, 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102}}, + {{32, 38, 62, 86, 32, 40, 58, 80, 34, 51, 68, + 85, 44, 61, 85, 101, 54, 69, 98, 117, 72, 84, + 118, 136, 82, 89, 129, 157, 92, 98, 127, 165}, + {31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71, + 50, 59, 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99}}, + {{32, 35, 59, 83, 32, 36, 57, 78, 34, 47, 65, + 82, 41, 53, 78, 97, 51, 61, 92, 111, 65, 73, + 108, 129, 75, 81, 117, 148, 86, 92, 119, 154}, + {31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70, + 49, 55, 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95}}, + {{32, 35, 51, 77, 32, 36, 50, 72, 34, 42, 54, 75, 38, 51, 67, 87, + 48, 59, 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144}, + {31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65, + 47, 54, 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93}}, + {{32, 35, 51, 75, 32, 36, 50, 71, 34, 42, 54, 73, 37, 50, 65, 84, + 45, 56, 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136}, + {31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64, + 46, 54, 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90}}, + {{32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58, 35, 43, 54, 68, + 41, 48, 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111}, + {31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57, + 45, 49, 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79}}, + {{32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59, + 38, 40, 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97}, + {31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53, + 46, 47, 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73}}, + {{32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54, + 35, 38, 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83}, + {31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50, + 47, 48, 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67}}, + {{31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42, + 34, 37, 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67}, + {31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45, + 43, 47, 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59}}, + {{31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40, + 33, 34, 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56}, + {31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46, + 40, 44, 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55}}, + {{31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36, + 32, 33, 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48}, + {31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47, + 37, 39, 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53}}, + {{31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 32, 32, 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36}, + {31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40, + 34, 36, 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47}}, + {{31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 31, 32, 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33}, + {31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, + 31, 32, 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40}}, + {{31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, + 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32}, + {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32}}}; +constexpr uint8_t kQuantizerMatrix4x16 + [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][64] = { + {{31, 44, 79, 96, 32, 41, 72, 90, 32, 42, 71, 86, 34, + 48, 73, 83, 34, 54, 78, 89, 41, 63, 90, 95, 45, 67, + 96, 102, 54, 75, 110, 111, 60, 79, 118, 123, 72, 90, 133, + 135, 75, 92, 136, 149, 83, 100, 142, 160, 88, 100, 140, 173, + 94, 101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197}, + {31, 49, 63, 69, 32, 45, 57, 65, 36, 46, 56, 62, 43, 49, 57, 60, + 46, 53, 60, 63, 45, 58, 67, 66, 46, 59, 71, 70, 50, 62, 78, 74, + 52, 64, 82, 80, 57, 67, 89, 85, 59, 68, 90, 91, 62, 71, 91, 96, + 63, 69, 89, 101, 65, 68, 89, 103, 67, 70, 86, 105, 69, 72, 88, 107}}, + {{31, 44, 73, 93, 32, 41, 67, 87, 32, 42, 65, 83, 33, + 44, 66, 81, 34, 54, 74, 86, 37, 58, 79, 92, 44, 66, + 90, 98, 49, 71, 99, 107, 56, 77, 107, 117, 65, 84, 119, + 129, 72, 90, 127, 141, 78, 95, 133, 151, 84, 95, 132, 163, + 89, 95, 136, 169, 95, 101, 132, 175, 101, 108, 141, 183}, + {31, 49, 61, 69, 32, 45, 55, 64, 36, 46, 54, 61, 41, 47, 54, 59, + 46, 53, 59, 62, 46, 56, 62, 65, 46, 59, 68, 68, 48, 61, 73, 73, + 51, 63, 77, 78, 54, 65, 82, 84, 57, 67, 86, 89, 60, 69, 88, 93, + 62, 67, 86, 98, 64, 66, 87, 100, 65, 68, 83, 102, 67, 70, 86, 103}}, + {{31, 39, 65, 90, 32, 38, 60, 84, 32, 39, 59, 81, 33, + 40, 58, 78, 34, 47, 65, 83, 37, 54, 73, 89, 41, 58, + 79, 94, 46, 62, 86, 102, 53, 68, 97, 112, 60, 73, 105, + 123, 65, 78, 111, 134, 74, 85, 120, 143, 79, 90, 125, 154, + 84, 90, 128, 158, 89, 95, 124, 164, 94, 101, 131, 170}, + {31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58, + 44, 51, 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71, + 49, 58, 73, 77, 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91, + 60, 66, 84, 95, 62, 64, 84, 97, 64, 66, 81, 99, 65, 68, 83, 100}}, + {{31, 36, 62, 88, 32, 35, 58, 82, 32, 36, 57, 79, 33, + 38, 56, 76, 34, 42, 61, 81, 34, 48, 66, 85, 39, 51, + 74, 91, 44, 56, 82, 98, 49, 60, 90, 107, 54, 63, 95, + 117, 60, 68, 102, 127, 68, 75, 110, 135, 75, 81, 117, 145, + 79, 85, 120, 148, 84, 89, 116, 153, 88, 94, 123, 159}, + {31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57, + 43, 50, 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70, + 48, 54, 70, 75, 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89, + 58, 61, 81, 93, 60, 63, 82, 94, 62, 64, 79, 96, 63, 66, 81, 97}}, + {{31, 36, 53, 81, 32, 35, 51, 76, 32, 35, 49, 73, 32, + 37, 49, 71, 33, 41, 53, 74, 34, 48, 60, 80, 37, 50, + 65, 85, 41, 53, 71, 91, 45, 56, 76, 98, 49, 60, 82, + 105, 54, 63, 87, 112, 61, 69, 93, 121, 68, 75, 100, 130, + 74, 80, 105, 137, 78, 84, 109, 142, 83, 88, 114, 148}, + {31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56, + 42, 49, 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68, + 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83, + 56, 59, 73, 87, 58, 61, 75, 90, 60, 62, 76, 92, 62, 64, 78, 94}}, + {{31, 36, 53, 79, 32, 35, 51, 75, 32, 34, 49, 72, 32, 36, 50, 71, + 33, 38, 49, 69, 34, 42, 54, 73, 34, 48, 60, 78, 37, 50, 65, 84, + 41, 53, 71, 90, 45, 56, 76, 96, 49, 60, 82, 103, 54, 63, 87, 110, + 60, 68, 92, 118, 65, 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136}, + {31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56, + 40, 47, 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64, + 45, 53, 61, 67, 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, + 52, 56, 70, 82, 54, 58, 72, 85, 57, 60, 75, 89, 59, 61, 75, 90}}, + {{31, 34, 44, 65, 32, 34, 43, 62, 32, 33, 41, 59, 32, 35, 43, 59, + 32, 37, 43, 58, 34, 39, 48, 63, 34, 42, 53, 67, 36, 44, 57, 71, + 39, 46, 60, 76, 42, 48, 64, 81, 45, 51, 67, 85, 50, 54, 72, 92, + 54, 58, 76, 98, 60, 63, 80, 105, 66, 68, 85, 111, 73, 74, 91, 118}, + {31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51, + 40, 47, 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58, + 46, 49, 57, 61, 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71, + 50, 52, 63, 73, 52, 53, 64, 76, 55, 55, 66, 79, 58, 58, 68, 82}}, + {{31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53, + 32, 34, 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63, + 37, 40, 57, 67, 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79, + 50, 50, 71, 86, 54, 53, 74, 90, 57, 56, 77, 93, 61, 58, 79, 97}, + {31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49, + 37, 44, 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55, + 46, 47, 55, 58, 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65, + 48, 47, 61, 68, 50, 48, 62, 70, 51, 49, 63, 71, 53, 50, 64, 73}}, + {{31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49, + 32, 34, 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54, + 35, 38, 49, 60, 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71, + 45, 45, 58, 75, 47, 47, 60, 77, 51, 50, 63, 82, 55, 54, 67, 87}, + {31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48, + 35, 43, 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50, + 47, 48, 53, 54, 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61, + 47, 46, 55, 63, 48, 47, 55, 64, 49, 47, 56, 66, 51, 49, 57, 68}}, + {{31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41, + 32, 34, 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44, + 34, 37, 42, 48, 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58, + 40, 41, 51, 60, 42, 43, 53, 63, 45, 45, 56, 66, 46, 46, 56, 67}, + {31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45, + 34, 42, 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47, + 42, 47, 50, 49, 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56, + 47, 46, 53, 57, 46, 46, 53, 58, 48, 46, 54, 59, 48, 46, 54, 59}}, + {{31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37, + 32, 32, 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40, + 32, 34, 37, 40, 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51, + 35, 36, 43, 51, 38, 39, 45, 54, 38, 39, 45, 54, 42, 42, 48, 58}, + {31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45, + 31, 35, 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46, + 38, 43, 47, 46, 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53, + 48, 47, 50, 53, 47, 46, 50, 54, 47, 46, 50, 54, 47, 45, 49, 56}}, + {{31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35, + 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36, + 32, 32, 34, 37, 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41, + 34, 34, 37, 42, 34, 34, 37, 44, 35, 34, 38, 48, 35, 34, 38, 48}, + {31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46, + 31, 32, 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47, + 36, 37, 44, 47, 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49, + 42, 43, 47, 50, 44, 44, 47, 51, 48, 46, 48, 53, 48, 46, 48, 53}}, + {{31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33, + 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35, + 32, 33, 34, 35, 32, 33, 34, 35, 33, 33, 35, 36, 34, 34, 36, 37}, + {31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38, + 31, 32, 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40, + 33, 35, 40, 42, 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45, + 38, 40, 45, 47, 38, 40, 45, 47, 39, 41, 45, 47, 42, 43, 46, 47}}, + {{31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, + 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33}, + {31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, + 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36, + 31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37, + 33, 35, 35, 39, 34, 36, 36, 40, 34, 36, 36, 40, 34, 36, 36, 40}}, + {{31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32}, + {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 30, 31, 32, 32}}}; +constexpr uint8_t kQuantizerMatrix8x16 + [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][128] = { + {{32, 32, 36, 53, 65, 87, 93, 99, 31, 33, 34, 49, 59, + 78, 86, 93, 32, 34, 36, 50, 59, 77, 82, 89, 34, 37, + 42, 54, 63, 79, 80, 88, 36, 38, 48, 60, 68, 84, 86, + 90, 44, 43, 53, 71, 79, 95, 94, 97, 48, 46, 56, 76, + 85, 102, 105, 105, 58, 54, 63, 87, 98, 116, 112, 115, 65, + 58, 68, 92, 105, 124, 122, 124, 79, 70, 79, 104, 118, 141, + 135, 135, 82, 72, 81, 106, 121, 144, 149, 146, 91, 80, 88, + 106, 130, 148, 162, 159, 97, 86, 94, 107, 128, 157, 167, 171, + 103, 93, 98, 114, 131, 150, 174, 186, 110, 100, 101, 117, 138, + 161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203}, + {32, 37, 48, 52, 57, 66, 68, 71, 30, 40, 46, 48, 52, 60, 63, 66, + 33, 43, 47, 47, 51, 59, 60, 63, 42, 47, 50, 50, 53, 60, 59, 62, + 49, 48, 53, 54, 57, 62, 62, 62, 49, 46, 53, 61, 64, 69, 66, 66, + 50, 46, 54, 64, 67, 73, 72, 70, 54, 49, 55, 68, 73, 80, 76, 75, + 57, 50, 56, 70, 76, 84, 80, 79, 63, 55, 60, 75, 82, 92, 87, 84, + 64, 56, 61, 75, 83, 93, 93, 89, 68, 59, 64, 74, 86, 94, 98, 94, + 70, 62, 66, 73, 83, 96, 99, 98, 72, 64, 66, 75, 83, 92, 101, 104, + 74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82, 91, 101, 109}}, + {{32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 44, 60, + 72, 84, 90, 32, 34, 36, 45, 59, 71, 80, 87, 32, 35, + 40, 47, 60, 71, 78, 85, 36, 37, 48, 56, 68, 78, 83, + 87, 39, 40, 50, 60, 73, 84, 91, 94, 47, 45, 56, 69, + 84, 95, 101, 101, 53, 50, 60, 75, 92, 103, 108, 110, 61, + 56, 65, 81, 100, 113, 116, 118, 71, 64, 73, 89, 111, 125, + 129, 129, 79, 70, 79, 95, 118, 133, 142, 138, 86, 76, 84, + 100, 124, 140, 153, 150, 92, 82, 89, 101, 121, 148, 157, 161, + 98, 88, 93, 108, 124, 141, 163, 174, 104, 94, 95, 110, 129, + 151, 171, 181, 110, 100, 98, 111, 127, 147, 169, 188}, + {32, 35, 48, 50, 57, 63, 68, 70, 30, 38, 46, 46, 52, 58, 63, 65, + 33, 41, 47, 46, 51, 56, 60, 63, 39, 46, 48, 47, 51, 55, 58, 61, + 49, 48, 53, 54, 57, 60, 61, 61, 48, 46, 53, 56, 60, 64, 65, 65, + 50, 46, 54, 61, 66, 70, 71, 69, 52, 47, 54, 63, 71, 75, 75, 74, + 55, 49, 56, 65, 74, 79, 79, 78, 60, 53, 58, 68, 79, 85, 85, 82, + 63, 55, 60, 70, 82, 89, 91, 87, 66, 58, 62, 72, 84, 91, 95, 91, + 68, 60, 64, 71, 81, 94, 97, 96, 70, 62, 65, 73, 81, 89, 98, 101, + 72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79, 89, 98, 105}}, + {{32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 41, 54, + 73, 81, 88, 32, 33, 36, 42, 53, 71, 78, 84, 32, 34, + 38, 42, 52, 69, 76, 82, 34, 36, 44, 50, 59, 75, 81, + 84, 39, 39, 50, 58, 68, 84, 88, 90, 44, 42, 53, 63, + 74, 90, 97, 97, 49, 46, 57, 67, 81, 97, 104, 105, 57, + 53, 63, 74, 90, 108, 111, 113, 65, 59, 68, 79, 97, 118, + 123, 122, 71, 64, 73, 84, 102, 125, 135, 131, 81, 72, 80, + 91, 110, 135, 145, 141, 87, 77, 85, 96, 114, 140, 148, 151, + 92, 83, 88, 102, 117, 133, 153, 163, 98, 88, 89, 103, 121, + 141, 160, 169, 103, 94, 92, 103, 119, 137, 158, 175}, + {32, 34, 48, 49, 54, 63, 67, 69, 31, 36, 46, 46, 50, 58, 62, 65, + 33, 40, 47, 46, 49, 56, 59, 62, 37, 44, 47, 45, 48, 54, 57, 60, + 44, 46, 51, 51, 53, 59, 60, 61, 48, 46, 53, 56, 58, 64, 64, 64, + 49, 45, 53, 58, 62, 67, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, + 54, 49, 55, 62, 70, 77, 77, 76, 57, 51, 56, 64, 73, 82, 83, 81, + 60, 53, 58, 65, 75, 85, 89, 85, 64, 57, 61, 68, 78, 89, 93, 89, + 66, 59, 63, 69, 79, 91, 94, 93, 68, 61, 63, 71, 79, 87, 96, 98, + 70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86, 95, 102}}, + {{32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 34, 41, 50, 61, 76, + 85, 31, 33, 35, 42, 49, 59, 73, 81, 32, 34, 37, 42, 49, 58, + 71, 79, 34, 35, 41, 48, 54, 63, 76, 81, 36, 36, 46, 54, 60, + 68, 80, 87, 41, 40, 49, 60, 67, 76, 88, 93, 47, 44, 53, 66, + 75, 84, 97, 101, 53, 50, 57, 71, 82, 92, 106, 108, 58, 54, 61, + 75, 87, 98, 112, 116, 65, 59, 66, 79, 92, 105, 120, 124, 74, 67, + 73, 86, 100, 113, 131, 134, 82, 73, 79, 92, 105, 120, 139, 142, 87, + 78, 83, 96, 110, 125, 144, 153, 92, 83, 84, 97, 114, 132, 150, 157, + 97, 88, 86, 97, 111, 128, 147, 163}, + {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64, + 33, 37, 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59, + 42, 44, 49, 49, 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63, + 48, 46, 51, 57, 59, 61, 66, 67, 50, 46, 52, 59, 63, 66, 71, 71, + 52, 47, 53, 61, 66, 71, 75, 74, 54, 49, 54, 62, 68, 73, 79, 79, + 57, 51, 55, 64, 70, 76, 83, 83, 61, 55, 58, 66, 73, 80, 87, 87, + 64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69, 77, 84, 93, 95, + 68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83, 92, 98}}, + {{32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 61, 74, + 82, 31, 32, 34, 38, 47, 59, 71, 79, 32, 33, 36, 40, 48, 58, + 69, 77, 33, 34, 38, 44, 52, 62, 72, 78, 36, 35, 42, 51, 58, + 68, 78, 84, 39, 38, 44, 54, 63, 73, 84, 89, 44, 41, 46, 59, + 69, 79, 90, 96, 48, 45, 50, 62, 74, 85, 96, 103, 53, 49, 53, + 66, 79, 92, 103, 111, 58, 54, 57, 70, 84, 98, 110, 118, 66, 60, + 63, 75, 90, 106, 119, 126, 74, 67, 69, 81, 97, 113, 128, 134, 81, + 73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147, + 91, 82, 80, 90, 103, 119, 137, 151}, + {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63, + 31, 35, 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58, + 41, 43, 48, 49, 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62, + 48, 46, 49, 54, 57, 60, 64, 65, 49, 45, 48, 56, 61, 64, 67, 69, + 50, 46, 49, 57, 63, 67, 71, 73, 52, 48, 50, 58, 65, 71, 75, 77, + 54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 53, 61, 69, 77, 82, 85, + 61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65, 73, 82, 89, 92, + 66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81, 89, 95}}, + {{32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, + 31, 32, 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, + 32, 33, 35, 38, 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, + 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50, 58, 65, 73, 84, + 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96, + 53, 49, 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110, + 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125, + 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136}, + {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, + 30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, + 37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, + 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64, + 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71, + 52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78, + 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85, + 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90}}, + {{32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 59, 69, + 31, 32, 33, 34, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65, + 32, 33, 35, 38, 42, 49, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, + 35, 34, 38, 47, 52, 59, 65, 73, 38, 36, 40, 49, 56, 63, 69, 77, + 41, 39, 41, 51, 60, 67, 74, 81, 44, 42, 43, 54, 64, 72, 79, 86, + 48, 45, 46, 56, 67, 76, 83, 91, 53, 49, 50, 60, 71, 82, 90, 99, + 58, 54, 54, 63, 75, 87, 95, 105, 65, 60, 58, 68, 79, 92, 102, 112, + 71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127}, + {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57, + 30, 32, 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54, + 37, 40, 47, 47, 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56, + 47, 46, 48, 52, 53, 53, 55, 58, 48, 46, 47, 53, 55, 56, 58, 61, + 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 62, 64, 66, + 50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 70, 73, + 54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56, 64, 70, 75, 79, + 60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75, 80, 86}}, + {{32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 51, 62, + 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59, + 32, 32, 34, 36, 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, + 34, 34, 37, 41, 44, 48, 54, 63, 36, 34, 38, 46, 50, 54, 60, 68, + 38, 37, 40, 47, 52, 57, 64, 72, 41, 39, 41, 49, 54, 60, 67, 76, + 44, 41, 43, 51, 57, 63, 71, 79, 48, 45, 46, 54, 60, 67, 76, 85, + 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97, + 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105}, + {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54, + 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51, + 35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, + 42, 43, 47, 49, 50, 49, 50, 53, 49, 46, 48, 52, 53, 53, 54, 57, + 48, 46, 47, 51, 54, 55, 57, 59, 48, 45, 46, 51, 54, 57, 59, 61, + 49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 64, 67, + 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73, + 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76}}, + {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51, + 31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49, + 32, 32, 34, 34, 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49, + 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54, + 36, 34, 37, 40, 48, 54, 56, 60, 38, 36, 39, 41, 49, 56, 58, 63, + 39, 37, 40, 42, 50, 58, 60, 65, 44, 41, 42, 45, 53, 63, 66, 71, + 47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48, 57, 67, 71, 77, + 53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75, 79, 87}, + {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50, + 30, 32, 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48, + 33, 36, 41, 44, 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, + 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50, + 49, 46, 48, 49, 53, 53, 54, 54, 48, 46, 47, 48, 53, 55, 55, 56, + 48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 45, 47, 53, 58, 59, 61, + 50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48, 54, 60, 61, 64, + 52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62, 65, 68}}, + {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45, + 31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, + 31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, + 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, + 34, 34, 35, 37, 41, 42, 48, 50, 35, 34, 36, 38, 45, 47, 52, 55, + 36, 34, 36, 38, 46, 48, 54, 56, 39, 37, 39, 40, 48, 50, 58, 60, + 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66, + 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70}, + {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48, + 31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, + 33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, + 37, 40, 43, 47, 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, + 42, 43, 44, 47, 49, 50, 49, 50, 47, 46, 46, 48, 51, 52, 53, 53, + 49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 51, 53, 56, 56, + 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59, + 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61}}, + {{32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42, + 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41, + 31, 32, 32, 33, 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42, + 32, 32, 32, 34, 34, 36, 36, 42, 32, 33, 33, 35, 35, 38, 38, 42, + 32, 33, 33, 35, 35, 38, 38, 42, 34, 34, 34, 37, 37, 42, 42, 48, + 34, 34, 34, 37, 37, 42, 42, 48, 36, 34, 34, 38, 38, 48, 48, 54, + 36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40, 40, 50, 50, 58, + 39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53, 53, 63}, + {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47, + 31, 31, 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45, + 30, 32, 32, 40, 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, + 33, 36, 36, 43, 43, 47, 47, 46, 37, 40, 40, 47, 47, 47, 47, 45, + 37, 40, 40, 47, 47, 47, 47, 45, 42, 43, 43, 47, 47, 50, 50, 49, + 42, 43, 43, 47, 47, 50, 50, 49, 49, 46, 46, 48, 48, 53, 53, 53, + 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 56, + 48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53, 53, 58}}, + {{32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35, + 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35, + 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, + 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36, + 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 34, 35, 35, 37, 38, + 32, 32, 33, 34, 35, 35, 37, 38, 33, 33, 33, 35, 36, 36, 40, 41, + 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44, + 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48}, + {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47, + 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46, + 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, + 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, + 35, 37, 37, 40, 44, 44, 46, 47, 37, 39, 40, 43, 47, 47, 47, 47, + 37, 39, 40, 43, 47, 47, 47, 47, 41, 42, 42, 44, 47, 47, 49, 49, + 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51, + 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53}}, + {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, + 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, + 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, + 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 35, 35, 35, + 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, + 32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37, 37, 38}, + {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39, + 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, + 31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, + 30, 31, 32, 32, 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, + 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44, + 33, 35, 36, 36, 38, 43, 43, 44, 35, 37, 38, 38, 41, 45, 45, 46, + 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, + 39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48}}, + {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 33, + 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34, + 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34}, + {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36, + 31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, + 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, + 31, 31, 31, 32, 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, + 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, + 30, 31, 32, 32, 32, 32, 35, 38, 31, 32, 33, 33, 33, 33, 36, 39, + 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41, + 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41}}, + {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32}, + {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, + 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}}; +constexpr uint8_t kQuantizerMatrix8x32 + [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][256] = { + {{32, 32, 36, 53, 65, 87, 93, 99, 31, 32, 35, 51, 62, 82, + 88, 94, 31, 33, 34, 49, 59, 78, 86, 93, 31, 33, 35, 49, + 59, 78, 84, 90, 32, 34, 36, 50, 59, 77, 82, 89, 32, 35, + 38, 49, 58, 75, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88, + 35, 37, 45, 57, 65, 82, 84, 87, 36, 38, 48, 60, 68, 84, + 86, 90, 39, 40, 50, 65, 73, 89, 91, 93, 44, 43, 53, 71, + 79, 95, 94, 97, 46, 44, 55, 73, 82, 98, 98, 99, 48, 46, + 56, 76, 85, 102, 105, 105, 53, 50, 60, 82, 92, 109, 107, 107, + 58, 54, 63, 87, 98, 116, 112, 115, 61, 56, 66, 89, 101, 120, + 119, 116, 65, 58, 68, 92, 105, 124, 122, 124, 71, 63, 73, 97, + 111, 132, 130, 127, 79, 70, 79, 104, 118, 141, 135, 135, 81, 71, + 80, 105, 119, 142, 140, 139, 82, 72, 81, 106, 121, 144, 149, 146, + 88, 77, 85, 108, 126, 149, 153, 152, 91, 80, 88, 106, 130, 148, + 162, 159, 94, 83, 91, 105, 131, 153, 165, 166, 97, 86, 94, 107, + 128, 157, 167, 171, 100, 89, 97, 111, 127, 152, 173, 182, 103, 93, + 98, 114, 131, 150, 174, 186, 107, 96, 100, 117, 136, 155, 177, 191, + 110, 100, 101, 117, 138, 161, 183, 193, 114, 104, 103, 117, 137, 159, + 185, 201, 118, 107, 105, 118, 136, 157, 182, 203, 122, 111, 107, 119, + 136, 156, 179, 204}, + {32, 37, 48, 52, 57, 66, 68, 71, 31, 38, 47, 50, 54, 63, 65, 67, + 30, 40, 46, 48, 52, 60, 63, 66, 32, 41, 46, 48, 51, 59, 62, 64, + 33, 43, 47, 47, 51, 59, 60, 63, 37, 47, 47, 47, 50, 57, 60, 62, + 42, 47, 50, 50, 53, 60, 59, 62, 45, 47, 51, 52, 55, 61, 61, 61, + 49, 48, 53, 54, 57, 62, 62, 62, 48, 47, 53, 57, 60, 66, 65, 64, + 49, 46, 53, 61, 64, 69, 66, 66, 49, 46, 53, 62, 65, 71, 68, 67, + 50, 46, 54, 64, 67, 73, 72, 70, 52, 47, 54, 66, 71, 77, 73, 71, + 54, 49, 55, 68, 73, 80, 76, 75, 55, 49, 56, 69, 75, 82, 79, 76, + 57, 50, 56, 70, 76, 84, 80, 79, 60, 52, 58, 72, 79, 88, 84, 81, + 63, 55, 60, 75, 82, 92, 87, 84, 64, 55, 61, 75, 82, 92, 89, 86, + 64, 56, 61, 75, 83, 93, 93, 89, 67, 58, 63, 76, 85, 95, 94, 91, + 68, 59, 64, 74, 86, 94, 98, 94, 69, 60, 65, 72, 85, 95, 99, 97, + 70, 62, 66, 73, 83, 96, 99, 98, 71, 63, 67, 74, 82, 93, 102, 102, + 72, 64, 66, 75, 83, 92, 101, 104, 73, 65, 66, 75, 84, 93, 102, 106, + 74, 67, 66, 74, 84, 94, 103, 106, 75, 68, 66, 74, 83, 93, 103, 109, + 76, 69, 67, 73, 82, 91, 101, 109, 77, 70, 67, 73, 81, 90, 99, 108}}, + {{32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 45, 62, 75, + 86, 91, 31, 32, 35, 44, 60, 72, 84, 90, 31, 33, 35, 44, + 59, 71, 82, 87, 32, 34, 36, 45, 59, 71, 80, 87, 32, 35, + 38, 45, 58, 69, 80, 86, 32, 35, 40, 47, 60, 71, 78, 85, + 34, 36, 42, 50, 63, 73, 82, 84, 36, 37, 48, 56, 68, 78, + 83, 87, 38, 39, 49, 58, 71, 81, 88, 90, 39, 40, 50, 60, + 73, 84, 91, 94, 44, 42, 53, 66, 79, 90, 94, 96, 47, 45, + 56, 69, 84, 95, 101, 101, 49, 47, 57, 71, 86, 97, 103, 102, + 53, 50, 60, 75, 92, 103, 108, 110, 58, 54, 63, 79, 98, 110, + 114, 111, 61, 56, 65, 81, 100, 113, 116, 118, 65, 59, 68, 84, + 105, 118, 124, 121, 71, 64, 73, 89, 111, 125, 129, 129, 76, 68, + 76, 92, 115, 130, 134, 132, 79, 70, 79, 95, 118, 133, 142, 138, + 82, 73, 81, 97, 121, 136, 145, 144, 86, 76, 84, 100, 124, 140, + 153, 150, 89, 79, 87, 99, 124, 145, 156, 156, 92, 82, 89, 101, + 121, 148, 157, 161, 95, 85, 92, 105, 120, 143, 163, 171, 98, 88, + 93, 108, 124, 141, 163, 174, 101, 91, 94, 110, 128, 146, 166, 179, + 104, 94, 95, 110, 129, 151, 171, 181, 107, 97, 96, 110, 128, 149, + 173, 188, 110, 100, 98, 111, 127, 147, 169, 188, 114, 104, 100, 111, + 127, 145, 166, 190}, + {32, 35, 48, 50, 57, 63, 68, 70, 31, 37, 47, 48, 54, 60, 64, 66, + 30, 38, 46, 46, 52, 58, 63, 65, 31, 38, 46, 46, 52, 57, 61, 63, + 33, 41, 47, 46, 51, 56, 60, 63, 37, 45, 47, 46, 50, 54, 59, 62, + 39, 46, 48, 47, 51, 55, 58, 61, 42, 46, 50, 50, 53, 57, 60, 60, + 49, 48, 53, 54, 57, 60, 61, 61, 48, 47, 53, 55, 58, 62, 64, 63, + 48, 46, 53, 56, 60, 64, 65, 65, 49, 45, 53, 59, 64, 67, 67, 66, + 50, 46, 54, 61, 66, 70, 71, 69, 51, 47, 54, 61, 68, 71, 72, 70, + 52, 47, 54, 63, 71, 75, 75, 74, 54, 49, 55, 65, 73, 78, 78, 74, + 55, 49, 56, 65, 74, 79, 79, 78, 57, 50, 56, 66, 76, 82, 83, 79, + 60, 53, 58, 68, 79, 85, 85, 82, 62, 54, 60, 69, 81, 87, 87, 84, + 63, 55, 60, 70, 82, 89, 91, 87, 64, 56, 61, 71, 83, 90, 92, 89, + 66, 58, 62, 72, 84, 91, 95, 91, 67, 59, 63, 71, 83, 93, 96, 94, + 68, 60, 64, 71, 81, 94, 97, 96, 69, 61, 65, 72, 80, 91, 99, 100, + 70, 62, 65, 73, 81, 89, 98, 101, 71, 64, 65, 73, 82, 90, 99, 103, + 72, 65, 65, 72, 82, 92, 100, 103, 73, 66, 65, 72, 81, 90, 100, 105, + 74, 67, 65, 71, 79, 89, 98, 105, 75, 68, 65, 71, 78, 87, 96, 105}}, + {{32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 42, 55, 75, + 83, 88, 31, 32, 35, 41, 54, 73, 81, 88, 31, 32, 34, 41, + 53, 72, 79, 84, 32, 33, 36, 42, 53, 71, 78, 84, 32, 34, + 37, 42, 53, 70, 77, 83, 32, 34, 38, 42, 52, 69, 76, 82, + 34, 35, 42, 48, 57, 73, 79, 81, 34, 36, 44, 50, 59, 75, + 81, 84, 36, 37, 48, 54, 63, 78, 85, 86, 39, 39, 50, 58, + 68, 84, 88, 90, 40, 40, 51, 59, 70, 85, 91, 92, 44, 42, + 53, 63, 74, 90, 97, 97, 47, 45, 56, 66, 79, 95, 99, 98, + 49, 46, 57, 67, 81, 97, 104, 105, 53, 50, 60, 71, 86, 103, + 109, 106, 57, 53, 63, 74, 90, 108, 111, 113, 59, 54, 64, 75, + 91, 111, 119, 115, 65, 59, 68, 79, 97, 118, 123, 122, 69, 62, + 71, 83, 100, 122, 127, 125, 71, 64, 73, 84, 102, 125, 135, 131, + 79, 71, 79, 90, 109, 133, 137, 136, 81, 72, 80, 91, 110, 135, + 145, 141, 82, 73, 81, 92, 111, 136, 147, 147, 87, 77, 85, 96, + 114, 140, 148, 151, 90, 80, 87, 99, 113, 135, 153, 160, 92, 83, + 88, 102, 117, 133, 153, 163, 95, 85, 88, 103, 120, 137, 155, 168, + 98, 88, 89, 103, 121, 141, 160, 169, 100, 91, 90, 103, 120, 139, + 161, 175, 103, 94, 92, 103, 119, 137, 158, 175, 106, 97, 93, 104, + 118, 135, 155, 176}, + {32, 34, 48, 49, 54, 63, 67, 69, 31, 35, 47, 47, 51, 60, 63, 65, + 31, 36, 46, 46, 50, 58, 62, 65, 30, 36, 46, 45, 49, 57, 60, 62, + 33, 40, 47, 46, 49, 56, 59, 62, 35, 42, 47, 45, 48, 55, 58, 61, + 37, 44, 47, 45, 48, 54, 57, 60, 42, 45, 50, 49, 51, 57, 59, 59, + 44, 46, 51, 51, 53, 59, 60, 61, 49, 47, 53, 53, 55, 60, 63, 62, + 48, 46, 53, 56, 58, 64, 64, 64, 48, 46, 53, 56, 59, 65, 66, 65, + 49, 45, 53, 58, 62, 67, 70, 68, 50, 46, 54, 59, 65, 70, 70, 68, + 51, 47, 54, 60, 65, 71, 73, 72, 52, 47, 54, 61, 68, 75, 76, 73, + 54, 49, 55, 62, 70, 77, 77, 76, 54, 49, 55, 62, 70, 78, 81, 77, + 57, 51, 56, 64, 73, 82, 83, 81, 59, 52, 58, 65, 74, 84, 85, 82, + 60, 53, 58, 65, 75, 85, 89, 85, 63, 56, 60, 67, 77, 89, 90, 87, + 64, 57, 61, 68, 78, 89, 93, 89, 64, 57, 61, 68, 78, 90, 94, 92, + 66, 59, 63, 69, 79, 91, 94, 93, 67, 60, 63, 70, 78, 88, 96, 97, + 68, 61, 63, 71, 79, 87, 96, 98, 69, 62, 63, 71, 80, 88, 96, 100, + 70, 63, 63, 70, 80, 89, 97, 100, 71, 64, 63, 70, 78, 88, 97, 102, + 72, 65, 63, 69, 77, 86, 95, 102, 73, 66, 63, 69, 76, 84, 93, 101}}, + {{32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 35, 42, 51, 62, + 78, 86, 31, 32, 34, 41, 50, 61, 76, 85, 31, 32, 34, 41, + 49, 59, 74, 82, 31, 33, 35, 42, 49, 59, 73, 81, 32, 33, + 36, 42, 50, 59, 73, 80, 32, 34, 37, 42, 49, 58, 71, 79, + 32, 34, 39, 44, 51, 60, 73, 78, 34, 35, 41, 48, 54, 63, + 76, 81, 35, 36, 45, 52, 59, 67, 79, 83, 36, 36, 46, 54, + 60, 68, 80, 87, 39, 39, 48, 58, 65, 73, 86, 88, 41, 40, + 49, 60, 67, 76, 88, 93, 44, 42, 51, 63, 71, 79, 92, 94, + 47, 44, 53, 66, 75, 84, 97, 101, 48, 45, 54, 67, 76, 85, + 98, 101, 53, 50, 57, 71, 82, 92, 106, 108, 55, 51, 59, 72, + 84, 94, 108, 110, 58, 54, 61, 75, 87, 98, 112, 116, 63, 58, + 65, 78, 91, 103, 118, 119, 65, 59, 66, 79, 92, 105, 120, 124, + 71, 64, 71, 84, 97, 111, 127, 129, 74, 67, 73, 86, 100, 113, + 131, 134, 79, 71, 77, 90, 104, 118, 136, 139, 82, 73, 79, 92, + 105, 120, 139, 142, 82, 74, 79, 92, 106, 121, 139, 150, 87, 78, + 83, 96, 110, 125, 144, 153, 89, 81, 83, 97, 113, 128, 145, 157, + 92, 83, 84, 97, 114, 132, 150, 157, 94, 85, 85, 97, 112, 130, + 151, 163, 97, 88, 86, 97, 111, 128, 147, 163, 99, 91, 87, 97, + 110, 126, 144, 163}, + {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64, + 31, 34, 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61, + 33, 37, 46, 45, 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60, + 37, 43, 47, 45, 47, 50, 55, 59, 39, 43, 48, 47, 48, 51, 56, 58, + 42, 44, 49, 49, 50, 53, 58, 60, 47, 46, 51, 53, 53, 56, 61, 61, + 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 56, 57, 60, 64, 64, + 48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58, 61, 64, 68, 67, + 50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67, 71, 71, + 52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75, + 54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80, + 57, 51, 55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85, + 61, 55, 58, 66, 73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89, + 64, 57, 60, 68, 75, 83, 91, 91, 64, 58, 60, 68, 75, 83, 91, 94, + 66, 59, 61, 69, 77, 84, 93, 95, 67, 60, 61, 69, 78, 85, 93, 97, + 68, 61, 61, 68, 77, 86, 94, 97, 69, 62, 61, 68, 76, 85, 94, 99, + 70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67, 74, 82, 90, 98}}, + {{32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 62, + 75, 83, 31, 32, 33, 39, 49, 61, 74, 82, 31, 32, 33, 38, + 47, 59, 72, 79, 31, 32, 34, 38, 47, 59, 71, 79, 32, 33, + 35, 39, 48, 59, 71, 78, 32, 33, 36, 40, 48, 58, 69, 77, + 32, 33, 36, 41, 48, 58, 69, 75, 33, 34, 38, 44, 52, 62, + 72, 78, 34, 34, 39, 45, 53, 63, 73, 80, 36, 35, 42, 51, + 58, 68, 78, 84, 36, 35, 42, 51, 59, 68, 79, 85, 39, 38, + 44, 54, 63, 73, 84, 89, 40, 39, 45, 56, 65, 75, 85, 90, + 44, 41, 46, 59, 69, 79, 90, 96, 46, 43, 48, 60, 72, 82, + 93, 97, 48, 45, 50, 62, 74, 85, 96, 103, 52, 48, 52, 65, + 78, 90, 101, 105, 53, 49, 53, 66, 79, 92, 103, 111, 58, 53, + 57, 69, 83, 97, 109, 113, 58, 54, 57, 70, 84, 98, 110, 118, + 65, 59, 62, 74, 89, 105, 118, 122, 66, 60, 63, 75, 90, 106, + 119, 126, 71, 65, 67, 79, 94, 111, 125, 131, 74, 67, 69, 81, + 97, 113, 128, 134, 79, 72, 73, 85, 101, 118, 133, 141, 81, 73, + 75, 86, 102, 120, 135, 143, 82, 74, 75, 87, 103, 121, 136, 147, + 86, 78, 78, 90, 106, 124, 140, 147, 88, 80, 80, 90, 105, 122, + 140, 152, 91, 82, 80, 90, 103, 119, 137, 151, 93, 85, 81, 90, + 103, 117, 134, 152}, + {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63, + 31, 33, 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60, + 31, 35, 43, 46, 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59, + 35, 39, 46, 46, 47, 50, 55, 58, 37, 41, 47, 46, 46, 50, 54, 57, + 41, 43, 48, 49, 49, 52, 57, 59, 42, 43, 48, 49, 50, 53, 57, 60, + 49, 47, 50, 53, 54, 57, 60, 62, 49, 47, 50, 53, 54, 57, 61, 63, + 48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55, 58, 61, 65, 66, + 49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65, 69, 70, + 50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74, + 52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78, + 54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83, + 57, 52, 53, 61, 69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87, + 61, 55, 56, 63, 72, 80, 86, 88, 63, 57, 57, 64, 73, 82, 89, 92, + 64, 58, 58, 65, 73, 82, 89, 92, 64, 58, 58, 65, 74, 83, 90, 94, + 66, 59, 59, 66, 75, 84, 91, 94, 67, 60, 59, 66, 74, 82, 91, 96, + 68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65, 71, 79, 87, 95}}, + {{32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, + 31, 32, 32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72, + 31, 32, 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, + 32, 32, 34, 36, 42, 50, 59, 71, 32, 33, 35, 38, 42, 49, 58, 69, + 32, 33, 35, 38, 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, + 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38, 48, 54, 60, 68, 78, + 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50, 58, 65, 73, 84, + 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, 79, 90, + 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96, + 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, + 53, 49, 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110, + 58, 54, 54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, + 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125, + 71, 65, 63, 73, 84, 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, + 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136, + 82, 75, 72, 81, 92, 106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141}, + {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, + 31, 31, 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57, + 30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, + 33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47, 45, 47, 50, 54, + 37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, + 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54, 57, 60, + 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64, + 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67, + 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71, + 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, + 52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78, + 54, 50, 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, + 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85, + 60, 54, 52, 58, 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89, + 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90, + 64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63, 69, 77, 84, 92}}, + {{32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 60, 70, + 31, 32, 32, 35, 42, 51, 59, 69, 31, 32, 32, 35, 41, 50, 58, 67, + 31, 32, 33, 34, 41, 49, 57, 66, 31, 32, 33, 35, 41, 49, 57, 66, + 32, 32, 34, 36, 42, 50, 57, 65, 32, 32, 34, 37, 42, 49, 56, 65, + 32, 33, 35, 38, 42, 49, 56, 64, 32, 33, 35, 39, 43, 50, 56, 64, + 34, 34, 37, 42, 48, 54, 61, 69, 34, 34, 37, 42, 48, 54, 61, 69, + 35, 34, 38, 47, 52, 59, 65, 73, 36, 34, 38, 48, 54, 60, 66, 74, + 38, 36, 40, 49, 56, 63, 69, 77, 39, 37, 40, 50, 58, 65, 71, 79, + 41, 39, 41, 51, 60, 67, 74, 81, 44, 41, 43, 53, 63, 71, 78, 85, + 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, + 48, 45, 46, 56, 67, 76, 83, 91, 53, 49, 49, 59, 71, 81, 89, 98, + 53, 49, 50, 60, 71, 82, 90, 99, 57, 52, 52, 62, 74, 85, 94, 103, + 58, 54, 54, 63, 75, 87, 95, 105, 61, 57, 56, 66, 77, 89, 98, 108, + 65, 60, 58, 68, 79, 92, 102, 112, 67, 61, 60, 69, 81, 94, 103, 114, + 71, 65, 63, 73, 84, 97, 108, 119, 72, 66, 64, 73, 85, 98, 108, 119, + 79, 72, 70, 79, 90, 104, 115, 127, 79, 72, 70, 79, 90, 104, 115, 127}, + {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58, + 31, 31, 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56, + 30, 32, 40, 46, 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54, + 33, 36, 43, 47, 46, 47, 50, 54, 34, 37, 44, 47, 45, 47, 50, 53, + 37, 40, 47, 47, 45, 47, 49, 52, 37, 40, 47, 48, 46, 47, 49, 53, + 42, 43, 47, 50, 49, 50, 53, 56, 42, 43, 47, 50, 49, 50, 53, 56, + 47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53, 53, 54, 56, 59, + 48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57, 59, 62, + 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66, + 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, + 50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72, + 52, 48, 47, 54, 61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75, + 54, 50, 49, 55, 62, 68, 72, 76, 55, 51, 49, 56, 63, 69, 74, 78, + 57, 52, 50, 56, 64, 70, 75, 79, 58, 53, 51, 57, 64, 71, 76, 80, + 60, 54, 52, 58, 65, 72, 77, 82, 60, 55, 53, 59, 65, 73, 78, 83, + 63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60, 67, 75, 80, 86}}, + {{32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 52, 63, + 31, 32, 32, 35, 38, 42, 51, 62, 31, 32, 32, 34, 37, 41, 50, 61, + 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 33, 34, 37, 41, 49, 59, + 31, 32, 34, 35, 38, 42, 49, 59, 32, 32, 34, 36, 38, 42, 50, 59, + 32, 32, 34, 36, 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, + 32, 33, 35, 37, 40, 42, 49, 58, 33, 33, 36, 40, 43, 46, 53, 62, + 34, 34, 37, 41, 44, 48, 54, 63, 34, 34, 37, 43, 46, 50, 56, 65, + 36, 34, 38, 46, 50, 54, 60, 68, 36, 34, 38, 46, 50, 54, 60, 68, + 38, 37, 40, 47, 52, 57, 64, 72, 39, 37, 40, 48, 53, 58, 65, 73, + 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, + 44, 41, 43, 51, 57, 63, 71, 79, 47, 44, 45, 53, 59, 66, 75, 84, + 48, 45, 46, 54, 60, 67, 76, 85, 50, 46, 47, 55, 61, 68, 78, 88, + 53, 49, 50, 57, 64, 71, 82, 92, 53, 49, 50, 57, 64, 71, 82, 92, + 57, 53, 53, 60, 67, 74, 86, 97, 58, 54, 54, 61, 68, 75, 87, 98, + 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105, + 65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62, 70, 76, 83, 96, 109}, + {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55, + 31, 31, 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53, + 30, 32, 40, 44, 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52, + 33, 35, 42, 46, 46, 45, 47, 51, 33, 36, 43, 46, 46, 46, 47, 51, + 35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, + 37, 40, 47, 47, 47, 45, 47, 50, 41, 42, 47, 49, 49, 48, 50, 52, + 42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50, 51, 51, 52, 54, + 49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53, 54, 57, + 48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60, + 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, + 49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66, + 50, 46, 46, 52, 56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68, + 52, 48, 47, 53, 57, 61, 66, 71, 52, 48, 47, 53, 57, 61, 66, 71, + 54, 49, 48, 54, 58, 62, 68, 73, 54, 50, 49, 54, 58, 62, 68, 73, + 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76, + 57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57, 61, 65, 72, 78}}, + {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52, + 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51, + 31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49, + 31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 33, 34, 36, 42, 45, 49, + 32, 32, 34, 34, 36, 42, 45, 50, 32, 32, 34, 35, 37, 42, 45, 49, + 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49, + 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54, + 34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50, 53, 57, + 36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60, + 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, + 39, 37, 40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69, + 44, 41, 42, 45, 53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72, + 47, 44, 45, 47, 56, 66, 69, 75, 48, 45, 46, 48, 56, 67, 70, 76, + 49, 46, 47, 48, 57, 67, 71, 77, 53, 49, 49, 51, 59, 71, 74, 81, + 53, 49, 50, 51, 60, 71, 75, 82, 55, 51, 51, 53, 61, 72, 76, 83, + 58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55, 63, 75, 79, 87}, + {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50, + 31, 31, 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50, + 30, 32, 38, 40, 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48, + 31, 33, 38, 41, 46, 45, 46, 48, 33, 35, 41, 43, 47, 45, 46, 47, + 33, 36, 41, 44, 47, 46, 46, 47, 34, 37, 42, 45, 47, 45, 46, 47, + 37, 40, 45, 47, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, + 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50, + 42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51, 52, 52, + 49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54, + 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, + 48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60, + 49, 45, 45, 47, 53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61, + 50, 46, 46, 48, 54, 59, 61, 63, 50, 46, 46, 48, 54, 59, 61, 64, + 51, 47, 47, 48, 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66, + 52, 48, 47, 48, 54, 61, 63, 66, 53, 48, 48, 49, 54, 61, 63, 67, + 54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50, 55, 62, 65, 68}}, + {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46, + 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45, + 31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, + 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 35, 41, 44, + 31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, + 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 35, 37, 37, 42, 45, + 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45, + 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50, + 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51, + 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, + 36, 34, 36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58, + 39, 37, 39, 40, 48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60, + 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66, + 44, 41, 42, 43, 51, 53, 63, 66, 44, 42, 42, 43, 51, 54, 64, 67, + 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70, + 48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48, 56, 58, 69, 73}, + {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49, + 31, 31, 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48, + 31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, + 30, 32, 35, 40, 44, 46, 45, 46, 31, 33, 35, 40, 45, 46, 45, 46, + 33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, + 33, 36, 38, 43, 46, 47, 46, 46, 35, 38, 41, 45, 47, 47, 45, 46, + 37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, + 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50, + 42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50, + 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, + 49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55, + 48, 46, 46, 47, 51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56, + 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59, + 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 52, 53, 58, 60, + 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61, + 50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47, 52, 54, 60, 62}}, + {{32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43, + 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42, + 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41, + 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41, + 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 35, 35, 41, + 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42, + 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 37, 37, 42, + 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42, + 32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45, + 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, + 34, 34, 34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50, + 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, + 36, 34, 34, 38, 38, 48, 48, 54, 37, 36, 36, 39, 39, 49, 49, 56, + 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, + 39, 37, 37, 40, 40, 50, 50, 58, 41, 39, 39, 42, 42, 52, 52, 60, + 44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43, 43, 53, 53, 63}, + {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48, + 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47, + 31, 31, 31, 38, 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46, + 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45, + 30, 32, 32, 40, 40, 46, 46, 45, 32, 34, 34, 41, 41, 46, 46, 45, + 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46, + 33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45, 45, 47, 47, 45, + 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, + 37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47, + 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, + 42, 43, 43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51, + 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, + 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 54, + 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, + 48, 46, 46, 47, 47, 53, 53, 56, 48, 45, 45, 46, 46, 53, 53, 57, + 49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46, 46, 53, 53, 58}}, + {{32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35, + 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, + 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, + 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 32, 34, 35, + 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, + 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 33, 33, 35, 35, + 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36, + 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36, + 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38, + 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, + 32, 32, 33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40, + 33, 33, 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, + 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 41, 42, + 34, 34, 34, 35, 37, 37, 43, 44, 35, 34, 34, 36, 38, 38, 45, 47, + 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48, + 36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37, 39, 39, 46, 49}, + {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48, + 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, + 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, + 31, 31, 32, 34, 39, 39, 45, 46, 30, 31, 32, 34, 39, 39, 44, 46, + 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, + 30, 32, 32, 35, 40, 40, 44, 46, 31, 33, 33, 36, 41, 41, 45, 46, + 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, + 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, + 35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47, + 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, + 37, 39, 40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48, + 41, 42, 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, + 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, 49, 50, + 44, 44, 44, 45, 47, 47, 50, 51, 47, 46, 46, 46, 48, 48, 51, 52, + 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53, + 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 47, 47, 52, 53}}, + {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33, + 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, + 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, + 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33, 33, 34, + 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, + 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, + 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35, + 32, 32, 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36, + 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, + 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, + 32, 33, 33, 33, 34, 36, 36, 36, 33, 33, 33, 33, 34, 36, 36, 37, + 34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34, 35, 37, 37, 38}, + {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39, + 31, 31, 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40, + 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, + 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, + 31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 34, 39, 39, 40, + 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, + 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, + 31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41, 41, 42, + 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44, + 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, + 33, 35, 36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45, + 35, 37, 38, 38, 41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47, + 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, + 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, + 39, 40, 41, 41, 43, 47, 47, 47, 40, 41, 42, 42, 44, 47, 47, 47, + 42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48}}, + {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, + 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34, + 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, + 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, + 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34}, + {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35, + 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36, + 31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, + 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, + 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, + 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, + 31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37, + 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38, + 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, + 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, + 30, 31, 32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38, + 31, 32, 33, 33, 33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40, + 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41, + 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, + 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, + 33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36, 36, 36, 39, 42}}, + {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32}, + {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, + 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32, + 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}}; +constexpr uint8_t kQuantizerMatrix16x32 + [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][512] = { + {{32, 31, 32, 34, 36, 44, 53, 59, 65, 79, 87, 90, 93, 96, + 99, 102, 31, 32, 32, 34, 35, 42, 51, 56, 62, 75, 82, 85, + 88, 91, 94, 97, 31, 32, 33, 33, 34, 41, 49, 54, 59, 72, + 78, 82, 86, 90, 93, 97, 31, 32, 33, 34, 35, 41, 49, 54, + 59, 71, 78, 81, 84, 87, 90, 93, 32, 32, 34, 35, 36, 42, + 50, 54, 59, 71, 77, 80, 82, 86, 89, 93, 32, 33, 35, 37, + 38, 42, 49, 53, 58, 69, 75, 78, 82, 86, 89, 92, 34, 34, + 37, 39, 42, 48, 54, 58, 63, 73, 79, 78, 80, 83, 88, 92, + 35, 34, 37, 41, 45, 50, 57, 61, 65, 76, 82, 83, 84, 84, + 87, 90, 36, 34, 38, 43, 48, 54, 60, 64, 68, 78, 84, 87, + 86, 89, 90, 90, 39, 37, 40, 45, 50, 58, 65, 69, 73, 84, + 89, 89, 91, 91, 93, 96, 44, 41, 43, 48, 53, 63, 71, 75, + 79, 90, 95, 93, 94, 95, 97, 97, 46, 43, 44, 49, 55, 65, + 73, 78, 82, 93, 98, 100, 98, 100, 99, 103, 48, 45, 46, 51, + 56, 67, 76, 80, 85, 96, 102, 102, 105, 102, 105, 104, 53, 49, + 50, 54, 60, 71, 82, 87, 92, 103, 109, 107, 107, 110, 107, 111, + 58, 54, 54, 58, 63, 75, 87, 92, 98, 110, 116, 115, 112, 111, + 115, 112, 61, 57, 56, 60, 66, 77, 89, 95, 101, 114, 120, 118, + 119, 118, 116, 120, 65, 60, 58, 63, 68, 79, 92, 98, 105, 118, + 124, 123, 122, 123, 124, 121, 71, 65, 63, 68, 73, 84, 97, 103, + 111, 125, 132, 132, 130, 128, 127, 130, 79, 72, 70, 74, 79, 90, + 104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81, 74, 71, 75, + 80, 91, 105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82, 75, + 72, 76, 81, 92, 106, 113, 121, 136, 144, 151, 149, 149, 146, 143, + 88, 80, 77, 80, 85, 97, 108, 115, 126, 142, 149, 153, 153, 152, + 152, 154, 91, 83, 80, 81, 88, 100, 106, 114, 130, 142, 148, 155, + 162, 160, 159, 155, 94, 85, 83, 82, 91, 100, 105, 118, 131, 137, + 153, 160, 165, 167, 166, 168, 97, 88, 86, 85, 94, 100, 107, 123, + 128, 140, 157, 161, 167, 173, 171, 169, 100, 91, 89, 87, 97, 100, + 111, 121, 127, 145, 152, 164, 173, 178, 182, 181, 103, 94, 93, 90, + 98, 101, 114, 120, 131, 144, 150, 170, 174, 180, 186, 183, 107, 97, + 96, 93, 100, 104, 117, 119, 136, 142, 155, 168, 177, 187, 191, 198, + 110, 101, 100, 97, 101, 108, 117, 123, 138, 141, 161, 165, 183, 188, + 193, 200, 114, 104, 104, 100, 103, 112, 117, 127, 137, 146, 159, 167, + 185, 190, 201, 206, 118, 108, 107, 103, 105, 115, 118, 131, 136, 151, + 157, 172, 182, 197, 203, 208, 122, 111, 111, 107, 107, 119, 119, 136, + 136, 156, 156, 178, 179, 203, 204, 217}, + {32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67, 68, 69, 71, 72, + 31, 31, 38, 42, 47, 47, 50, 52, 54, 60, 63, 64, 65, 66, 67, 68, + 30, 32, 40, 42, 46, 45, 48, 50, 52, 57, 60, 62, 63, 65, 66, 68, + 32, 34, 41, 44, 46, 45, 48, 49, 51, 57, 59, 61, 62, 63, 64, 65, + 33, 36, 43, 45, 47, 46, 47, 49, 51, 56, 59, 60, 60, 62, 63, 65, + 37, 40, 47, 47, 47, 45, 47, 48, 50, 54, 57, 58, 60, 61, 62, 63, + 42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58, 59, 60, 62, 63, + 45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61, 61, 60, 61, 61, + 49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63, 62, 63, 62, 62, + 48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65, 65, 64, 64, 65, + 49, 45, 46, 49, 53, 58, 61, 62, 64, 67, 69, 67, 66, 66, 66, 65, + 49, 46, 46, 49, 53, 59, 62, 64, 65, 69, 71, 70, 68, 68, 67, 68, + 50, 46, 46, 50, 54, 59, 64, 65, 67, 71, 73, 72, 72, 70, 70, 69, + 52, 48, 47, 50, 54, 61, 66, 68, 71, 75, 77, 74, 73, 73, 71, 72, + 54, 50, 49, 52, 55, 62, 68, 71, 73, 78, 80, 78, 76, 74, 75, 73, + 55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80, 79, 78, 76, 77, + 57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82, 80, 80, 79, 77, + 60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86, 84, 82, 81, 81, + 63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88, 87, 85, 84, 81, + 64, 58, 55, 58, 61, 68, 75, 78, 82, 89, 92, 90, 89, 87, 86, 86, + 64, 59, 56, 58, 61, 68, 75, 79, 83, 90, 93, 95, 93, 91, 89, 87, + 67, 61, 58, 60, 63, 69, 76, 79, 85, 92, 95, 96, 94, 92, 91, 91, + 68, 62, 59, 60, 64, 71, 74, 78, 86, 91, 94, 96, 98, 96, 94, 91, + 69, 62, 60, 60, 65, 70, 72, 79, 85, 88, 95, 98, 99, 98, 97, 96, + 70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97, 99, 101, 98, 97, + 71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98, 102, 102, 102, 101, + 72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101, 103, 104, 102, + 73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98, 102, 105, 106, 107, + 74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96, 103, 105, 106, 107, + 75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96, 103, 105, 109, 109, + 76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97, 101, 107, 109, 110, + 77, 70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99, 99, 108, 108, 113}}, + {{32, 31, 32, 32, 36, 44, 47, 53, 65, 73, 79, 87, 90, 93, + 96, 99, 31, 32, 32, 33, 35, 42, 45, 51, 62, 69, 75, 83, + 86, 88, 91, 94, 31, 32, 32, 33, 35, 41, 44, 49, 60, 67, + 72, 80, 84, 87, 90, 94, 31, 32, 33, 33, 35, 41, 44, 49, + 59, 66, 71, 79, 82, 84, 87, 90, 32, 32, 34, 34, 36, 42, + 45, 50, 59, 65, 71, 78, 80, 83, 87, 90, 32, 33, 35, 36, + 38, 42, 45, 49, 58, 64, 69, 76, 80, 83, 86, 88, 32, 33, + 35, 36, 40, 44, 47, 51, 60, 66, 71, 76, 78, 81, 85, 89, + 34, 34, 36, 38, 42, 48, 50, 54, 63, 69, 73, 80, 82, 81, + 84, 86, 36, 34, 37, 40, 48, 54, 56, 60, 68, 74, 78, 84, + 83, 86, 87, 87, 38, 36, 39, 41, 49, 56, 58, 63, 71, 77, + 81, 86, 88, 88, 90, 93, 39, 37, 40, 42, 50, 58, 60, 65, + 73, 79, 84, 90, 91, 92, 94, 93, 44, 41, 42, 45, 53, 63, + 66, 71, 79, 85, 90, 96, 94, 96, 96, 99, 47, 44, 45, 47, + 56, 66, 69, 75, 84, 90, 95, 99, 101, 98, 101, 99, 49, 46, + 47, 48, 57, 67, 71, 77, 86, 93, 97, 103, 103, 105, 102, 106, + 53, 49, 50, 51, 60, 71, 75, 82, 92, 99, 103, 111, 108, 107, + 110, 107, 58, 54, 54, 55, 63, 75, 79, 87, 98, 105, 110, 114, + 114, 113, 111, 115, 61, 56, 56, 57, 65, 77, 81, 89, 100, 107, + 113, 118, 116, 117, 118, 116, 65, 60, 59, 60, 68, 79, 84, 92, + 105, 112, 118, 126, 124, 122, 121, 124, 71, 65, 64, 65, 73, 84, + 89, 97, 111, 119, 125, 130, 129, 129, 129, 125, 76, 69, 68, 69, + 76, 88, 92, 101, 115, 123, 130, 134, 134, 131, 132, 135, 79, 72, + 70, 71, 79, 90, 95, 104, 118, 127, 133, 143, 142, 141, 138, 136, + 82, 75, 73, 74, 81, 92, 97, 106, 121, 130, 136, 146, 145, 144, + 144, 145, 86, 78, 76, 77, 84, 95, 100, 109, 124, 133, 140, 147, + 153, 151, 150, 146, 89, 81, 79, 78, 87, 95, 99, 112, 124, 130, + 145, 152, 156, 157, 156, 158, 92, 84, 82, 80, 89, 95, 101, 116, + 121, 132, 148, 151, 157, 163, 161, 159, 95, 86, 85, 83, 92, 95, + 105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98, 89, 88, 85, + 93, 95, 108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92, + 91, 88, 94, 98, 110, 112, 128, 133, 146, 158, 166, 175, 179, 185, + 104, 95, 94, 91, 95, 101, 110, 115, 129, 132, 151, 154, 171, 175, + 181, 186, 107, 98, 97, 94, 96, 105, 110, 119, 128, 136, 149, 156, + 173, 177, 188, 192, 110, 101, 100, 97, 98, 108, 111, 123, 127, 141, + 147, 161, 169, 183, 188, 193, 114, 104, 104, 100, 100, 111, 111, 126, + 127, 145, 145, 166, 166, 189, 190, 201}, + {32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68, 69, 70, 71, + 31, 31, 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64, 65, 66, 67, + 30, 32, 38, 40, 46, 45, 46, 48, 52, 55, 58, 61, 63, 64, 65, 67, + 31, 33, 38, 41, 46, 45, 46, 48, 52, 55, 57, 60, 61, 62, 63, 64, + 33, 36, 41, 44, 47, 46, 46, 47, 51, 54, 56, 59, 60, 61, 63, 64, + 37, 40, 45, 47, 47, 45, 46, 47, 50, 52, 54, 57, 59, 61, 62, 62, + 39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57, 58, 59, 61, 62, + 42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60, 59, 60, 60, + 49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61, 62, 61, 61, + 48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64, 63, 63, 64, + 48, 46, 46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65, 65, 65, 64, + 49, 45, 45, 47, 53, 58, 59, 61, 64, 66, 67, 69, 67, 67, 66, 67, + 50, 46, 46, 48, 54, 59, 61, 63, 66, 68, 70, 71, 71, 68, 69, 67, + 51, 47, 47, 48, 54, 60, 61, 64, 68, 70, 71, 73, 72, 72, 70, 71, + 52, 48, 47, 48, 54, 61, 63, 66, 71, 73, 75, 77, 75, 73, 74, 71, + 54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79, 78, 76, 74, 75, + 55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79, 78, 78, 75, + 57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83, 81, 79, 79, + 60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85, 84, 82, 80, + 62, 56, 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87, 85, 84, 84, + 63, 57, 55, 56, 60, 67, 70, 75, 82, 86, 89, 92, 91, 89, 87, 84, + 64, 59, 56, 57, 61, 68, 71, 75, 83, 87, 90, 93, 92, 90, 89, 89, + 66, 60, 58, 58, 62, 69, 72, 76, 84, 88, 91, 94, 95, 93, 91, 89, + 67, 61, 59, 58, 63, 68, 71, 78, 83, 86, 93, 96, 96, 96, 94, 94, + 68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95, 97, 98, 96, 94, + 69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99, 99, 100, 98, + 70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98, 100, 101, 99, + 71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99, 102, 103, 104, + 72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104, + 73, 66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106, + 74, 67, 67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98, 103, 105, 106, + 75, 68, 68, 65, 65, 71, 71, 78, 78, 87, 87, 96, 96, 105, 105, 109}}, + {{32, 31, 32, 32, 36, 39, 44, 53, 58, 65, 79, 81, 88, 90, + 93, 96, 31, 32, 32, 32, 35, 38, 42, 51, 55, 62, 75, 77, + 83, 86, 88, 91, 31, 32, 32, 32, 35, 38, 41, 50, 54, 60, + 73, 75, 81, 84, 88, 91, 31, 32, 32, 33, 34, 37, 41, 49, + 53, 59, 72, 74, 79, 82, 84, 87, 32, 32, 33, 34, 36, 39, + 42, 50, 53, 59, 71, 72, 78, 81, 84, 87, 32, 32, 34, 34, + 37, 40, 42, 49, 53, 58, 70, 71, 77, 80, 83, 85, 32, 33, + 34, 35, 38, 40, 42, 49, 52, 58, 69, 70, 76, 78, 82, 86, + 34, 34, 35, 37, 42, 45, 48, 54, 57, 63, 73, 75, 79, 79, + 81, 83, 34, 34, 36, 37, 44, 47, 50, 56, 59, 65, 75, 77, + 81, 83, 84, 84, 36, 34, 37, 38, 48, 51, 54, 60, 63, 68, + 78, 80, 85, 85, 86, 89, 39, 37, 39, 40, 50, 54, 58, 65, + 68, 73, 84, 85, 88, 89, 90, 89, 40, 38, 40, 41, 51, 55, + 59, 67, 70, 75, 85, 87, 91, 92, 92, 95, 44, 41, 42, 43, + 53, 58, 63, 71, 74, 79, 90, 91, 97, 94, 97, 95, 47, 44, + 45, 46, 56, 61, 66, 75, 79, 85, 95, 97, 99, 101, 98, 102, + 49, 46, 46, 47, 57, 62, 67, 77, 81, 86, 97, 99, 104, 102, + 105, 102, 53, 49, 50, 50, 60, 65, 71, 82, 86, 92, 103, 105, + 109, 108, 106, 110, 57, 53, 53, 53, 63, 68, 74, 86, 90, 97, + 108, 110, 111, 112, 113, 110, 59, 54, 54, 54, 64, 69, 75, 87, + 91, 98, 111, 112, 119, 117, 115, 118, 65, 60, 59, 58, 68, 73, + 79, 92, 97, 105, 118, 119, 123, 123, 122, 119, 69, 63, 62, 62, + 71, 76, 83, 96, 100, 109, 122, 124, 127, 125, 125, 128, 71, 65, + 64, 63, 73, 78, 84, 97, 102, 111, 125, 127, 135, 134, 131, 129, + 79, 72, 71, 70, 79, 84, 90, 104, 109, 118, 133, 135, 137, 136, + 136, 137, 81, 74, 72, 71, 80, 85, 91, 105, 110, 120, 135, 137, + 145, 143, 141, 138, 82, 75, 73, 72, 81, 86, 92, 106, 111, 121, + 136, 139, 147, 148, 147, 149, 87, 79, 77, 76, 85, 90, 96, 110, + 114, 125, 140, 143, 148, 154, 151, 149, 90, 82, 80, 78, 87, 89, + 99, 108, 113, 129, 135, 146, 153, 157, 160, 159, 92, 84, 83, 81, + 88, 90, 102, 106, 117, 128, 133, 150, 153, 158, 163, 160, 95, 87, + 85, 83, 88, 92, 103, 105, 120, 125, 137, 148, 155, 164, 168, 173, + 98, 89, 88, 85, 89, 95, 103, 108, 121, 124, 141, 144, 160, 164, + 169, 174, 100, 92, 91, 88, 90, 98, 103, 111, 120, 127, 139, 146, + 161, 165, 175, 179, 103, 94, 94, 90, 92, 101, 103, 114, 119, 131, + 137, 150, 158, 170, 175, 180, 106, 97, 97, 93, 93, 104, 104, 118, + 118, 135, 135, 154, 155, 175, 176, 187}, + {32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68, 69, 69, + 31, 31, 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64, 65, 66, + 31, 32, 36, 39, 46, 46, 46, 48, 50, 53, 58, 59, 62, 63, 65, 66, + 30, 32, 36, 40, 46, 45, 45, 48, 49, 52, 57, 58, 60, 61, 62, 63, + 33, 36, 40, 43, 47, 46, 46, 47, 49, 51, 56, 57, 59, 60, 62, 63, + 35, 38, 42, 45, 47, 46, 45, 47, 48, 50, 55, 56, 58, 60, 61, 61, + 37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55, 57, 58, 60, 61, + 42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58, 59, 59, + 44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61, 61, 60, + 49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62, 62, 63, + 48, 46, 46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64, 64, 63, + 48, 45, 46, 46, 53, 55, 56, 58, 59, 61, 65, 65, 66, 66, 65, 66, + 49, 45, 45, 46, 53, 56, 58, 61, 62, 64, 67, 68, 70, 67, 68, 66, + 50, 46, 46, 46, 54, 56, 59, 63, 65, 66, 70, 71, 70, 71, 68, 70, + 51, 47, 47, 47, 54, 57, 60, 64, 65, 68, 71, 72, 73, 71, 72, 70, + 52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75, 76, 75, 73, 73, + 54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77, 76, 74, + 54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79, 77, 78, + 57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82, 81, 78, + 59, 54, 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83, 82, 82, + 60, 54, 53, 52, 58, 62, 65, 72, 75, 79, 85, 86, 89, 87, 85, 82, + 63, 57, 56, 55, 60, 64, 67, 75, 77, 82, 89, 90, 90, 88, 87, 86, + 64, 58, 57, 55, 61, 64, 68, 75, 78, 82, 89, 90, 93, 91, 89, 87, + 64, 59, 57, 56, 61, 65, 68, 75, 78, 83, 90, 91, 94, 93, 92, 91, + 66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93, 94, 95, 93, 91, + 67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97, 97, 95, + 68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97, 98, 96, + 69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99, 100, 101, + 70, 64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99, 100, 101, + 71, 65, 64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99, 102, 103, + 72, 65, 65, 62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103, + 73, 66, 66, 63, 63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105}}, + {{32, 31, 31, 32, 35, 36, 44, 47, 53, 62, 65, 79, 82, 88, + 90, 93, 31, 32, 32, 32, 35, 35, 42, 45, 51, 59, 62, 75, + 78, 83, 86, 88, 31, 32, 32, 32, 34, 35, 41, 45, 50, 58, + 61, 74, 76, 82, 85, 88, 31, 32, 32, 33, 34, 34, 41, 44, + 49, 57, 59, 72, 74, 79, 82, 84, 31, 32, 33, 34, 35, 36, + 42, 44, 49, 57, 59, 71, 73, 79, 81, 84, 32, 32, 33, 34, + 36, 36, 42, 45, 50, 57, 59, 71, 73, 78, 80, 82, 32, 33, + 34, 35, 37, 38, 42, 45, 49, 56, 58, 69, 71, 76, 79, 83, + 32, 33, 34, 36, 39, 40, 44, 47, 51, 58, 60, 71, 73, 76, + 78, 80, 34, 34, 35, 37, 41, 42, 48, 50, 54, 61, 63, 73, + 76, 81, 81, 80, 35, 34, 36, 38, 45, 47, 52, 55, 59, 65, + 67, 77, 79, 82, 83, 86, 36, 34, 36, 38, 46, 48, 54, 56, + 60, 66, 68, 78, 80, 85, 87, 86, 39, 37, 39, 40, 48, 50, + 58, 60, 65, 71, 73, 84, 86, 89, 88, 91, 41, 39, 40, 41, + 49, 51, 60, 62, 67, 74, 76, 86, 88, 91, 93, 91, 44, 41, + 42, 43, 51, 53, 63, 66, 71, 78, 79, 90, 92, 97, 94, 97, + 47, 44, 44, 45, 53, 56, 66, 69, 75, 82, 84, 95, 97, 98, + 101, 98, 48, 45, 45, 46, 54, 56, 67, 70, 76, 83, 85, 96, + 98, 104, 101, 105, 53, 49, 50, 50, 57, 60, 71, 75, 82, 90, + 92, 103, 106, 107, 108, 105, 55, 51, 51, 51, 59, 61, 72, 77, + 84, 92, 94, 106, 108, 111, 110, 112, 58, 54, 54, 54, 61, 63, + 75, 79, 87, 95, 98, 110, 112, 117, 116, 113, 63, 58, 58, 57, + 65, 67, 78, 83, 91, 100, 103, 116, 118, 119, 119, 121, 65, 60, + 59, 58, 66, 68, 79, 84, 92, 102, 105, 118, 120, 127, 124, 122, + 71, 65, 64, 63, 71, 73, 84, 89, 97, 108, 111, 125, 127, 129, + 129, 130, 74, 68, 67, 66, 73, 75, 86, 91, 100, 110, 113, 128, + 131, 135, 134, 130, 79, 72, 71, 70, 77, 79, 90, 95, 104, 115, + 118, 133, 136, 140, 139, 140, 82, 75, 73, 72, 79, 81, 92, 97, + 105, 117, 120, 136, 139, 145, 142, 140, 82, 75, 74, 72, 79, 81, + 92, 97, 106, 117, 121, 136, 139, 148, 150, 149, 87, 79, 78, 76, + 83, 85, 96, 100, 110, 120, 125, 141, 144, 148, 153, 150, 89, 82, + 81, 78, 83, 87, 97, 99, 113, 118, 128, 139, 145, 153, 157, 161, + 92, 84, 83, 80, 84, 89, 97, 101, 114, 116, 132, 135, 150, 153, + 157, 162, 94, 86, 85, 82, 85, 92, 97, 104, 112, 119, 130, 136, + 151, 154, 163, 166, 97, 88, 88, 85, 86, 94, 97, 107, 111, 123, + 128, 140, 147, 159, 163, 167, 99, 91, 91, 87, 87, 97, 97, 110, + 110, 126, 126, 144, 144, 163, 163, 173}, + {32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31, + 31, 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32, + 34, 39, 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35, + 40, 44, 46, 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42, + 46, 47, 45, 46, 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46, + 47, 46, 46, 47, 50, 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47, + 45, 46, 47, 49, 50, 54, 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47, + 47, 48, 50, 51, 55, 56, 57, 58, 59, 42, 43, 44, 47, 49, 50, 49, 50, + 50, 53, 53, 57, 58, 60, 60, 59, 47, 46, 46, 48, 51, 52, 53, 53, 53, + 55, 56, 60, 61, 61, 61, 62, 49, 46, 47, 48, 52, 53, 53, 54, 54, 56, + 57, 60, 61, 63, 63, 62, 48, 46, 46, 47, 51, 53, 56, 56, 57, 59, 60, + 64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53, 57, 57, 59, 61, 61, 65, + 66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59, 61, 63, 64, 67, 68, + 70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65, 66, 70, 71, 70, + 71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71, 71, 73, 71, + 72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75, 74, 72, + 53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76, 54, + 50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51, + 51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51, + 50, 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52, + 57, 58, 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58, + 59, 66, 69, 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60, + 67, 70, 75, 80, 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68, + 71, 75, 81, 83, 90, 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71, + 75, 81, 83, 90, 91, 94, 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77, + 82, 84, 92, 93, 94, 95, 93, 67, 61, 60, 58, 61, 63, 69, 70, 78, 80, + 85, 90, 93, 96, 97, 97, 68, 62, 61, 59, 61, 64, 68, 71, 77, 79, 86, + 88, 94, 96, 97, 98, 69, 63, 62, 59, 61, 65, 68, 72, 76, 80, 85, 88, + 94, 95, 99, 99, 70, 63, 63, 60, 61, 66, 67, 73, 75, 81, 83, 89, 92, + 97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74, 74, 82, 82, 90, 90, 98, + 98, 102}}, + {{32, 31, 31, 32, 33, 36, 40, 44, 51, 53, 65, 66, 79, 81, + 87, 90, 31, 32, 32, 32, 33, 35, 39, 42, 49, 51, 62, 63, + 75, 77, 83, 85, 31, 32, 32, 32, 33, 35, 39, 42, 49, 51, + 61, 62, 74, 76, 82, 85, 31, 32, 32, 33, 33, 34, 38, 41, + 47, 49, 59, 60, 72, 74, 79, 81, 31, 32, 32, 33, 34, 35, + 38, 41, 47, 49, 59, 60, 71, 73, 79, 81, 32, 32, 33, 34, + 35, 36, 39, 42, 48, 50, 59, 60, 71, 72, 78, 80, 32, 32, + 33, 35, 36, 37, 40, 42, 48, 49, 58, 59, 69, 71, 77, 80, + 32, 33, 33, 35, 36, 38, 41, 42, 48, 49, 58, 59, 69, 70, + 75, 77, 33, 33, 34, 36, 38, 41, 44, 46, 52, 53, 62, 63, + 72, 74, 78, 78, 34, 34, 34, 37, 39, 42, 45, 48, 53, 54, + 63, 64, 73, 75, 80, 83, 36, 34, 35, 38, 42, 48, 51, 54, + 58, 60, 68, 69, 78, 80, 84, 83, 36, 35, 35, 38, 42, 48, + 51, 54, 59, 60, 68, 69, 79, 80, 85, 87, 39, 37, 38, 40, + 44, 50, 54, 58, 63, 65, 73, 74, 84, 85, 89, 88, 40, 38, + 39, 41, 45, 51, 56, 59, 65, 67, 75, 76, 85, 87, 90, 93, + 44, 41, 41, 43, 46, 53, 59, 63, 69, 71, 79, 80, 90, 91, + 96, 93, 46, 43, 43, 44, 48, 55, 60, 65, 72, 73, 82, 83, + 93, 94, 97, 100, 48, 45, 45, 46, 50, 56, 62, 67, 74, 76, + 85, 86, 96, 98, 103, 100, 52, 48, 48, 49, 52, 59, 65, 70, + 78, 80, 90, 91, 101, 103, 105, 107, 53, 49, 49, 50, 53, 60, + 66, 71, 79, 82, 92, 93, 103, 105, 111, 107, 58, 53, 53, 53, + 57, 63, 69, 74, 83, 86, 97, 98, 109, 111, 113, 115, 58, 54, + 54, 54, 57, 63, 70, 75, 84, 87, 98, 99, 110, 112, 118, 115, + 65, 60, 59, 58, 62, 68, 74, 79, 89, 92, 105, 106, 118, 119, + 122, 123, 66, 61, 60, 59, 63, 69, 75, 80, 90, 93, 106, 107, + 119, 121, 126, 123, 71, 65, 65, 63, 67, 73, 79, 84, 94, 97, + 111, 112, 125, 127, 131, 132, 74, 68, 67, 66, 69, 75, 81, 86, + 97, 100, 113, 115, 128, 130, 134, 132, 79, 72, 72, 70, 73, 79, + 85, 90, 101, 104, 118, 119, 133, 135, 141, 140, 81, 74, 73, 71, + 75, 80, 86, 91, 102, 105, 120, 121, 135, 137, 143, 140, 82, 75, + 74, 72, 75, 81, 87, 92, 103, 106, 121, 122, 136, 139, 147, 151, + 86, 78, 78, 75, 78, 84, 90, 95, 106, 109, 124, 125, 140, 142, + 147, 151, 88, 81, 80, 77, 80, 86, 90, 98, 105, 112, 122, 127, + 140, 144, 152, 155, 91, 83, 82, 79, 80, 88, 90, 100, 103, 114, + 119, 130, 137, 148, 151, 155, 93, 85, 85, 81, 81, 90, 90, 102, + 103, 117, 117, 134, 134, 151, 152, 160}, + {32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31, + 31, 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31, + 33, 38, 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33, + 40, 42, 46, 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41, + 43, 46, 46, 45, 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44, + 47, 46, 46, 47, 47, 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47, + 46, 45, 47, 47, 50, 51, 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46, + 45, 46, 47, 50, 50, 54, 55, 57, 58, 41, 42, 43, 47, 48, 49, 49, 48, + 49, 50, 52, 53, 57, 57, 59, 58, 42, 43, 43, 47, 48, 50, 49, 49, 50, + 50, 53, 54, 57, 58, 60, 61, 49, 46, 47, 48, 50, 53, 53, 53, 54, 54, + 57, 57, 60, 61, 62, 61, 49, 46, 47, 48, 50, 53, 53, 54, 54, 55, 57, + 57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53, 54, 56, 57, 57, 60, 60, + 64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56, 58, 58, 61, 61, 65, + 65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61, 64, 64, 67, 68, + 69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66, 69, 69, 70, + 70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71, 73, 71, + 51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74, 52, + 48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50, + 49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50, + 49, 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50, + 52, 56, 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53, + 57, 61, 64, 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58, + 62, 65, 71, 72, 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63, + 66, 72, 73, 80, 81, 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67, + 73, 75, 82, 82, 89, 90, 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73, + 75, 82, 83, 89, 90, 92, 90, 64, 59, 58, 56, 58, 61, 65, 68, 74, 75, + 83, 83, 90, 91, 94, 95, 66, 60, 59, 57, 59, 62, 66, 69, 75, 76, 84, + 85, 91, 92, 94, 95, 67, 61, 60, 58, 59, 63, 66, 70, 74, 77, 82, 85, + 91, 93, 96, 96, 68, 62, 61, 58, 59, 64, 65, 71, 72, 78, 81, 86, 89, + 94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71, 71, 79, 79, 87, 87, 95, + 95, 98}}, + {{32, 31, 31, 32, 32, 36, 36, 44, 44, 53, 53, 65, 65, 79, + 79, 87, 31, 32, 32, 32, 32, 35, 35, 42, 42, 51, 51, 62, + 62, 75, 75, 82, 31, 32, 32, 32, 32, 35, 35, 42, 42, 51, + 51, 62, 62, 75, 75, 82, 31, 32, 32, 33, 33, 34, 34, 41, + 41, 49, 49, 59, 59, 72, 72, 78, 31, 32, 32, 33, 33, 34, + 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 32, 32, 32, 34, + 34, 36, 36, 42, 42, 50, 50, 59, 59, 71, 71, 77, 32, 32, + 32, 34, 34, 36, 36, 42, 42, 50, 50, 59, 59, 71, 71, 77, + 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69, + 69, 75, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, + 58, 69, 69, 75, 34, 34, 34, 37, 37, 42, 42, 48, 48, 54, + 54, 63, 63, 73, 73, 79, 34, 34, 34, 37, 37, 42, 42, 48, + 48, 54, 54, 63, 63, 73, 73, 79, 36, 34, 34, 38, 38, 48, + 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 36, 34, 34, 38, + 38, 48, 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 39, 37, + 37, 40, 40, 50, 50, 58, 58, 65, 65, 73, 73, 84, 84, 89, + 39, 37, 37, 40, 40, 50, 50, 58, 58, 65, 65, 73, 73, 84, + 84, 89, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, + 79, 90, 90, 95, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, + 71, 79, 79, 90, 90, 95, 48, 45, 45, 46, 46, 56, 56, 67, + 67, 76, 76, 85, 85, 96, 96, 102, 48, 45, 45, 46, 46, 56, + 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, 53, 49, 49, 50, + 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 53, 49, + 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, + 58, 54, 54, 54, 54, 63, 63, 75, 75, 87, 87, 98, 98, 110, + 110, 116, 58, 54, 54, 54, 54, 63, 63, 75, 75, 87, 87, 98, + 98, 110, 110, 116, 65, 60, 60, 58, 58, 68, 68, 79, 79, 92, + 92, 105, 105, 118, 118, 124, 65, 60, 60, 58, 58, 68, 68, 79, + 79, 92, 92, 105, 105, 118, 118, 124, 71, 65, 65, 63, 63, 73, + 73, 84, 84, 97, 97, 111, 111, 125, 125, 132, 71, 65, 65, 63, + 63, 73, 73, 84, 84, 97, 97, 111, 111, 125, 125, 132, 79, 72, + 72, 70, 70, 79, 79, 90, 90, 104, 104, 118, 118, 133, 133, 141, + 79, 72, 72, 70, 70, 79, 79, 90, 90, 104, 104, 118, 118, 133, + 133, 141, 82, 75, 75, 72, 72, 81, 81, 92, 92, 106, 106, 121, + 121, 136, 136, 144, 82, 75, 75, 72, 72, 81, 81, 92, 92, 106, + 106, 121, 121, 136, 136, 144, 87, 79, 79, 76, 76, 84, 84, 96, + 96, 109, 109, 124, 124, 141, 141, 149}, + {32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31, + 31, 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31, + 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32, + 40, 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40, + 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43, + 47, 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47, + 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47, + 45, 45, 47, 47, 50, 50, 54, 54, 57, 37, 40, 40, 47, 47, 47, 47, 45, + 45, 47, 47, 50, 50, 54, 54, 57, 42, 43, 43, 47, 47, 50, 50, 49, 49, + 50, 50, 53, 53, 57, 57, 60, 42, 43, 43, 47, 47, 50, 50, 49, 49, 50, + 50, 53, 53, 57, 57, 60, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54, + 57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54, 57, + 57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60, + 64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60, 64, + 64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67, + 69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67, 69, + 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 50, + 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48, + 48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48, + 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49, + 49, 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49, + 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56, + 56, 64, 64, 70, 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56, + 64, 64, 70, 70, 76, 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65, + 65, 72, 72, 79, 79, 85, 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65, + 72, 72, 79, 79, 85, 85, 88, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, + 75, 82, 82, 89, 89, 92, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75, + 82, 82, 89, 89, 92, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83, + 83, 90, 90, 93, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83, 83, + 90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69, 69, 77, 77, 84, 84, 92, + 92, 95}}, + {{32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53, 53, 62, 65, 73, 79, + 31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 60, 62, 70, 75, + 31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 59, 62, 69, 75, + 31, 32, 32, 32, 32, 33, 35, 36, 41, 42, 50, 50, 58, 60, 67, 73, + 31, 32, 32, 32, 33, 33, 34, 36, 41, 41, 49, 49, 57, 59, 66, 72, + 31, 32, 32, 33, 33, 34, 35, 37, 41, 42, 49, 49, 57, 59, 66, 71, + 32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50, 50, 57, 59, 65, 71, + 32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49, 49, 56, 59, 65, 70, + 32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49, 49, 56, 58, 64, 69, + 32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50, 50, 56, 58, 64, 69, + 34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, + 34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, + 35, 34, 34, 37, 38, 42, 47, 48, 52, 53, 59, 59, 65, 67, 73, 77, + 36, 35, 34, 37, 38, 43, 48, 49, 54, 54, 60, 60, 66, 68, 74, 78, + 38, 36, 36, 38, 40, 44, 49, 51, 56, 57, 63, 63, 69, 71, 77, 81, + 39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65, 65, 71, 73, 79, 84, + 41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67, 67, 74, 76, 81, 86, + 44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71, 71, 78, 79, 85, 90, + 44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72, 72, 79, 81, 86, 91, + 48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, + 48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, + 53, 49, 49, 49, 49, 54, 59, 62, 71, 71, 81, 81, 89, 91, 98, 103, + 53, 50, 49, 50, 50, 54, 60, 63, 71, 72, 82, 82, 90, 92, 99, 103, + 57, 53, 52, 52, 52, 57, 62, 65, 74, 75, 85, 85, 94, 96, 103, 108, + 58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87, 87, 95, 98, 105, 110, + 61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89, 89, 98, 101, 108, 114, + 65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92, 92, 102, 105, 112, 118, + 67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94, 94, 103, 106, 114, 120, + 71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97, 97, 108, 111, 119, 125, + 72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98, 98, 108, 111, 119, 125, + 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133, + 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133}, + {32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31, + 31, 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31, + 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32, + 37, 39, 42, 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37, + 40, 42, 46, 46, 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41, + 44, 46, 46, 45, 45, 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45, + 47, 46, 46, 46, 47, 47, 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47, + 47, 45, 46, 47, 47, 50, 51, 53, 55, 37, 40, 40, 45, 47, 47, 47, 47, + 45, 46, 47, 47, 49, 50, 52, 54, 37, 40, 40, 45, 47, 47, 48, 47, 46, + 46, 47, 47, 49, 50, 53, 55, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49, + 50, 50, 53, 53, 56, 57, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49, 50, + 50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50, 52, 52, 53, 53, 53, 53, + 55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 54, 56, + 57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55, 56, 56, 58, 58, + 61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57, 59, 60, 62, + 64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61, 63, 65, + 49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67, 49, + 46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47, + 46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46, + 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47, + 47, 50, 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47, + 50, 54, 56, 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51, + 55, 57, 62, 62, 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55, + 57, 62, 63, 68, 68, 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58, + 63, 63, 69, 69, 74, 75, 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64, + 64, 70, 70, 75, 76, 79, 82, 58, 53, 53, 51, 51, 54, 57, 59, 64, 65, + 71, 71, 76, 77, 80, 83, 60, 55, 54, 53, 52, 55, 58, 60, 65, 66, 72, + 72, 77, 79, 82, 85, 60, 55, 55, 53, 53, 55, 59, 60, 65, 66, 73, 73, + 78, 79, 83, 85, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80, + 82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80, 82, + 86, 89}}, + {{32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58, 65, 65, + 31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56, 63, 63, + 31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 51, 55, 62, 62, + 31, 32, 32, 32, 32, 32, 34, 35, 37, 41, 41, 48, 50, 54, 61, 61, + 31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53, 59, 59, + 31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53, 59, 59, + 31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48, 49, 53, 59, 59, + 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53, 59, 59, + 32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53, 58, 58, + 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, + 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, + 33, 33, 33, 35, 36, 36, 40, 41, 43, 46, 46, 52, 53, 56, 62, 62, + 34, 34, 34, 35, 37, 37, 41, 42, 44, 48, 48, 53, 54, 57, 63, 63, + 34, 34, 34, 35, 37, 37, 43, 44, 46, 50, 50, 55, 56, 59, 65, 65, + 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63, 68, 68, + 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63, 68, 68, + 38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67, 72, 72, + 39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68, 73, 73, + 41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70, 76, 76, + 44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, + 44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, + 47, 44, 44, 44, 45, 45, 53, 56, 59, 66, 66, 73, 75, 78, 84, 84, + 48, 45, 45, 45, 46, 46, 54, 56, 60, 67, 67, 74, 76, 79, 85, 85, + 50, 47, 46, 47, 47, 47, 55, 58, 61, 68, 68, 76, 78, 82, 88, 88, + 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86, 92, 92, + 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86, 92, 92, + 57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90, 97, 97, + 58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91, 98, 98, + 61, 57, 56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93, 100, 100, + 65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, + 65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, + 70, 65, 64, 63, 62, 62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109}, + {32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31, + 31, 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31, + 31, 34, 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32, + 34, 39, 39, 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35, + 40, 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40, + 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42, + 46, 47, 46, 45, 45, 47, 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46, + 47, 46, 46, 46, 47, 47, 49, 51, 51, 35, 37, 37, 40, 44, 44, 46, 47, + 46, 45, 45, 47, 47, 48, 51, 51, 37, 39, 40, 43, 47, 47, 47, 47, 47, + 45, 45, 46, 47, 48, 50, 50, 37, 39, 40, 43, 47, 47, 47, 47, 47, 45, + 45, 46, 47, 48, 50, 50, 41, 42, 42, 44, 47, 47, 49, 49, 49, 48, 48, + 49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47, 49, 50, 50, 49, 49, 50, + 50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51, 51, 51, 51, 52, 52, + 53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55, + 57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55, 57, + 57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58, 59, 59, + 48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60, 48, + 46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46, + 45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45, + 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46, + 46, 46, 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46, + 46, 52, 54, 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47, + 52, 54, 56, 60, 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53, + 54, 57, 61, 61, 65, 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54, + 57, 61, 61, 65, 66, 68, 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58, + 62, 62, 67, 68, 70, 73, 73, 54, 51, 50, 49, 49, 49, 54, 55, 58, 62, + 62, 67, 68, 70, 73, 73, 55, 51, 51, 50, 49, 49, 54, 56, 58, 63, 63, + 68, 69, 71, 74, 74, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69, + 70, 73, 76, 76, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69, 70, + 73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58, 61, 65, 65, 70, 72, 74, + 78, 78}}, + {{32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31, + 32, 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32, + 32, 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, + 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, + 32, 32, 33, 34, 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32, + 33, 33, 34, 34, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33, + 33, 35, 35, 36, 41, 41, 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34, + 36, 36, 38, 42, 42, 45, 49, 49, 54, 32, 32, 32, 33, 34, 34, 34, 36, + 36, 38, 42, 42, 45, 50, 50, 54, 32, 32, 32, 33, 34, 34, 35, 37, 37, + 38, 42, 42, 45, 49, 49, 54, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39, + 42, 42, 45, 49, 49, 53, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39, 42, + 42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36, 36, 39, 40, 41, 44, 44, + 47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50, + 54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50, 54, + 54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50, 53, 57, 57, + 61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, + 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, 38, + 37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38, + 37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37, + 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40, + 42, 42, 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42, + 43, 45, 52, 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43, + 45, 52, 54, 56, 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47, + 54, 56, 58, 66, 66, 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55, + 56, 59, 67, 67, 70, 76, 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57, + 60, 67, 67, 71, 77, 77, 81, 53, 50, 49, 49, 49, 49, 51, 58, 59, 62, + 71, 71, 74, 81, 81, 86, 53, 51, 49, 49, 50, 50, 51, 59, 60, 63, 71, + 71, 75, 82, 82, 87, 55, 52, 51, 51, 51, 51, 53, 60, 61, 64, 72, 72, + 76, 83, 83, 88, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79, + 87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79, 87, + 87, 92}, + {32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31, + 31, 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31, + 31, 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31, + 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32, + 38, 39, 40, 45, 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38, + 40, 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40, + 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43, + 46, 47, 46, 45, 45, 46, 47, 47, 49, 33, 35, 36, 36, 41, 43, 44, 46, + 47, 46, 46, 46, 46, 47, 47, 49, 34, 36, 37, 37, 42, 44, 45, 47, 47, + 47, 45, 45, 46, 47, 47, 49, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47, + 45, 45, 46, 47, 47, 48, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47, 45, + 45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47, 47, 48, 48, 48, 47, 47, + 47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50, + 50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50, 50, + 50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51, 52, 52, 52, + 54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, + 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, 48, + 47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46, + 46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46, + 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45, + 46, 46, 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45, + 46, 47, 52, 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46, + 47, 52, 53, 55, 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48, + 53, 54, 55, 59, 59, 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53, + 54, 55, 59, 59, 61, 64, 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54, + 55, 60, 60, 61, 64, 64, 66, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, + 61, 61, 63, 66, 66, 68, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61, + 61, 63, 66, 66, 68, 53, 50, 48, 48, 48, 48, 49, 54, 54, 56, 61, 61, + 63, 67, 67, 69, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65, + 68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65, 68, + 68, 71}}, + {{32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31, + 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32, + 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, + 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, + 32, 32, 32, 33, 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32, + 33, 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, + 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, + 33, 34, 35, 35, 38, 41, 41, 44, 49, 31, 32, 32, 32, 33, 34, 34, 34, + 35, 36, 36, 39, 42, 42, 44, 49, 32, 32, 32, 32, 33, 34, 34, 34, 36, + 36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 34, 34, 34, 36, 36, + 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 35, 35, 35, 37, 37, 37, + 40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41, + 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41, 42, + 42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40, 40, 42, 44, 44, + 47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50, + 54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50, 54, + 34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55, 35, + 35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35, + 34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34, + 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36, + 37, 40, 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39, + 40, 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40, + 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41, + 43, 49, 51, 51, 56, 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45, + 51, 53, 53, 59, 63, 63, 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51, + 53, 53, 59, 63, 63, 66, 71, 44, 43, 42, 42, 42, 43, 43, 45, 51, 54, + 54, 59, 64, 64, 67, 72, 47, 45, 44, 44, 44, 45, 45, 47, 53, 56, 56, + 61, 66, 66, 69, 75, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62, + 67, 67, 70, 76, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62, 67, + 67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50, 56, 58, 58, 64, 69, 69, + 73, 79}, + {32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31, + 31, 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31, + 31, 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31, + 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32, + 34, 39, 39, 40, 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35, + 40, 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40, + 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40, + 41, 45, 46, 46, 45, 45, 45, 46, 48, 33, 34, 35, 35, 37, 42, 42, 43, + 46, 47, 47, 46, 45, 45, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46, + 47, 47, 46, 46, 46, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46, 47, + 47, 46, 46, 46, 46, 47, 35, 37, 38, 38, 41, 45, 45, 46, 47, 47, 47, + 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46, + 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46, 45, + 45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48, 48, 47, 47, 47, + 47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50, + 50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50, 50, + 43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51, 47, + 46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47, + 46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46, + 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46, + 46, 47, 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46, + 47, 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47, + 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46, + 47, 51, 53, 53, 55, 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47, + 51, 53, 53, 56, 58, 58, 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51, + 53, 53, 56, 58, 58, 59, 61, 49, 47, 45, 45, 45, 46, 46, 47, 52, 53, + 53, 56, 58, 58, 60, 62, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, + 57, 59, 59, 61, 63, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, + 59, 59, 61, 64, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59, + 59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48, 52, 54, 54, 58, 60, 60, + 62, 65}}, + {{32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 34, 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 33, 33, 33, 33, + 34, 35, 35, 35, 38, 41, 41, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, + 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, + 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, + 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 36, 37, 37, 37, + 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, + 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, + 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, + 33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45, 34, + 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, + 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34, + 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34, + 34, 36, 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34, + 36, 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, + 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, + 38, 38, 43, 48, 48, 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39, + 39, 44, 49, 49, 49, 52, 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40, + 45, 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, + 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, + 50, 50, 54, 58, 58, 41, 40, 39, 39, 39, 40, 42, 42, 42, 46, 52, 52, + 52, 56, 60, 60, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53, + 58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53, 58, + 63, 63}, + {32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31, + 31, 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31, + 31, 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, + 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, + 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32, + 35, 39, 39, 39, 42, 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35, + 40, 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, + 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, + 40, 42, 46, 46, 46, 45, 45, 45, 32, 33, 34, 34, 34, 37, 41, 41, 41, + 44, 46, 46, 46, 46, 45, 45, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, + 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47, + 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47, 47, + 47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45, 45, 46, 47, 47, 47, + 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, + 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45, + 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45, 45, + 39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47, 42, + 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, + 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43, + 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44, + 44, 46, 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46, + 47, 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, + 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, + 48, 48, 50, 53, 53, 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47, + 47, 50, 53, 53, 53, 54, 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47, + 50, 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, + 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, + 53, 53, 54, 56, 56, 48, 47, 45, 45, 45, 46, 46, 46, 46, 49, 53, 53, + 53, 55, 57, 57, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53, + 56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53, 56, + 58, 58}}, + {{32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + 34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, + 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, + 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, + 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, + 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37, + 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38, 32, + 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, + 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32, + 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33, + 33, 33, 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33, + 33, 35, 36, 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34, + 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, + 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, + 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37, + 37, 37, 40, 43, 44, 44, 44, 35, 35, 34, 34, 34, 34, 36, 37, 38, 38, + 38, 41, 45, 47, 47, 47, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, + 42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, + 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46, + 48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38, 39, 39, 39, 42, 46, 49, + 49, 49}, + {32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31, + 31, 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31, + 31, 31, 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, + 31, 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, + 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, + 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32, + 34, 37, 39, 39, 39, 41, 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34, + 38, 39, 39, 39, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, + 40, 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, + 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, + 40, 42, 44, 46, 46, 46, 31, 32, 33, 33, 33, 33, 36, 39, 41, 41, 41, + 43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35, 37, 40, 42, 42, 42, 44, + 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, + 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47, + 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47, 47, + 47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47, 47, 47, + 36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47, 37, + 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, + 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39, + 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, + 41, 41, 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42, + 42, 44, 46, 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43, + 44, 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, + 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, + 47, 47, 47, 48, 49, 50, 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47, + 47, 47, 49, 50, 51, 51, 51, 47, 46, 46, 46, 46, 46, 46, 47, 48, 48, + 48, 49, 51, 52, 52, 52, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, + 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, + 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52, + 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 52, 53, + 53, 53}}, + {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, + 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33, + 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, + 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, + 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, + 35, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36, + 36, 36, 36, 36, 38, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, + 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, + 37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37, + 38, 39}, + {32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31, + 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31, + 31, 31, 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31, + 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, + 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, + 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, + 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, + 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 32, 32, 32, 32, + 34, 36, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 32, 34, + 37, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, + 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, + 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, + 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40, + 40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38, 40, 40, 40, 40, + 41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41, 41, 41, 42, + 44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42, 43, 44, + 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, + 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, + 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35, + 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37, + 37, 37, 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38, + 38, 38, 39, 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39, + 39, 40, 42, 44, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, + 41, 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, + 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, + 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, + 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, 41, 42, 43, 45, 47, + 47, 47, 47, 47, 48, 40, 41, 41, 42, 42, 42, 42, 42, 44, 45, 47, 47, + 47, 47, 47, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, + 47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, + 48, 48}}, + {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 34, 34}, + {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, + 34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, + 36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 36, + 38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, + 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, + 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32, + 32, 33, 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34, + 34, 34, 34, 34, 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35, + 35, 35, 35, 35, 36, 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36, + 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, + 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, + 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, + 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, + 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, + 40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 36, 38, 39, 40, + 42, 44}}, + {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32}, + {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32}}}; +constexpr uint8_t + kQuantizerMatrix4x4[kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes] + [10] = {{{32, 43, 67, 73, 94, 137, 97, 110, 150, 200}, + {35, 46, 60, 57, 69, 90, 66, 71, 90, 109}}, + {{32, 41, 63, 69, 88, 127, 92, 103, 140, 184}, + {33, 45, 58, 56, 66, 86, 64, 69, 87, 105}}, + {{32, 38, 56, 63, 78, 113, 86, 97, 130, 169}, + {32, 45, 55, 53, 62, 80, 63, 67, 84, 101}}, + {{32, 37, 54, 58, 72, 102, 81, 91, 121, 156}, + {32, 45, 54, 51, 59, 75, 61, 65, 81, 97}}, + {{32, 34, 49, 53, 64, 91, 75, 81, 112, 140}, + {32, 46, 53, 49, 55, 70, 58, 62, 78, 91}}, + {{32, 34, 48, 49, 60, 82, 72, 79, 104, 134}, + {32, 46, 53, 47, 54, 66, 57, 60, 75, 89}}, + {{32, 33, 39, 45, 51, 71, 62, 64, 87, 108}, + {31, 42, 48, 47, 50, 61, 53, 54, 67, 78}}, + {{32, 33, 38, 42, 46, 63, 55, 57, 75, 92}, + {31, 41, 48, 46, 48, 58, 51, 51, 62, 71}}, + {{32, 32, 35, 38, 40, 54, 51, 49, 64, 81}, + {31, 38, 47, 47, 46, 54, 49, 46, 57, 66}}, + {{32, 32, 34, 35, 37, 48, 43, 43, 54, 65}, + {31, 37, 44, 47, 47, 53, 47, 45, 53, 59}}, + {{32, 32, 33, 34, 35, 39, 38, 39, 45, 54}, + {31, 34, 39, 42, 45, 48, 47, 46, 49, 54}}, + {{32, 32, 32, 32, 33, 35, 35, 35, 38, 46}, + {31, 32, 34, 38, 41, 47, 46, 46, 47, 52}}, + {{31, 32, 32, 32, 32, 33, 32, 33, 34, 35}, + {31, 31, 32, 34, 35, 39, 38, 40, 43, 47}}, + {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33}, + {31, 31, 31, 31, 31, 32, 34, 35, 35, 39}}, + {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32}, + {31, 31, 31, 31, 31, 31, 31, 31, 31, 31}}}; +constexpr uint8_t kQuantizerMatrix8x8 + [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][36] = { + {{32, 32, 35, 38, 40, 54, 51, 49, 65, 82, 68, 63, + 78, 97, 117, 84, 76, 91, 111, 134, 152, 95, 89, 98, + 113, 138, 159, 183, 109, 102, 106, 121, 142, 168, 199, 220}, + {31, 38, 47, 47, 46, 54, 50, 47, 57, 66, 57, 52, + 61, 72, 82, 63, 57, 66, 77, 88, 96, 67, 62, 67, + 75, 86, 95, 104, 71, 67, 68, 75, 84, 95, 107, 113}}, + {{32, 32, 35, 37, 39, 51, 47, 46, 60, 73, 62, 58, + 71, 87, 105, 78, 72, 84, 100, 121, 140, 90, 84, 93, + 106, 129, 148, 169, 102, 96, 100, 113, 132, 155, 183, 201}, + {31, 38, 47, 47, 47, 53, 48, 46, 55, 62, 54, 50, + 58, 67, 76, 61, 55, 63, 72, 83, 91, 66, 61, 65, + 73, 84, 92, 101, 69, 65, 66, 73, 82, 92, 103, 109}}, + {{32, 32, 34, 35, 37, 48, 46, 45, 56, 70, 57, 54, + 64, 80, 93, 76, 70, 79, 96, 111, 134, 85, 79, 87, + 100, 121, 138, 156, 96, 90, 93, 105, 122, 144, 168, 184}, + {31, 36, 43, 47, 47, 53, 48, 46, 54, 61, 52, 49, + 55, 65, 71, 60, 55, 60, 70, 78, 89, 64, 59, 63, + 71, 81, 89, 97, 67, 63, 64, 71, 79, 89, 99, 104}}, + {{32, 32, 33, 35, 36, 46, 42, 42, 52, 63, 53, 51, + 60, 73, 86, 68, 64, 72, 84, 100, 117, 78, 74, 80, + 92, 109, 128, 140, 90, 84, 87, 98, 114, 133, 155, 168}, + {31, 34, 39, 46, 47, 52, 47, 45, 52, 58, 50, 48, + 54, 62, 68, 57, 53, 58, 65, 73, 82, 61, 57, 61, + 68, 77, 86, 91, 65, 61, 62, 68, 76, 86, 95, 100}}, + {{32, 32, 33, 34, 35, 39, 39, 40, 46, 56, 50, 48, + 53, 65, 78, 62, 59, 63, 75, 90, 105, 76, 71, 74, + 86, 101, 118, 134, 84, 79, 81, 92, 106, 123, 142, 153}, + {31, 34, 39, 42, 45, 48, 47, 46, 49, 55, 49, 47, + 50, 58, 65, 54, 51, 53, 61, 69, 76, 60, 56, 57, + 65, 73, 82, 89, 64, 59, 60, 66, 74, 83, 92, 96}}, + {{32, 32, 33, 34, 35, 39, 38, 39, 45, 54, 46, 45, + 51, 61, 71, 56, 54, 58, 69, 80, 92, 68, 64, 68, + 78, 90, 103, 117, 78, 74, 76, 86, 99, 113, 128, 140}, + {31, 34, 39, 42, 45, 48, 47, 46, 49, 54, 48, 46, + 50, 56, 61, 52, 49, 52, 58, 65, 71, 57, 53, 55, + 61, 68, 75, 82, 61, 57, 58, 64, 71, 79, 86, 91}}, + {{31, 32, 32, 32, 33, 35, 35, 35, 38, 48, 42, 41, + 43, 54, 63, 51, 49, 49, 59, 71, 81, 59, 56, 56, + 66, 77, 89, 98, 69, 65, 64, 73, 85, 97, 108, 119}, + {31, 32, 35, 38, 42, 47, 48, 47, 48, 53, 47, 45, + 45, 53, 58, 50, 47, 47, 54, 61, 66, 53, 50, 49, + 56, 63, 69, 73, 57, 54, 52, 58, 65, 72, 77, 82}}, + {{31, 32, 32, 32, 32, 35, 34, 34, 37, 42, 38, 37, + 40, 47, 54, 46, 44, 45, 52, 60, 69, 52, 49, 49, + 56, 65, 75, 82, 63, 59, 58, 65, 73, 84, 92, 105}, + {31, 31, 32, 38, 40, 47, 44, 44, 47, 50, 47, 45, + 46, 51, 54, 48, 46, 46, 51, 56, 61, 50, 47, 47, + 52, 57, 63, 66, 55, 52, 50, 54, 60, 66, 70, 76}}, + {{31, 32, 32, 32, 32, 34, 34, 33, 35, 39, 35, 34, + 37, 42, 48, 41, 40, 41, 47, 53, 60, 47, 44, 45, + 51, 57, 65, 71, 53, 50, 51, 55, 61, 70, 77, 85}, + {31, 31, 32, 35, 36, 41, 42, 42, 45, 48, 48, 46, + 47, 50, 53, 47, 45, 45, 49, 53, 57, 49, 46, 46, + 50, 54, 59, 61, 51, 48, 48, 51, 54, 60, 64, 68}}, + {{31, 31, 32, 32, 32, 33, 32, 32, 34, 35, 34, 34, + 35, 37, 41, 37, 36, 38, 39, 45, 51, 43, 41, 42, + 42, 49, 56, 63, 47, 44, 45, 46, 52, 59, 67, 71}, + {31, 31, 32, 34, 35, 39, 37, 40, 43, 47, 43, 43, + 45, 47, 49, 48, 46, 46, 47, 50, 53, 47, 45, 45, + 45, 50, 55, 58, 49, 46, 46, 46, 50, 55, 60, 61}}, + {{31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 33, 33, + 34, 35, 37, 34, 34, 35, 36, 39, 43, 37, 36, 37, + 38, 41, 46, 51, 41, 39, 40, 41, 44, 49, 54, 58}, + {31, 31, 31, 32, 33, 35, 35, 37, 39, 43, 39, 41, + 42, 45, 47, 45, 44, 45, 47, 48, 50, 48, 46, 46, + 47, 48, 51, 53, 48, 46, 45, 46, 47, 51, 54, 56}}, + {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33, 32, 32, + 32, 34, 35, 32, 33, 33, 34, 35, 36, 34, 34, 33, + 35, 36, 38, 39, 35, 35, 34, 36, 38, 40, 42, 48}, + {31, 31, 31, 30, 31, 32, 34, 34, 35, 39, 36, 37, + 39, 42, 46, 39, 40, 41, 44, 47, 47, 42, 42, 42, + 45, 47, 48, 48, 48, 47, 46, 47, 47, 49, 50, 53}}, + {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 33, 32, 32, 32, 32, 33, 34, 32, 32, 32, + 32, 34, 34, 35, 33, 33, 33, 33, 35, 35, 36, 38}, + {31, 31, 31, 31, 31, 31, 30, 31, 31, 32, 34, 34, + 35, 35, 39, 35, 35, 36, 36, 40, 41, 37, 38, 39, + 40, 43, 44, 47, 40, 41, 41, 42, 44, 45, 47, 48}}, + {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 33, 33}, + {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, + 31, 31, 32, 31, 32, 32, 32, 32, 33, 33, 34, 34, + 35, 35, 36, 39, 33, 34, 34, 35, 35, 36, 39, 39}}, + {{31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, + 32, 32, 32, 31, 31, 32, 32, 32, 32, 31, 31, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32}, + {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31}}}; +constexpr uint8_t kQuantizerMatrix32x32 + [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][528] = { + {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 33, + 33, 32, 32, 32, 33, 34, 35, 34, 34, 33, 34, 35, 37, 39, + 35, 34, 34, 35, 36, 37, 41, 43, 36, 35, 34, 35, 36, 38, + 42, 45, 48, 39, 38, 37, 38, 39, 40, 45, 47, 50, 54, 44, + 42, 41, 41, 42, 42, 47, 50, 54, 58, 63, 46, 44, 42, 43, + 44, 44, 49, 52, 55, 59, 65, 67, 48, 46, 44, 45, 45, 46, + 51, 53, 57, 61, 67, 69, 71, 54, 51, 49, 49, 50, 49, 54, + 57, 60, 65, 71, 74, 76, 82, 59, 56, 54, 54, 54, 53, 58, + 61, 64, 69, 75, 78, 80, 87, 92, 62, 59, 56, 56, 56, 55, + 60, 63, 66, 71, 77, 80, 83, 89, 95, 98, 65, 62, 59, 59, + 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 71, + 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90, 97, 103, + 107, 111, 117, 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90, + 93, 96, 104, 110, 114, 118, 125, 134, 81, 77, 73, 73, 72, 70, + 75, 77, 80, 85, 91, 94, 97, 105, 111, 115, 119, 126, 135, 137, + 83, 78, 75, 74, 74, 72, 76, 79, 81, 86, 92, 95, 99, 106, + 113, 117, 121, 128, 137, 138, 140, 88, 84, 80, 79, 78, 76, 80, + 82, 85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, + 152, 91, 86, 83, 82, 81, 79, 81, 84, 88, 92, 95, 100, 107, + 110, 115, 123, 127, 132, 140, 147, 151, 154, 159, 94, 89, 86, 85, + 84, 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136, + 139, 146, 156, 158, 161, 166, 97, 92, 90, 88, 86, 85, 84, 89, + 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143, 148, 152, 163, + 166, 168, 174, 101, 95, 93, 91, 89, 89, 87, 91, 93, 98, 101, + 105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171, 174, 176, + 183, 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111, + 116, 122, 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191, + 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110, 113, 120, + 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193, 200, + 111, 105, 104, 101, 100, 99, 97, 96, 102, 103, 109, 111, 117, 120, + 125, 131, 135, 143, 146, 156, 158, 168, 173, 180, 189, 195, 200, 202, + 210, 115, 109, 108, 104, 104, 102, 101, 100, 103, 106, 111, 113, 119, + 121, 129, 131, 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204, + 210, 212, 220, 119, 113, 112, 107, 107, 106, 105, 103, 105, 110, 112, + 117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179, 183, 193, + 197, 210, 214, 220, 222, 231, 123, 116, 116, 111, 111, 109, 110, 107, + 107, 114, 114, 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176, + 177, 190, 191, 204, 206, 222, 224, 230, 232, 242}, + {32, 31, 31, 30, 31, 32, 32, 33, 33, 35, 33, 34, 35, 37, + 39, 36, 38, 40, 41, 43, 47, 41, 42, 42, 43, 45, 47, 48, + 45, 45, 44, 45, 46, 47, 49, 50, 49, 47, 46, 47, 47, 48, + 50, 51, 53, 48, 47, 45, 46, 46, 46, 49, 51, 53, 54, 49, + 47, 45, 45, 45, 45, 49, 51, 53, 55, 58, 50, 47, 45, 46, + 46, 46, 49, 51, 54, 56, 59, 60, 50, 48, 46, 46, 46, 46, + 50, 52, 54, 56, 60, 60, 61, 52, 50, 47, 47, 47, 47, 50, + 52, 54, 57, 61, 62, 63, 66, 54, 52, 49, 49, 49, 48, 52, + 53, 55, 58, 62, 64, 65, 68, 71, 56, 53, 51, 50, 50, 49, + 52, 54, 56, 59, 63, 64, 66, 69, 72, 73, 57, 54, 52, 51, + 51, 50, 53, 55, 56, 60, 63, 65, 67, 70, 73, 75, 76, 60, + 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72, 75, + 77, 79, 82, 63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67, + 69, 71, 75, 78, 80, 82, 85, 89, 64, 61, 58, 57, 57, 55, + 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90, + 65, 61, 58, 58, 57, 55, 58, 60, 61, 64, 68, 70, 71, 75, + 79, 81, 83, 86, 90, 91, 91, 67, 63, 61, 60, 59, 57, 60, + 61, 63, 66, 69, 70, 73, 77, 79, 81, 85, 88, 90, 92, 94, + 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74, + 75, 78, 82, 84, 86, 90, 93, 94, 96, 98, 69, 65, 63, 62, + 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87, + 89, 92, 96, 97, 98, 100, 70, 66, 64, 63, 62, 61, 60, 63, + 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98, + 99, 100, 102, 71, 67, 66, 64, 63, 62, 61, 63, 64, 67, 68, + 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100, 101, 102, + 104, 72, 68, 67, 65, 64, 64, 61, 63, 65, 67, 68, 71, 73, + 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102, 103, 104, 106, + 73, 69, 68, 66, 65, 65, 63, 63, 66, 67, 69, 71, 73, 76, + 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104, 105, 106, 109, + 74, 70, 70, 67, 66, 66, 64, 63, 66, 67, 70, 71, 74, 75, + 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106, 108, 108, + 111, 75, 71, 71, 68, 68, 67, 66, 64, 66, 68, 70, 71, 74, + 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108, + 110, 111, 113, 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70, + 72, 74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104, + 105, 109, 111, 112, 113, 116, 78, 74, 74, 70, 70, 69, 69, 66, + 66, 70, 70, 74, 74, 77, 78, 82, 82, 86, 87, 92, 92, 96, + 97, 102, 102, 107, 107, 112, 113, 115, 115, 118}}, + {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 33, 32, 32, 32, 33, 34, 35, 32, 33, 33, 33, 34, 36, 36, + 34, 34, 33, 34, 35, 37, 38, 39, 36, 35, 34, 35, 36, 38, + 40, 42, 48, 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 39, + 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, 44, 42, 41, 41, + 42, 42, 44, 47, 54, 56, 58, 63, 47, 45, 44, 44, 45, 45, + 47, 50, 56, 58, 60, 66, 69, 49, 47, 46, 45, 46, 46, 48, + 51, 57, 60, 62, 68, 71, 73, 54, 51, 50, 49, 50, 49, 51, + 54, 60, 63, 65, 71, 75, 77, 82, 59, 56, 54, 54, 54, 53, + 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 61, 58, 56, 56, + 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 65, + 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, + 98, 101, 105, 71, 68, 65, 65, 64, 63, 65, 68, 73, 76, 78, + 84, 89, 92, 97, 103, 106, 111, 117, 76, 72, 70, 69, 68, 66, + 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122, 127, + 80, 76, 73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98, + 104, 110, 113, 118, 125, 130, 134, 83, 78, 76, 75, 74, 72, 73, + 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121, 128, 133, 137, + 140, 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, + 103, 109, 116, 119, 124, 131, 136, 140, 144, 147, 89, 85, 82, 81, + 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114, 120, 128, + 131, 136, 146, 147, 150, 155, 92, 88, 85, 84, 82, 81, 80, 85, + 86, 90, 95, 97, 102, 107, 110, 117, 122, 125, 134, 138, 142, 152, + 154, 156, 162, 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95, + 99, 105, 106, 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163, + 169, 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102, 104, + 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, 176, + 101, 96, 95, 92, 91, 90, 88, 88, 93, 95, 99, 103, 106, 112, + 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176, 178, 184, + 104, 99, 98, 95, 94, 93, 91, 90, 95, 96, 102, 103, 109, 112, + 117, 122, 125, 133, 136, 145, 146, 156, 160, 167, 174, 180, 184, 186, + 193, 108, 102, 101, 98, 97, 96, 95, 93, 97, 100, 104, 106, 111, + 113, 121, 122, 130, 132, 140, 143, 150, 155, 162, 169, 174, 183, 188, + 192, 194, 201, 111, 105, 105, 101, 100, 99, 98, 96, 98, 103, 105, + 109, 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178, + 181, 193, 196, 201, 202, 210, 114, 109, 109, 104, 104, 102, 102, 99, + 100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140, 151, 151, 162, + 162, 175, 176, 187, 188, 203, 204, 210, 211, 219}, + {32, 31, 31, 30, 31, 31, 31, 32, 32, 33, 33, 34, 35, 36, 39, + 36, 38, 39, 40, 43, 47, 38, 40, 41, 41, 44, 47, 47, 41, 42, + 42, 43, 45, 47, 48, 48, 49, 47, 46, 46, 47, 48, 49, 50, 53, + 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 48, 47, 46, 45, 46, + 46, 48, 49, 53, 54, 54, 49, 47, 45, 45, 45, 45, 47, 49, 53, + 55, 55, 58, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, + 61, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62, + 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, + 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, 64, 66, 68, + 71, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58, 59, 63, 65, 66, + 69, 72, 73, 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, + 66, 67, 70, 73, 74, 76, 60, 57, 55, 54, 53, 52, 53, 55, 58, + 60, 61, 65, 68, 69, 72, 75, 77, 79, 82, 62, 59, 57, 56, 55, + 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86, + 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 63, 67, 70, 71, 75, + 78, 79, 82, 85, 87, 89, 65, 61, 59, 58, 57, 55, 56, 58, 61, + 63, 64, 68, 71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 66, 63, + 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80, 81, + 84, 87, 90, 91, 93, 94, 67, 64, 62, 61, 59, 58, 58, 60, 63, + 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89, 93, 94, 95, 97, + 68, 65, 63, 62, 60, 59, 58, 61, 62, 64, 67, 68, 71, 74, 75, + 79, 81, 83, 87, 89, 91, 95, 96, 97, 99, 69, 66, 64, 63, 61, + 61, 59, 61, 62, 65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88, + 91, 92, 97, 98, 98, 101, 70, 67, 65, 63, 62, 62, 60, 61, 63, + 65, 66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, + 100, 100, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69, + 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, + 105, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73, + 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107, + 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77, + 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109, + 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76, + 78, 80, 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108, + 111, 75, 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75, + 75, 79, 79, 83, 84, 88, 89, 93, 93, 98, 98, 102, 103, 108, 108, + 110, 110, 113}}, + {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 33, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 34, 34, 35, + 34, 34, 33, 33, 35, 36, 37, 39, 34, 34, 34, 34, 36, 36, + 37, 41, 42, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 39, + 38, 38, 37, 39, 40, 40, 45, 47, 50, 54, 41, 39, 39, 38, + 40, 40, 41, 46, 48, 51, 55, 56, 44, 42, 41, 41, 42, 42, + 42, 47, 50, 54, 58, 59, 63, 48, 46, 45, 44, 45, 45, 45, + 50, 53, 56, 61, 62, 66, 70, 49, 47, 46, 45, 46, 46, 46, + 51, 53, 57, 62, 63, 68, 71, 73, 54, 51, 50, 49, 50, 49, + 49, 54, 56, 60, 65, 67, 71, 76, 77, 82, 58, 55, 54, 53, + 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 59, + 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82, + 87, 91, 93, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, + 75, 79, 85, 87, 92, 97, 99, 105, 69, 66, 64, 63, 63, 62, + 61, 66, 68, 71, 76, 78, 83, 88, 90, 96, 100, 102, 109, 113, + 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90, + 92, 97, 102, 104, 111, 115, 117, 80, 76, 73, 72, 71, 70, 69, + 74, 76, 79, 84, 86, 90, 96, 98, 104, 109, 111, 118, 123, 125, + 134, 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91, + 97, 99, 105, 110, 112, 120, 125, 127, 136, 137, 83, 78, 76, 75, + 74, 73, 72, 76, 78, 81, 86, 88, 92, 98, 100, 106, 111, 113, + 121, 126, 128, 137, 139, 140, 87, 83, 81, 79, 78, 77, 75, 80, + 82, 85, 90, 91, 96, 101, 103, 110, 114, 117, 125, 129, 133, 142, + 143, 145, 150, 90, 85, 83, 81, 80, 79, 78, 81, 83, 87, 89, + 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151, + 156, 93, 88, 86, 84, 83, 82, 80, 82, 85, 89, 90, 96, 98, + 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, 156, 157, 163, + 95, 90, 89, 86, 85, 85, 83, 83, 88, 89, 93, 97, 99, 105, + 106, 113, 116, 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169, + 98, 93, 92, 89, 88, 87, 86, 85, 89, 90, 96, 97, 102, 105, + 109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169, 170, + 176, 101, 96, 95, 91, 91, 90, 89, 87, 90, 93, 97, 99, 104, + 105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155, 160, 168, 172, + 176, 177, 184, 104, 99, 98, 94, 94, 92, 92, 90, 92, 96, 98, + 102, 104, 109, 112, 116, 121, 125, 130, 135, 141, 144, 152, 155, 163, + 166, 177, 179, 184, 185, 191, 107, 101, 101, 97, 97, 95, 95, 93, + 93, 99, 99, 105, 105, 112, 112, 120, 120, 129, 129, 139, 140, 149, + 149, 161, 161, 172, 172, 185, 186, 191, 192, 199}, + {32, 31, 31, 30, 31, 31, 30, 31, 31, 32, 33, 34, 35, 35, 39, + 35, 36, 37, 37, 41, 43, 36, 38, 39, 40, 43, 45, 47, 41, 42, + 42, 42, 45, 46, 47, 48, 44, 44, 44, 44, 46, 46, 47, 49, 50, + 49, 47, 47, 46, 47, 47, 48, 50, 51, 53, 48, 47, 46, 45, 46, + 46, 46, 49, 51, 53, 54, 48, 47, 46, 45, 46, 46, 46, 49, 51, + 53, 54, 55, 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56, + 58, 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61, + 51, 48, 47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62, + 52, 50, 48, 47, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63, 64, + 66, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 64, + 65, 68, 70, 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, + 62, 65, 66, 68, 70, 71, 57, 54, 53, 52, 51, 50, 50, 53, 54, + 56, 60, 61, 63, 66, 67, 70, 73, 73, 76, 59, 56, 54, 53, 53, + 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80, + 60, 57, 55, 54, 53, 53, 52, 55, 56, 58, 61, 63, 65, 68, 69, + 72, 75, 76, 79, 81, 82, 63, 60, 58, 57, 56, 55, 54, 57, 59, + 60, 63, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 64, 61, + 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, + 79, 82, 85, 86, 89, 90, 65, 61, 60, 58, 57, 56, 55, 58, 59, + 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85, 86, 90, 91, 91, + 67, 63, 61, 60, 59, 58, 57, 60, 61, 63, 65, 66, 69, 72, 73, + 77, 79, 80, 84, 86, 88, 92, 93, 93, 95, 68, 64, 63, 61, 60, + 59, 58, 60, 61, 63, 65, 67, 70, 71, 74, 76, 78, 81, 83, 86, + 88, 89, 94, 94, 95, 97, 68, 65, 64, 62, 61, 60, 58, 59, 61, + 64, 64, 68, 69, 71, 74, 75, 79, 80, 83, 86, 87, 91, 92, 95, + 96, 97, 99, 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67, + 69, 72, 72, 76, 78, 80, 83, 84, 88, 89, 92, 94, 97, 98, 99, + 101, 70, 67, 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, + 73, 76, 77, 81, 82, 85, 86, 90, 91, 94, 96, 99, 100, 100, 103, + 71, 67, 67, 64, 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, + 74, 78, 79, 83, 84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104, + 72, 68, 68, 65, 65, 64, 63, 61, 62, 65, 66, 68, 69, 71, 73, + 75, 77, 79, 82, 84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104, + 106, 73, 69, 69, 66, 66, 64, 64, 62, 62, 66, 66, 69, 69, 72, + 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99, 99, 104, 104, + 106, 106, 108}}, + {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 33, 31, 32, 32, 32, 33, 33, 32, 32, 32, 32, 33, 34, 35, + 32, 33, 33, 33, 34, 34, 36, 36, 34, 34, 34, 33, 35, 35, + 37, 38, 39, 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 36, + 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 39, 38, 38, 37, + 39, 39, 40, 42, 45, 49, 50, 54, 41, 40, 39, 38, 40, 40, + 41, 43, 46, 50, 52, 55, 57, 44, 42, 42, 41, 42, 42, 42, + 44, 47, 52, 54, 58, 60, 63, 47, 45, 45, 44, 44, 45, 45, + 47, 50, 55, 56, 60, 62, 66, 69, 48, 46, 45, 44, 45, 45, + 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, 54, 51, 50, 49, + 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 56, + 53, 52, 51, 51, 51, 51, 53, 56, 60, 61, 66, 69, 73, 77, + 78, 84, 86, 59, 56, 55, 54, 54, 54, 53, 55, 58, 62, 64, + 69, 71, 75, 79, 80, 87, 89, 92, 64, 61, 60, 58, 58, 58, + 57, 59, 62, 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102, + 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 75, 79, + 84, 85, 92, 94, 98, 103, 105, 71, 68, 67, 65, 64, 64, 63, + 65, 68, 72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111, + 117, 74, 71, 69, 68, 67, 67, 65, 67, 70, 74, 75, 80, 83, + 86, 91, 93, 100, 102, 106, 112, 114, 120, 123, 80, 76, 74, 72, + 71, 71, 69, 71, 74, 78, 79, 84, 86, 90, 95, 96, 104, 106, + 110, 116, 118, 125, 128, 134, 82, 78, 76, 74, 73, 73, 71, 73, + 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, + 131, 136, 139, 83, 78, 77, 75, 74, 74, 72, 73, 76, 80, 81, + 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139, + 140, 87, 83, 81, 79, 78, 78, 75, 77, 80, 83, 85, 90, 92, + 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142, 144, 145, 150, + 90, 85, 84, 81, 80, 80, 78, 78, 82, 84, 87, 91, 93, 98, + 99, 106, 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156, + 92, 88, 87, 84, 83, 82, 80, 80, 84, 85, 90, 91, 95, 98, + 102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152, 155, 156, + 162, 95, 90, 89, 86, 85, 84, 83, 82, 85, 87, 91, 92, 97, + 98, 105, 105, 112, 114, 121, 123, 129, 133, 138, 143, 147, 155, 158, + 161, 162, 168, 97, 92, 92, 88, 88, 86, 86, 84, 85, 90, 91, + 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133, 140, 143, 150, + 152, 162, 164, 168, 168, 174, 100, 95, 95, 90, 90, 89, 89, 86, + 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129, 137, + 137, 147, 148, 157, 158, 169, 170, 174, 175, 181}, + {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 33, 34, 34, 34, 37, + 33, 34, 35, 35, 38, 39, 36, 38, 39, 40, 42, 43, 47, 38, 40, + 40, 41, 43, 44, 47, 47, 41, 42, 42, 42, 44, 45, 47, 48, 48, + 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 49, 47, 47, 46, 47, + 47, 48, 49, 50, 52, 53, 48, 47, 46, 45, 46, 46, 46, 48, 49, + 52, 53, 54, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55, + 55, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58, + 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, + 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60, 61, + 61, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, + 63, 63, 66, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, + 59, 62, 64, 64, 67, 68, 54, 52, 51, 49, 49, 49, 48, 49, 52, + 55, 55, 58, 60, 62, 64, 65, 68, 69, 71, 56, 54, 53, 51, 51, + 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75, + 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 56, 60, 61, 63, 66, + 67, 70, 71, 73, 76, 76, 60, 57, 56, 54, 53, 53, 52, 53, 55, + 58, 58, 61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 61, 58, + 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69, 73, + 74, 76, 79, 80, 83, 84, 63, 60, 59, 57, 56, 56, 54, 55, 57, + 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, + 64, 61, 60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68, 70, + 71, 75, 77, 79, 82, 82, 86, 87, 90, 91, 65, 61, 60, 58, 57, + 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, + 83, 86, 88, 90, 91, 91, 67, 63, 62, 60, 59, 59, 57, 58, 60, + 62, 63, 66, 67, 69, 72, 73, 77, 78, 80, 83, 84, 88, 89, 92, + 93, 93, 95, 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65, + 67, 70, 70, 74, 75, 78, 80, 81, 85, 86, 89, 91, 93, 94, 95, + 97, 68, 65, 64, 62, 61, 60, 59, 58, 61, 61, 64, 65, 67, 69, + 71, 73, 75, 78, 79, 83, 83, 87, 88, 91, 93, 95, 96, 97, 99, + 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, + 72, 76, 76, 80, 81, 84, 86, 88, 90, 92, 95, 96, 98, 98, 100, + 70, 66, 66, 63, 63, 62, 61, 60, 60, 63, 64, 66, 67, 69, 71, + 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, 100, 100, + 102, 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67, 70, + 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100, + 101, 101, 104}}, + {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 32, 31, 32, 32, 32, 33, 33, 32, 32, 32, 32, 33, 33, 34, + 32, 32, 32, 32, 33, 34, 35, 35, 33, 33, 33, 33, 34, 35, + 36, 36, 38, 34, 34, 34, 33, 34, 35, 36, 37, 39, 39, 36, + 35, 35, 34, 35, 36, 37, 38, 42, 42, 48, 36, 35, 35, 34, + 35, 36, 38, 38, 42, 43, 48, 49, 39, 38, 38, 37, 38, 39, + 40, 40, 44, 45, 50, 51, 54, 41, 39, 39, 38, 39, 40, 40, + 41, 45, 46, 51, 52, 55, 56, 44, 42, 42, 41, 41, 42, 42, + 42, 46, 47, 54, 54, 58, 59, 63, 46, 44, 44, 42, 43, 44, + 44, 44, 48, 49, 55, 55, 59, 61, 65, 67, 48, 46, 46, 44, + 45, 45, 45, 46, 50, 51, 57, 57, 61, 63, 67, 69, 71, 52, + 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59, 64, 65, 70, + 72, 74, 78, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, + 60, 65, 67, 71, 74, 76, 80, 82, 58, 56, 55, 53, 53, 53, + 53, 53, 57, 58, 63, 64, 68, 70, 75, 77, 80, 84, 86, 91, + 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70, + 75, 78, 80, 85, 87, 91, 92, 65, 62, 61, 59, 59, 59, 58, + 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, + 105, 66, 63, 63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, + 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 71, 68, 67, 65, + 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, + 97, 103, 103, 111, 112, 117, 74, 71, 70, 68, 67, 67, 66, 65, + 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105, 106, 114, + 115, 120, 123, 80, 76, 75, 72, 72, 71, 70, 69, 73, 74, 79, + 79, 84, 86, 90, 93, 96, 101, 104, 110, 110, 118, 119, 125, 128, + 134, 81, 77, 77, 74, 73, 73, 71, 71, 74, 75, 80, 80, 85, + 87, 91, 94, 98, 103, 105, 111, 112, 120, 121, 127, 130, 136, 137, + 83, 78, 78, 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88, + 92, 95, 99, 104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140, + 86, 82, 81, 78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91, + 95, 98, 101, 106, 109, 115, 116, 124, 125, 131, 135, 140, 142, 144, + 147, 89, 84, 84, 80, 80, 79, 78, 77, 79, 81, 85, 86, 91, + 92, 97, 98, 104, 106, 112, 114, 119, 123, 128, 132, 135, 142, 145, + 148, 149, 153, 91, 86, 86, 82, 82, 81, 80, 79, 80, 84, 85, + 88, 91, 94, 97, 100, 104, 107, 112, 115, 120, 123, 129, 132, 138, + 140, 148, 150, 153, 154, 159, 93, 88, 88, 84, 84, 83, 83, 80, + 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118, 119, 126, + 126, 135, 136, 144, 144, 155, 155, 159, 159, 164}, + {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 31, 32, 32, 33, 34, 33, 34, + 35, 35, 37, 39, 35, 37, 37, 38, 39, 41, 44, 36, 38, 39, 40, 41, 43, + 46, 47, 40, 41, 41, 42, 43, 44, 46, 47, 48, 41, 42, 42, 42, 43, 45, + 46, 47, 48, 48, 49, 47, 47, 46, 46, 47, 47, 48, 50, 50, 53, 49, 47, + 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 48, 47, 47, 45, 46, 46, 46, + 46, 49, 49, 53, 53, 54, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53, + 53, 54, 55, 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56, + 58, 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56, 57, 59, 60, + 50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60, 60, 61, + 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, + 65, 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, + 63, 65, 66, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 55, 58, 59, + 62, 63, 65, 67, 68, 70, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, + 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 57, 54, 54, 52, 51, 51, 50, + 50, 52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 57, 55, + 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, 67, 70, 71, + 73, 74, 77, 77, 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61, + 63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82, 61, 58, 57, 55, 55, 54, + 53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80, + 83, 84, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67, + 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 64, 61, 60, 58, 57, 57, + 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83, + 86, 87, 89, 90, 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64, + 65, 68, 70, 71, 74, 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 66, 63, + 62, 60, 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, + 79, 80, 84, 84, 87, 89, 91, 92, 93, 94, 67, 64, 63, 61, 60, 59, 58, + 57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87, + 89, 92, 93, 94, 94, 96, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62, + 64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94, + 96, 96, 98, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68, + 68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97, + 99}}, + {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 32, 31, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 34, 34, 35, 32, 32, 32, 32, 32, 34, + 34, 35, 35, 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 34, + 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 36, 35, 35, 34, + 34, 36, 36, 38, 38, 42, 42, 48, 36, 35, 35, 34, 34, 36, + 36, 38, 38, 42, 42, 48, 48, 39, 38, 38, 37, 37, 39, 39, + 40, 40, 45, 45, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39, + 40, 40, 45, 45, 50, 50, 54, 54, 44, 42, 42, 41, 41, 42, + 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 44, 42, 42, 41, + 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, 48, + 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, + 67, 67, 71, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, + 57, 57, 61, 61, 67, 67, 71, 71, 54, 51, 51, 49, 49, 50, + 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, + 54, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, + 65, 71, 71, 76, 76, 82, 82, 59, 56, 56, 54, 54, 54, 54, + 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, + 92, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, + 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 65, 62, 62, 59, + 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, + 85, 92, 92, 98, 98, 105, 65, 62, 62, 59, 59, 59, 59, 58, + 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, + 98, 105, 105, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, + 73, 73, 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, + 117, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, + 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117, 117, + 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, + 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134, + 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, + 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134, + 134, 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, + 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, 128, 128, + 137, 137, 140, 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, + 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, + 128, 128, 137, 137, 140, 140, 87, 83, 83, 79, 79, 77, 77, 75, + 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109, 109, 116, + 116, 124, 124, 132, 132, 141, 141, 144, 144, 149}, + {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 30, 31, 31, 32, 32, 33, 34, + 34, 35, 35, 39, 33, 34, 34, 35, 35, 39, 39, 36, 38, 38, 40, 40, 43, + 43, 47, 36, 38, 38, 40, 40, 43, 43, 47, 47, 41, 42, 42, 42, 42, 45, + 45, 47, 47, 48, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 49, 47, + 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 49, 47, 47, 46, 46, 47, 47, + 48, 48, 50, 50, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, + 53, 53, 54, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, + 54, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, + 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58, + 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, + 61, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, + 60, 61, 61, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, + 57, 61, 61, 63, 63, 66, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, + 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 54, 52, 52, 49, 49, 49, 49, + 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 54, 52, + 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, + 68, 68, 71, 71, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, + 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 57, 54, 54, 52, 52, 51, + 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, + 76, 76, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, + 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 60, 57, 57, 54, 54, 53, + 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, + 79, 79, 82, 82, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, + 63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 63, 60, + 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, + 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 65, 61, 61, 58, 58, 57, 57, + 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, + 83, 86, 86, 90, 90, 91, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, + 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, + 90, 91, 91, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, + 66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, + 95}}, + {{32, 31, 31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 32, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 33, 33, 34, 32, 32, 32, 32, 32, 33, + 34, 34, 35, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 34, + 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 34, 34, 34, 33, + 33, 34, 35, 35, 37, 37, 39, 39, 35, 35, 35, 34, 34, 35, + 36, 36, 38, 38, 42, 42, 46, 36, 35, 35, 34, 34, 35, 36, + 37, 38, 38, 42, 42, 47, 48, 38, 37, 37, 36, 36, 37, 38, + 38, 39, 40, 44, 44, 48, 50, 51, 39, 38, 38, 38, 37, 38, + 39, 39, 40, 41, 45, 45, 49, 50, 52, 54, 41, 40, 40, 39, + 38, 39, 40, 40, 41, 41, 46, 46, 50, 52, 54, 55, 57, 44, + 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54, 56, + 58, 60, 63, 45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48, + 48, 53, 54, 57, 58, 60, 64, 65, 48, 46, 46, 45, 44, 45, + 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, + 48, 46, 46, 45, 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, + 59, 61, 63, 67, 68, 71, 71, 53, 51, 51, 49, 49, 49, 49, + 49, 49, 49, 54, 54, 58, 59, 62, 64, 67, 71, 72, 75, 75, + 81, 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, + 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 57, 55, 55, 53, + 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74, + 75, 79, 79, 85, 85, 89, 59, 56, 56, 54, 54, 54, 54, 54, + 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80, 86, + 87, 90, 92, 62, 59, 59, 57, 56, 56, 56, 56, 55, 56, 60, + 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93, 95, + 98, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, 67, + 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, + 67, 64, 64, 62, 61, 61, 60, 60, 59, 60, 64, 64, 68, 69, + 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106, 108, + 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68, 72, 73, + 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113, + 117, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72, + 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111, + 113, 118, 119, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, + 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, + 114, 118, 120, 125, 126, 134, 80, 76, 76, 73, 72, 72, 71, 70, + 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, + 104, 108, 110, 114, 118, 120, 125, 126, 134, 134}, + {32, 31, 31, 31, 31, 31, 30, 31, 31, 31, 30, 31, 31, 31, 32, 32, 32, + 33, 33, 33, 35, 33, 34, 34, 35, 35, 37, 39, 34, 35, 35, 36, 36, 38, + 40, 41, 36, 38, 38, 39, 40, 41, 43, 44, 47, 37, 38, 39, 40, 40, 42, + 43, 44, 47, 47, 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 41, 42, + 42, 42, 42, 43, 45, 45, 47, 47, 48, 48, 47, 46, 46, 46, 45, 46, 47, + 47, 47, 48, 50, 50, 52, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50, + 50, 52, 53, 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, + 53, 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52, 53, 54, 54, + 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54, 55, 55, + 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57, + 58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, + 57, 59, 59, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, + 55, 56, 58, 60, 60, 61, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, + 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 52, 50, 49, 48, 47, 47, 47, + 47, 46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 52, 50, + 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, + 63, 63, 66, 66, 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54, + 55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69, 54, 52, 52, 50, 49, 49, + 49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68, + 70, 71, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58, + 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 57, 54, 54, 52, 52, 51, + 51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63, 64, 67, 67, 70, 70, + 72, 73, 75, 76, 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56, + 57, 59, 60, 62, 64, 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 60, 57, + 57, 55, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, + 68, 68, 72, 72, 74, 75, 77, 79, 80, 82, 60, 57, 57, 55, 54, 54, 54, + 53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75, + 76, 77, 79, 80, 82, 82, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, + 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, + 85, 85, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, + 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, + 89}}, + {{32, 31, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 31, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 32, + 32, 33, 34, 34, 35, 35, 33, 33, 33, 33, 33, 33, 34, 35, 35, + 36, 36, 38, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39, + 39, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42, + 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, + 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, + 48, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, + 50, 50, 52, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, + 45, 47, 50, 50, 53, 54, 41, 40, 40, 39, 38, 38, 40, 40, 40, + 41, 41, 45, 46, 48, 52, 52, 54, 55, 57, 44, 42, 42, 42, 41, + 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, + 44, 42, 42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, + 54, 57, 58, 60, 63, 63, 47, 46, 45, 45, 44, 44, 44, 45, 45, + 45, 45, 49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 48, 47, + 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57, 60, + 61, 63, 67, 67, 70, 71, 50, 49, 48, 47, 46, 46, 47, 47, 47, + 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68, 68, 72, 73, 75, + 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, + 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 54, 52, 51, 50, 49, + 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, + 71, 75, 76, 78, 82, 82, 58, 56, 55, 54, 53, 53, 53, 53, 53, + 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, + 86, 86, 90, 59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57, + 58, 60, 64, 64, 68, 69, 71, 75, 75, 79, 80, 83, 87, 87, 91, + 92, 61, 59, 58, 57, 56, 56, 56, 56, 55, 55, 55, 59, 60, 62, + 65, 65, 69, 70, 73, 77, 77, 81, 82, 85, 89, 89, 93, 94, 97, + 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, + 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105, + 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, + 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105, + 105, 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67, 69, + 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105, + 109, 109, 114}, + {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 32, 30, 31, + 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 37, 33, 34, 34, 35, 35, 35, + 38, 39, 34, 36, 36, 36, 37, 37, 40, 40, 42, 36, 38, 38, 39, 40, 40, + 42, 43, 45, 47, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 40, 41, + 41, 41, 42, 42, 44, 44, 45, 47, 47, 48, 41, 42, 42, 42, 42, 42, 44, + 45, 46, 47, 47, 48, 48, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, + 49, 49, 50, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, + 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, + 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54, + 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, + 54, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, + 54, 55, 55, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, + 53, 53, 55, 55, 57, 58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, + 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 50, 48, 48, 47, 46, 46, 46, + 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 50, 49, + 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 58, + 60, 60, 61, 61, 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, + 51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63, 52, 50, 50, 49, 47, 47, + 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, + 65, 66, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54, + 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 54, 52, 51, 50, 49, 49, + 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, + 66, 68, 68, 70, 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52, + 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 55, 53, + 53, 52, 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, + 63, 63, 65, 66, 67, 69, 69, 71, 72, 73, 57, 55, 54, 53, 52, 52, 51, + 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, + 70, 70, 73, 73, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, + 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, + 74, 76, 76, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56, + 58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78, + 80}}, + {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 32, 33, 33, 33, 33, 33, 33, + 34, 34, 35, 36, 36, 36, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, + 37, 38, 39, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, + 39, 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39, 41, 41, 43, + 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, + 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, + 48, 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, + 50, 50, 51, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, + 45, 47, 50, 50, 52, 54, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, + 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 42, 41, 41, 41, 40, 40, 40, + 41, 41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 44, 43, + 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, 54, 54, 56, + 58, 58, 61, 63, 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45, + 48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64, 47, 46, 45, 45, 44, 44, + 44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, + 66, 69, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51, + 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 49, 48, 47, 47, 46, 45, + 45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57, 60, 62, 62, 66, 68, + 68, 71, 72, 73, 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51, + 54, 54, 57, 59, 59, 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 54, 52, + 51, 51, 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, + 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 55, 53, 53, 52, 51, 50, 50, + 51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73, + 76, 77, 78, 83, 83, 85, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, + 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, + 87, 88, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, + 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, + 92}, + {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 30, 31, + 31, 31, 31, 32, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, + 35, 38, 33, 34, 34, 34, 35, 35, 36, 38, 39, 34, 35, 35, 36, 36, 36, + 37, 40, 40, 41, 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 36, 38, + 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 38, 39, 40, 40, 41, 41, 41, + 43, 44, 45, 47, 47, 47, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, + 47, 48, 48, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, + 48, 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48, 49, 49, 50, + 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53, + 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53, + 53, 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, + 53, 53, 53, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, + 49, 51, 53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, + 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 49, 47, 47, 47, 45, 45, 45, + 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 49, 47, + 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, + 55, 55, 57, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, + 49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59, 50, 49, 48, 48, 46, 46, + 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, + 59, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, + 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 51, 49, 48, 48, 47, 46, + 46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, + 60, 61, 62, 62, 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48, + 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 52, 50, + 50, 49, 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, + 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 53, 51, 50, 50, 48, 48, 48, + 48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62, + 63, 64, 64, 67, 67, 68, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, + 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, + 68, 69, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, + 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, + 71}}, + {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, + 35, 35, 35, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, + 36, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, + 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39, + 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40, + 41, 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, + 42, 43, 46, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, + 40, 42, 42, 44, 47, 48, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, + 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 38, 37, 37, 37, 36, 36, 36, + 36, 37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 39, 39, + 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, + 50, 50, 52, 54, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, + 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 41, 40, 40, 40, 39, 38, + 38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55, + 55, 57, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, + 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 44, 43, 42, 42, 42, 41, + 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, + 58, 60, 63, 63, 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43, + 43, 45, 48, 48, 49, 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 47, 46, + 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, + 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 48, 47, 46, 46, 45, 44, 44, + 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, + 63, 67, 67, 68, 70, 71, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, + 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, + 70, 71, 71, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48, + 50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74, + 77}, + {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, + 31, 31, 31, 32, 30, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 37, 33, 34, 34, 34, 35, 35, + 35, 36, 38, 39, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 35, 36, + 37, 37, 37, 38, 38, 38, 41, 41, 41, 44, 36, 37, 38, 38, 39, 40, 40, + 40, 42, 43, 43, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, + 46, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, + 47, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, + 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48, + 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49, + 49, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, + 50, 50, 52, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, + 49, 50, 50, 51, 52, 53, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, + 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46, + 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 48, 47, + 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, + 53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, + 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 49, 47, 47, 47, 46, 45, + 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55, + 55, 55, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, + 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 49, 47, 47, 47, 46, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, + 55, 57, 58, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, + 45, 47, 49, 49, 50, 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 50, 49, + 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, + 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 50, 49, 48, 48, 47, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, + 58, 60, 60, 60, 61, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, + 61, 61, 61, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, + 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63, + 64}}, + {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, + 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, + 37, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, + 37, 38, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, + 37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, + 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 35, 34, 34, 34, 34, 34, 34, + 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 36, 35, + 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, + 42, 42, 45, 48, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, + 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 36, 35, 35, 35, 35, 35, + 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, + 48, 48, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, + 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 39, 39, 38, 38, 38, 38, + 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, + 50, 50, 52, 54, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, + 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 39, 39, + 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, + 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 41, 41, 40, 40, 40, 39, 39, + 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52, + 52, 54, 56, 56, 56, 58, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, + 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, + 58, 60, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, + 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, + 63}, + {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 32, 30, 31, 31, 31, 31, 31, + 32, 32, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 35, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 33, 34, + 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 33, 34, 34, 34, 34, 35, 35, + 35, 35, 37, 39, 39, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41, + 41, 41, 43, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, + 47, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, + 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, + 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47, + 47, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, + 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, + 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, + 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44, + 44, 44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 49, 48, + 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, + 50, 50, 51, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, + 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 49, 48, 47, 47, 47, 47, + 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, + 53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, + 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 48, 48, 47, 47, 47, 46, + 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, + 53, 53, 53, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, + 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 48, 48, + 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, + 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 49, 48, 47, 47, 47, 46, 45, + 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53, + 53, 54, 55, 55, 55, 56, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, + 55, 57, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, + 58}}, + {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 32, 32, 31, 31, + 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, + 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, + 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, + 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, + 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34, + 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, + 39, 39, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, + 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, + 41, 41, 41, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36, + 36, 36, 36, 37, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 36, 35, + 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, + 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 36, 35, 35, 35, 35, 35, 35, + 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, + 42, 42, 45, 47, 48, 48, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, + 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, + 48, 48, 48, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37, + 37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49, + 50}, + {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, + 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 32, 30, 30, 31, 31, 31, 31, + 31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 33, 33, 33, 34, 34, 34, 34, + 34, 34, 34, 34, 36, 37, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, + 37, 38, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, + 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, + 34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40, 40, 42, + 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44, + 46, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, + 45, 46, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, + 43, 43, 45, 46, 47, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, + 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, + 41, 41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 40, 41, + 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, + 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, + 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, + 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, + 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, + 49, 49, 49, 50, 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, + 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 49, 48, + 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, + 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 49, 48, 48, 47, 47, 47, 47, + 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, + 50, 50, 51, 52, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, + 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, + 53, 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, + 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, + 53}}, + {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, + 31, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, + 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, + 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, + 35, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 34, 35, 35, 35, 36, 36, 36, 36, 36, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, + 36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, + 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, + 38, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, + 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, + 39}, + {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 30, 30, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, + 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, + 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, + 38, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, + 36, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, + 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 34, 35, 35, 35, 35, 35, 35, + 36, 36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 35, 36, + 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 41, 41, 41, + 41, 41, 42, 44, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, + 39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46, 36, 37, 37, 38, 38, 38, + 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, + 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, + 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 36, 37, 37, 38, 38, 38, + 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, + 47, 47, 47, 47, 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, + 40, 41, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 38, 39, + 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, + 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 40, 40, 40, 41, 41, 41, 41, + 41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47, + 47, 47, 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, + 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, + 48}}, + {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 32, + 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33}, + {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 34, 34, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 33, 33, 33, 33, 33, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, + 37, 37, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 33, 33, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, + 37, 38, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 33, 33, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, + 38, 39, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, + 39, 39, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, + 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, + 40}}, + {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32}, + {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, + 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32}}}; diff --git a/src/reconstruction.cc b/src/reconstruction.cc new file mode 100644 index 0000000..1aa1233 --- /dev/null +++ b/src/reconstruction.cc @@ -0,0 +1,190 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/reconstruction.h" + +#include +#include +#include + +#include "src/utils/common.h" + +namespace libgav1 { +namespace { + +// Maps TransformType to dsp::Transform1D for the row transforms. +constexpr dsp::Transform1D kRowTransform[kNumTransformTypes] = { + dsp::k1DTransformDct, dsp::k1DTransformAdst, + dsp::k1DTransformDct, dsp::k1DTransformAdst, + dsp::k1DTransformAdst, dsp::k1DTransformDct, + dsp::k1DTransformAdst, dsp::k1DTransformAdst, + dsp::k1DTransformAdst, dsp::k1DTransformIdentity, + dsp::k1DTransformIdentity, dsp::k1DTransformDct, + dsp::k1DTransformIdentity, dsp::k1DTransformAdst, + dsp::k1DTransformIdentity, dsp::k1DTransformAdst}; + +// Maps TransformType to dsp::Transform1D for the column transforms. +constexpr dsp::Transform1D kColumnTransform[kNumTransformTypes] = { + dsp::k1DTransformDct, dsp::k1DTransformDct, + dsp::k1DTransformAdst, dsp::k1DTransformAdst, + dsp::k1DTransformDct, dsp::k1DTransformAdst, + dsp::k1DTransformAdst, dsp::k1DTransformAdst, + dsp::k1DTransformAdst, dsp::k1DTransformIdentity, + dsp::k1DTransformDct, dsp::k1DTransformIdentity, + dsp::k1DTransformAdst, dsp::k1DTransformIdentity, + dsp::k1DTransformAdst, dsp::k1DTransformIdentity}; + +dsp::TransformSize1D Get1DTransformSize(int size_log2) { + return static_cast(size_log2 - 2); +} + +// Returns the number of rows to process based on |non_zero_coeff_count|. The +// transform loops process either 4 or a multiple of 8 rows. Use the +// TransformClass derived from |tx_type| to determine the scan order. +template +int GetNumRows(TransformType tx_type, int tx_height, int non_zero_coeff_count) { + const TransformClass tx_class = GetTransformClass(tx_type); + + switch (tx_class) { + case kTransformClass2D: + if (tx_width == 4) { + if (non_zero_coeff_count <= 13) return 4; + if (non_zero_coeff_count <= 29) return 8; + } + if (tx_width == 8) { + if (non_zero_coeff_count <= 10) return 4; + if ((non_zero_coeff_count <= 14) & (tx_height > 8)) return 4; + if (non_zero_coeff_count <= 43) return 8; + if ((non_zero_coeff_count <= 107) & (tx_height > 16)) return 16; + if ((non_zero_coeff_count <= 171) & (tx_height > 16)) return 24; + } + if (tx_width == 16) { + if (non_zero_coeff_count <= 10) return 4; + if ((non_zero_coeff_count <= 14) & (tx_height > 16)) return 4; + if (non_zero_coeff_count <= 36) return 8; + if ((non_zero_coeff_count <= 44) & (tx_height > 16)) return 8; + if ((non_zero_coeff_count <= 151) & (tx_height > 16)) return 16; + if ((non_zero_coeff_count <= 279) & (tx_height > 16)) return 24; + } + if (tx_width == 32) { + if (non_zero_coeff_count <= 10) return 4; + if (non_zero_coeff_count <= 36) return 8; + if ((non_zero_coeff_count <= 136) & (tx_height > 16)) return 16; + if ((non_zero_coeff_count <= 300) & (tx_height > 16)) return 24; + } + break; + + case kTransformClassHorizontal: + if (non_zero_coeff_count <= 4) return 4; + if (non_zero_coeff_count <= 8) return 8; + if ((non_zero_coeff_count <= 16) & (tx_height > 16)) return 16; + if ((non_zero_coeff_count <= 24) & (tx_height > 16)) return 24; + break; + + default: + assert(tx_class == kTransformClassVertical); + if (tx_width == 4) { + if (non_zero_coeff_count <= 16) return 4; + if (non_zero_coeff_count <= 32) return 8; + } + if (tx_width == 8) { + if (non_zero_coeff_count <= 32) return 4; + if (non_zero_coeff_count <= 64) return 8; + // There's no need to check tx_height since the maximum values for + // smaller sizes are: 8x8: 63, 8x16: 127. + if (non_zero_coeff_count <= 128) return 16; + if (non_zero_coeff_count <= 192) return 24; + } + if (tx_width == 16) { + if (non_zero_coeff_count <= 64) return 4; + if (non_zero_coeff_count <= 128) return 8; + // There's no need to check tx_height since the maximum values for + // smaller sizes are: 16x8: 127, 16x16: 255. + if (non_zero_coeff_count <= 256) return 16; + if (non_zero_coeff_count <= 384) return 24; + } + if (tx_width == 32) { + if (non_zero_coeff_count <= 128) return 4; + if (non_zero_coeff_count <= 256) return 8; + // There's no need to check tx_height since the maximum values for + // smaller sizes are: 32x8 is 255, 32x16 is 511. + if ((non_zero_coeff_count <= 512)) return 16; + if ((non_zero_coeff_count <= 768)) return 24; + } + break; + } + return (tx_width >= 16) ? std::min(tx_height, 32) : tx_height; +} + +} // namespace + +template +void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type, + TransformSize tx_size, bool lossless, Residual* const buffer, + int start_x, int start_y, Array2DView* frame, + int non_zero_coeff_count) { + static_assert(sizeof(Residual) == 2 || sizeof(Residual) == 4, ""); + const int tx_width_log2 = kTransformWidthLog2[tx_size]; + const int tx_height_log2 = kTransformHeightLog2[tx_size]; + + int tx_height = (non_zero_coeff_count == 1) ? 1 : kTransformHeight[tx_size]; + if (tx_height > 4) { + static constexpr int (*kGetNumRows[])(TransformType tx_type, int tx_height, + int non_zero_coeff_count) = { + &GetNumRows<4>, &GetNumRows<8>, &GetNumRows<16>, &GetNumRows<32>, + &GetNumRows<32>}; + tx_height = kGetNumRows[tx_width_log2 - 2](tx_type, tx_height, + non_zero_coeff_count); + } + assert(tx_height <= 32); + + // Row transform. + const dsp::TransformSize1D row_transform_size = + Get1DTransformSize(tx_width_log2); + const dsp::Transform1D row_transform = + lossless ? dsp::k1DTransformWht : kRowTransform[tx_type]; + const dsp::InverseTransformAddFunc row_transform_func = + dsp.inverse_transforms[row_transform][row_transform_size][dsp::kRow]; + assert(row_transform_func != nullptr); + + row_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y, + frame); + + // Column transform. + const dsp::TransformSize1D column_transform_size = + Get1DTransformSize(tx_height_log2); + const dsp::Transform1D column_transform = + lossless ? dsp::k1DTransformWht : kColumnTransform[tx_type]; + const dsp::InverseTransformAddFunc column_transform_func = + dsp.inverse_transforms[column_transform][column_transform_size] + [dsp::kColumn]; + assert(column_transform_func != nullptr); + + column_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y, + frame); +} + +template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type, + TransformSize tx_size, bool lossless, int16_t* buffer, + int start_x, int start_y, Array2DView* frame, + int non_zero_coeff_count); +#if LIBGAV1_MAX_BITDEPTH >= 10 +template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type, + TransformSize tx_size, bool lossless, int32_t* buffer, + int start_x, int start_y, + Array2DView* frame, + int non_zero_coeff_count); +#endif + +} // namespace libgav1 diff --git a/src/reconstruction.h b/src/reconstruction.h new file mode 100644 index 0000000..6d5b115 --- /dev/null +++ b/src/reconstruction.h @@ -0,0 +1,54 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_RECONSTRUCTION_H_ +#define LIBGAV1_SRC_RECONSTRUCTION_H_ + +#include + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/array_2d.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { + +// Steps 2 and 3 of section 7.12.3 (contains the implementation of section +// 7.13.3). +// Apply the inverse transforms and add the residual to the frame for the +// transform block size |tx_size| starting at position |start_x| and |start_y|. +template +void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type, + TransformSize tx_size, bool lossless, Residual* buffer, + int start_x, int start_y, Array2DView* frame, + int non_zero_coeff_count); + +extern template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type, + TransformSize tx_size, bool lossless, + int16_t* buffer, int start_x, int start_y, + Array2DView* frame, + int non_zero_coeff_count); +#if LIBGAV1_MAX_BITDEPTH >= 10 +extern template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type, + TransformSize tx_size, bool lossless, + int32_t* buffer, int start_x, int start_y, + Array2DView* frame, + int non_zero_coeff_count); +#endif + +} // namespace libgav1 +#endif // LIBGAV1_SRC_RECONSTRUCTION_H_ diff --git a/src/residual_buffer_pool.cc b/src/residual_buffer_pool.cc new file mode 100644 index 0000000..e166392 --- /dev/null +++ b/src/residual_buffer_pool.cc @@ -0,0 +1,142 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/residual_buffer_pool.h" + +#include // NOLINT (unapproved c++11 header) +#include + +namespace libgav1 { +namespace { + +// The maximum queue size is derived using the following formula: +// ((sb_size * sb_size) / 16) + (2 * (((sb_size / x) * (sb_size / y)) / 16)). +// Where: +// sb_size is the superblock size (64 or 128). +// 16 is 4*4 which is kMinTransformWidth * kMinTransformHeight. +// x is subsampling_x + 1. +// y is subsampling_y + 1. +// The first component is for the Y plane and the second component is for the U +// and V planes. +// For example, for 128x128 superblocks with 422 subsampling the size is: +// ((128 * 128) / 16) + (2 * (((128 / 2) * (128 / 1)) / 16)) = 2048. +// +// First dimension: use_128x128_superblock. +// Second dimension: subsampling_x. +// Third dimension: subsampling_y. +constexpr int kMaxQueueSize[2][2][2] = { + // 64x64 superblocks. + { + {768, 512}, + {512, 384}, + }, + // 128x128 superblocks. + { + {3072, 2048}, + {2048, 1536}, + }, +}; + +} // namespace + +ResidualBufferStack::~ResidualBufferStack() { + while (top_ != nullptr) { + ResidualBuffer* top = top_; + top_ = top_->next_; + delete top; + } +} + +void ResidualBufferStack::Push(std::unique_ptr buffer) { + buffer->next_ = top_; + top_ = buffer.release(); + ++num_buffers_; +} + +std::unique_ptr ResidualBufferStack::Pop() { + std::unique_ptr top; + if (top_ != nullptr) { + top.reset(top_); + top_ = top_->next_; + top->next_ = nullptr; + --num_buffers_; + } + return top; +} + +void ResidualBufferStack::Swap(ResidualBufferStack* other) { + std::swap(top_, other->top_); + std::swap(num_buffers_, other->num_buffers_); +} + +ResidualBufferPool::ResidualBufferPool(bool use_128x128_superblock, + int subsampling_x, int subsampling_y, + size_t residual_size) + : buffer_size_(GetResidualBufferSize( + use_128x128_superblock ? 128 : 64, use_128x128_superblock ? 128 : 64, + subsampling_x, subsampling_y, residual_size)), + queue_size_(kMaxQueueSize[static_cast(use_128x128_superblock)] + [subsampling_x][subsampling_y]) {} + +void ResidualBufferPool::Reset(bool use_128x128_superblock, int subsampling_x, + int subsampling_y, size_t residual_size) { + const size_t buffer_size = GetResidualBufferSize( + use_128x128_superblock ? 128 : 64, use_128x128_superblock ? 128 : 64, + subsampling_x, subsampling_y, residual_size); + const int queue_size = kMaxQueueSize[static_cast(use_128x128_superblock)] + [subsampling_x][subsampling_y]; + if (buffer_size == buffer_size_ && queue_size == queue_size_) { + // The existing buffers (if any) are still valid, so don't do anything. + return; + } + buffer_size_ = buffer_size; + queue_size_ = queue_size; + // The existing buffers (if any) are no longer valid since the buffer size or + // the queue size has changed. Clear the stack. + ResidualBufferStack buffers; + { + std::lock_guard lock(mutex_); + // Move the buffers in the stack to the local variable |buffers| and clear + // the stack. + buffers.Swap(&buffers_); + // Release mutex_ before freeing the buffers. + } + // As the local variable |buffers| goes out of scope, its destructor frees + // the buffers that were in the stack. +} + +std::unique_ptr ResidualBufferPool::Get() { + std::unique_ptr buffer = nullptr; + { + std::lock_guard lock(mutex_); + buffer = buffers_.Pop(); + } + if (buffer == nullptr) { + buffer = ResidualBuffer::Create(buffer_size_, queue_size_); + } + return buffer; +} + +void ResidualBufferPool::Release(std::unique_ptr buffer) { + buffer->transform_parameters()->Reset(); + std::lock_guard lock(mutex_); + buffers_.Push(std::move(buffer)); +} + +size_t ResidualBufferPool::Size() const { + std::lock_guard lock(mutex_); + return buffers_.Size(); +} + +} // namespace libgav1 diff --git a/src/residual_buffer_pool.h b/src/residual_buffer_pool.h new file mode 100644 index 0000000..f7bc75d --- /dev/null +++ b/src/residual_buffer_pool.h @@ -0,0 +1,203 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_ +#define LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_ + +#include +#include +#include +#include // NOLINT (unapproved c++11 header) +#include + +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" +#include "src/utils/types.h" + +namespace libgav1 { + +// A simple fixed size queue implementation to hold the transform parameters +// when |Tile::split_parse_and_decode_| is true. We don't have to do any +// boundary checks since we always push data into the queue before accessing it. +class TransformParameterQueue { + public: + TransformParameterQueue() = default; + + // Move only. + TransformParameterQueue(TransformParameterQueue&& other) = default; + TransformParameterQueue& operator=(TransformParameterQueue&& other) = default; + + LIBGAV1_MUST_USE_RESULT bool Init(int max_size) { + max_size_ = max_size; + // No initialization is necessary since the data will be always written to + // before being read. + non_zero_coeff_count_.reset(new (std::nothrow) int16_t[max_size_]); + tx_type_.reset(new (std::nothrow) TransformType[max_size_]); + return non_zero_coeff_count_ != nullptr && tx_type_ != nullptr; + } + + // Adds the |non_zero_coeff_count| and the |tx_type| to the back of the queue. + void Push(int non_zero_coeff_count, TransformType tx_type) { + assert(back_ < max_size_); + non_zero_coeff_count_[back_] = non_zero_coeff_count; + tx_type_[back_++] = tx_type; + } + + // Returns the non_zero_coeff_count at the front of the queue. + int16_t NonZeroCoeffCount() const { + assert(front_ != back_); + return non_zero_coeff_count_[front_]; + } + + // Returns the tx_type at the front of the queue. + TransformType Type() const { + assert(front_ != back_); + return tx_type_[front_]; + } + + // Removes the |non_zero_coeff_count| and the |tx_type| from the front of the + // queue. + void Pop() { + assert(front_ != back_); + ++front_; + } + + // Clears the queue. + void Reset() { + front_ = 0; + back_ = 0; + } + + // Used only in the tests. Returns the number of elements in the queue. + int Size() const { return back_ - front_; } + + private: + int max_size_ = 0; + std::unique_ptr non_zero_coeff_count_; + std::unique_ptr tx_type_; + int front_ = 0; + int back_ = 0; +}; + +// This class is used for parsing and decoding a superblock. Members of this +// class are populated in the "parse" step and consumed in the "decode" step. +class ResidualBuffer : public Allocable { + public: + static std::unique_ptr Create(size_t buffer_size, + int queue_size) { + std::unique_ptr buffer(new (std::nothrow) ResidualBuffer); + if (buffer != nullptr) { + buffer->buffer_ = MakeAlignedUniquePtr(32, buffer_size); + if (buffer->buffer_ == nullptr || + !buffer->transform_parameters_.Init(queue_size)) { + buffer = nullptr; + } + } + return buffer; + } + + // Move only. + ResidualBuffer(ResidualBuffer&& other) = default; + ResidualBuffer& operator=(ResidualBuffer&& other) = default; + + // Buffer used to store the residual values. + uint8_t* buffer() { return buffer_.get(); } + // Queue used to store the transform parameters. + TransformParameterQueue* transform_parameters() { + return &transform_parameters_; + } + + private: + friend class ResidualBufferStack; + + ResidualBuffer() = default; + + AlignedUniquePtr buffer_; + TransformParameterQueue transform_parameters_; + // Used by ResidualBufferStack to form a chain of ResidualBuffers. + ResidualBuffer* next_ = nullptr; +}; + +// A LIFO stack of ResidualBuffers. Owns the buffers in the stack. +class ResidualBufferStack { + public: + ResidualBufferStack() = default; + + // Not copyable or movable + ResidualBufferStack(const ResidualBufferStack&) = delete; + ResidualBufferStack& operator=(const ResidualBufferStack&) = delete; + + ~ResidualBufferStack(); + + // Pushes |buffer| to the top of the stack. + void Push(std::unique_ptr buffer); + + // If the stack is non-empty, returns the buffer at the top of the stack and + // removes it from the stack. If the stack is empty, returns nullptr. + std::unique_ptr Pop(); + + // Swaps the contents of this stack and |other|. + void Swap(ResidualBufferStack* other); + + // Returns the number of buffers in the stack. + size_t Size() const { return num_buffers_; } + + private: + // A singly-linked list of ResidualBuffers, chained together using the next_ + // field of ResidualBuffer. + ResidualBuffer* top_ = nullptr; + size_t num_buffers_ = 0; +}; + +// Utility class used to manage the residual buffers (and the transform +// parameters) used for multi-threaded decoding. This class uses a stack to +// store the buffers for better cache locality. Since buffers used more recently +// are more likely to be in the cache. All functions in this class are +// thread-safe. +class ResidualBufferPool : public Allocable { + public: + ResidualBufferPool(bool use_128x128_superblock, int subsampling_x, + int subsampling_y, size_t residual_size); + + // Recomputes |buffer_size_| and invalidates the existing buffers if + // necessary. + void Reset(bool use_128x128_superblock, int subsampling_x, int subsampling_y, + size_t residual_size); + // Gets a residual buffer. The buffer is guaranteed to be large enough to + // store the residual values for one superblock whose parameters are the same + // as the constructor or the last call to Reset(). If there are free buffers + // in the stack, it returns one from the stack, otherwise a new buffer is + // allocated. + std::unique_ptr Get(); + // Returns the |buffer| back to the pool (by appending it to the stack). + // Subsequent calls to Get() may re-use this buffer. + void Release(std::unique_ptr buffer); + + // Used only in the tests. Returns the number of buffers in the stack. + size_t Size() const; + + private: + mutable std::mutex mutex_; + ResidualBufferStack buffers_ LIBGAV1_GUARDED_BY(mutex_); + size_t buffer_size_; + int queue_size_; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_RESIDUAL_BUFFER_POOL_H_ diff --git a/src/scan_tables.inc b/src/scan_tables.inc new file mode 100644 index 0000000..f7c9231 --- /dev/null +++ b/src/scan_tables.inc @@ -0,0 +1,440 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file contains all the scan order tables. + +constexpr uint16_t kDefaultScan4x4[16] = {0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15}; + +constexpr uint16_t kColumnScan4x4[16] = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; + +constexpr uint16_t kRowScan4x4[16] = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + +constexpr uint16_t kDefaultScan4x8[32] = { + 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, + 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31}; + +constexpr uint16_t kColumnScan4x8[32] = { + 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29, + 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31}; + +constexpr uint16_t kRowScan4x8[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + +constexpr uint16_t kDefaultScan8x4[32] = { + 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, 25, 18, 11, 4, 26, 19, + 12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31}; + +constexpr uint16_t kColumnScan8x4[32] = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31}; + +constexpr uint16_t kRowScan8x4[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + +constexpr uint16_t kDefaultScan8x8[64] = { + 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63}; + +constexpr uint16_t kColumnScan8x8[64] = { + 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, + 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59, + 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61, + 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63}; + +constexpr uint16_t kRowScan8x8[64] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}; + +constexpr uint16_t kDefaultScan8x16[128] = { + 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, + 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, + 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, + 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, + 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, + 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, + 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107, + 114, 121, 87, 94, 101, 108, 115, 122, 95, 102, 109, 116, 123, 103, 110, + 117, 124, 111, 118, 125, 119, 126, 127}; + +constexpr uint16_t kColumnScan8x16[128] = { + 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, + 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, + 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, + 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, + 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, + 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, + 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, + 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127}; + +constexpr uint16_t kRowScan8x16[128] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127}; + +constexpr uint16_t kDefaultScan16x8[128] = { + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 49, 34, 19, 4, 80, + 65, 50, 35, 20, 5, 96, 81, 66, 51, 36, 21, 6, 112, 97, 82, 67, + 52, 37, 22, 7, 113, 98, 83, 68, 53, 38, 23, 8, 114, 99, 84, 69, + 54, 39, 24, 9, 115, 100, 85, 70, 55, 40, 25, 10, 116, 101, 86, 71, + 56, 41, 26, 11, 117, 102, 87, 72, 57, 42, 27, 12, 118, 103, 88, 73, + 58, 43, 28, 13, 119, 104, 89, 74, 59, 44, 29, 14, 120, 105, 90, 75, + 60, 45, 30, 15, 121, 106, 91, 76, 61, 46, 31, 122, 107, 92, 77, 62, + 47, 123, 108, 93, 78, 63, 124, 109, 94, 79, 125, 110, 95, 126, 111, 127}; + +constexpr uint16_t kColumnScan16x8[128] = { + 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113, + 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115, + 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117, + 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119, + 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121, + 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123, + 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125, + 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127}; + +constexpr uint16_t kRowScan16x8[128] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127}; + +constexpr uint16_t kDefaultScan16x16[256] = { + 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4, + 5, 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22, + 37, 52, 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, 8, + 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100, + 85, 70, 55, 40, 25, 10, 11, 26, 41, 56, 71, 86, 101, 116, 131, + 146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, 42, 27, + 12, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, + 224, 209, 194, 179, 164, 149, 134, 119, 104, 89, 74, 59, 44, 29, 14, + 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, + 240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91, 76, 61, 46, + 31, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242, + 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, 78, 63, 79, 94, + 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185, + 170, 155, 140, 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, 216, 231, + 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203, + 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235, + 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, + 255}; + +constexpr uint16_t kColumnScan16x16[256] = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, + 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, + 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, + 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, + 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, + 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, + 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, + 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, + 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, + 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, + 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, + 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, + 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, + 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255}; + +constexpr uint16_t kRowScan16x16[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255}; + +constexpr uint16_t kDefaultScan16x32[512] = { + 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, + 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, + 37, 52, 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, + 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, + 85, 100, 115, 130, 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, + 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, + 192, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, + 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, + 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, + 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, + 241, 256, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, + 242, 257, 272, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, + 243, 258, 273, 288, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, + 244, 259, 274, 289, 304, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230, + 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231, + 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232, + 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233, + 248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234, + 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235, + 250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236, + 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237, + 252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238, + 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239, + 254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464, + 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465, + 480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466, + 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467, + 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483, + 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335, + 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396, + 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, + 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444, + 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, + 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, + 510, 511}; + +constexpr uint16_t kDefaultScan32x16[512] = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, + 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, + 162, 131, 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8, + 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 320, 289, 258, 227, 196, + 165, 134, 103, 72, 41, 10, 352, 321, 290, 259, 228, 197, 166, 135, 104, + 73, 42, 11, 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, + 12, 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, 13, + 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76, 45, 14, + 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77, 46, + 15, 481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78, + 47, 16, 482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110, + 79, 48, 17, 483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142, + 111, 80, 49, 18, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, + 143, 112, 81, 50, 19, 485, 454, 423, 392, 361, 330, 299, 268, 237, 206, + 175, 144, 113, 82, 51, 20, 486, 455, 424, 393, 362, 331, 300, 269, 238, + 207, 176, 145, 114, 83, 52, 21, 487, 456, 425, 394, 363, 332, 301, 270, + 239, 208, 177, 146, 115, 84, 53, 22, 488, 457, 426, 395, 364, 333, 302, + 271, 240, 209, 178, 147, 116, 85, 54, 23, 489, 458, 427, 396, 365, 334, + 303, 272, 241, 210, 179, 148, 117, 86, 55, 24, 490, 459, 428, 397, 366, + 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 491, 460, 429, 398, + 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57, 26, 492, 461, 430, + 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 493, 462, + 431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90, 59, 28, 494, + 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29, + 495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92, 61, + 30, 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, + 62, 31, 497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, + 94, 63, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126, + 95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500, + 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408, + 377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285, + 254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411, + 380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, + 382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, + 479, 511}; + +constexpr uint16_t kDefaultScan32x32[1024] = { + 0, 1, 32, 64, 33, 2, 3, 34, 65, 96, 128, 97, 66, + 35, 4, 5, 36, 67, 98, 129, 160, 192, 161, 130, 99, 68, + 37, 6, 7, 38, 69, 100, 131, 162, 193, 224, 256, 225, 194, + 163, 132, 101, 70, 39, 8, 9, 40, 71, 102, 133, 164, 195, + 226, 257, 288, 320, 289, 258, 227, 196, 165, 134, 103, 72, 41, + 10, 11, 42, 73, 104, 135, 166, 197, 228, 259, 290, 321, 352, + 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, 12, + 13, 44, 75, 106, 137, 168, 199, 230, 261, 292, 323, 354, 385, + 416, 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, + 76, 45, 14, 15, 46, 77, 108, 139, 170, 201, 232, 263, 294, + 325, 356, 387, 418, 449, 480, 512, 481, 450, 419, 388, 357, 326, + 295, 264, 233, 202, 171, 140, 109, 78, 47, 16, 17, 48, 79, + 110, 141, 172, 203, 234, 265, 296, 327, 358, 389, 420, 451, 482, + 513, 544, 576, 545, 514, 483, 452, 421, 390, 359, 328, 297, 266, + 235, 204, 173, 142, 111, 80, 49, 18, 19, 50, 81, 112, 143, + 174, 205, 236, 267, 298, 329, 360, 391, 422, 453, 484, 515, 546, + 577, 608, 640, 609, 578, 547, 516, 485, 454, 423, 392, 361, 330, + 299, 268, 237, 206, 175, 144, 113, 82, 51, 20, 21, 52, 83, + 114, 145, 176, 207, 238, 269, 300, 331, 362, 393, 424, 455, 486, + 517, 548, 579, 610, 641, 672, 704, 673, 642, 611, 580, 549, 518, + 487, 456, 425, 394, 363, 332, 301, 270, 239, 208, 177, 146, 115, + 84, 53, 22, 23, 54, 85, 116, 147, 178, 209, 240, 271, 302, + 333, 364, 395, 426, 457, 488, 519, 550, 581, 612, 643, 674, 705, + 736, 768, 737, 706, 675, 644, 613, 582, 551, 520, 489, 458, 427, + 396, 365, 334, 303, 272, 241, 210, 179, 148, 117, 86, 55, 24, + 25, 56, 87, 118, 149, 180, 211, 242, 273, 304, 335, 366, 397, + 428, 459, 490, 521, 552, 583, 614, 645, 676, 707, 738, 769, 800, + 832, 801, 770, 739, 708, 677, 646, 615, 584, 553, 522, 491, 460, + 429, 398, 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57, + 26, 27, 58, 89, 120, 151, 182, 213, 244, 275, 306, 337, 368, + 399, 430, 461, 492, 523, 554, 585, 616, 647, 678, 709, 740, 771, + 802, 833, 864, 896, 865, 834, 803, 772, 741, 710, 679, 648, 617, + 586, 555, 524, 493, 462, 431, 400, 369, 338, 307, 276, 245, 214, + 183, 152, 121, 90, 59, 28, 29, 60, 91, 122, 153, 184, 215, + 246, 277, 308, 339, 370, 401, 432, 463, 494, 525, 556, 587, 618, + 649, 680, 711, 742, 773, 804, 835, 866, 897, 928, 960, 929, 898, + 867, 836, 805, 774, 743, 712, 681, 650, 619, 588, 557, 526, 495, + 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92, + 61, 30, 31, 62, 93, 124, 155, 186, 217, 248, 279, 310, 341, + 372, 403, 434, 465, 496, 527, 558, 589, 620, 651, 682, 713, 744, + 775, 806, 837, 868, 899, 930, 961, 992, 993, 962, 931, 900, 869, + 838, 807, 776, 745, 714, 683, 652, 621, 590, 559, 528, 497, 466, + 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, 94, 63, + 95, 126, 157, 188, 219, 250, 281, 312, 343, 374, 405, 436, 467, + 498, 529, 560, 591, 622, 653, 684, 715, 746, 777, 808, 839, 870, + 901, 932, 963, 994, 995, 964, 933, 902, 871, 840, 809, 778, 747, + 716, 685, 654, 623, 592, 561, 530, 499, 468, 437, 406, 375, 344, + 313, 282, 251, 220, 189, 158, 127, 159, 190, 221, 252, 283, 314, + 345, 376, 407, 438, 469, 500, 531, 562, 593, 624, 655, 686, 717, + 748, 779, 810, 841, 872, 903, 934, 965, 996, 997, 966, 935, 904, + 873, 842, 811, 780, 749, 718, 687, 656, 625, 594, 563, 532, 501, + 470, 439, 408, 377, 346, 315, 284, 253, 222, 191, 223, 254, 285, + 316, 347, 378, 409, 440, 471, 502, 533, 564, 595, 626, 657, 688, + 719, 750, 781, 812, 843, 874, 905, 936, 967, 998, 999, 968, 937, + 906, 875, 844, 813, 782, 751, 720, 689, 658, 627, 596, 565, 534, + 503, 472, 441, 410, 379, 348, 317, 286, 255, 287, 318, 349, 380, + 411, 442, 473, 504, 535, 566, 597, 628, 659, 690, 721, 752, 783, + 814, 845, 876, 907, 938, 969, 1000, 1001, 970, 939, 908, 877, 846, + 815, 784, 753, 722, 691, 660, 629, 598, 567, 536, 505, 474, 443, + 412, 381, 350, 319, 351, 382, 413, 444, 475, 506, 537, 568, 599, + 630, 661, 692, 723, 754, 785, 816, 847, 878, 909, 940, 971, 1002, + 1003, 972, 941, 910, 879, 848, 817, 786, 755, 724, 693, 662, 631, + 600, 569, 538, 507, 476, 445, 414, 383, 415, 446, 477, 508, 539, + 570, 601, 632, 663, 694, 725, 756, 787, 818, 849, 880, 911, 942, + 973, 1004, 1005, 974, 943, 912, 881, 850, 819, 788, 757, 726, 695, + 664, 633, 602, 571, 540, 509, 478, 447, 479, 510, 541, 572, 603, + 634, 665, 696, 727, 758, 789, 820, 851, 882, 913, 944, 975, 1006, + 1007, 976, 945, 914, 883, 852, 821, 790, 759, 728, 697, 666, 635, + 604, 573, 542, 511, 543, 574, 605, 636, 667, 698, 729, 760, 791, + 822, 853, 884, 915, 946, 977, 1008, 1009, 978, 947, 916, 885, 854, + 823, 792, 761, 730, 699, 668, 637, 606, 575, 607, 638, 669, 700, + 731, 762, 793, 824, 855, 886, 917, 948, 979, 1010, 1011, 980, 949, + 918, 887, 856, 825, 794, 763, 732, 701, 670, 639, 671, 702, 733, + 764, 795, 826, 857, 888, 919, 950, 981, 1012, 1013, 982, 951, 920, + 889, 858, 827, 796, 765, 734, 703, 735, 766, 797, 828, 859, 890, + 921, 952, 983, 1014, 1015, 984, 953, 922, 891, 860, 829, 798, 767, + 799, 830, 861, 892, 923, 954, 985, 1016, 1017, 986, 955, 924, 893, + 862, 831, 863, 894, 925, 956, 987, 1018, 1019, 988, 957, 926, 895, + 927, 958, 989, 1020, 1021, 990, 959, 991, 1022, 1023}; + +constexpr uint16_t kDefaultScan4x16[64] = { + 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, + 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30, + 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46, + 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63}; + +constexpr uint16_t kColumnScan4x16[64] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, + 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, + 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, + 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63}; + +constexpr uint16_t kRowScan4x16[64] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}; + +constexpr uint16_t kDefaultScan16x4[64] = { + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 49, 34, 19, 4, 50, 35, + 20, 5, 51, 36, 21, 6, 52, 37, 22, 7, 53, 38, 23, 8, 54, 39, + 24, 9, 55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43, + 28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63}; + +constexpr uint16_t kColumnScan16x4[64] = { + 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, + 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, + 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, + 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63}; + +constexpr uint16_t kRowScan16x4[64] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}; + +constexpr uint16_t kDefaultScan8x32[256] = { + 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, + 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, + 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, + 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, + 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, + 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, + 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107, + 114, 121, 128, 87, 94, 101, 108, 115, 122, 129, 136, 95, 102, 109, 116, + 123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125, + 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134, + 141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143, + 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200, + 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209, + 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, + 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227, + 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, + 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, + 255}; + +constexpr uint16_t kDefaultScan32x8[256] = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, + 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, + 162, 131, 100, 69, 38, 7, 225, 194, 163, 132, 101, 70, 39, 8, 226, + 195, 164, 133, 102, 71, 40, 9, 227, 196, 165, 134, 103, 72, 41, 10, + 228, 197, 166, 135, 104, 73, 42, 11, 229, 198, 167, 136, 105, 74, 43, + 12, 230, 199, 168, 137, 106, 75, 44, 13, 231, 200, 169, 138, 107, 76, + 45, 14, 232, 201, 170, 139, 108, 77, 46, 15, 233, 202, 171, 140, 109, + 78, 47, 16, 234, 203, 172, 141, 110, 79, 48, 17, 235, 204, 173, 142, + 111, 80, 49, 18, 236, 205, 174, 143, 112, 81, 50, 19, 237, 206, 175, + 144, 113, 82, 51, 20, 238, 207, 176, 145, 114, 83, 52, 21, 239, 208, + 177, 146, 115, 84, 53, 22, 240, 209, 178, 147, 116, 85, 54, 23, 241, + 210, 179, 148, 117, 86, 55, 24, 242, 211, 180, 149, 118, 87, 56, 25, + 243, 212, 181, 150, 119, 88, 57, 26, 244, 213, 182, 151, 120, 89, 58, + 27, 245, 214, 183, 152, 121, 90, 59, 28, 246, 215, 184, 153, 122, 91, + 60, 29, 247, 216, 185, 154, 123, 92, 61, 30, 248, 217, 186, 155, 124, + 93, 62, 31, 249, 218, 187, 156, 125, 94, 63, 250, 219, 188, 157, 126, + 95, 251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223, + 255}; + +// 5.11.41 (implemented as a simple look up of transform class and transform +// size). +const uint16_t* kScan[3][kNumTransformSizes] = { + // kTransformClass2D + {kDefaultScan4x4, kDefaultScan4x8, kDefaultScan4x16, kDefaultScan8x4, + kDefaultScan8x8, kDefaultScan8x16, kDefaultScan8x32, kDefaultScan16x4, + kDefaultScan16x8, kDefaultScan16x16, kDefaultScan16x32, kDefaultScan16x32, + kDefaultScan32x8, kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32, + kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32}, + // kTransformClassHorizontal + {kColumnScan4x4, kColumnScan4x8, kColumnScan4x16, kColumnScan8x4, + kColumnScan8x8, kColumnScan8x16, kColumnScan16x4, kColumnScan16x4, + kColumnScan16x8, kColumnScan16x16, kColumnScan16x4, kDefaultScan16x32, + kColumnScan16x4, kColumnScan16x4, kColumnScan16x4, kDefaultScan32x32, + kDefaultScan32x16, kDefaultScan32x32, kDefaultScan32x32}, + // kTransformClassVertical + {kRowScan4x4, kRowScan4x8, kRowScan4x16, kRowScan8x4, kRowScan8x8, + kRowScan8x16, kRowScan16x4, kRowScan16x4, kRowScan16x8, kRowScan16x16, + kRowScan16x4, kDefaultScan16x32, kRowScan16x4, kRowScan16x4, kRowScan16x4, + kDefaultScan32x32, kDefaultScan32x16, kDefaultScan32x32, + kDefaultScan32x32}}; diff --git a/src/status_code.cc b/src/status_code.cc new file mode 100644 index 0000000..34def08 --- /dev/null +++ b/src/status_code.cc @@ -0,0 +1,57 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/gav1/status_code.h" + +extern "C" { + +const char* Libgav1GetErrorString(Libgav1StatusCode status) { + switch (status) { + case kLibgav1StatusOk: + return "Success."; + case kLibgav1StatusUnknownError: + return "Unknown error."; + case kLibgav1StatusInvalidArgument: + return "Invalid function argument."; + case kLibgav1StatusOutOfMemory: + return "Memory allocation failure."; + case kLibgav1StatusResourceExhausted: + return "Ran out of a resource (other than memory)."; + case kLibgav1StatusNotInitialized: + return "The object is not initialized."; + case kLibgav1StatusAlready: + return "An operation that can only be performed once has already been " + "performed."; + case kLibgav1StatusUnimplemented: + return "Not implemented."; + case kLibgav1StatusInternalError: + return "Internal error in libgav1."; + case kLibgav1StatusBitstreamError: + return "The bitstream is not encoded correctly or violates a bitstream " + "conformance requirement."; + case kLibgav1StatusTryAgain: + return "The operation is not allowed at the moment. Try again later."; + case kLibgav1StatusNothingToDequeue: + return "There are no enqueued frames, so there is nothing to dequeue. " + "Try enqueuing a frame before trying to dequeue again."; + // This switch statement does not have a default case. This way the compiler + // will warn if we neglect to update this function after adding a new value + // to the Libgav1StatusCode enum type. + case kLibgav1StatusReservedForFutureExpansionUseDefaultInSwitchInstead_: + break; + } + return "Unrecognized status code."; +} + +} // extern "C" diff --git a/src/symbol_decoder_context.cc b/src/symbol_decoder_context.cc new file mode 100644 index 0000000..26a281e --- /dev/null +++ b/src/symbol_decoder_context.cc @@ -0,0 +1,322 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/symbol_decoder_context.h" + +#include +#include +#include + +namespace libgav1 { +namespace { + +// Import all the constants in the anonymous namespace. +#include "src/symbol_decoder_context_cdfs.inc" + +uint8_t GetQuantizerContext(int base_quantizer_index) { + if (base_quantizer_index <= 20) return 0; + if (base_quantizer_index <= 60) return 1; + if (base_quantizer_index <= 120) return 2; + return 3; +} + +// Reset*Counters() are helper functions to reset the CDF arrays where the +// counters are not in the last element of the innermost dimension. + +void ResetPartitionCounters(SymbolDecoderContext* const context) { + int block_size_log2 = k4x4WidthLog2[kBlock8x8]; + for (auto& d1 : context->partition_cdf) { + const int cdf_size = + SymbolDecoderContext::PartitionCdfSize(block_size_log2++); + for (auto& d2 : d1) { + d2[cdf_size] = 0; + } + } +} + +void ResetPaletteColorIndexCounters(SymbolDecoderContext* const context) { + for (auto& d1 : context->palette_color_index_cdf) { + int cdf_size = kMinPaletteSize; + for (auto& d2 : d1) { + for (auto& d3 : d2) { + d3[cdf_size] = 0; + } + ++cdf_size; + } + } +} + +void ResetTxTypeCounters(SymbolDecoderContext* const context) { + int set_index = kTransformSetIntra1; + for (auto& d1 : context->intra_tx_type_cdf) { + const int cdf_size = kNumTransformTypesInSet[set_index++]; + for (auto& d2 : d1) { + for (auto& d3 : d2) { + d3[cdf_size] = 0; + } + } + } + for (auto& d1 : context->inter_tx_type_cdf) { + const int cdf_size = kNumTransformTypesInSet[set_index++]; + for (auto& d2 : d1) { + d2[cdf_size] = 0; + } + } +} + +void ResetTxDepthCounters(SymbolDecoderContext* const context) { + int delta = 1; + for (auto& d1 : context->tx_depth_cdf) { + const int cdf_size = kMaxTxDepthSymbolCount - delta; + delta = 0; + for (auto& d2 : d1) { + d2[cdf_size] = 0; + } + } +} + +void ResetUVModeCounters(SymbolDecoderContext* const context) { + int cdf_size = kIntraPredictionModesUV - 1; + for (auto& d1 : context->uv_mode_cdf) { + for (auto& d2 : d1) { + d2[cdf_size] = 0; + } + ++cdf_size; + } +} + +} // namespace + +#define CDF_COPY(source, destination) \ + static_assert(sizeof(source) == sizeof(destination), ""); \ + memcpy(destination, source, sizeof(source)) + +void SymbolDecoderContext::Initialize(int base_quantizer_index) { + CDF_COPY(kDefaultPartitionCdf, partition_cdf); + CDF_COPY(kDefaultSkipCdf, skip_cdf); + CDF_COPY(kDefaultSkipModeCdf, skip_mode_cdf); + CDF_COPY(kDefaultSegmentIdCdf, segment_id_cdf); + CDF_COPY(kDefaultUsePredictedSegmentIdCdf, use_predicted_segment_id_cdf); + CDF_COPY(kDefaultDeltaQCdf, delta_q_cdf); + CDF_COPY(kDefaultDeltaQCdf, delta_lf_cdf); + for (auto& delta_lf_multi_cdf_entry : delta_lf_multi_cdf) { + CDF_COPY(kDefaultDeltaQCdf, delta_lf_multi_cdf_entry); + } + CDF_COPY(kDefaultIntraBlockCopyCdf, intra_block_copy_cdf); + CDF_COPY(kDefaultIntraFrameYModeCdf, intra_frame_y_mode_cdf); + CDF_COPY(kDefaultYModeCdf, y_mode_cdf); + CDF_COPY(kDefaultAngleDeltaCdf, angle_delta_cdf); + CDF_COPY(kDefaultUVModeCdf, uv_mode_cdf); + CDF_COPY(kDefaultCflAlphaSignsCdf, cfl_alpha_signs_cdf); + CDF_COPY(kDefaultCflAlphaCdf, cfl_alpha_cdf); + CDF_COPY(kDefaultUseFilterIntraCdf, use_filter_intra_cdf); + CDF_COPY(kDefaultFilterIntraModeCdf, filter_intra_mode_cdf); + CDF_COPY(kDefaultTxDepthCdf, tx_depth_cdf); + CDF_COPY(kDefaultTxSplitCdf, tx_split_cdf); + CDF_COPY(kDefaultInterTxTypeCdf, inter_tx_type_cdf); + CDF_COPY(kDefaultIntraTxTypeCdf, intra_tx_type_cdf); + CDF_COPY(kDefaultRestorationTypeCdf, restoration_type_cdf); + CDF_COPY(kDefaultUseWienerCdf, use_wiener_cdf); + CDF_COPY(kDefaultUseSgrProjCdf, use_sgrproj_cdf); + CDF_COPY(kDefaultHasPaletteYCdf, has_palette_y_cdf); + CDF_COPY(kDefaultPaletteYSizeCdf, palette_y_size_cdf); + CDF_COPY(kDefaultHasPaletteUVCdf, has_palette_uv_cdf); + CDF_COPY(kDefaultPaletteUVSizeCdf, palette_uv_size_cdf); + CDF_COPY(kDefaultPaletteColorIndexCdf, palette_color_index_cdf); + CDF_COPY(kDefaultIsInterCdf, is_inter_cdf); + CDF_COPY(kDefaultUseCompoundReferenceCdf, use_compound_reference_cdf); + CDF_COPY(kDefaultCompoundReferenceTypeCdf, compound_reference_type_cdf); + CDF_COPY(kDefaultCompoundReferenceCdf, compound_reference_cdf); + CDF_COPY(kDefaultCompoundBackwardReferenceCdf, + compound_backward_reference_cdf); + CDF_COPY(kDefaultSingleReferenceCdf, single_reference_cdf); + CDF_COPY(kDefaultCompoundPredictionModeCdf, compound_prediction_mode_cdf); + CDF_COPY(kDefaultNewMvCdf, new_mv_cdf); + CDF_COPY(kDefaultZeroMvCdf, zero_mv_cdf); + CDF_COPY(kDefaultReferenceMvCdf, reference_mv_cdf); + CDF_COPY(kDefaultRefMvIndexCdf, ref_mv_index_cdf); + CDF_COPY(kDefaultIsInterIntraCdf, is_inter_intra_cdf); + CDF_COPY(kDefaultInterIntraModeCdf, inter_intra_mode_cdf); + CDF_COPY(kDefaultIsWedgeInterIntraCdf, is_wedge_inter_intra_cdf); + CDF_COPY(kDefaultWedgeIndexCdf, wedge_index_cdf); + CDF_COPY(kDefaultUseObmcCdf, use_obmc_cdf); + CDF_COPY(kDefaultMotionModeCdf, motion_mode_cdf); + CDF_COPY(kDefaultIsExplicitCompoundTypeCdf, is_explicit_compound_type_cdf); + CDF_COPY(kDefaultIsCompoundTypeAverageCdf, is_compound_type_average_cdf); + CDF_COPY(kDefaultCompoundTypeCdf, compound_type_cdf); + CDF_COPY(kDefaultInterpolationFilterCdf, interpolation_filter_cdf); + for (int i = 0; i < kMvContexts; ++i) { + CDF_COPY(kDefaultMvJointCdf, mv_joint_cdf[i]); + for (int j = 0; j < kNumMvComponents; ++j) { + CDF_COPY(kDefaultMvSignCdf, mv_sign_cdf[i][j]); + CDF_COPY(kDefaultMvClassCdf, mv_class_cdf[i][j]); + CDF_COPY(kDefaultMvClass0BitCdf, mv_class0_bit_cdf[i][j]); + CDF_COPY(kDefaultMvClass0FractionCdf, mv_class0_fraction_cdf[i][j]); + CDF_COPY(kDefaultMvClass0HighPrecisionCdf, + mv_class0_high_precision_cdf[i][j]); + CDF_COPY(kDefaultMvBitCdf, mv_bit_cdf[i][j]); + CDF_COPY(kDefaultMvFractionCdf, mv_fraction_cdf[i][j]); + CDF_COPY(kDefaultMvHighPrecisionCdf, mv_high_precision_cdf[i][j]); + } + } + const int quantizer_context = GetQuantizerContext(base_quantizer_index); + CDF_COPY(kDefaultAllZeroCdf[quantizer_context], all_zero_cdf); + CDF_COPY(kDefaultEobPt16Cdf[quantizer_context], eob_pt_16_cdf); + CDF_COPY(kDefaultEobPt32Cdf[quantizer_context], eob_pt_32_cdf); + CDF_COPY(kDefaultEobPt64Cdf[quantizer_context], eob_pt_64_cdf); + CDF_COPY(kDefaultEobPt128Cdf[quantizer_context], eob_pt_128_cdf); + CDF_COPY(kDefaultEobPt256Cdf[quantizer_context], eob_pt_256_cdf); + CDF_COPY(kDefaultEobPt512Cdf[quantizer_context], eob_pt_512_cdf); + CDF_COPY(kDefaultEobPt1024Cdf[quantizer_context], eob_pt_1024_cdf); + CDF_COPY(kDefaultEobExtraCdf[quantizer_context], eob_extra_cdf); + CDF_COPY(kDefaultCoeffBaseEobCdf[quantizer_context], coeff_base_eob_cdf); + CDF_COPY(kDefaultCoeffBaseCdf[quantizer_context], coeff_base_cdf); + CDF_COPY(kDefaultCoeffBaseRangeCdf[quantizer_context], coeff_base_range_cdf); + CDF_COPY(kDefaultDcSignCdf[quantizer_context], dc_sign_cdf); +} + +void SymbolDecoderContext::ResetIntraFrameYModeCdf() { + CDF_COPY(kDefaultIntraFrameYModeCdf, intra_frame_y_mode_cdf); +} + +#undef CDF_COPY + +// These macros set the last element in the inner-most dimension of the array to +// zero. +#define RESET_COUNTER_1D(array) \ + do { \ + (array)[std::extent::value - 1] = 0; \ + } while (false) + +#define RESET_COUNTER_2D(array) \ + do { \ + for (auto& d1 : (array)) { \ + d1[std::extent::value - 1] = 0; \ + } \ + } while (false) + +#define RESET_COUNTER_3D(array) \ + do { \ + for (auto& d1 : (array)) { \ + for (auto& d2 : d1) { \ + d2[std::extent::value - 1] = 0; \ + } \ + } \ + } while (false) + +#define RESET_COUNTER_4D(array) \ + do { \ + for (auto& d1 : (array)) { \ + for (auto& d2 : d1) { \ + for (auto& d3 : d2) { \ + d3[std::extent::value - 1] = 0; \ + } \ + } \ + } \ + } while (false) + +void SymbolDecoderContext::ResetCounters() { + ResetPartitionCounters(this); + RESET_COUNTER_2D(segment_id_cdf); + RESET_COUNTER_2D(use_predicted_segment_id_cdf); + RESET_COUNTER_2D(skip_cdf); + RESET_COUNTER_2D(skip_mode_cdf); + RESET_COUNTER_1D(delta_q_cdf); + RESET_COUNTER_1D(delta_lf_cdf); + RESET_COUNTER_2D(delta_lf_multi_cdf); + RESET_COUNTER_1D(intra_block_copy_cdf); + RESET_COUNTER_3D(intra_frame_y_mode_cdf); + RESET_COUNTER_2D(y_mode_cdf); + RESET_COUNTER_2D(angle_delta_cdf); + ResetUVModeCounters(this); + RESET_COUNTER_1D(cfl_alpha_signs_cdf); + RESET_COUNTER_2D(cfl_alpha_cdf); + RESET_COUNTER_2D(use_filter_intra_cdf); + RESET_COUNTER_1D(filter_intra_mode_cdf); + ResetTxDepthCounters(this); + RESET_COUNTER_2D(tx_split_cdf); + RESET_COUNTER_3D(all_zero_cdf); + ResetTxTypeCounters(this); + RESET_COUNTER_3D(eob_pt_16_cdf); + RESET_COUNTER_3D(eob_pt_32_cdf); + RESET_COUNTER_3D(eob_pt_64_cdf); + RESET_COUNTER_3D(eob_pt_128_cdf); + RESET_COUNTER_3D(eob_pt_256_cdf); + RESET_COUNTER_2D(eob_pt_512_cdf); + RESET_COUNTER_2D(eob_pt_1024_cdf); + RESET_COUNTER_4D(eob_extra_cdf); + RESET_COUNTER_4D(coeff_base_eob_cdf); + RESET_COUNTER_4D(coeff_base_cdf); + RESET_COUNTER_4D(coeff_base_range_cdf); + RESET_COUNTER_3D(dc_sign_cdf); + RESET_COUNTER_1D(restoration_type_cdf); + RESET_COUNTER_1D(use_wiener_cdf); + RESET_COUNTER_1D(use_sgrproj_cdf); + RESET_COUNTER_3D(has_palette_y_cdf); + RESET_COUNTER_2D(palette_y_size_cdf); + RESET_COUNTER_2D(has_palette_uv_cdf); + RESET_COUNTER_2D(palette_uv_size_cdf); + ResetPaletteColorIndexCounters(this); + RESET_COUNTER_2D(is_inter_cdf); + RESET_COUNTER_2D(use_compound_reference_cdf); + RESET_COUNTER_2D(compound_reference_type_cdf); + RESET_COUNTER_4D(compound_reference_cdf); + RESET_COUNTER_3D(compound_backward_reference_cdf); + RESET_COUNTER_3D(single_reference_cdf); + RESET_COUNTER_2D(compound_prediction_mode_cdf); + RESET_COUNTER_2D(new_mv_cdf); + RESET_COUNTER_2D(zero_mv_cdf); + RESET_COUNTER_2D(reference_mv_cdf); + RESET_COUNTER_2D(ref_mv_index_cdf); + RESET_COUNTER_2D(is_inter_intra_cdf); + RESET_COUNTER_2D(inter_intra_mode_cdf); + RESET_COUNTER_2D(is_wedge_inter_intra_cdf); + RESET_COUNTER_2D(wedge_index_cdf); + RESET_COUNTER_2D(use_obmc_cdf); + RESET_COUNTER_2D(motion_mode_cdf); + RESET_COUNTER_2D(is_explicit_compound_type_cdf); + RESET_COUNTER_2D(is_compound_type_average_cdf); + RESET_COUNTER_2D(compound_type_cdf); + RESET_COUNTER_2D(interpolation_filter_cdf); + RESET_COUNTER_2D(mv_joint_cdf); + RESET_COUNTER_3D(mv_sign_cdf); + RESET_COUNTER_3D(mv_class_cdf); + RESET_COUNTER_3D(mv_class0_bit_cdf); + RESET_COUNTER_4D(mv_class0_fraction_cdf); + RESET_COUNTER_3D(mv_class0_high_precision_cdf); + RESET_COUNTER_4D(mv_bit_cdf); + RESET_COUNTER_3D(mv_fraction_cdf); + RESET_COUNTER_3D(mv_high_precision_cdf); +} + +#undef RESET_COUNTER_1D +#undef RESET_COUNTER_2D +#undef RESET_COUNTER_3D +#undef RESET_COUNTER_4D + +int SymbolDecoderContext::PartitionCdfSize(int block_size_log2) { + assert(block_size_log2 > 0); + assert(block_size_log2 < 6); + + switch (block_size_log2) { + case 1: + return kPartitionSplit + 1; + case 5: + return kPartitionVerticalWithRightSplit + 1; + default: + return kMaxPartitionTypes; + } +} + +} // namespace libgav1 diff --git a/src/symbol_decoder_context.h b/src/symbol_decoder_context.h new file mode 100644 index 0000000..1bea76c --- /dev/null +++ b/src/symbol_decoder_context.h @@ -0,0 +1,301 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_ +#define LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_ + +#include +#include + +#include "src/dsp/constants.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" + +namespace libgav1 { + +enum { + kPartitionContexts = 4, + kSegmentIdContexts = 3, + kUsePredictedSegmentIdContexts = 3, + kSkipContexts = 3, + kSkipModeContexts = 3, + kBooleanFieldCdfSize = 3, + kDeltaSymbolCount = 4, // Used for both delta_q and delta_lf. + kIntraModeContexts = 5, + kYModeContexts = 4, + kAngleDeltaSymbolCount = 2 * kMaxAngleDelta + 1, + kCflAlphaSignsSymbolCount = 8, + kCflAlphaContexts = 6, + kCflAlphaSymbolCount = 16, + kTxDepthContexts = 3, + kMaxTxDepthSymbolCount = 3, + kTxSplitContexts = 21, + kCoefficientQuantizerContexts = 4, + kNumSquareTransformSizes = 5, + kAllZeroContexts = 13, + kNumExtendedTransformSizes = 4, + kEobPtContexts = 2, + kEobPt16SymbolCount = 5, + kEobPt32SymbolCount = 6, + kEobPt64SymbolCount = 7, + kEobPt128SymbolCount = 8, + kEobPt256SymbolCount = 9, + kEobPt512SymbolCount = 10, + kEobPt1024SymbolCount = 11, + kEobExtraContexts = 9, + kCoeffBaseEobContexts = 4, + kCoeffBaseEobSymbolCount = 3, + kCoeffBaseContexts = 42, + kCoeffBaseSymbolCount = 4, + kCoeffBaseRangeContexts = 21, + kCoeffBaseRangeSymbolCount = 4, + kDcSignContexts = 3, + kPaletteBlockSizeContexts = 7, + kPaletteYModeContexts = 3, + kPaletteUVModeContexts = 2, + kPaletteSizeSymbolCount = 7, + kPaletteColorIndexContexts = 5, + kPaletteColorIndexSymbolCount = 8, + kIsInterContexts = 4, + kUseCompoundReferenceContexts = 5, + kCompoundReferenceTypeContexts = 5, + kReferenceContexts = 3, + kCompoundPredictionModeContexts = 8, + kNewMvContexts = 6, + kZeroMvContexts = 2, + kReferenceMvContexts = 6, + kRefMvIndexContexts = 3, + kInterIntraContexts = 3, + kWedgeIndexSymbolCount = 16, + kIsExplicitCompoundTypeContexts = 6, + kIsCompoundTypeAverageContexts = 6, + kInterpolationFilterContexts = 16, + kMvContexts = 2, + kMvClassSymbolCount = 11, + kMvFractionSymbolCount = 4, + kMvBitSymbolCount = 10, + kNumMvComponents = 2, +}; // anonymous enum + +struct SymbolDecoderContext { + SymbolDecoderContext() = default; + explicit SymbolDecoderContext(int base_quantizer_index) { + Initialize(base_quantizer_index); + } + + void Initialize(int base_quantizer_index); + + // Partition related variables and functions. + static int PartitionCdfSize(int block_size_log2); + + // Returns the cdf array index for inter_tx_type or intra_tx_type based on + // |tx_set|. + static int TxTypeIndex(TransformSet tx_set) { + assert(tx_set != kTransformSetDctOnly); + switch (tx_set) { + case kTransformSetInter1: + case kTransformSetIntra1: + return 0; + case kTransformSetInter2: + case kTransformSetIntra2: + return 1; + case kTransformSetInter3: + return 2; + default: + return -1; + } + } + + // Resets the intra_frame_y_mode_cdf array to the default. + void ResetIntraFrameYModeCdf(); + + // Resets the symbol counters of all the CDF arrays to zero. Symbol counter is + // the last used element in the innermost dimension of each of the CDF array. + void ResetCounters(); + + // Note kMaxAlignment allows for aligned instructions to be used in the + // copies done in Initialize(). + alignas(kMaxAlignment) uint16_t + partition_cdf[kBlockWidthCount][kPartitionContexts] + [kMaxPartitionTypes + 1]; + alignas(kMaxAlignment) uint16_t + segment_id_cdf[kSegmentIdContexts][kMaxSegments + 1]; + alignas(kMaxAlignment) uint16_t + use_predicted_segment_id_cdf[kUsePredictedSegmentIdContexts] + [kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t skip_cdf[kSkipContexts][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + skip_mode_cdf[kSkipModeContexts][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t delta_q_cdf[kDeltaSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t delta_lf_cdf[kDeltaSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + delta_lf_multi_cdf[kFrameLfCount][kDeltaSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t intra_block_copy_cdf[kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + intra_frame_y_mode_cdf[kIntraModeContexts][kIntraModeContexts] + [kIntraPredictionModesY + 1]; + alignas(kMaxAlignment) uint16_t + y_mode_cdf[kYModeContexts][kIntraPredictionModesY + 1]; + alignas(kMaxAlignment) uint16_t + angle_delta_cdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + uv_mode_cdf[kBooleanSymbolCount][kIntraPredictionModesY] + [kIntraPredictionModesUV + 1]; + alignas(kMaxAlignment) uint16_t + cfl_alpha_signs_cdf[kCflAlphaSignsSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + cfl_alpha_cdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + use_filter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + filter_intra_mode_cdf[kNumFilterIntraPredictors + 1]; + alignas(kMaxAlignment) uint16_t + tx_depth_cdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + tx_split_cdf[kTxSplitContexts][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + all_zero_cdf[kNumSquareTransformSizes][kAllZeroContexts] + [kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + inter_tx_type_cdf[3][kNumExtendedTransformSizes][kNumTransformTypes + 1]; + alignas(kMaxAlignment) uint16_t + intra_tx_type_cdf[2][kNumExtendedTransformSizes][kIntraPredictionModesY] + [kNumTransformTypes + 1]; + alignas(kMaxAlignment) uint16_t + eob_pt_16_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt16SymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + eob_pt_32_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt32SymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + eob_pt_64_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt64SymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + eob_pt_128_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt128SymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + eob_pt_256_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt256SymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + eob_pt_512_cdf[kNumPlaneTypes][kEobPt512SymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + eob_pt_1024_cdf[kNumPlaneTypes][kEobPt1024SymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + eob_extra_cdf[kNumSquareTransformSizes][kNumPlaneTypes][kEobExtraContexts] + [kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + coeff_base_eob_cdf[kNumSquareTransformSizes][kNumPlaneTypes] + [kCoeffBaseEobContexts][kCoeffBaseEobSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + coeff_base_cdf[kNumSquareTransformSizes][kNumPlaneTypes] + [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + coeff_base_range_cdf[kNumSquareTransformSizes][kNumPlaneTypes] + [kCoeffBaseRangeContexts] + [kCoeffBaseRangeSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + dc_sign_cdf[kNumPlaneTypes][kDcSignContexts][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + restoration_type_cdf[kRestorationTypeSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t use_wiener_cdf[kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t use_sgrproj_cdf[kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + has_palette_y_cdf[kPaletteBlockSizeContexts][kPaletteYModeContexts] + [kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + palette_y_size_cdf[kPaletteBlockSizeContexts] + [kPaletteSizeSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + has_palette_uv_cdf[kPaletteUVModeContexts][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + palette_uv_size_cdf[kPaletteBlockSizeContexts] + [kPaletteSizeSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + palette_color_index_cdf[kNumPlaneTypes][kPaletteSizeSymbolCount] + [kPaletteColorIndexContexts] + [kPaletteColorIndexSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + is_inter_cdf[kIsInterContexts][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + use_compound_reference_cdf[kUseCompoundReferenceContexts] + [kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + compound_reference_type_cdf[kCompoundReferenceTypeContexts] + [kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + compound_reference_cdf[kNumCompoundReferenceTypes][kReferenceContexts][3] + [kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + compound_backward_reference_cdf[kReferenceContexts][2] + [kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + single_reference_cdf[kReferenceContexts][6][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + compound_prediction_mode_cdf[kCompoundPredictionModeContexts] + [kNumCompoundInterPredictionModes + 1]; + alignas(kMaxAlignment) uint16_t + new_mv_cdf[kNewMvContexts][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + zero_mv_cdf[kZeroMvContexts][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + reference_mv_cdf[kReferenceMvContexts][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + ref_mv_index_cdf[kRefMvIndexContexts][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + is_inter_intra_cdf[kInterIntraContexts][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + inter_intra_mode_cdf[kInterIntraContexts][kNumInterIntraModes + 1]; + alignas(kMaxAlignment) uint16_t + is_wedge_inter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + wedge_index_cdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + use_obmc_cdf[kMaxBlockSizes][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + motion_mode_cdf[kMaxBlockSizes][kNumMotionModes + 1]; + alignas(kMaxAlignment) uint16_t + is_explicit_compound_type_cdf[kIsExplicitCompoundTypeContexts] + [kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + is_compound_type_average_cdf[kIsCompoundTypeAverageContexts] + [kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + compound_type_cdf[kMaxBlockSizes] + [kNumExplicitCompoundPredictionTypes + 1]; + alignas(kMaxAlignment) uint16_t + interpolation_filter_cdf[kInterpolationFilterContexts] + [kNumExplicitInterpolationFilters + 1]; + alignas(kMaxAlignment) uint16_t + mv_joint_cdf[kMvContexts][kNumMvJointTypes + 1]; + alignas(kMaxAlignment) uint16_t + mv_sign_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + mv_class_cdf[kMvContexts][kNumMvComponents][kMvClassSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + mv_class0_bit_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + mv_class0_fraction_cdf[kMvContexts][kNumMvComponents][kBooleanSymbolCount] + [kMvFractionSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + mv_class0_high_precision_cdf[kMvContexts][kNumMvComponents] + [kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t + mv_bit_cdf[kMvContexts][kNumMvComponents][kMvBitSymbolCount] + [kBooleanFieldCdfSize]; + alignas(kMaxAlignment) uint16_t mv_fraction_cdf[kMvContexts][kNumMvComponents] + [kMvFractionSymbolCount + 1]; + alignas(kMaxAlignment) uint16_t + mv_high_precision_cdf[kMvContexts][kNumMvComponents] + [kBooleanFieldCdfSize]; +}; + +} // namespace libgav1 +#endif // LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_ diff --git a/src/symbol_decoder_context_cdfs.inc b/src/symbol_decoder_context_cdfs.inc new file mode 100644 index 0000000..509286f --- /dev/null +++ b/src/symbol_decoder_context_cdfs.inc @@ -0,0 +1,2509 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file is just a convenience to separate out all the CDF constant +// definitions from the symbol decoder context functions. + +alignas(kMaxAlignment) constexpr uint16_t kDefaultPartitionCdf + [kBlockWidthCount][kPartitionContexts][kMaxPartitionTypes + 1] = { + // width 8 + {{13636, 7258, 2376, 0, 0}, + {18840, 12913, 4228, 0, 0}, + {20246, 9089, 4139, 0, 0}, + {22872, 13985, 6915, 0, 0}}, + // width 16 + {{17171, 11839, 8197, 6062, 5104, 3947, 3167, 2197, 866, 0, 0}, + {24843, 21725, 15983, 10298, 8797, 7725, 6117, 4067, 2934, 0, 0}, + {27354, 19499, 17657, 12280, 10408, 8268, 7231, 6432, 651, 0, 0}, + {30106, 26406, 24154, 11908, 9715, 7990, 6332, 4939, 1597, 0, 0}}, + // width 32 + {{14306, 11848, 9644, 5121, 4541, 3719, 3249, 2590, 1224, 0, 0}, + {25079, 23708, 20712, 7776, 7108, 6586, 5817, 4727, 3716, 0, 0}, + {26753, 23759, 22706, 8224, 7359, 6223, 5697, 5242, 721, 0, 0}, + {31374, 30560, 29972, 4154, 3707, 3302, 2928, 2583, 869, 0, 0}}, + // width 64 + {{12631, 11221, 9690, 3202, 2931, 2507, 2244, 1876, 1044, 0, 0}, + {26036, 25278, 23271, 4824, 4518, 4253, 3799, 3138, 2664, 0, 0}, + {26823, 25105, 24420, 4085, 3651, 3019, 2704, 2470, 530, 0, 0}, + {31898, 31556, 31281, 1570, 1374, 1194, 1025, 887, 436, 0, 0}}, + // width 128 + {{4869, 4549, 4239, 284, 229, 149, 129, 0, 0}, + {26161, 25778, 24500, 708, 549, 430, 397, 0, 0}, + {27339, 26092, 25646, 741, 541, 237, 186, 0, 0}, + {32057, 31802, 31596, 320, 230, 151, 104, 0, 0}}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultSegmentIdCdf[kSegmentIdContexts][kMaxSegments + 1] = { + {27146, 24875, 16675, 14535, 4959, 4395, 235, 0, 0}, + {18494, 14538, 10211, 7833, 2788, 1917, 424, 0, 0}, + {5241, 4281, 4045, 3878, 371, 121, 89, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultUsePredictedSegmentIdCdf[kUsePredictedSegmentIdContexts] + [kBooleanFieldCdfSize] = {{16384, 0, 0}, + {16384, 0, 0}, + {16384, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultSkipCdf[kSkipContexts][kBooleanFieldCdfSize] = { + {1097, 0, 0}, {16253, 0, 0}, {28192, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultSkipModeCdf[kSkipModeContexts][kBooleanFieldCdfSize] = { + {147, 0, 0}, {12060, 0, 0}, {24641, 0, 0}}; + +// This constant is also used for DeltaLf and DeltaLfMulti. +alignas(kMaxAlignment) constexpr uint16_t + kDefaultDeltaQCdf[kDeltaSymbolCount + 1] = {4608, 648, 91, 0, 0}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultIntraBlockCopyCdf[kBooleanFieldCdfSize] = {2237, 0, 0}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultIntraFrameYModeCdf[kIntraModeContexts][kIntraModeContexts] + [kIntraPredictionModesY + 1] = { + {{17180, 15741, 13430, 12550, 12086, 11658, + 10943, 9524, 8579, 4603, 3675, 2302, 0, 0}, + {20752, 14702, 13252, 12465, 12049, 11324, + 10880, 9736, 8334, 4110, 2596, 1359, 0, 0}, + {22716, 21997, 10472, 9980, 9713, 9529, 8635, + 7148, 6608, 3432, 2839, 1201, 0, 0}, + {18677, 17362, 16326, 13960, 13632, 13222, + 12770, 10672, 8022, 3183, 1810, 306, 0, 0}, + {20646, 19503, 17165, 16267, 14159, 12735, + 10377, 7185, 6331, 2507, 1695, 293, 0, 0}}, + {{22745, 13183, 11920, 11328, 10936, 10008, + 9679, 8745, 7387, 3754, 2286, 1332, 0, 0}, + {26785, 8669, 8208, 7882, 7702, 6973, 6855, + 6345, 5158, 2863, 1492, 974, 0, 0}, + {25324, 19987, 12591, 12040, 11691, 11161, + 10598, 9363, 8299, 4853, 3678, 2276, 0, 0}, + {24231, 18079, 17336, 15681, 15360, 14596, + 14360, 12943, 8119, 3615, 1672, 558, 0, 0}, + {25225, 18537, 17272, 16573, 14863, 12051, + 10784, 8252, 6767, 3093, 1787, 774, 0, 0}}, + {{20155, 19177, 11385, 10764, 10456, 10191, + 9367, 7713, 7039, 3230, 2463, 691, 0, 0}, + {23081, 19298, 14262, 13538, 13164, 12621, + 12073, 10706, 9549, 5025, 3557, 1861, 0, 0}, + {26585, 26263, 6744, 6516, 6402, 6334, 5686, + 4414, 4213, 2301, 1974, 682, 0, 0}, + {22050, 21034, 17814, 15544, 15203, 14844, + 14207, 11245, 8890, 3793, 2481, 516, 0, 0}, + {23574, 22910, 16267, 15505, 14344, 13597, + 11205, 6807, 6207, 2696, 2031, 305, 0, 0}}, + {{20166, 18369, 17280, 14387, 13990, 13453, + 13044, 11349, 7708, 3072, 1851, 359, 0, 0}, + {24565, 18947, 18244, 15663, 15329, 14637, + 14364, 13300, 7543, 3283, 1610, 426, 0, 0}, + {24317, 23037, 17764, 15125, 14756, 14343, + 13698, 11230, 8163, 3650, 2690, 750, 0, 0}, + {25054, 23720, 23252, 16101, 15951, 15774, + 15615, 14001, 6025, 2379, 1232, 240, 0, 0}, + {23925, 22488, 21272, 17451, 16116, 14825, + 13660, 10050, 6999, 2815, 1785, 283, 0, 0}}, + {{20190, 19097, 16789, 15934, 13693, 11855, + 9779, 7319, 6549, 2554, 1618, 291, 0, 0}, + {23205, 19142, 17688, 16876, 15012, 11905, + 10561, 8532, 7388, 3115, 1625, 491, 0, 0}, + {24412, 23867, 15152, 14512, 13418, 12662, + 10170, 6821, 6302, 2868, 2245, 507, 0, 0}, + {21933, 20953, 19644, 16726, 15750, 14729, + 13821, 10015, 8153, 3279, 1885, 286, 0, 0}, + {25150, 24480, 22909, 22259, 17382, 14111, + 9865, 3992, 3588, 1413, 966, 175, 0, 0}}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultYModeCdf[kYModeContexts][kIntraPredictionModesY + 1] = { + {9967, 9279, 8475, 8012, 7167, 6645, 6162, 5350, 4823, 3540, 3083, 2419, + 0, 0}, + {14095, 12923, 10137, 9450, 8818, 8119, 7241, 5404, 4616, 3067, 2784, + 1916, 0, 0}, + {12998, 11789, 9372, 8829, 8527, 8114, 7632, 5695, 4938, 3408, 3038, + 2109, 0, 0}, + {12613, 11467, 9930, 9590, 9507, 9235, 9065, 7964, 7416, 6193, 5752, + 4719, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultAngleDeltaCdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1] = + {{30588, 27736, 25201, 9992, 5779, 2551, 0, 0}, + {30467, 27160, 23967, 9281, 5794, 2438, 0, 0}, + {28988, 21750, 19069, 13414, 9685, 1482, 0, 0}, + {28187, 21542, 17621, 15630, 10934, 4371, 0, 0}, + {31031, 21841, 18259, 13180, 10023, 3945, 0, 0}, + {30104, 22592, 20283, 15118, 11168, 2273, 0, 0}, + {30528, 21672, 17315, 12427, 10207, 3851, 0, 0}, + {29163, 22340, 20309, 15092, 11524, 2113, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultUVModeCdf[kBooleanSymbolCount][kIntraPredictionModesY] + [kIntraPredictionModesUV + 1] = { + // CFL not allowed. + {{10137, 8616, 7390, 7107, 6782, 6248, 5713, 4845, + 4524, 2709, 1827, 807, 0, 0}, + {23255, 5887, 5795, 5722, 5650, 5104, 5029, 4944, + 4409, 3263, 2968, 972, 0, 0}, + {22923, 22853, 4105, 4064, 4011, 3988, 3570, 2946, + 2914, 2004, 991, 739, 0, 0}, + {19129, 18871, 18597, 7437, 7162, 7041, 6815, 5620, + 4191, 2156, 1413, 275, 0, 0}, + {23004, 22933, 22838, 22814, 7382, 5715, 4810, 4620, + 4525, 1667, 1024, 405, 0, 0}, + {20943, 19179, 19091, 19048, 17720, 3555, 3467, 3310, + 3057, 1607, 1327, 218, 0, 0}, + {18593, 18369, 16160, 15947, 15050, 14993, 4217, 2568, + 2523, 931, 426, 101, 0, 0}, + {19883, 19730, 17790, 17178, 17095, 17020, 16592, + 3640, 3501, 2125, 807, 307, 0, 0}, + {20742, 19107, 18894, 17463, 17278, 17042, 16773, + 16495, 4325, 2380, 2001, 352, 0, 0}, + {13716, 12928, 12189, 11852, 11618, 11301, 10883, + 10049, 9594, 3907, 2389, 593, 0, 0}, + {14141, 13119, 11794, 11549, 11276, 10952, 10569, + 9649, 9241, 5715, 1371, 620, 0, 0}, + {15742, 13764, 12771, 12429, 12182, 11665, 11419, + 10861, 10286, 6872, 6227, 949, 0, 0}, + {20644, 19009, 17809, 17776, 17761, 17717, 17690, + 17602, 17513, 17015, 16729, 16162, 0, 0}}, + // CFL allowed. + {{22361, 21560, 19868, 19587, 18945, 18593, 17869, + 17112, 16782, 12682, 11773, 10313, 8556, 0, 0}, + {28236, 12988, 12711, 12553, 12340, 11697, 11569, + 11317, 10669, 8540, 8075, 5736, 3296, 0, 0}, + {27495, 27389, 12591, 12498, 12383, 12329, 11819, + 11073, 10994, 9630, 8512, 8065, 6089, 0, 0}, + {26028, 25601, 25106, 18616, 18232, 17983, 17734, + 16027, 14397, 11248, 10562, 9379, 8586, 0, 0}, + {27781, 27400, 26840, 26700, 13654, 12453, 10911, + 10515, 10357, 7857, 7388, 6741, 6392, 0, 0}, + {27398, 25879, 25521, 25375, 23270, 11654, 11366, + 11015, 10787, 7988, 7382, 6251, 5592, 0, 0}, + {27952, 27807, 25564, 25442, 24003, 23838, 12599, + 12086, 11965, 9580, 9005, 8313, 7828, 0, 0}, + {26160, 26028, 24239, 23719, 23511, 23412, 23033, + 13941, 13709, 10432, 9564, 8804, 7975, 0, 0}, + {26770, 25349, 24987, 23835, 23513, 23219, 23015, + 22351, 13870, 10274, 9629, 8004, 6779, 0, 0}, + {22108, 21470, 20218, 19811, 19446, 19144, 18728, + 17764, 17234, 12054, 10979, 9325, 7907, 0, 0}, + {22246, 21238, 20216, 19805, 19390, 18989, 18523, + 17533, 16866, 12666, 10072, 8994, 6930, 0, 0}, + {22669, 22077, 20129, 19719, 19382, 19103, 18643, + 17605, 17132, 13092, 12294, 9249, 7560, 0, 0}, + {29624, 27681, 25386, 25264, 25175, 25078, 24967, + 24704, 24536, 23520, 22893, 22247, 3720, 0, 0}}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultCflAlphaSignsCdf[kCflAlphaSignsSymbolCount + 1] = { + 31350, 30645, 19428, 14363, 5796, 4425, 474, 0, 0}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultCflAlphaCdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1] = { + {25131, 12049, 1367, 287, 111, 80, 76, 72, 68, 64, 60, 56, 52, 48, 44, + 0, 0}, + {18403, 9165, 4633, 1600, 601, 373, 281, 195, 148, 121, 100, 96, 92, 88, + 84, 0, 0}, + {21236, 10388, 4323, 1408, 419, 245, 184, 119, 95, 91, 87, 83, 79, 75, + 71, 0, 0}, + {5778, 1366, 486, 197, 76, 72, 68, 64, 60, 56, 52, 48, 44, 40, 36, 0, + 0}, + {15520, 6710, 3864, 2160, 1463, 891, 642, 447, 374, 304, 252, 208, 192, + 175, 146, 0, 0}, + {18030, 11090, 6989, 4867, 3744, 2466, 1788, 925, 624, 355, 248, 174, + 146, 112, 108, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultUseFilterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = { + {28147, 0, 0}, {26025, 0, 0}, {19998, 0, 0}, {26875, 0, 0}, + {24902, 0, 0}, {20217, 0, 0}, {12539, 0, 0}, {22400, 0, 0}, + {23374, 0, 0}, {20360, 0, 0}, {18467, 0, 0}, {16384, 0, 0}, + {14667, 0, 0}, {20012, 0, 0}, {10425, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultFilterIntraModeCdf[kNumFilterIntraPredictors + 1] = { + 23819, 19992, 15557, 3210, 0, 0}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultTxDepthCdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1] = { + {{12800, 0, 0}, {12800, 0, 0}, {8448, 0, 0}}, + {{20496, 2596, 0, 0}, {20496, 2596, 0, 0}, {14091, 1920, 0, 0}}, + {{19782, 17588, 0, 0}, {19782, 17588, 0, 0}, {8466, 7166, 0, 0}}, + {{26986, 21293, 0, 0}, {26986, 21293, 0, 0}, {15965, 10009, 0, 0}}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultTxSplitCdf[kTxSplitContexts][kBooleanFieldCdfSize] = { + {4187, 0, 0}, {8922, 0, 0}, {11921, 0, 0}, {8453, 0, 0}, + {14572, 0, 0}, {20635, 0, 0}, {13977, 0, 0}, {21881, 0, 0}, + {21763, 0, 0}, {5589, 0, 0}, {12764, 0, 0}, {21487, 0, 0}, + {6219, 0, 0}, {13460, 0, 0}, {18544, 0, 0}, {4753, 0, 0}, + {11222, 0, 0}, {18368, 0, 0}, {4603, 0, 0}, {10367, 0, 0}, + {16680, 0, 0}}; + +/* clang-format off */ +alignas(kMaxAlignment) constexpr uint16_t kDefaultAllZeroCdf[kCoefficientQuantizerContexts] + [kNumSquareTransformSizes][kAllZeroContexts] + [kBooleanFieldCdfSize] = { + { + {{919, 0, 0}, {26876, 0, 0}, {20656, 0, 0}, {10833, 0, 0}, {12479, 0, 0}, + {5295, 0, 0}, {281, 0, 0}, {25114, 0, 0}, {13295, 0, 0}, {2784, 0, 0}, + {22807, 0, 0}, {2526, 0, 0}, {651, 0, 0}}, + {{1220, 0, 0}, {31219, 0, 0}, {22638, 0, 0}, {16112, 0, 0}, {14177, 0, 0}, + {6460, 0, 0}, {231, 0, 0}, {27365, 0, 0}, {14672, 0, 0}, {2765, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}, + {{2811, 0, 0}, {27377, 0, 0}, {14729, 0, 0}, {9202, 0, 0}, {10337, 0, 0}, + {6946, 0, 0}, {571, 0, 0}, {28990, 0, 0}, {17432, 0, 0}, {3787, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}, + {{14848, 0, 0}, {30950, 0, 0}, {25486, 0, 0}, {7495, 0, 0}, {21845, 0, 0}, + {1214, 0, 0}, {144, 0, 0}, {31402, 0, 0}, {17140, 0, 0}, {2306, 0, 0}, + {32622, 0, 0}, {27636, 0, 0}, {1111, 0, 0}}, + {{26460, 0, 0}, {32651, 0, 0}, {31130, 0, 0}, {30607, 0, 0}, {16384, 0, 0}, + {21845, 0, 0}, {2521, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}} + }, + { + {{2397, 0, 0}, {25198, 0, 0}, {19613, 0, 0}, {12017, 0, 0}, {11799, 0, 0}, + {5701, 0, 0}, {755, 0, 0}, {27273, 0, 0}, {14826, 0, 0}, {4488, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}, + {{986, 0, 0}, {30932, 0, 0}, {22079, 0, 0}, {15164, 0, 0}, {11146, 0, 0}, + {5250, 0, 0}, {369, 0, 0}, {28349, 0, 0}, {16474, 0, 0}, {4423, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}, + {{867, 0, 0}, {22457, 0, 0}, {14721, 0, 0}, {7962, 0, 0}, {9480, 0, 0}, + {4854, 0, 0}, {472, 0, 0}, {28553, 0, 0}, {17012, 0, 0}, {4427, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}, + {{6042, 0, 0}, {31723, 0, 0}, {21065, 0, 0}, {12178, 0, 0}, {14214, 0, 0}, + {6798, 0, 0}, {830, 0, 0}, {27185, 0, 0}, {11455, 0, 0}, {3378, 0, 0}, + {32127, 0, 0}, {10503, 0, 0}, {1316, 0, 0}}, + {{6184, 0, 0}, {32580, 0, 0}, {23921, 0, 0}, {8249, 0, 0}, {9830, 0, 0}, + {2185, 0, 0}, {160, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}} + }, + { + {{3154, 0, 0}, {23700, 0, 0}, {19844, 0, 0}, {13230, 0, 0}, {15031, 0, 0}, + {8149, 0, 0}, {2126, 0, 0}, {28649, 0, 0}, {16742, 0, 0}, {7111, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}, + {{811, 0, 0}, {29538, 0, 0}, {21615, 0, 0}, {14645, 0, 0}, {12625, 0, 0}, + {6232, 0, 0}, {782, 0, 0}, {29718, 0, 0}, {18165, 0, 0}, {7613, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}, + {{405, 0, 0}, {22076, 0, 0}, {13678, 0, 0}, {8411, 0, 0}, {8326, 0, 0}, + {4456, 0, 0}, {599, 0, 0}, {29120, 0, 0}, {17078, 0, 0}, {5953, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}, + {{2099, 0, 0}, {28936, 0, 0}, {21105, 0, 0}, {13879, 0, 0}, {12986, 0, 0}, + {9455, 0, 0}, {1438, 0, 0}, {27644, 0, 0}, {14049, 0, 0}, {4300, 0, 0}, + {29686, 0, 0}, {11786, 0, 0}, {3325, 0, 0}}, + {{4195, 0, 0}, {29585, 0, 0}, {14966, 0, 0}, {6791, 0, 0}, {6091, 0, 0}, + {4936, 0, 0}, {381, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}} + }, + { + {{5881, 0, 0}, {26039, 0, 0}, {22407, 0, 0}, {15326, 0, 0}, {17723, 0, 0}, + {10290, 0, 0}, {3696, 0, 0}, {30055, 0, 0}, {20907, 0, 0}, {11995, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}, + {{865, 0, 0}, {30724, 0, 0}, {25240, 0, 0}, {18150, 0, 0}, {16586, 0, 0}, + {8600, 0, 0}, {1731, 0, 0}, {29982, 0, 0}, {21574, 0, 0}, {12613, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}, + {{258, 0, 0}, {24338, 0, 0}, {15450, 0, 0}, {8614, 0, 0}, {9094, 0, 0}, + {3979, 0, 0}, {629, 0, 0}, {29328, 0, 0}, {19651, 0, 0}, {10066, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}}, + {{1097, 0, 0}, {30712, 0, 0}, {21022, 0, 0}, {15916, 0, 0}, {14133, 0, 0}, + {8053, 0, 0}, {1284, 0, 0}, {28112, 0, 0}, {16694, 0, 0}, {8064, 0, 0}, + {30962, 0, 0}, {18123, 0, 0}, {7432, 0, 0}}, + {{1229, 0, 0}, {24335, 0, 0}, {12192, 0, 0}, {4864, 0, 0}, {4916, 0, 0}, + {2742, 0, 0}, {327, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}} + } +}; +/* clang-format on */ + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultInterTxTypeCdf[3][kNumExtendedTransformSizes][kNumTransformTypes + + 1] = { + {{28310, 27208, 25073, 23059, 19438, 17979, 15231, 12502, 11264, 9920, + 8834, 7294, 5041, 3853, 2137, 0, 0}, + {31123, 30195, 27990, 27057, 24961, 24146, 22246, 17411, 15094, 12360, + 10251, 7758, 5652, 3912, 2019, 0, 0}, + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}, + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}}, + {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + // Only 16x16 is used in this case. + {31998, 30347, 27543, 19861, 16949, 13841, 11207, 8679, 6173, 4242, + 2239, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{16384, 0, 0}, {28601, 0, 0}, {30770, 0, 0}, {32020, 0, 0}}}; + +alignas(kMaxAlignment) constexpr uint16_t kDefaultIntraTxTypeCdf + [2][kNumExtendedTransformSizes][kIntraPredictionModesY] + [kNumTransformTypes + 1] = { + {{{31233, 24733, 23307, 20017, 9301, 4943, 0, 0}, + {32204, 29433, 23059, 21898, 14625, 4674, 0, 0}, + {32096, 29521, 29092, 20786, 13353, 9641, 0, 0}, + {27489, 18883, 17281, 14724, 9241, 2516, 0, 0}, + {28345, 26694, 24783, 22352, 7075, 3470, 0, 0}, + {31282, 28527, 23308, 22106, 16312, 5074, 0, 0}, + {32329, 29930, 29246, 26031, 14710, 9014, 0, 0}, + {31578, 28535, 27913, 21098, 12487, 8391, 0, 0}, + {31723, 28456, 24121, 22609, 14124, 3433, 0, 0}, + {32566, 29034, 28021, 25470, 15641, 8752, 0, 0}, + {32321, 28456, 25949, 23884, 16758, 8910, 0, 0}, + {32491, 28399, 27513, 23863, 16303, 10497, 0, 0}, + {29359, 27332, 22169, 17169, 13081, 8728, 0, 0}}, + {{30898, 19026, 18238, 16270, 8998, 5070, 0, 0}, + {32442, 23972, 18136, 17689, 13496, 5282, 0, 0}, + {32284, 25192, 25056, 18325, 13609, 10177, 0, 0}, + {31642, 17428, 16873, 15745, 11872, 2489, 0, 0}, + {32113, 27914, 27519, 26855, 10669, 5630, 0, 0}, + {31469, 26310, 23883, 23478, 17917, 7271, 0, 0}, + {32457, 27473, 27216, 25883, 16661, 10096, 0, 0}, + {31885, 24709, 24498, 21510, 15479, 11219, 0, 0}, + {32027, 25188, 23450, 22423, 16080, 3722, 0, 0}, + {32658, 25362, 24853, 23573, 16727, 9439, 0, 0}, + {32405, 24794, 23411, 22095, 17139, 8294, 0, 0}, + {32615, 25121, 24656, 22832, 17461, 12772, 0, 0}, + {29257, 26436, 21603, 17433, 13445, 9174, 0, 0}}}, + {{{26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}}, + {{26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}, + {26214, 19661, 13107, 6554, 0, 0}}, + {{31641, 19954, 9996, 5285, 0, 0}, + {32623, 26007, 20788, 6101, 0, 0}, + {32406, 26881, 21090, 16043, 0, 0}, + {32383, 17555, 14181, 2075, 0, 0}, + {32743, 29854, 9634, 4865, 0, 0}, + {32708, 28298, 21019, 8777, 0, 0}, + {32731, 29436, 18257, 11320, 0, 0}, + {32611, 26448, 19732, 15329, 0, 0}, + {32649, 26049, 19862, 3372, 0, 0}, + {32721, 27231, 20192, 11269, 0, 0}, + {32499, 26692, 21510, 9653, 0, 0}, + {32685, 27153, 20767, 15540, 0, 0}, + {30800, 27212, 20745, 14221, 0, 0}}}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultEobPt16Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes] + [kEobPtContexts][kEobPt16SymbolCount + 1] = { + {{{31928, 31729, 30788, 27873, 0, 0}, + {32398, 32097, 30885, 28297, 0, 0}}, + {{29521, 27818, 23080, 18205, 0, 0}, + {30864, 29414, 25005, 18121, 0, 0}}}, + {{{30643, 30217, 27603, 23822, 0, 0}, + {32255, 32003, 30909, 26429, 0, 0}}, + {{25131, 23270, 18509, 13660, 0, 0}, + {30271, 28672, 23902, 15775, 0, 0}}}, + {{{28752, 27871, 23887, 17800, 0, 0}, + {32052, 31663, 30122, 22712, 0, 0}}, + {{21629, 19498, 14527, 9202, 0, 0}, + {29576, 27736, 22471, 13013, 0, 0}}}, + {{{26060, 23810, 18022, 10635, 0, 0}, + {31546, 30694, 27985, 17358, 0, 0}}, + {{13193, 11002, 6724, 3059, 0, 0}, + {25471, 22001, 13495, 4574, 0, 0}}}}; +alignas(kMaxAlignment) constexpr uint16_t + kDefaultEobPt32Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes] + [kEobPtContexts][kEobPt32SymbolCount + 1] = { + {{{32368, 32248, 31791, 30666, 26226, 0, 0}, + {32558, 32363, 31453, 29442, 25231, 0, 0}}, + {{30132, 28495, 25180, 20974, 12367, 0, 0}, + {30982, 29589, 25866, 21411, 13714, 0, 0}}}, + {{{31779, 31519, 30749, 28617, 21983, 0, 0}, + {32455, 32327, 31669, 29851, 24206, 0, 0}}, + {{24374, 22416, 18836, 13913, 6754, 0, 0}, + {30190, 28644, 24587, 19098, 8534, 0, 0}}}, + {{{30253, 29765, 28316, 24606, 16727, 0, 0}, + {32194, 31947, 30932, 27679, 19640, 0, 0}}, + {{19300, 16465, 12407, 7663, 3487, 0, 0}, + {29226, 27266, 22353, 16008, 7124, 0, 0}}}, + {{{28151, 27059, 24322, 19184, 9633, 0, 0}, + {31612, 31066, 29093, 23494, 12229, 0, 0}}, + {{10682, 8486, 5758, 2998, 1025, 0, 0}, + {25069, 21871, 11877, 5842, 1140, 0, 0}}}}; +alignas(kMaxAlignment) constexpr uint16_t + kDefaultEobPt64Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes] + [kEobPtContexts][kEobPt64SymbolCount + 1] = { + {{{32439, 32270, 31667, 30984, 29503, 25010, 0, 0}, + {32433, 32038, 31309, 27274, 24013, 19771, 0, 0}}, + {{29263, 27464, 22682, 18954, 15084, 9398, 0, 0}, + {31205, 30068, 27892, 21857, 18062, 10288, 0, 0}}}, + {{{31508, 31322, 30515, 29056, 26116, 19399, 0, 0}, + {32367, 32163, 31739, 30205, 26923, 20142, 0, 0}}, + {{24159, 22156, 18144, 14054, 10154, 3744, 0, 0}, + {30845, 29641, 26901, 23065, 18491, 5668, 0, 0}}}, + {{{30394, 29996, 28185, 25492, 20480, 13062, 0, 0}, + {32271, 31958, 31453, 29768, 25764, 17127, 0, 0}}, + {{17718, 15642, 11358, 7882, 4612, 2042, 0, 0}, + {28734, 26478, 22533, 17786, 11554, 4277, 0, 0}}}, + {{{26461, 25227, 20708, 16410, 10215, 4903, 0, 0}, + {31479, 30448, 28797, 24842, 18615, 8477, 0, 0}}, + {{8556, 7060, 4500, 2733, 1461, 719, 0, 0}, + {24042, 20390, 13359, 6318, 2730, 306, 0, 0}}}}; +alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt128Cdf + [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts] + [kEobPt128SymbolCount + 1] = { + {{{32549, 32286, 31628, 30677, 29088, 26740, 20182, 0, 0}, + {32397, 32069, 31514, 27938, 23289, 20206, 15271, 0, 0}}, + {{27523, 25312, 19888, 16916, 12735, 8836, 5160, 0, 0}, + {30714, 29296, 26899, 18536, 14526, 12178, 6016, 0, 0}}}, + {{{32083, 31835, 31280, 30054, 28002, 24206, 13514, 0, 0}, + {32551, 32416, 32150, 30465, 27507, 22799, 15296, 0, 0}}, + {{24723, 21568, 17271, 13173, 8820, 5360, 1830, 0, 0}, + {30458, 28608, 25297, 17771, 14837, 12000, 2528, 0, 0}}}, + {{{31402, 31030, 30241, 27752, 23413, 16971, 8125, 0, 0}, + {32414, 32210, 31824, 30008, 25481, 18731, 10989, 0, 0}}, + {{19141, 16522, 12595, 8339, 4820, 2353, 905, 0, 0}, + {26493, 22879, 17999, 9604, 4780, 2275, 496, 0, 0}}}, + {{{29296, 27883, 25279, 20287, 14251, 8232, 3133, 0, 0}, + {31882, 31037, 29497, 24299, 17199, 10642, 4385, 0, 0}}, + {{8455, 6706, 4383, 2661, 1551, 870, 423, 0, 0}, + {23603, 19486, 11618, 2482, 874, 197, 56, 0, 0}}}}; + +alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt256Cdf + [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts] + [kEobPt256SymbolCount + 1] = { + {{{32458, 32184, 30881, 29179, 26600, 24157, 21416, 17116, 0, 0}, + {31770, 30918, 29770, 27164, 15427, 12880, 9869, 7185, 0, 0}}, + {{30248, 29528, 26816, 23898, 20191, 15210, 12814, 8600, 0, 0}, + {30565, 28638, 25333, 22029, 12116, 9087, 7159, 5507, 0, 0}}}, + {{{31320, 30659, 28617, 26505, 23439, 19508, 14824, 9468, 0, 0}, + {32369, 31749, 31019, 29730, 22324, 17222, 10029, 5474, 0, 0}}, + {{26366, 24620, 20145, 17696, 14040, 9921, 6321, 3391, 0, 0}, + {31094, 29516, 27034, 22609, 10371, 8966, 7947, 1828, 0, 0}}}, + {{{29679, 28848, 26730, 23308, 18502, 12887, 7002, 3592, 0, 0}, + {31684, 30410, 29280, 27646, 21285, 14665, 6745, 2969, 0, 0}}, + {{21254, 18974, 15288, 12014, 8407, 5390, 3276, 1491, 0, 0}, + {26197, 23158, 17252, 10942, 3676, 1939, 926, 60, 0, 0}}}, + {{{27420, 25655, 20948, 16844, 10662, 5991, 2434, 1011, 0, 0}, + {30315, 28294, 26461, 23991, 16294, 9793, 3768, 1221, 0, 0}}, + {{9658, 8171, 5628, 3874, 2601, 1841, 1376, 674, 0, 0}, + {22770, 15107, 7590, 4671, 1460, 730, 365, 73, 0, 0}}}}; + +alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt512Cdf + [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPt512SymbolCount + 1] = + {{{32127, 31785, 29061, 27338, 22534, 17810, 13980, 9356, 6707, 0, 0}, + {27673, 26322, 22772, 19414, 16751, 14782, 11849, 6639, 3628, 0, 0}}, + {{31538, 30490, 27733, 24992, 20897, 17422, 13178, 8184, 4019, 0, 0}, + {25503, 22789, 16949, 13518, 10988, 8922, 6290, 4372, 957, 0, 0}}, + {{30144, 28832, 26288, 23082, 18789, 15042, 9501, 4358, 1690, 0, 0}, + {20753, 17999, 13180, 10716, 8546, 6956, 5468, 3549, 654, 0, 0}}, + {{26841, 24959, 21845, 18171, 13329, 8633, 4312, 1626, 708, 0, 0}, + {11675, 9725, 7026, 5110, 3671, 3052, 2695, 1948, 812, 0, 0}}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultEobPt1024Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes] + [kEobPt1024SymbolCount + 1] = { + {{32375, 32347, 32017, 31145, 29608, 26416, 19423, + 14721, 10197, 6938, 0, 0}, + {30903, 30780, 29838, 28526, 22235, 16230, 11414, + 5513, 4222, 984, 0, 0}}, + {{32072, 31820, 29623, 27066, 23062, 19551, 14917, + 10912, 7076, 4734, 0, 0}, + {30096, 29177, 23438, 15684, 10043, 8484, 6241, + 4741, 4391, 1892, 0, 0}}, + {{29984, 28937, 25727, 22247, 17921, 13924, 9613, + 6086, 3539, 1723, 0, 0}, + {23191, 20302, 15029, 12018, 10707, 9553, 8167, + 7285, 6925, 712, 0, 0}}, + {{26070, 24434, 20807, 17006, 12582, 8906, 5334, + 3442, 1686, 718, 0, 0}, + {12199, 10342, 7199, 5909, 4715, 3855, 3282, 3044, + 2961, 198, 0, 0}}}; + +/* clang-format off */ +alignas(kMaxAlignment) constexpr uint16_t kDefaultEobExtraCdf[kCoefficientQuantizerContexts] + [kNumSquareTransformSizes][kNumPlaneTypes] + [kEobExtraContexts][kBooleanFieldCdfSize] = { + { + { + {{15807, 0, 0}, {15545, 0, 0}, {25147, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}}, + {{13699, 0, 0}, {10243, 0, 0}, {19391, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + }, + { + {{12367, 0, 0}, {15743, 0, 0}, {19923, 0, 0}, {19895, 0, 0}, + {18674, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}}, + {{12087, 0, 0}, {12067, 0, 0}, {17518, 0, 0}, {17751, 0, 0}, + {17840, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + }, + { + {{8863, 0, 0}, {15574, 0, 0}, {16598, 0, 0}, {15073, 0, 0}, + {18942, 0, 0}, {16958, 0, 0}, {20732, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}}, + {{8809, 0, 0}, {11969, 0, 0}, {13747, 0, 0}, {16565, 0, 0}, + {14882, 0, 0}, {18624, 0, 0}, {20758, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + }, + { + {{5369, 0, 0}, {16441, 0, 0}, {14697, 0, 0}, {13184, 0, 0}, + {12047, 0, 0}, {14336, 0, 0}, {13208, 0, 0}, {22618, 0, 0}, + {23963, 0, 0}}, + {{7836, 0, 0}, {11935, 0, 0}, {20741, 0, 0}, {16098, 0, 0}, + {12854, 0, 0}, {17662, 0, 0}, {15106, 0, 0}, {18985, 0, 0}, + {4012, 0, 0}} + }, + { + {{9362, 0, 0}, {10923, 0, 0}, {14336, 0, 0}, {16384, 0, 0}, + {15672, 0, 0}, {20207, 0, 0}, {15448, 0, 0}, {10373, 0, 0}, + {11398, 0, 0}}, + {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + } + }, + { + { + {{15297, 0, 0}, {12545, 0, 0}, {21411, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}}, + {{12433, 0, 0}, {11101, 0, 0}, {17950, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + }, + { + {{12338, 0, 0}, {12106, 0, 0}, {17401, 0, 0}, {15798, 0, 0}, + {18111, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}}, + {{10651, 0, 0}, {10740, 0, 0}, {14118, 0, 0}, {16726, 0, 0}, + {16883, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + }, + { + {{10359, 0, 0}, {11756, 0, 0}, {17118, 0, 0}, {15373, 0, 0}, + {17299, 0, 0}, {12563, 0, 0}, {13257, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}}, + {{8548, 0, 0}, {10288, 0, 0}, {15031, 0, 0}, {13852, 0, 0}, + {13500, 0, 0}, {14356, 0, 0}, {13924, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + }, + { + {{6777, 0, 0}, {12454, 0, 0}, {15037, 0, 0}, {13090, 0, 0}, + {14119, 0, 0}, {15461, 0, 0}, {10970, 0, 0}, {15219, 0, 0}, + {17138, 0, 0}}, + {{6183, 0, 0}, {11299, 0, 0}, {12336, 0, 0}, {15033, 0, 0}, + {13488, 0, 0}, {17533, 0, 0}, {12471, 0, 0}, {10297, 0, 0}, + {3771, 0, 0}} + }, + { + {{6163, 0, 0}, {21464, 0, 0}, {16042, 0, 0}, {16208, 0, 0}, + {11902, 0, 0}, {9244, 0, 0}, {12890, 0, 0}, {19299, 0, 0}, + {9684, 0, 0}}, + {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + } + }, + { + { + {{13785, 0, 0}, {12256, 0, 0}, {17883, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}}, + {{12678, 0, 0}, {13324, 0, 0}, {15482, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + }, + { + {{13629, 0, 0}, {11281, 0, 0}, {13809, 0, 0}, {11858, 0, 0}, + {13679, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}}, + {{12232, 0, 0}, {12104, 0, 0}, {12143, 0, 0}, {13645, 0, 0}, + {17906, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + }, + { + {{12935, 0, 0}, {11266, 0, 0}, {15283, 0, 0}, {12501, 0, 0}, + {14415, 0, 0}, {9439, 0, 0}, {11290, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}}, + {{10727, 0, 0}, {9334, 0, 0}, {12767, 0, 0}, {12214, 0, 0}, + {11817, 0, 0}, {12623, 0, 0}, {17206, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + }, + { + {{9456, 0, 0}, {11161, 0, 0}, {16242, 0, 0}, {13811, 0, 0}, + {14734, 0, 0}, {13834, 0, 0}, {8521, 0, 0}, {15847, 0, 0}, + {15688, 0, 0}}, + {{6189, 0, 0}, {7858, 0, 0}, {14131, 0, 0}, {12968, 0, 0}, + {12380, 0, 0}, {22881, 0, 0}, {17126, 0, 0}, {2570, 0, 0}, + {8047, 0, 0}} + }, + { + {{5770, 0, 0}, {16031, 0, 0}, {14930, 0, 0}, {13846, 0, 0}, + {13253, 0, 0}, {14132, 0, 0}, {15435, 0, 0}, {16992, 0, 0}, + {10110, 0, 0}}, + {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + } + }, + { + { + {{12591, 0, 0}, {11979, 0, 0}, {12506, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}}, + {{11352, 0, 0}, {11913, 0, 0}, {9358, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + }, + { + {{12530, 0, 0}, {11711, 0, 0}, {13609, 0, 0}, {10431, 0, 0}, + {12609, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}}, + {{12643, 0, 0}, {12209, 0, 0}, {11061, 0, 0}, {10472, 0, 0}, + {15435, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + }, + { + {{12827, 0, 0}, {12241, 0, 0}, {11298, 0, 0}, {10281, 0, 0}, + {13210, 0, 0}, {10414, 0, 0}, {12437, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}}, + {{10016, 0, 0}, {7762, 0, 0}, {10693, 0, 0}, {11192, 0, 0}, + {15028, 0, 0}, {11078, 0, 0}, {13557, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + }, + { + {{11326, 0, 0}, {10410, 0, 0}, {14265, 0, 0}, {12477, 0, 0}, + {12823, 0, 0}, {11474, 0, 0}, {11590, 0, 0}, {13368, 0, 0}, + {22212, 0, 0}}, + {{8120, 0, 0}, {7819, 0, 0}, {12060, 0, 0}, {8863, 0, 0}, + {12267, 0, 0}, {23210, 0, 0}, {23345, 0, 0}, {2403, 0, 0}, + {13515, 0, 0}} + }, + { + {{6704, 0, 0}, {10670, 0, 0}, {13155, 0, 0}, {12243, 0, 0}, + {15173, 0, 0}, {16150, 0, 0}, {12271, 0, 0}, {13779, 0, 0}, + {17255, 0, 0}}, + {{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}} + } + } +}; + +alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseEobCdf[kCoefficientQuantizerContexts] + [kNumSquareTransformSizes][kNumPlaneTypes] + [kCoeffBaseEobContexts] + [kCoeffBaseEobSymbolCount + 1] = { + { + { + {{14931, 3713, 0, 0}, {3168, 1322, 0, 0}, {1924, 890, 0, 0}, + {7842, 3820, 0, 0}}, + {{11403, 2742, 0, 0}, {2256, 345, 0, 0}, {1110, 147, 0, 0}, + {3138, 887, 0, 0}} + }, + { + {{27051, 6291, 0, 0}, {2277, 1065, 0, 0}, {1218, 610, 0, 0}, + {3120, 1277, 0, 0}}, + {{20160, 4948, 0, 0}, {2088, 543, 0, 0}, {1959, 433, 0, 0}, + {1469, 345, 0, 0}} + }, + { + {{30982, 20156, 0, 0}, {2105, 1143, 0, 0}, {429, 300, 0, 0}, + {1620, 935, 0, 0}}, + {{13911, 8903, 0, 0}, {1340, 340, 0, 0}, {1024, 395, 0, 0}, + {993, 242, 0, 0}} + }, + { + {{30981, 30236, 0, 0}, {1936, 1106, 0, 0}, {944, 86, 0, 0}, + {635, 199, 0, 0}}, + {{19017, 10533, 0, 0}, {679, 359, 0, 0}, {5684, 4848, 0, 0}, + {3477, 174, 0, 0}} + }, + { + {{31043, 29319, 0, 0}, {1666, 833, 0, 0}, {311, 155, 0, 0}, + {356, 119, 0, 0}}, + {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0}, + {21845, 10923, 0, 0}} + } + }, + { + { + {{15208, 2880, 0, 0}, {3097, 1219, 0, 0}, {1761, 712, 0, 0}, + {5482, 2762, 0, 0}}, + {{6174, 1556, 0, 0}, {1560, 186, 0, 0}, {933, 131, 0, 0}, + {2173, 562, 0, 0}} + }, + { + {{17529, 2836, 0, 0}, {1453, 673, 0, 0}, {638, 334, 0, 0}, + {1904, 772, 0, 0}}, + {{6489, 1800, 0, 0}, {1626, 273, 0, 0}, {1055, 228, 0, 0}, + {839, 174, 0, 0}} + }, + { + {{30124, 7570, 0, 0}, {730, 317, 0, 0}, {129, 73, 0, 0}, + {602, 250, 0, 0}}, + {{15581, 5100, 0, 0}, {1054, 218, 0, 0}, {485, 90, 0, 0}, + {838, 205, 0, 0}} + }, + { + {{31724, 30511, 0, 0}, {2013, 845, 0, 0}, {560, 75, 0, 0}, + {524, 153, 0, 0}}, + {{11451, 6561, 0, 0}, {3635, 1900, 0, 0}, {3457, 1537, 0, 0}, + {3111, 1681, 0, 0}} + }, + { + {{32290, 30934, 0, 0}, {1763, 781, 0, 0}, {451, 44, 0, 0}, + {1903, 120, 0, 0}}, + {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0}, + {21845, 10923, 0, 0}} + } + }, + { + { + {{12676, 1994, 0, 0}, {2073, 748, 0, 0}, {1637, 665, 0, 0}, + {4102, 1898, 0, 0}}, + {{5510, 1673, 0, 0}, {964, 145, 0, 0}, {1005, 240, 0, 0}, + {1330, 262, 0, 0}} + }, + { + {{14719, 2279, 0, 0}, {1062, 482, 0, 0}, {605, 295, 0, 0}, + {1218, 584, 0, 0}}, + {{5652, 1926, 0, 0}, {797, 170, 0, 0}, {680, 192, 0, 0}, + {701, 104, 0, 0}} + }, + { + {{19914, 3675, 0, 0}, {496, 210, 0, 0}, {101, 39, 0, 0}, + {462, 183, 0, 0}}, + {{7292, 2402, 0, 0}, {599, 81, 0, 0}, {289, 79, 0, 0}, + {1095, 134, 0, 0}} + }, + { + {{29959, 13467, 0, 0}, {563, 146, 0, 0}, {430, 38, 0, 0}, + {982, 152, 0, 0}}, + {{10031, 3663, 0, 0}, {1958, 406, 0, 0}, {2754, 141, 0, 0}, + {2240, 194, 0, 0}} + }, + { + {{31833, 29386, 0, 0}, {1979, 859, 0, 0}, {302, 12, 0, 0}, + {1908, 255, 0, 0}}, + {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0}, + {21845, 10923, 0, 0}} + } + }, + { + { + {{10271, 1570, 0, 0}, {1053, 273, 0, 0}, {1162, 431, 0, 0}, + {2380, 778, 0, 0}}, + {{4891, 1184, 0, 0}, {598, 40, 0, 0}, {613, 80, 0, 0}, + {549, 66, 0, 0}} + }, + { + {{11311, 1725, 0, 0}, {817, 285, 0, 0}, {615, 206, 0, 0}, + {1295, 553, 0, 0}}, + {{5210, 1617, 0, 0}, {748, 128, 0, 0}, {671, 193, 0, 0}, + {526, 49, 0, 0}} + }, + { + {{12788, 2177, 0, 0}, {549, 171, 0, 0}, {187, 62, 0, 0}, + {965, 481, 0, 0}}, + {{6295, 2261, 0, 0}, {337, 45, 0, 0}, {572, 157, 0, 0}, + {1180, 240, 0, 0}} + }, + { + {{8121, 2305, 0, 0}, {356, 73, 0, 0}, {300, 48, 0, 0}, + {1499, 245, 0, 0}}, + {{4286, 1263, 0, 0}, {616, 67, 0, 0}, {1036, 170, 0, 0}, + {1001, 56, 0, 0}} + }, + { + {{20410, 7791, 0, 0}, {1437, 383, 0, 0}, {134, 12, 0, 0}, + {2357, 220, 0, 0}}, + {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0}, + {21845, 10923, 0, 0}} + } + } +}; +/* clang-format on */ + +alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseCdf + [kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes] + [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1] = { + {{{{28734, 23838, 20041, 0, 0}, {14686, 3027, 891, 0, 0}, + {20172, 6644, 2275, 0, 0}, {23322, 11650, 5763, 0, 0}, + {26460, 17627, 11489, 0, 0}, {30305, 26411, 22985, 0, 0}, + {12101, 2222, 839, 0, 0}, {19725, 6645, 2634, 0, 0}, + {24617, 14011, 7990, 0, 0}, {27513, 19929, 14136, 0, 0}, + {29948, 25562, 21607, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {17032, 5215, 2164, 0, 0}, + {21558, 8974, 3981, 0, 0}, {26821, 18894, 13067, 0, 0}, + {28553, 23445, 18877, 0, 0}, {29935, 26306, 22709, 0, 0}, + {13163, 2375, 1186, 0, 0}, {19245, 6516, 2520, 0, 0}, + {24322, 14146, 8256, 0, 0}, {28950, 22425, 16794, 0, 0}, + {31287, 28651, 25972, 0, 0}, {10119, 1466, 578, 0, 0}, + {17939, 5641, 2319, 0, 0}, {24455, 15066, 9464, 0, 0}, + {29746, 24467, 19982, 0, 0}, {31232, 28356, 25584, 0, 0}, + {10414, 2994, 1396, 0, 0}, {18045, 7296, 3554, 0, 0}, + {26095, 19023, 14106, 0, 0}, {30700, 27002, 23446, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{26466, 16324, 11007, 0, 0}, {9728, 1230, 293, 0, 0}, + {17572, 4316, 1272, 0, 0}, {22748, 9822, 4254, 0, 0}, + {26235, 15906, 9267, 0, 0}, {29230, 22952, 17692, 0, 0}, + {8324, 893, 243, 0, 0}, {16887, 3844, 1133, 0, 0}, + {22846, 9895, 4302, 0, 0}, {26241, 15802, 9077, 0, 0}, + {28654, 21465, 15548, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {12567, 1998, 559, 0, 0}, + {18014, 4697, 1510, 0, 0}, {24390, 12582, 6251, 0, 0}, + {26852, 17469, 10790, 0, 0}, {28500, 21185, 14867, 0, 0}, + {8407, 743, 187, 0, 0}, {14095, 2663, 825, 0, 0}, + {22572, 10524, 5192, 0, 0}, {27273, 18419, 12351, 0, 0}, + {30092, 25353, 21270, 0, 0}, {8090, 810, 183, 0, 0}, + {14139, 2862, 937, 0, 0}, {23404, 12044, 6453, 0, 0}, + {28127, 20450, 14674, 0, 0}, {30010, 25381, 21189, 0, 0}, + {7335, 926, 299, 0, 0}, {13973, 3479, 1357, 0, 0}, + {25124, 15184, 9176, 0, 0}, {29360, 23754, 17721, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{28232, 22696, 18767, 0, 0}, {7309, 1352, 562, 0, 0}, + {16163, 4720, 1950, 0, 0}, {21760, 9911, 5049, 0, 0}, + {25853, 16500, 10453, 0, 0}, {30143, 25956, 22231, 0, 0}, + {8511, 980, 269, 0, 0}, {15888, 3314, 889, 0, 0}, + {20810, 7714, 2990, 0, 0}, {24852, 14050, 7684, 0, 0}, + {29385, 23991, 19322, 0, 0}, {10048, 1165, 375, 0, 0}, + {17808, 4643, 1433, 0, 0}, {23037, 10558, 4840, 0, 0}, + {26464, 16936, 10491, 0, 0}, {29858, 24950, 20602, 0, 0}, + {12393, 2141, 637, 0, 0}, {18864, 5484, 1881, 0, 0}, + {23400, 11210, 5624, 0, 0}, {26831, 17802, 11649, 0, 0}, + {30101, 25543, 21449, 0, 0}, {8798, 1298, 390, 0, 0}, + {15595, 3034, 750, 0, 0}, {19973, 7327, 2803, 0, 0}, + {23787, 13088, 6875, 0, 0}, {28040, 21396, 15866, 0, 0}, + {8481, 971, 329, 0, 0}, {16065, 3623, 1072, 0, 0}, + {21935, 9214, 4043, 0, 0}, {26300, 16202, 9711, 0, 0}, + {30353, 26206, 22490, 0, 0}, {6158, 373, 109, 0, 0}, + {14178, 2270, 651, 0, 0}, {20348, 7012, 2818, 0, 0}, + {25129, 14022, 8058, 0, 0}, {29767, 24682, 20421, 0, 0}, + {7692, 704, 188, 0, 0}, {14822, 2640, 740, 0, 0}, + {20744, 7783, 3390, 0, 0}, {25251, 14378, 8464, 0, 0}, + {29525, 23987, 19437, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{26731, 15997, 10811, 0, 0}, {7994, 1064, 342, 0, 0}, + {15938, 4179, 1712, 0, 0}, {22166, 9940, 5008, 0, 0}, + {26035, 15939, 9697, 0, 0}, {29518, 23854, 19212, 0, 0}, + {7186, 548, 100, 0, 0}, {14109, 2426, 545, 0, 0}, + {20222, 6619, 2253, 0, 0}, {24348, 12317, 5967, 0, 0}, + {28132, 20348, 14424, 0, 0}, {5187, 406, 129, 0, 0}, + {13781, 2685, 790, 0, 0}, {21441, 8520, 3684, 0, 0}, + {25504, 15049, 8648, 0, 0}, {28773, 22000, 16599, 0, 0}, + {6875, 937, 281, 0, 0}, {16191, 4181, 1389, 0, 0}, + {22579, 10020, 4586, 0, 0}, {25936, 15674, 9212, 0, 0}, + {29060, 22658, 17434, 0, 0}, {6864, 486, 112, 0, 0}, + {13047, 1976, 492, 0, 0}, {19949, 6525, 2357, 0, 0}, + {24196, 12154, 5877, 0, 0}, {27404, 18709, 12301, 0, 0}, + {6188, 330, 91, 0, 0}, {11916, 1543, 428, 0, 0}, + {20333, 7068, 2801, 0, 0}, {24077, 11943, 5792, 0, 0}, + {28322, 20559, 15499, 0, 0}, {5418, 339, 72, 0, 0}, + {11396, 1791, 496, 0, 0}, {20095, 7498, 2915, 0, 0}, + {23560, 11843, 6128, 0, 0}, {27750, 19417, 14036, 0, 0}, + {5417, 289, 55, 0, 0}, {11370, 1559, 381, 0, 0}, + {20606, 7721, 2926, 0, 0}, {24872, 14077, 7449, 0, 0}, + {28098, 19886, 13887, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{27281, 22308, 19060, 0, 0}, {11171, 4465, 2094, 0, 0}, + {21731, 10815, 6292, 0, 0}, {24621, 14806, 9816, 0, 0}, + {27526, 19707, 14236, 0, 0}, {30879, 27560, 24586, 0, 0}, + {5994, 635, 178, 0, 0}, {14924, 3204, 1001, 0, 0}, + {21078, 8330, 3597, 0, 0}, {25226, 14553, 8309, 0, 0}, + {29775, 24718, 20449, 0, 0}, {4745, 440, 177, 0, 0}, + {14117, 2642, 814, 0, 0}, {20604, 7622, 3179, 0, 0}, + {25006, 14238, 7997, 0, 0}, {29276, 23585, 18848, 0, 0}, + {5177, 760, 277, 0, 0}, {15619, 3915, 1258, 0, 0}, + {21283, 8765, 3908, 0, 0}, {25071, 14682, 8558, 0, 0}, + {29693, 24769, 20550, 0, 0}, {4500, 286, 114, 0, 0}, + {13137, 1717, 364, 0, 0}, {18908, 5508, 1748, 0, 0}, + {23163, 11155, 5174, 0, 0}, {27892, 20606, 14860, 0, 0}, + {5520, 452, 192, 0, 0}, {13813, 2311, 693, 0, 0}, + {20944, 8771, 3973, 0, 0}, {25422, 14572, 8121, 0, 0}, + {29365, 23521, 18657, 0, 0}, {3057, 113, 33, 0, 0}, + {11599, 1374, 351, 0, 0}, {19281, 5570, 1811, 0, 0}, + {23940, 11085, 5154, 0, 0}, {28498, 21317, 15730, 0, 0}, + {4060, 190, 37, 0, 0}, {12648, 1527, 286, 0, 0}, + {19076, 5218, 1447, 0, 0}, {23350, 10254, 4329, 0, 0}, + {27769, 19485, 13306, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{27095, 18466, 13057, 0, 0}, {6517, 2067, 934, 0, 0}, + {19986, 8985, 4965, 0, 0}, {23641, 12111, 6960, 0, 0}, + {26400, 16560, 11306, 0, 0}, {30303, 25591, 21946, 0, 0}, + {2807, 205, 49, 0, 0}, {14450, 2877, 819, 0, 0}, + {21407, 8254, 3411, 0, 0}, {24868, 13165, 7161, 0, 0}, + {28766, 22178, 17222, 0, 0}, {3131, 458, 173, 0, 0}, + {14472, 2855, 959, 0, 0}, {22624, 11253, 5897, 0, 0}, + {27410, 18446, 12374, 0, 0}, {29701, 24406, 19422, 0, 0}, + {4116, 298, 92, 0, 0}, {15230, 1997, 559, 0, 0}, + {18844, 5886, 2274, 0, 0}, {22272, 9931, 4899, 0, 0}, + {25532, 16372, 11147, 0, 0}, {2025, 81, 22, 0, 0}, + {9762, 1092, 279, 0, 0}, {18274, 4940, 1648, 0, 0}, + {22594, 9967, 4416, 0, 0}, {26526, 17487, 11725, 0, 0}, + {6951, 525, 48, 0, 0}, {14150, 1401, 443, 0, 0}, + {18771, 4450, 890, 0, 0}, {20513, 6234, 1385, 0, 0}, + {23207, 11180, 4318, 0, 0}, {4580, 133, 44, 0, 0}, + {10708, 403, 40, 0, 0}, {14666, 2078, 240, 0, 0}, + {18572, 3904, 769, 0, 0}, {20506, 6976, 1903, 0, 0}, + {8592, 659, 140, 0, 0}, {14488, 3087, 805, 0, 0}, + {22563, 9065, 3104, 0, 0}, {24879, 12743, 5092, 0, 0}, + {26708, 16025, 8798, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{27627, 25672, 24508, 0, 0}, {5582, 3746, 2979, 0, 0}, + {26100, 20200, 17086, 0, 0}, {30596, 26587, 24130, 0, 0}, + {31642, 29389, 28237, 0, 0}, {32325, 31407, 30514, 0, 0}, + {6685, 1615, 332, 0, 0}, {19282, 8165, 4285, 0, 0}, + {26260, 17928, 12858, 0, 0}, {29382, 23968, 19482, 0, 0}, + {31238, 28446, 25714, 0, 0}, {3129, 688, 220, 0, 0}, + {16871, 5216, 2478, 0, 0}, {24180, 12721, 7385, 0, 0}, + {27879, 19429, 13499, 0, 0}, {30528, 25897, 22270, 0, 0}, + {4603, 571, 251, 0, 0}, {12033, 2341, 1200, 0, 0}, + {18443, 8097, 5076, 0, 0}, {27649, 20214, 14963, 0, 0}, + {30958, 27327, 24507, 0, 0}, {1556, 44, 20, 0, 0}, + {9416, 1002, 223, 0, 0}, {18099, 5198, 1709, 0, 0}, + {24276, 11874, 5496, 0, 0}, {29124, 22574, 17564, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{30307, 25755, 23397, 0, 0}, {8019, 3168, 1782, 0, 0}, + {23302, 13731, 10351, 0, 0}, {29184, 23488, 18368, 0, 0}, + {31263, 28839, 27335, 0, 0}, {32091, 31268, 30032, 0, 0}, + {8781, 2066, 651, 0, 0}, {19214, 8197, 3505, 0, 0}, + {26557, 18212, 11613, 0, 0}, {29633, 21796, 17143, 0, 0}, + {30333, 25641, 21341, 0, 0}, {1468, 236, 218, 0, 0}, + {18011, 2403, 814, 0, 0}, {28363, 21156, 14215, 0, 0}, + {32188, 28636, 25446, 0, 0}, {31073, 22599, 18644, 0, 0}, + {2760, 486, 177, 0, 0}, {13524, 2660, 1020, 0, 0}, + {21588, 8610, 3213, 0, 0}, {27118, 17796, 13559, 0, 0}, + {30654, 27659, 24312, 0, 0}, {912, 52, 20, 0, 0}, + {9756, 1104, 196, 0, 0}, {19074, 6112, 2132, 0, 0}, + {24626, 13260, 6675, 0, 0}, {28515, 21813, 16044, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{32167, 31785, 31457, 0, 0}, {14043, 9362, 4681, 0, 0}, + {27307, 24576, 21845, 0, 0}, {28987, 17644, 11343, 0, 0}, + {30181, 25007, 20696, 0, 0}, {32662, 32310, 31958, 0, 0}, + {10486, 3058, 874, 0, 0}, {24260, 11842, 6784, 0, 0}, + {29042, 20055, 14685, 0, 0}, {31148, 25656, 21875, 0, 0}, + {32039, 30532, 29273, 0, 0}, {2605, 294, 84, 0, 0}, + {14464, 2304, 768, 0, 0}, {21325, 6242, 3121, 0, 0}, + {26761, 17476, 11469, 0, 0}, {30534, 26065, 23831, 0, 0}, + {1814, 591, 197, 0, 0}, {15405, 3206, 1692, 0, 0}, + {23082, 10304, 5358, 0, 0}, {24576, 16384, 11378, 0, 0}, + {31013, 24722, 21504, 0, 0}, {1600, 34, 20, 0, 0}, + {10282, 1327, 297, 0, 0}, {19935, 7141, 3030, 0, 0}, + {25788, 15389, 9646, 0, 0}, {29657, 23881, 19289, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}}, + {{{{26727, 20914, 16841, 0, 0}, {12442, 1863, 517, 0, 0}, + {18604, 5937, 2043, 0, 0}, {23008, 12121, 6183, 0, 0}, + {26352, 17815, 11549, 0, 0}, {29802, 25617, 21877, 0, 0}, + {9201, 1394, 514, 0, 0}, {17790, 5352, 1822, 0, 0}, + {23334, 12543, 6514, 0, 0}, {26110, 18210, 12233, 0, 0}, + {28852, 24091, 19779, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {14680, 3223, 1181, 0, 0}, + {19706, 6925, 2695, 0, 0}, {23828, 15941, 10517, 0, 0}, + {25114, 19548, 14795, 0, 0}, {27035, 22452, 18312, 0, 0}, + {9889, 1380, 654, 0, 0}, {17553, 4775, 1813, 0, 0}, + {23371, 13323, 7790, 0, 0}, {29326, 22955, 17424, 0, 0}, + {31400, 28832, 26236, 0, 0}, {7274, 735, 362, 0, 0}, + {15996, 4805, 2050, 0, 0}, {23349, 14603, 9508, 0, 0}, + {30091, 25267, 20971, 0, 0}, {31252, 28424, 25598, 0, 0}, + {6212, 1314, 667, 0, 0}, {15640, 5733, 2660, 0, 0}, + {24444, 17424, 12519, 0, 0}, {30865, 27072, 23299, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{24313, 13765, 8400, 0, 0}, {9205, 747, 164, 0, 0}, + {16531, 3322, 833, 0, 0}, {22044, 8769, 3410, 0, 0}, + {26043, 15240, 8352, 0, 0}, {28841, 21841, 15943, 0, 0}, + {6455, 480, 134, 0, 0}, {15338, 2673, 673, 0, 0}, + {21652, 8162, 3089, 0, 0}, {25573, 14384, 7499, 0, 0}, + {28042, 19916, 13453, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {9946, 1120, 285, 0, 0}, + {16044, 3135, 839, 0, 0}, {22507, 9735, 4043, 0, 0}, + {25739, 14928, 8240, 0, 0}, {27901, 18882, 11266, 0, 0}, + {7470, 876, 277, 0, 0}, {14959, 3438, 1256, 0, 0}, + {23100, 11439, 6189, 0, 0}, {27994, 19812, 13792, 0, 0}, + {30446, 25738, 21228, 0, 0}, {7296, 848, 225, 0, 0}, + {14811, 3381, 1136, 0, 0}, {23572, 12175, 6368, 0, 0}, + {28088, 20063, 13566, 0, 0}, {29851, 24312, 19332, 0, 0}, + {6297, 709, 194, 0, 0}, {14310, 2985, 859, 0, 0}, + {24368, 13304, 6812, 0, 0}, {28956, 21795, 15562, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{25989, 19025, 15090, 0, 0}, {7962, 971, 311, 0, 0}, + {15152, 3721, 1396, 0, 0}, {21705, 9593, 4765, 0, 0}, + {26247, 16658, 10444, 0, 0}, {30004, 25264, 21114, 0, 0}, + {7502, 401, 131, 0, 0}, {13714, 2215, 593, 0, 0}, + {20629, 7556, 2961, 0, 0}, {25457, 14606, 8064, 0, 0}, + {29371, 23604, 18694, 0, 0}, {6780, 560, 246, 0, 0}, + {16515, 3856, 1242, 0, 0}, {23617, 11381, 5396, 0, 0}, + {27080, 17853, 11272, 0, 0}, {30051, 25141, 20764, 0, 0}, + {9624, 913, 325, 0, 0}, {16698, 4277, 1443, 0, 0}, + {24066, 12301, 6251, 0, 0}, {27525, 18812, 12401, 0, 0}, + {30147, 25433, 21201, 0, 0}, {6132, 428, 138, 0, 0}, + {12778, 1718, 427, 0, 0}, {19525, 6663, 2453, 0, 0}, + {24180, 13247, 6850, 0, 0}, {28051, 21183, 15464, 0, 0}, + {6924, 476, 186, 0, 0}, {13678, 2133, 671, 0, 0}, + {20805, 8222, 3829, 0, 0}, {26550, 16681, 10414, 0, 0}, + {30428, 26160, 22342, 0, 0}, {4722, 192, 74, 0, 0}, + {11590, 1455, 472, 0, 0}, {19282, 6584, 2898, 0, 0}, + {25619, 14897, 9045, 0, 0}, {29935, 24810, 20509, 0, 0}, + {5058, 240, 82, 0, 0}, {12094, 1692, 500, 0, 0}, + {20355, 7813, 3525, 0, 0}, {26092, 15841, 9671, 0, 0}, + {29802, 24435, 19849, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{24129, 13429, 8339, 0, 0}, {8364, 931, 243, 0, 0}, + {15771, 3343, 984, 0, 0}, {21515, 8534, 3619, 0, 0}, + {26017, 15374, 8740, 0, 0}, {29278, 22938, 17577, 0, 0}, + {6485, 297, 54, 0, 0}, {13169, 1600, 326, 0, 0}, + {19622, 5814, 1875, 0, 0}, {24554, 12180, 5878, 0, 0}, + {28069, 19687, 13468, 0, 0}, {4556, 310, 99, 0, 0}, + {14174, 2452, 668, 0, 0}, {21549, 8360, 3534, 0, 0}, + {25903, 15112, 8619, 0, 0}, {29090, 22406, 16762, 0, 0}, + {6943, 632, 152, 0, 0}, {15455, 2915, 747, 0, 0}, + {21571, 8297, 3296, 0, 0}, {25821, 14987, 8363, 0, 0}, + {29000, 22108, 16507, 0, 0}, {5416, 268, 62, 0, 0}, + {11918, 1300, 299, 0, 0}, {18747, 5061, 1635, 0, 0}, + {23804, 11020, 4930, 0, 0}, {27331, 18103, 11581, 0, 0}, + {6464, 276, 70, 0, 0}, {12359, 1388, 383, 0, 0}, + {19086, 5546, 2136, 0, 0}, {23794, 11532, 6083, 0, 0}, + {28534, 21103, 15834, 0, 0}, {6495, 411, 57, 0, 0}, + {12096, 1526, 327, 0, 0}, {18596, 5514, 1866, 0, 0}, + {22898, 10870, 5493, 0, 0}, {27604, 19262, 13498, 0, 0}, + {6043, 309, 40, 0, 0}, {11777, 1326, 241, 0, 0}, + {19697, 6334, 1957, 0, 0}, {24584, 12678, 6026, 0, 0}, + {27965, 19513, 12873, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{25213, 17826, 14267, 0, 0}, {8358, 1590, 481, 0, 0}, + {18374, 6030, 2515, 0, 0}, {24355, 13214, 7573, 0, 0}, + {28002, 19844, 13983, 0, 0}, {30739, 26962, 23561, 0, 0}, + {5992, 404, 105, 0, 0}, {14036, 2801, 837, 0, 0}, + {21763, 8982, 3916, 0, 0}, {26302, 15859, 9258, 0, 0}, + {29724, 24130, 19349, 0, 0}, {3560, 186, 64, 0, 0}, + {12700, 1911, 560, 0, 0}, {20765, 7683, 3173, 0, 0}, + {25821, 15018, 8579, 0, 0}, {29523, 23665, 18761, 0, 0}, + {5409, 303, 99, 0, 0}, {13347, 2154, 594, 0, 0}, + {20853, 7758, 3189, 0, 0}, {25818, 15092, 8694, 0, 0}, + {29761, 24295, 19672, 0, 0}, {3766, 92, 33, 0, 0}, + {10666, 919, 192, 0, 0}, {18360, 4759, 1363, 0, 0}, + {23741, 11089, 4837, 0, 0}, {28074, 20090, 14020, 0, 0}, + {4552, 240, 86, 0, 0}, {11919, 1504, 450, 0, 0}, + {20012, 6953, 3017, 0, 0}, {25203, 13967, 7845, 0, 0}, + {29259, 23235, 18291, 0, 0}, {2635, 81, 29, 0, 0}, + {9705, 858, 253, 0, 0}, {18180, 4717, 1636, 0, 0}, + {23683, 11119, 5311, 0, 0}, {28507, 21114, 15504, 0, 0}, + {3250, 77, 20, 0, 0}, {10317, 809, 155, 0, 0}, + {17904, 4046, 1068, 0, 0}, {23073, 9804, 4052, 0, 0}, + {27836, 19410, 13266, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{26303, 15810, 11080, 0, 0}, {7569, 1254, 408, 0, 0}, + {17994, 5619, 2161, 0, 0}, {23511, 11330, 5796, 0, 0}, + {27045, 17585, 10886, 0, 0}, {29618, 23889, 19037, 0, 0}, + {5779, 506, 86, 0, 0}, {15372, 2831, 683, 0, 0}, + {21381, 7867, 2984, 0, 0}, {25479, 13947, 7220, 0, 0}, + {29034, 22191, 16682, 0, 0}, {3040, 267, 73, 0, 0}, + {15337, 3067, 865, 0, 0}, {22847, 9942, 4468, 0, 0}, + {26872, 17334, 10700, 0, 0}, {29338, 23122, 18011, 0, 0}, + {4154, 257, 63, 0, 0}, {13404, 2130, 505, 0, 0}, + {19639, 6514, 2366, 0, 0}, {24014, 12284, 6328, 0, 0}, + {28390, 21161, 15658, 0, 0}, {2476, 97, 24, 0, 0}, + {10988, 1165, 267, 0, 0}, {18454, 4939, 1477, 0, 0}, + {23157, 10441, 4505, 0, 0}, {27878, 19681, 13703, 0, 0}, + {6906, 201, 35, 0, 0}, {11974, 718, 201, 0, 0}, + {15525, 2143, 514, 0, 0}, {19485, 5140, 1294, 0, 0}, + {23099, 10236, 3850, 0, 0}, {5333, 71, 20, 0, 0}, + {7846, 378, 54, 0, 0}, {11319, 1264, 232, 0, 0}, + {16376, 3039, 936, 0, 0}, {21076, 7884, 3692, 0, 0}, + {8575, 478, 33, 0, 0}, {13859, 1664, 205, 0, 0}, + {20532, 5927, 1365, 0, 0}, {24597, 10928, 3686, 0, 0}, + {25544, 15488, 7493, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{29690, 25929, 22878, 0, 0}, {18931, 12318, 8289, 0, 0}, + {26854, 18546, 13440, 0, 0}, {28902, 22501, 18006, 0, 0}, + {30156, 25560, 21726, 0, 0}, {31701, 29777, 27992, 0, 0}, + {6951, 1122, 239, 0, 0}, {19060, 6430, 2383, 0, 0}, + {25440, 14183, 7898, 0, 0}, {28077, 19688, 13492, 0, 0}, + {30943, 27515, 24416, 0, 0}, {3382, 453, 144, 0, 0}, + {15608, 3767, 1408, 0, 0}, {23166, 10906, 5372, 0, 0}, + {26853, 16996, 10620, 0, 0}, {29982, 24989, 20721, 0, 0}, + {3522, 318, 105, 0, 0}, {14072, 2839, 950, 0, 0}, + {22258, 9399, 4208, 0, 0}, {26539, 16269, 9643, 0, 0}, + {30160, 25320, 21063, 0, 0}, {2015, 58, 20, 0, 0}, + {11130, 1281, 265, 0, 0}, {19831, 5914, 1898, 0, 0}, + {24586, 12172, 5798, 0, 0}, {29131, 22499, 17271, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{27524, 20618, 15862, 0, 0}, {12282, 5910, 3067, 0, 0}, + {25012, 14451, 9033, 0, 0}, {29316, 23512, 19622, 0, 0}, + {30748, 27562, 24539, 0, 0}, {30967, 27775, 24865, 0, 0}, + {5717, 910, 237, 0, 0}, {16780, 5237, 2149, 0, 0}, + {23580, 11284, 6049, 0, 0}, {26495, 15582, 8968, 0, 0}, + {29660, 23413, 18004, 0, 0}, {1692, 248, 88, 0, 0}, + {14649, 2731, 918, 0, 0}, {22524, 9799, 5296, 0, 0}, + {28076, 18691, 13495, 0, 0}, {29074, 21091, 15212, 0, 0}, + {2708, 187, 48, 0, 0}, {11757, 1993, 648, 0, 0}, + {20837, 7948, 3479, 0, 0}, {25649, 15106, 8412, 0, 0}, + {28935, 22062, 16464, 0, 0}, {814, 37, 20, 0, 0}, + {8855, 1044, 279, 0, 0}, {17248, 4708, 1482, 0, 0}, + {21251, 9760, 4197, 0, 0}, {26575, 18260, 12139, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{31733, 29961, 28612, 0, 0}, {19606, 14630, 11829, 0, 0}, + {30072, 26135, 24013, 0, 0}, {31395, 28607, 25915, 0, 0}, + {31669, 30022, 28052, 0, 0}, {32428, 31747, 31169, 0, 0}, + {9942, 2349, 633, 0, 0}, {22373, 11006, 5826, 0, 0}, + {28042, 20361, 15407, 0, 0}, {30321, 25688, 22175, 0, 0}, + {31541, 29051, 26757, 0, 0}, {4612, 1344, 834, 0, 0}, + {15853, 5014, 2395, 0, 0}, {23620, 11778, 6337, 0, 0}, + {26818, 17253, 11620, 0, 0}, {30276, 25441, 21242, 0, 0}, + {2166, 291, 98, 0, 0}, {12742, 2813, 1200, 0, 0}, + {21548, 9140, 4663, 0, 0}, {26116, 15749, 9795, 0, 0}, + {29704, 24232, 19725, 0, 0}, {999, 44, 20, 0, 0}, + {10538, 1881, 395, 0, 0}, {20534, 7689, 3037, 0, 0}, + {25442, 13952, 7415, 0, 0}, {28835, 21861, 16152, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}}, + {{{{23872, 16541, 12138, 0, 0}, {9139, 986, 241, 0, 0}, + {17595, 5013, 1447, 0, 0}, {22610, 11535, 5386, 0, 0}, + {26348, 17911, 11210, 0, 0}, {29499, 24613, 20122, 0, 0}, + {7933, 759, 272, 0, 0}, {16259, 4347, 1189, 0, 0}, + {21811, 11254, 5350, 0, 0}, {24887, 16838, 10672, 0, 0}, + {27380, 21808, 16850, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {12023, 1995, 675, 0, 0}, + {17568, 5547, 1907, 0, 0}, {19736, 11895, 7101, 0, 0}, + {20483, 14105, 9274, 0, 0}, {21205, 15287, 11279, 0, 0}, + {6508, 786, 448, 0, 0}, {17371, 4685, 1668, 0, 0}, + {23026, 13551, 7944, 0, 0}, {29507, 23139, 17406, 0, 0}, + {31288, 28446, 25269, 0, 0}, {5169, 512, 308, 0, 0}, + {15911, 5109, 1994, 0, 0}, {23217, 14478, 9020, 0, 0}, + {29716, 23835, 18665, 0, 0}, {30747, 26858, 22981, 0, 0}, + {3763, 753, 376, 0, 0}, {15091, 5074, 1905, 0, 0}, + {23564, 15412, 9549, 0, 0}, {30365, 25252, 19954, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{21960, 10712, 5872, 0, 0}, {7029, 455, 92, 0, 0}, + {15480, 2565, 547, 0, 0}, {21409, 7890, 2872, 0, 0}, + {25819, 15001, 7875, 0, 0}, {28481, 20972, 14697, 0, 0}, + {4888, 247, 63, 0, 0}, {13730, 1764, 354, 0, 0}, + {20204, 6423, 2000, 0, 0}, {24499, 12821, 5989, 0, 0}, + {27094, 18111, 11094, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {7026, 449, 97, 0, 0}, + {13211, 1604, 314, 0, 0}, {19387, 6387, 2013, 0, 0}, + {22667, 11302, 6046, 0, 0}, {23559, 13118, 5943, 0, 0}, + {5661, 851, 336, 0, 0}, {14712, 3875, 1565, 0, 0}, + {22568, 11334, 6004, 0, 0}, {28108, 19855, 13266, 0, 0}, + {30400, 25838, 20264, 0, 0}, {5808, 610, 155, 0, 0}, + {14140, 2763, 737, 0, 0}, {22535, 10326, 4536, 0, 0}, + {27297, 18138, 11252, 0, 0}, {29533, 22001, 15659, 0, 0}, + {5072, 328, 76, 0, 0}, {12736, 1601, 330, 0, 0}, + {24068, 11427, 4326, 0, 0}, {27106, 17937, 10973, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{23064, 15474, 11636, 0, 0}, {6006, 490, 135, 0, 0}, + {14386, 3148, 949, 0, 0}, {21877, 9293, 4045, 0, 0}, + {26410, 16185, 9459, 0, 0}, {29520, 23650, 18627, 0, 0}, + {5564, 195, 69, 0, 0}, {12950, 1944, 439, 0, 0}, + {20996, 7648, 2727, 0, 0}, {25773, 14735, 7729, 0, 0}, + {29016, 22326, 16670, 0, 0}, {5546, 512, 209, 0, 0}, + {17412, 4369, 1293, 0, 0}, {23947, 12133, 5711, 0, 0}, + {27257, 18364, 11529, 0, 0}, {29833, 24546, 19717, 0, 0}, + {7893, 648, 239, 0, 0}, {17535, 4503, 1323, 0, 0}, + {24163, 12198, 5836, 0, 0}, {27337, 18355, 11572, 0, 0}, + {29774, 24427, 19545, 0, 0}, {4567, 164, 68, 0, 0}, + {11727, 1322, 312, 0, 0}, {19547, 6555, 2293, 0, 0}, + {24513, 13383, 6731, 0, 0}, {27838, 20183, 13938, 0, 0}, + {4000, 320, 141, 0, 0}, {13063, 2207, 747, 0, 0}, + {21196, 9179, 4548, 0, 0}, {27236, 17734, 11322, 0, 0}, + {30308, 25618, 21312, 0, 0}, {2894, 149, 69, 0, 0}, + {11147, 1697, 567, 0, 0}, {20257, 8021, 3776, 0, 0}, + {26487, 16373, 10020, 0, 0}, {29522, 23490, 18271, 0, 0}, + {3053, 143, 56, 0, 0}, {11810, 1757, 485, 0, 0}, + {21535, 9097, 3962, 0, 0}, {26756, 16640, 9900, 0, 0}, + {29341, 22917, 17354, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{21752, 10657, 5974, 0, 0}, {6822, 411, 91, 0, 0}, + {14878, 2316, 516, 0, 0}, {21090, 7626, 2952, 0, 0}, + {26048, 15234, 8184, 0, 0}, {28538, 21103, 14948, 0, 0}, + {4368, 145, 21, 0, 0}, {11604, 1100, 193, 0, 0}, + {19196, 5380, 1586, 0, 0}, {24534, 12018, 5410, 0, 0}, + {27703, 18713, 11871, 0, 0}, {3787, 221, 63, 0, 0}, + {14087, 2225, 529, 0, 0}, {21849, 8693, 3482, 0, 0}, + {26337, 15569, 8691, 0, 0}, {28949, 22304, 16150, 0, 0}, + {5898, 301, 75, 0, 0}, {13727, 1937, 421, 0, 0}, + {20974, 7557, 2752, 0, 0}, {25880, 14749, 7798, 0, 0}, + {28398, 20405, 13776, 0, 0}, {3190, 98, 24, 0, 0}, + {9609, 761, 155, 0, 0}, {17453, 4099, 1092, 0, 0}, + {23470, 10161, 3986, 0, 0}, {26624, 16855, 9800, 0, 0}, + {4658, 269, 99, 0, 0}, {11194, 1831, 753, 0, 0}, + {20009, 7950, 4041, 0, 0}, {26223, 16007, 9726, 0, 0}, + {29119, 22171, 15935, 0, 0}, {4605, 216, 40, 0, 0}, + {10667, 1299, 304, 0, 0}, {19608, 7296, 2625, 0, 0}, + {25465, 14084, 7300, 0, 0}, {27527, 18793, 11813, 0, 0}, + {4368, 137, 24, 0, 0}, {10664, 975, 165, 0, 0}, + {19211, 6197, 1922, 0, 0}, {25019, 12907, 6093, 0, 0}, + {27895, 18738, 11534, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{22968, 15133, 11695, 0, 0}, {6615, 883, 241, 0, 0}, + {17730, 4916, 1762, 0, 0}, {24050, 12204, 6282, 0, 0}, + {27640, 18692, 12254, 0, 0}, {30132, 25202, 20843, 0, 0}, + {5217, 264, 67, 0, 0}, {14458, 2714, 668, 0, 0}, + {22557, 9348, 3686, 0, 0}, {26546, 15892, 8852, 0, 0}, + {29306, 22814, 17270, 0, 0}, {2777, 135, 47, 0, 0}, + {12885, 2017, 567, 0, 0}, {21627, 8584, 3483, 0, 0}, + {26348, 15828, 8994, 0, 0}, {29376, 23015, 17650, 0, 0}, + {4303, 152, 56, 0, 0}, {12918, 2066, 524, 0, 0}, + {21785, 8744, 3545, 0, 0}, {26474, 15998, 9186, 0, 0}, + {29524, 23485, 18259, 0, 0}, {2745, 51, 20, 0, 0}, + {9828, 736, 142, 0, 0}, {18486, 4840, 1295, 0, 0}, + {24206, 11441, 4854, 0, 0}, {27922, 19375, 12849, 0, 0}, + {2787, 178, 73, 0, 0}, {12303, 1805, 602, 0, 0}, + {21289, 9189, 4573, 0, 0}, {26852, 17120, 10695, 0, 0}, + {29737, 24163, 19370, 0, 0}, {1622, 77, 29, 0, 0}, + {9662, 1044, 324, 0, 0}, {18985, 6030, 2329, 0, 0}, + {24916, 13300, 6961, 0, 0}, {28908, 21644, 15915, 0, 0}, + {1754, 44, 20, 0, 0}, {9139, 659, 140, 0, 0}, + {18021, 4653, 1365, 0, 0}, {24223, 11526, 5290, 0, 0}, + {28194, 19987, 13701, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{23583, 13074, 8080, 0, 0}, {6687, 783, 147, 0, 0}, + {16753, 3768, 981, 0, 0}, {22226, 9078, 3562, 0, 0}, + {26036, 14823, 8091, 0, 0}, {28852, 21729, 16046, 0, 0}, + {4544, 202, 24, 0, 0}, {13668, 1630, 283, 0, 0}, + {20240, 6148, 1889, 0, 0}, {25027, 12491, 5883, 0, 0}, + {28202, 19923, 13778, 0, 0}, {2835, 175, 50, 0, 0}, + {15098, 2435, 613, 0, 0}, {22383, 9168, 3859, 0, 0}, + {26525, 16532, 10361, 0, 0}, {28792, 22379, 16751, 0, 0}, + {4391, 207, 30, 0, 0}, {13402, 1593, 286, 0, 0}, + {19441, 5593, 1674, 0, 0}, {24510, 11999, 5625, 0, 0}, + {28065, 19570, 13241, 0, 0}, {1682, 62, 20, 0, 0}, + {9915, 866, 185, 0, 0}, {18009, 4582, 1349, 0, 0}, + {23484, 10386, 4420, 0, 0}, {27183, 17576, 10900, 0, 0}, + {4477, 116, 22, 0, 0}, {12919, 661, 197, 0, 0}, + {17934, 5950, 3554, 0, 0}, {22462, 10174, 4096, 0, 0}, + {26153, 15384, 9384, 0, 0}, {3821, 164, 23, 0, 0}, + {7143, 479, 122, 0, 0}, {14010, 4096, 1365, 0, 0}, + {22751, 9338, 4245, 0, 0}, {25906, 17499, 10637, 0, 0}, + {8835, 259, 29, 0, 0}, {12841, 1273, 137, 0, 0}, + {20865, 6745, 2147, 0, 0}, {25742, 12674, 5516, 0, 0}, + {26770, 14662, 8331, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{28312, 21494, 17235, 0, 0}, {11549, 3689, 1152, 0, 0}, + {21595, 8994, 4201, 0, 0}, {25486, 14475, 8505, 0, 0}, + {27878, 19482, 13653, 0, 0}, {30878, 27260, 24109, 0, 0}, + {6117, 632, 121, 0, 0}, {18138, 4514, 1313, 0, 0}, + {24052, 11481, 5373, 0, 0}, {27153, 17437, 10760, 0, 0}, + {30093, 25068, 20618, 0, 0}, {2814, 242, 78, 0, 0}, + {16642, 3786, 1135, 0, 0}, {23738, 11407, 5416, 0, 0}, + {27357, 17975, 11497, 0, 0}, {29825, 24346, 19605, 0, 0}, + {3229, 167, 38, 0, 0}, {14643, 2383, 567, 0, 0}, + {22346, 8678, 3300, 0, 0}, {26300, 15281, 8330, 0, 0}, + {29798, 24115, 19237, 0, 0}, {1856, 53, 20, 0, 0}, + {12102, 1395, 271, 0, 0}, {20259, 6128, 1851, 0, 0}, + {24710, 12139, 5478, 0, 0}, {28537, 20762, 14716, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{22566, 12135, 7284, 0, 0}, {5432, 1323, 416, 0, 0}, + {20348, 8384, 4216, 0, 0}, {25120, 14653, 8912, 0, 0}, + {27106, 18427, 12866, 0, 0}, {29157, 22440, 17378, 0, 0}, + {1823, 152, 32, 0, 0}, {14086, 2263, 515, 0, 0}, + {21255, 7432, 2565, 0, 0}, {25319, 13316, 6620, 0, 0}, + {28286, 19717, 13882, 0, 0}, {746, 78, 21, 0, 0}, + {14190, 2267, 622, 0, 0}, {21519, 9400, 4137, 0, 0}, + {27123, 15810, 10610, 0, 0}, {27759, 21324, 16131, 0, 0}, + {1411, 58, 20, 0, 0}, {11216, 1274, 264, 0, 0}, + {18877, 5091, 1428, 0, 0}, {23717, 10670, 4596, 0, 0}, + {27578, 19391, 13282, 0, 0}, {404, 28, 20, 0, 0}, + {7929, 861, 217, 0, 0}, {15608, 3989, 1072, 0, 0}, + {20316, 8631, 3166, 0, 0}, {26603, 17379, 10291, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{30193, 25487, 21691, 0, 0}, {18766, 11902, 7366, 0, 0}, + {26425, 17712, 13110, 0, 0}, {28294, 20910, 15727, 0, 0}, + {29903, 24469, 20234, 0, 0}, {31424, 28819, 26377, 0, 0}, + {8048, 1529, 309, 0, 0}, {20183, 7412, 2800, 0, 0}, + {25587, 14522, 8324, 0, 0}, {27743, 19101, 12883, 0, 0}, + {30247, 25464, 21163, 0, 0}, {2860, 516, 184, 0, 0}, + {15347, 3612, 1193, 0, 0}, {22879, 10580, 4986, 0, 0}, + {26890, 17121, 10645, 0, 0}, {29954, 24103, 19445, 0, 0}, + {2585, 200, 55, 0, 0}, {14240, 2573, 719, 0, 0}, + {21786, 8162, 3111, 0, 0}, {25811, 14603, 7537, 0, 0}, + {29260, 22650, 17300, 0, 0}, {1007, 32, 20, 0, 0}, + {11727, 1440, 222, 0, 0}, {20200, 6036, 1602, 0, 0}, + {24716, 12048, 5035, 0, 0}, {28432, 20576, 14372, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}}, + {{{{25706, 16296, 10449, 0, 0}, {8230, 507, 94, 0, 0}, + {19093, 4727, 989, 0, 0}, {24178, 12094, 5137, 0, 0}, + {27083, 18093, 10755, 0, 0}, {29113, 22870, 17037, 0, 0}, + {6275, 350, 110, 0, 0}, {16392, 3426, 678, 0, 0}, + {22174, 10119, 3798, 0, 0}, {24592, 15598, 8465, 0, 0}, + {27163, 20074, 13629, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {8880, 866, 226, 0, 0}, + {14156, 3081, 781, 0, 0}, {16523, 7916, 3519, 0, 0}, + {17003, 10160, 5209, 0, 0}, {12873, 8069, 5258, 0, 0}, + {4367, 556, 311, 0, 0}, {17494, 4943, 1788, 0, 0}, + {23404, 14640, 8436, 0, 0}, {30485, 24575, 17686, 0, 0}, + {31540, 28796, 24887, 0, 0}, {3313, 299, 148, 0, 0}, + {14787, 4523, 1380, 0, 0}, {21847, 12670, 6528, 0, 0}, + {29025, 20939, 14111, 0, 0}, {30394, 23175, 17053, 0, 0}, + {1700, 302, 133, 0, 0}, {12447, 3196, 797, 0, 0}, + {21997, 12513, 5649, 0, 0}, {29973, 22358, 15407, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{23448, 10666, 4928, 0, 0}, {5711, 304, 44, 0, 0}, + {16437, 2500, 459, 0, 0}, {22449, 8833, 3048, 0, 0}, + {26579, 16320, 8662, 0, 0}, {29179, 21884, 13960, 0, 0}, + {3742, 144, 20, 0, 0}, {13542, 1261, 181, 0, 0}, + {20076, 5847, 1565, 0, 0}, {25719, 13236, 5133, 0, 0}, + {25041, 17099, 9516, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {4712, 143, 20, 0, 0}, + {10385, 693, 99, 0, 0}, {17351, 5670, 1019, 0, 0}, + {14641, 6275, 5578, 0, 0}, {27307, 16384, 10923, 0, 0}, + {4786, 677, 184, 0, 0}, {13723, 2900, 796, 0, 0}, + {22371, 10502, 4836, 0, 0}, {26778, 19071, 11268, 0, 0}, + {30976, 25856, 17664, 0, 0}, {4570, 267, 50, 0, 0}, + {11234, 1247, 199, 0, 0}, {21659, 7551, 2751, 0, 0}, + {27097, 17644, 6617, 0, 0}, {28087, 18725, 14043, 0, 0}, + {4080, 188, 27, 0, 0}, {10192, 689, 107, 0, 0}, + {22141, 10627, 4428, 0, 0}, {23406, 18725, 4681, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{25014, 15820, 10626, 0, 0}, {7098, 438, 77, 0, 0}, + {17105, 3543, 774, 0, 0}, {22890, 9480, 3610, 0, 0}, + {26349, 15680, 8432, 0, 0}, {28909, 21765, 15729, 0, 0}, + {5206, 173, 43, 0, 0}, {15193, 2180, 369, 0, 0}, + {21949, 7930, 2459, 0, 0}, {25644, 14082, 6852, 0, 0}, + {28289, 20080, 13428, 0, 0}, {4383, 292, 95, 0, 0}, + {17462, 3763, 830, 0, 0}, {23831, 11153, 4446, 0, 0}, + {26786, 17165, 9982, 0, 0}, {29148, 22501, 16632, 0, 0}, + {5488, 304, 101, 0, 0}, {17161, 3608, 764, 0, 0}, + {23677, 10633, 4028, 0, 0}, {26536, 16136, 8748, 0, 0}, + {28721, 21391, 15096, 0, 0}, {3548, 138, 50, 0, 0}, + {13118, 1548, 306, 0, 0}, {19718, 6456, 1941, 0, 0}, + {23540, 11898, 5300, 0, 0}, {26622, 17619, 10797, 0, 0}, + {2599, 287, 145, 0, 0}, {15556, 3457, 1214, 0, 0}, + {22857, 11457, 5886, 0, 0}, {28281, 19454, 12396, 0, 0}, + {30198, 24996, 19879, 0, 0}, {1844, 155, 60, 0, 0}, + {13278, 2562, 661, 0, 0}, {21536, 8770, 3492, 0, 0}, + {25999, 14813, 7733, 0, 0}, {28370, 20145, 13554, 0, 0}, + {2159, 141, 46, 0, 0}, {13398, 2186, 481, 0, 0}, + {22311, 9149, 3359, 0, 0}, {26325, 15131, 7934, 0, 0}, + {28123, 19532, 12662, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{24142, 12497, 6552, 0, 0}, {6061, 362, 57, 0, 0}, + {15769, 2439, 482, 0, 0}, {21323, 7645, 2482, 0, 0}, + {26357, 13940, 7167, 0, 0}, {25967, 20310, 12520, 0, 0}, + {2850, 86, 20, 0, 0}, {12119, 1029, 150, 0, 0}, + {19889, 4995, 1187, 0, 0}, {24872, 11017, 4524, 0, 0}, + {27508, 17898, 9070, 0, 0}, {3516, 175, 37, 0, 0}, + {15696, 2308, 474, 0, 0}, {22115, 8625, 3403, 0, 0}, + {26232, 15278, 8785, 0, 0}, {27839, 19598, 12683, 0, 0}, + {4631, 250, 53, 0, 0}, {14597, 1984, 361, 0, 0}, + {21331, 7332, 2309, 0, 0}, {25516, 14234, 6592, 0, 0}, + {28642, 19415, 11790, 0, 0}, {1606, 42, 20, 0, 0}, + {9751, 546, 67, 0, 0}, {17139, 3535, 722, 0, 0}, + {23381, 10147, 3288, 0, 0}, {25846, 15152, 7758, 0, 0}, + {3930, 503, 154, 0, 0}, {13067, 2562, 848, 0, 0}, + {21554, 10358, 4835, 0, 0}, {27448, 18591, 9734, 0, 0}, + {27719, 19887, 14941, 0, 0}, {5284, 297, 34, 0, 0}, + {11692, 1242, 207, 0, 0}, {20061, 6465, 1557, 0, 0}, + {24599, 11046, 4549, 0, 0}, {26723, 13362, 5726, 0, 0}, + {5015, 196, 23, 0, 0}, {11936, 890, 115, 0, 0}, + {19518, 5412, 1094, 0, 0}, {25050, 11260, 2910, 0, 0}, + {25559, 14418, 7209, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{24892, 15867, 11027, 0, 0}, {8767, 870, 143, 0, 0}, + {18239, 4809, 1317, 0, 0}, {24495, 11950, 5510, 0, 0}, + {27490, 18095, 11258, 0, 0}, {29785, 23925, 18729, 0, 0}, + {4752, 194, 36, 0, 0}, {15297, 2462, 467, 0, 0}, + {22544, 8705, 3040, 0, 0}, {26166, 14814, 7716, 0, 0}, + {28766, 21183, 15009, 0, 0}, {2578, 134, 29, 0, 0}, + {15271, 2486, 498, 0, 0}, {22539, 9039, 3230, 0, 0}, + {26424, 15557, 8328, 0, 0}, {28919, 21579, 15660, 0, 0}, + {4198, 185, 42, 0, 0}, {15247, 2607, 530, 0, 0}, + {22615, 9203, 3390, 0, 0}, {26313, 15427, 8325, 0, 0}, + {28861, 21726, 15744, 0, 0}, {2079, 53, 20, 0, 0}, + {11222, 928, 158, 0, 0}, {19221, 5187, 1309, 0, 0}, + {23856, 11011, 4459, 0, 0}, {27220, 17688, 10722, 0, 0}, + {1985, 228, 83, 0, 0}, {15228, 3240, 1100, 0, 0}, + {22608, 11300, 5985, 0, 0}, {28044, 19375, 12714, 0, 0}, + {30066, 24594, 19666, 0, 0}, {1120, 82, 26, 0, 0}, + {11814, 1674, 431, 0, 0}, {20348, 7070, 2589, 0, 0}, + {25464, 13448, 6520, 0, 0}, {28402, 20507, 13904, 0, 0}, + {1187, 45, 20, 0, 0}, {11395, 1182, 243, 0, 0}, + {20024, 6143, 1883, 0, 0}, {25337, 12446, 5818, 0, 0}, + {28076, 19445, 12657, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{24935, 14399, 8673, 0, 0}, {6118, 495, 66, 0, 0}, + {16397, 2807, 577, 0, 0}, {21713, 8686, 3139, 0, 0}, + {25876, 14124, 7368, 0, 0}, {27762, 19711, 13528, 0, 0}, + {2934, 102, 20, 0, 0}, {13191, 1433, 198, 0, 0}, + {20515, 6259, 1646, 0, 0}, {24777, 11996, 5057, 0, 0}, + {27091, 16858, 9709, 0, 0}, {2659, 236, 48, 0, 0}, + {16021, 2602, 516, 0, 0}, {22634, 9226, 3584, 0, 0}, + {26977, 16592, 9212, 0, 0}, {28406, 22354, 15484, 0, 0}, + {3276, 142, 20, 0, 0}, {12874, 1366, 243, 0, 0}, + {19826, 5697, 1899, 0, 0}, {24422, 11552, 5363, 0, 0}, + {26196, 15681, 8909, 0, 0}, {733, 33, 20, 0, 0}, + {9811, 930, 150, 0, 0}, {18044, 4196, 996, 0, 0}, + {22404, 8769, 3215, 0, 0}, {25764, 14335, 7113, 0, 0}, + {5240, 491, 87, 0, 0}, {15809, 1597, 672, 0, 0}, + {22282, 9175, 4806, 0, 0}, {24576, 16384, 9557, 0, 0}, + {23831, 14895, 11916, 0, 0}, {5053, 766, 153, 0, 0}, + {17695, 3277, 1092, 0, 0}, {21504, 8192, 4096, 0, 0}, + {30427, 14043, 9362, 0, 0}, {25486, 14564, 7282, 0, 0}, + {4221, 555, 111, 0, 0}, {11980, 2995, 529, 0, 0}, + {25988, 11299, 2260, 0, 0}, {26810, 17873, 8937, 0, 0}, + {16384, 10923, 5461, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{26776, 18464, 13003, 0, 0}, {10156, 1530, 312, 0, 0}, + {19312, 5606, 1681, 0, 0}, {24767, 12706, 6264, 0, 0}, + {27600, 18663, 12004, 0, 0}, {30136, 24997, 20383, 0, 0}, + {5734, 424, 59, 0, 0}, {16918, 3353, 771, 0, 0}, + {23274, 9992, 3927, 0, 0}, {26617, 15938, 8799, 0, 0}, + {29307, 22729, 17046, 0, 0}, {2634, 199, 37, 0, 0}, + {17130, 3346, 823, 0, 0}, {23618, 10903, 4550, 0, 0}, + {27121, 17049, 10092, 0, 0}, {29366, 22996, 17291, 0, 0}, + {4238, 182, 33, 0, 0}, {15629, 2470, 476, 0, 0}, + {22568, 8729, 3083, 0, 0}, {26349, 15094, 7982, 0, 0}, + {29224, 22543, 16944, 0, 0}, {1435, 42, 20, 0, 0}, + {12150, 1281, 224, 0, 0}, {19867, 5551, 1536, 0, 0}, + {24144, 11034, 4597, 0, 0}, {27664, 18577, 12020, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{21562, 11678, 6207, 0, 0}, {4009, 489, 97, 0, 0}, + {18597, 4816, 1199, 0, 0}, {23025, 9861, 3627, 0, 0}, + {25897, 14882, 7900, 0, 0}, {27808, 19616, 13453, 0, 0}, + {1691, 107, 20, 0, 0}, {13368, 1573, 253, 0, 0}, + {20016, 5910, 1728, 0, 0}, {24398, 10670, 4177, 0, 0}, + {27311, 17395, 10470, 0, 0}, {1071, 62, 20, 0, 0}, + {14908, 2111, 435, 0, 0}, {20258, 7956, 3507, 0, 0}, + {26588, 13644, 8046, 0, 0}, {27727, 19220, 14809, 0, 0}, + {1216, 52, 20, 0, 0}, {10860, 999, 145, 0, 0}, + {18298, 4567, 1203, 0, 0}, {23275, 9786, 4160, 0, 0}, + {25910, 15528, 8631, 0, 0}, {225, 16, 12, 0, 0}, + {8482, 671, 102, 0, 0}, {16810, 3551, 744, 0, 0}, + {22561, 8534, 2810, 0, 0}, {25839, 14463, 7116, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}, + {{{28631, 21921, 17086, 0, 0}, {14944, 5767, 2710, 0, 0}, + {22564, 9972, 4477, 0, 0}, {26692, 16833, 10643, 0, 0}, + {28916, 21831, 15952, 0, 0}, {30516, 26444, 22637, 0, 0}, + {6928, 752, 106, 0, 0}, {17659, 4500, 1237, 0, 0}, + {23383, 10537, 4428, 0, 0}, {26686, 16096, 9289, 0, 0}, + {29450, 23341, 18087, 0, 0}, {2174, 194, 50, 0, 0}, + {15932, 3216, 909, 0, 0}, {23212, 10226, 4412, 0, 0}, + {26463, 16043, 9228, 0, 0}, {29392, 22873, 17584, 0, 0}, + {3385, 151, 23, 0, 0}, {13877, 1959, 367, 0, 0}, + {21080, 6826, 2081, 0, 0}, {25300, 13299, 6117, 0, 0}, + {28859, 21410, 15756, 0, 0}, {1204, 32, 20, 0, 0}, + {11862, 1157, 168, 0, 0}, {19577, 5147, 1231, 0, 0}, + {24000, 10739, 4092, 0, 0}, {27689, 18659, 11862, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}, + {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}}}; + +alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseRangeCdf + [kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes] + [kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount + 1] = { + {{{{18470, 12050, 8594, 0, 0}, {20232, 13167, 8979, 0, 0}, + {24056, 17717, 13265, 0, 0}, {26598, 21441, 17334, 0, 0}, + {28026, 23842, 20230, 0, 0}, {28965, 25451, 22222, 0, 0}, + {31072, 29451, 27897, 0, 0}, {18376, 12817, 10012, 0, 0}, + {16790, 9550, 5950, 0, 0}, {20581, 13294, 8879, 0, 0}, + {23592, 17128, 12509, 0, 0}, {25700, 20113, 15740, 0, 0}, + {27112, 22326, 18296, 0, 0}, {30188, 27776, 25524, 0, 0}, + {20632, 14719, 11342, 0, 0}, {18984, 12047, 8287, 0, 0}, + {21932, 15147, 10868, 0, 0}, {24396, 18324, 13921, 0, 0}, + {26245, 20989, 16768, 0, 0}, {27431, 22870, 19008, 0, 0}, + {29734, 26908, 24306, 0, 0}}, + {{16801, 9863, 6482, 0, 0}, {19234, 12114, 8189, 0, 0}, + {23264, 16676, 12233, 0, 0}, {25793, 20200, 15865, 0, 0}, + {27404, 22677, 18748, 0, 0}, {28411, 24398, 20911, 0, 0}, + {30262, 27834, 25550, 0, 0}, {9736, 3953, 1832, 0, 0}, + {13228, 6064, 3049, 0, 0}, {17610, 9799, 5671, 0, 0}, + {21360, 13903, 9118, 0, 0}, {23883, 17320, 12518, 0, 0}, + {25660, 19915, 15352, 0, 0}, {28537, 24727, 21288, 0, 0}, + {12945, 6278, 3612, 0, 0}, {13878, 6839, 3836, 0, 0}, + {17108, 9277, 5335, 0, 0}, {20621, 12992, 8280, 0, 0}, + {23040, 15994, 11119, 0, 0}, {24849, 18491, 13702, 0, 0}, + {27328, 22598, 18583, 0, 0}}}, + {{{18362, 11906, 8354, 0, 0}, {20944, 13861, 9659, 0, 0}, + {24511, 18375, 13965, 0, 0}, {26908, 22021, 17990, 0, 0}, + {28293, 24282, 20784, 0, 0}, {29162, 25814, 22725, 0, 0}, + {31032, 29358, 27720, 0, 0}, {18338, 12722, 9886, 0, 0}, + {17175, 9869, 6059, 0, 0}, {20666, 13400, 8957, 0, 0}, + {23709, 17184, 12506, 0, 0}, {25769, 20165, 15720, 0, 0}, + {27084, 22271, 18215, 0, 0}, {29946, 27330, 24906, 0, 0}, + {16983, 11183, 8409, 0, 0}, {14421, 7539, 4502, 0, 0}, + {17794, 10281, 6379, 0, 0}, {21345, 14087, 9497, 0, 0}, + {23905, 17418, 12760, 0, 0}, {25615, 19916, 15490, 0, 0}, + {29061, 25732, 22786, 0, 0}}, + {{17308, 11072, 7299, 0, 0}, {20598, 13519, 9577, 0, 0}, + {24045, 17741, 13436, 0, 0}, {26340, 21064, 16894, 0, 0}, + {27846, 23476, 19716, 0, 0}, {28629, 25073, 21758, 0, 0}, + {30477, 28260, 26170, 0, 0}, {12912, 5848, 2940, 0, 0}, + {14845, 7479, 3976, 0, 0}, {18490, 10800, 6471, 0, 0}, + {21858, 14632, 9818, 0, 0}, {24345, 17953, 13141, 0, 0}, + {25997, 20485, 15994, 0, 0}, {28694, 25018, 21687, 0, 0}, + {12916, 6694, 4096, 0, 0}, {13397, 6658, 3779, 0, 0}, + {16503, 8895, 5105, 0, 0}, {20010, 12390, 7816, 0, 0}, + {22673, 15670, 10807, 0, 0}, {24518, 18140, 13317, 0, 0}, + {27563, 23023, 19146, 0, 0}}}, + {{{22205, 16535, 13005, 0, 0}, {22974, 16746, 12964, 0, 0}, + {26018, 20823, 17009, 0, 0}, {27805, 23582, 20016, 0, 0}, + {28923, 25333, 22141, 0, 0}, {29717, 26683, 23934, 0, 0}, + {31457, 30172, 28938, 0, 0}, {21522, 16364, 13079, 0, 0}, + {20453, 13857, 10037, 0, 0}, {22211, 15673, 11479, 0, 0}, + {24632, 18762, 14519, 0, 0}, {26420, 21294, 17203, 0, 0}, + {27572, 23113, 19368, 0, 0}, {30419, 28242, 26181, 0, 0}, + {19431, 14038, 11199, 0, 0}, {13462, 6697, 3886, 0, 0}, + {16816, 9228, 5514, 0, 0}, {20359, 12834, 8338, 0, 0}, + {23008, 16062, 11379, 0, 0}, {24764, 18548, 13950, 0, 0}, + {28630, 24974, 21807, 0, 0}}, + {{21898, 16084, 11819, 0, 0}, {23104, 17538, 14088, 0, 0}, + {25882, 20659, 17360, 0, 0}, {27943, 23868, 20463, 0, 0}, + {29138, 25606, 22454, 0, 0}, {29732, 26339, 23381, 0, 0}, + {31097, 29472, 27828, 0, 0}, {18949, 13609, 9742, 0, 0}, + {20784, 13660, 9648, 0, 0}, {22078, 15558, 11105, 0, 0}, + {24784, 18614, 14435, 0, 0}, {25900, 20474, 16644, 0, 0}, + {27494, 23774, 19900, 0, 0}, {29780, 26997, 24344, 0, 0}, + {13032, 6121, 3627, 0, 0}, {13835, 6698, 3784, 0, 0}, + {16989, 9720, 5568, 0, 0}, {20130, 12707, 8236, 0, 0}, + {22076, 15223, 10548, 0, 0}, {23551, 17517, 12714, 0, 0}, + {27690, 23484, 20174, 0, 0}}}, + {{{30437, 29106, 27524, 0, 0}, {29877, 27997, 26623, 0, 0}, + {28170, 25145, 23039, 0, 0}, {29248, 25923, 23569, 0, 0}, + {29351, 26649, 23444, 0, 0}, {30167, 27356, 25383, 0, 0}, + {32168, 31595, 31024, 0, 0}, {25096, 19482, 15299, 0, 0}, + {28536, 24976, 21975, 0, 0}, {29853, 27451, 25371, 0, 0}, + {30450, 28412, 26616, 0, 0}, {30641, 28768, 27214, 0, 0}, + {30918, 29290, 27493, 0, 0}, {31791, 30835, 29925, 0, 0}, + {14488, 8381, 4779, 0, 0}, {16916, 10097, 6583, 0, 0}, + {18923, 11817, 7979, 0, 0}, {21713, 14802, 10639, 0, 0}, + {23630, 17346, 12967, 0, 0}, {25314, 19623, 15312, 0, 0}, + {29398, 26375, 23755, 0, 0}}, + {{26926, 23539, 21930, 0, 0}, {30455, 29277, 28492, 0, 0}, + {29770, 26664, 25272, 0, 0}, {30348, 25321, 22900, 0, 0}, + {29734, 24273, 21845, 0, 0}, {28692, 23831, 21793, 0, 0}, + {31682, 30398, 29469, 0, 0}, {23054, 15514, 12324, 0, 0}, + {24225, 19070, 15645, 0, 0}, {27850, 23761, 20858, 0, 0}, + {28639, 25236, 22215, 0, 0}, {30404, 27235, 24710, 0, 0}, + {30934, 29222, 27205, 0, 0}, {31295, 29860, 28635, 0, 0}, + {17363, 11575, 7149, 0, 0}, {17077, 10816, 6207, 0, 0}, + {19806, 13574, 8603, 0, 0}, {22496, 14913, 10639, 0, 0}, + {24180, 17498, 12050, 0, 0}, {24086, 18099, 13268, 0, 0}, + {27898, 23132, 19563, 0, 0}}}, + {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}}, + {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}}}}, + {{{{17773, 11427, 8019, 0, 0}, {19610, 12479, 8167, 0, 0}, + {23827, 17442, 12892, 0, 0}, {26471, 21227, 16961, 0, 0}, + {27951, 23739, 19992, 0, 0}, {29037, 25495, 22141, 0, 0}, + {30921, 29151, 27414, 0, 0}, {18296, 13109, 10425, 0, 0}, + {15962, 8606, 5235, 0, 0}, {19868, 12364, 8055, 0, 0}, + {23357, 16656, 11971, 0, 0}, {25712, 20071, 15620, 0, 0}, + {27224, 22429, 18308, 0, 0}, {29814, 27064, 24449, 0, 0}, + {20304, 14697, 11414, 0, 0}, {17286, 10240, 6734, 0, 0}, + {20698, 13499, 9144, 0, 0}, {23815, 17362, 12662, 0, 0}, + {25741, 20038, 15548, 0, 0}, {26881, 21855, 17628, 0, 0}, + {28975, 25490, 22321, 0, 0}}, + {{17197, 10536, 7019, 0, 0}, {18262, 11193, 7394, 0, 0}, + {22579, 15679, 11199, 0, 0}, {25452, 19467, 14853, 0, 0}, + {26985, 21856, 17578, 0, 0}, {28008, 23613, 19680, 0, 0}, + {29775, 26802, 23994, 0, 0}, {9344, 3865, 1990, 0, 0}, + {11993, 5102, 2478, 0, 0}, {16294, 8358, 4469, 0, 0}, + {20297, 12588, 7781, 0, 0}, {23358, 16281, 11329, 0, 0}, + {25232, 19154, 14239, 0, 0}, {27720, 23182, 19219, 0, 0}, + {11678, 5478, 3012, 0, 0}, {11972, 5366, 2742, 0, 0}, + {14949, 7283, 3799, 0, 0}, {18908, 10859, 6306, 0, 0}, + {21766, 14274, 9239, 0, 0}, {23815, 16839, 11871, 0, 0}, + {26320, 20850, 16314, 0, 0}}}, + {{{16769, 10560, 7319, 0, 0}, {19718, 12780, 8646, 0, 0}, + {24174, 17904, 13390, 0, 0}, {26735, 21689, 17530, 0, 0}, + {28214, 24085, 20421, 0, 0}, {29096, 25629, 22431, 0, 0}, + {30868, 28997, 27192, 0, 0}, {16980, 11428, 8819, 0, 0}, + {15943, 8533, 5010, 0, 0}, {19895, 12366, 7958, 0, 0}, + {23178, 16405, 11674, 0, 0}, {25416, 19559, 15035, 0, 0}, + {26808, 21779, 17584, 0, 0}, {29536, 26534, 23761, 0, 0}, + {17007, 12052, 9544, 0, 0}, {13450, 6779, 4009, 0, 0}, + {17239, 9674, 5839, 0, 0}, {21106, 13779, 9127, 0, 0}, + {23813, 17200, 12402, 0, 0}, {25487, 19662, 15060, 0, 0}, + {28520, 24709, 21328, 0, 0}}, + {{17869, 11551, 8265, 0, 0}, {19249, 12485, 8721, 0, 0}, + {23339, 16802, 12403, 0, 0}, {26068, 20413, 16116, 0, 0}, + {27680, 23064, 19052, 0, 0}, {28525, 24614, 21037, 0, 0}, + {30066, 27404, 24907, 0, 0}, {10023, 4380, 2314, 0, 0}, + {12533, 5622, 2846, 0, 0}, {16872, 9053, 5131, 0, 0}, + {20928, 13418, 8637, 0, 0}, {23646, 16836, 11888, 0, 0}, + {25280, 19187, 14406, 0, 0}, {27654, 23200, 19398, 0, 0}, + {11923, 6215, 3836, 0, 0}, {11787, 5396, 2884, 0, 0}, + {14987, 7433, 3983, 0, 0}, {19008, 11060, 6471, 0, 0}, + {21793, 14353, 9403, 0, 0}, {23723, 16979, 12082, 0, 0}, + {26638, 21569, 17345, 0, 0}}}, + {{{19219, 13044, 9610, 0, 0}, {20924, 14386, 10522, 0, 0}, + {24849, 19149, 14995, 0, 0}, {27282, 22625, 18822, 0, 0}, + {28602, 24785, 21444, 0, 0}, {29404, 26262, 23341, 0, 0}, + {31170, 29608, 28094, 0, 0}, {17487, 11789, 8987, 0, 0}, + {17829, 10649, 6816, 0, 0}, {21405, 14361, 9956, 0, 0}, + {24159, 17911, 13398, 0, 0}, {26031, 20584, 16288, 0, 0}, + {27262, 22505, 18506, 0, 0}, {29778, 26982, 24388, 0, 0}, + {12519, 7515, 5351, 0, 0}, {11698, 5250, 2767, 0, 0}, + {15914, 8299, 4694, 0, 0}, {19904, 12282, 7768, 0, 0}, + {22806, 15790, 10990, 0, 0}, {24694, 18430, 13720, 0, 0}, + {28274, 24289, 20862, 0, 0}}, + {{18808, 13151, 9939, 0, 0}, {21618, 15427, 11540, 0, 0}, + {25618, 19804, 15578, 0, 0}, {27437, 22766, 18901, 0, 0}, + {28601, 25024, 21711, 0, 0}, {29288, 26139, 23122, 0, 0}, + {30885, 28984, 27082, 0, 0}, {14016, 7108, 3856, 0, 0}, + {15800, 8182, 4738, 0, 0}, {19248, 11713, 7455, 0, 0}, + {22315, 15142, 10488, 0, 0}, {24382, 18263, 13652, 0, 0}, + {26026, 20173, 15760, 0, 0}, {28495, 24628, 21269, 0, 0}, + {10648, 4941, 2535, 0, 0}, {12205, 5410, 2873, 0, 0}, + {15692, 8124, 4615, 0, 0}, {19406, 11826, 7459, 0, 0}, + {21974, 14803, 10073, 0, 0}, {23754, 17116, 12449, 0, 0}, + {27060, 22256, 18271, 0, 0}}}, + {{{27063, 21838, 17043, 0, 0}, {24822, 20003, 16653, 0, 0}, + {25967, 20645, 16542, 0, 0}, {27306, 22633, 18568, 0, 0}, + {28579, 24757, 21261, 0, 0}, {29577, 26539, 23360, 0, 0}, + {31711, 30631, 29556, 0, 0}, {22750, 15701, 11277, 0, 0}, + {25388, 20186, 16315, 0, 0}, {26700, 21923, 18429, 0, 0}, + {27670, 23570, 20213, 0, 0}, {28456, 24758, 21649, 0, 0}, + {29068, 25802, 22987, 0, 0}, {31075, 29442, 27881, 0, 0}, + {14011, 7838, 4994, 0, 0}, {15120, 8172, 4951, 0, 0}, + {18061, 10716, 6742, 0, 0}, {21048, 13916, 9476, 0, 0}, + {23411, 16816, 12243, 0, 0}, {24958, 19015, 14558, 0, 0}, + {28889, 25435, 22440, 0, 0}}, + {{24490, 19526, 16846, 0, 0}, {22221, 16901, 13849, 0, 0}, + {23662, 16926, 12159, 0, 0}, {25935, 19761, 15550, 0, 0}, + {27957, 23056, 18845, 0, 0}, {28783, 25416, 21640, 0, 0}, + {31080, 29310, 27506, 0, 0}, {19817, 10907, 6258, 0, 0}, + {22980, 16724, 12492, 0, 0}, {26459, 21524, 17898, 0, 0}, + {27585, 23419, 20202, 0, 0}, {28379, 24539, 21276, 0, 0}, + {29135, 25823, 22148, 0, 0}, {29168, 25921, 22861, 0, 0}, + {11020, 4631, 2513, 0, 0}, {13332, 6187, 3208, 0, 0}, + {16409, 8567, 4815, 0, 0}, {18807, 11075, 6897, 0, 0}, + {21224, 14082, 9446, 0, 0}, {23396, 16306, 11816, 0, 0}, + {26630, 21558, 17378, 0, 0}}}, + {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}}, + {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}}}}, + {{{{16630, 10545, 7259, 0, 0}, {17421, 10338, 6436, 0, 0}, + {23154, 16032, 11436, 0, 0}, {26168, 20493, 15861, 0, 0}, + {27957, 23344, 19221, 0, 0}, {29020, 24959, 21348, 0, 0}, + {30514, 28181, 25878, 0, 0}, {17572, 12484, 9591, 0, 0}, + {14451, 7299, 4317, 0, 0}, {18850, 11117, 6926, 0, 0}, + {22716, 15618, 10773, 0, 0}, {25269, 19138, 14181, 0, 0}, + {26610, 21351, 16765, 0, 0}, {28754, 24983, 21516, 0, 0}, + {17720, 11701, 8384, 0, 0}, {14566, 7422, 4215, 0, 0}, + {18466, 10749, 6412, 0, 0}, {21929, 14629, 9602, 0, 0}, + {24053, 17024, 11962, 0, 0}, {25232, 19192, 14224, 0, 0}, + {27355, 22433, 18270, 0, 0}}, + {{15374, 8267, 4873, 0, 0}, {16879, 9348, 5583, 0, 0}, + {21207, 13635, 8898, 0, 0}, {24483, 17956, 12924, 0, 0}, + {26272, 20725, 16218, 0, 0}, {27997, 23194, 19091, 0, 0}, + {29165, 25938, 22624, 0, 0}, {11112, 5064, 2568, 0, 0}, + {11444, 4853, 2257, 0, 0}, {15441, 7432, 3771, 0, 0}, + {19351, 11387, 6735, 0, 0}, {22636, 15343, 10430, 0, 0}, + {24188, 17752, 13135, 0, 0}, {27074, 21291, 16357, 0, 0}, + {8652, 2988, 1318, 0, 0}, {8915, 3073, 1177, 0, 0}, + {12683, 5154, 2340, 0, 0}, {17442, 8433, 4193, 0, 0}, + {20954, 13296, 7958, 0, 0}, {22547, 14157, 8001, 0, 0}, + {25079, 18210, 12447, 0, 0}}}, + {{{16554, 10388, 6998, 0, 0}, {18555, 11464, 7473, 0, 0}, + {23555, 16945, 12313, 0, 0}, {26373, 21010, 16629, 0, 0}, + {27989, 23581, 19702, 0, 0}, {28947, 25267, 21815, 0, 0}, + {30475, 28201, 25973, 0, 0}, {16909, 11485, 8948, 0, 0}, + {14364, 7166, 4042, 0, 0}, {18443, 10788, 6562, 0, 0}, + {22099, 14831, 10048, 0, 0}, {24471, 18126, 13321, 0, 0}, + {26022, 20379, 15875, 0, 0}, {28444, 24517, 20998, 0, 0}, + {16236, 11137, 8293, 0, 0}, {12101, 5618, 3100, 0, 0}, + {16040, 8258, 4593, 0, 0}, {19907, 12123, 7436, 0, 0}, + {22692, 15407, 10351, 0, 0}, {24373, 17828, 12805, 0, 0}, + {27037, 22085, 17856, 0, 0}}, + {{18335, 11613, 7830, 0, 0}, {18110, 11052, 7223, 0, 0}, + {22845, 15944, 11211, 0, 0}, {25786, 19716, 15047, 0, 0}, + {27349, 22265, 17718, 0, 0}, {27916, 23606, 19754, 0, 0}, + {29497, 26373, 23138, 0, 0}, {10558, 4935, 2659, 0, 0}, + {12018, 5400, 2947, 0, 0}, {15874, 7940, 4195, 0, 0}, + {19521, 11492, 7011, 0, 0}, {22730, 15503, 10205, 0, 0}, + {24181, 17821, 12441, 0, 0}, {27123, 21397, 17516, 0, 0}, + {10741, 5242, 3054, 0, 0}, {9670, 3622, 1547, 0, 0}, + {12882, 5427, 2496, 0, 0}, {17159, 9021, 4722, 0, 0}, + {20775, 12703, 7829, 0, 0}, {23131, 14501, 9097, 0, 0}, + {25143, 18967, 13624, 0, 0}}}, + {{{18330, 11970, 8679, 0, 0}, {20147, 13565, 9671, 0, 0}, + {24591, 18643, 14366, 0, 0}, {27094, 22267, 18312, 0, 0}, + {28532, 24529, 21035, 0, 0}, {29321, 26018, 22962, 0, 0}, + {30782, 28818, 26904, 0, 0}, {16560, 10669, 7838, 0, 0}, + {16231, 8743, 5183, 0, 0}, {19988, 12387, 7901, 0, 0}, + {23001, 16156, 11352, 0, 0}, {25082, 19030, 14370, 0, 0}, + {26435, 21154, 16804, 0, 0}, {28827, 25197, 21932, 0, 0}, + {9949, 5346, 3566, 0, 0}, {10544, 4254, 2047, 0, 0}, + {15108, 7335, 3855, 0, 0}, {19194, 11286, 6766, 0, 0}, + {22139, 14791, 9830, 0, 0}, {24156, 17470, 12503, 0, 0}, + {27161, 22277, 18172, 0, 0}}, + {{19199, 12968, 9562, 0, 0}, {19640, 12844, 8899, 0, 0}, + {24439, 17927, 13365, 0, 0}, {26638, 21792, 17711, 0, 0}, + {28086, 23929, 20250, 0, 0}, {29112, 25359, 22180, 0, 0}, + {30191, 27669, 25356, 0, 0}, {10341, 4084, 2183, 0, 0}, + {11855, 5018, 2629, 0, 0}, {16928, 8659, 4934, 0, 0}, + {20460, 12739, 8199, 0, 0}, {22552, 15983, 11310, 0, 0}, + {24459, 18565, 13655, 0, 0}, {26725, 21600, 17461, 0, 0}, + {9602, 3867, 1770, 0, 0}, {10869, 4363, 2017, 0, 0}, + {14355, 6677, 3325, 0, 0}, {17535, 9654, 5416, 0, 0}, + {20085, 12296, 7480, 0, 0}, {22066, 14509, 9359, 0, 0}, + {24643, 18304, 13542, 0, 0}}}, + {{{23728, 17982, 14408, 0, 0}, {22789, 17050, 13353, 0, 0}, + {24855, 18850, 14457, 0, 0}, {26909, 21879, 17584, 0, 0}, + {28175, 24091, 20258, 0, 0}, {28948, 25372, 21977, 0, 0}, + {31038, 29297, 27576, 0, 0}, {20965, 14403, 10059, 0, 0}, + {21349, 14710, 10543, 0, 0}, {23350, 16994, 12525, 0, 0}, + {25229, 19443, 15111, 0, 0}, {26535, 21451, 17384, 0, 0}, + {27631, 23112, 19223, 0, 0}, {29791, 26994, 24419, 0, 0}, + {11561, 5522, 3128, 0, 0}, {13221, 6190, 3271, 0, 0}, + {16599, 8897, 5078, 0, 0}, {19948, 12310, 7750, 0, 0}, + {22544, 15436, 10554, 0, 0}, {24242, 17720, 12884, 0, 0}, + {27731, 23358, 19650, 0, 0}}, + {{20429, 15439, 12628, 0, 0}, {19263, 12873, 9543, 0, 0}, + {22921, 15824, 11204, 0, 0}, {25488, 19512, 14420, 0, 0}, + {28056, 22759, 18314, 0, 0}, {28407, 24854, 20291, 0, 0}, + {29898, 27140, 24773, 0, 0}, {12707, 7264, 4242, 0, 0}, + {17533, 9890, 6623, 0, 0}, {19783, 12810, 8613, 0, 0}, + {22986, 16127, 11365, 0, 0}, {23312, 16408, 12008, 0, 0}, + {25913, 19828, 14211, 0, 0}, {27107, 22204, 17766, 0, 0}, + {7112, 2166, 874, 0, 0}, {10198, 3661, 1676, 0, 0}, + {13851, 6345, 3227, 0, 0}, {16828, 9119, 5014, 0, 0}, + {19965, 12187, 7549, 0, 0}, {21686, 14073, 9392, 0, 0}, + {24829, 18395, 13763, 0, 0}}}, + {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}}, + {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}}}}, + {{{{14453, 8479, 5217, 0, 0}, {15914, 8700, 4933, 0, 0}, + {22628, 14841, 9595, 0, 0}, {26046, 19786, 14501, 0, 0}, + {28107, 22942, 18062, 0, 0}, {28936, 24603, 20474, 0, 0}, + {29973, 26670, 23523, 0, 0}, {15623, 9442, 6096, 0, 0}, + {12035, 5088, 2460, 0, 0}, {16736, 8307, 4222, 0, 0}, + {21115, 12675, 7687, 0, 0}, {23478, 16339, 10682, 0, 0}, + {24972, 18170, 12786, 0, 0}, {26266, 20390, 15327, 0, 0}, + {11087, 5036, 2448, 0, 0}, {10379, 3724, 1507, 0, 0}, + {13741, 6037, 2681, 0, 0}, {18029, 9013, 4144, 0, 0}, + {21410, 11990, 7257, 0, 0}, {21773, 14695, 8578, 0, 0}, + {23606, 17778, 12151, 0, 0}}, + {{11343, 4816, 2380, 0, 0}, {14706, 6930, 3734, 0, 0}, + {20812, 12887, 7960, 0, 0}, {25050, 17768, 11788, 0, 0}, + {27066, 21514, 16625, 0, 0}, {27870, 23680, 15904, 0, 0}, + {29089, 25992, 20861, 0, 0}, {9474, 2608, 1105, 0, 0}, + {8371, 2872, 932, 0, 0}, {13523, 5640, 2175, 0, 0}, + {19566, 12943, 6364, 0, 0}, {21190, 13471, 8811, 0, 0}, + {24695, 19471, 11398, 0, 0}, {27307, 21845, 13023, 0, 0}, + {5401, 2247, 834, 0, 0}, {7864, 2097, 828, 0, 0}, + {9693, 4308, 1469, 0, 0}, {18368, 9110, 2351, 0, 0}, + {18883, 8886, 4443, 0, 0}, {18022, 9830, 4915, 0, 0}, + {27307, 16384, 5461, 0, 0}}}, + {{{14494, 7955, 4878, 0, 0}, {17231, 9619, 5765, 0, 0}, + {23319, 16028, 10941, 0, 0}, {26068, 20270, 15507, 0, 0}, + {27780, 22902, 18570, 0, 0}, {28532, 24621, 20866, 0, 0}, + {29901, 26908, 24114, 0, 0}, {15644, 9597, 6667, 0, 0}, + {12372, 5291, 2620, 0, 0}, {16195, 8139, 4276, 0, 0}, + {20019, 11922, 7094, 0, 0}, {22535, 14890, 9950, 0, 0}, + {24243, 17436, 12405, 0, 0}, {26485, 21136, 16513, 0, 0}, + {12302, 6257, 3482, 0, 0}, {9709, 3594, 1577, 0, 0}, + {13287, 5505, 2527, 0, 0}, {17310, 9137, 4631, 0, 0}, + {20352, 12160, 7075, 0, 0}, {22507, 14757, 9507, 0, 0}, + {24752, 18113, 13102, 0, 0}}, + {{15152, 8182, 4656, 0, 0}, {16959, 9469, 5613, 0, 0}, + {22001, 13878, 8975, 0, 0}, {25041, 18513, 13903, 0, 0}, + {26639, 20842, 15886, 0, 0}, {28286, 23064, 17907, 0, 0}, + {29491, 25316, 21246, 0, 0}, {9812, 4217, 2038, 0, 0}, + {10044, 3831, 1807, 0, 0}, {14301, 6444, 3188, 0, 0}, + {19534, 12055, 7119, 0, 0}, {21587, 15176, 10287, 0, 0}, + {24477, 14410, 8192, 0, 0}, {25200, 20887, 17784, 0, 0}, + {7820, 3767, 1621, 0, 0}, {7094, 2149, 617, 0, 0}, + {11927, 5975, 3165, 0, 0}, {18099, 8412, 4102, 0, 0}, + {21434, 9175, 4549, 0, 0}, {23846, 18006, 9895, 0, 0}, + {24467, 19224, 12233, 0, 0}}}, + {{{15655, 9035, 5687, 0, 0}, {18629, 11362, 7316, 0, 0}, + {24216, 17766, 12992, 0, 0}, {26897, 21648, 17390, 0, 0}, + {28313, 24152, 20515, 0, 0}, {29299, 25858, 22382, 0, 0}, + {30513, 28215, 25986, 0, 0}, {14544, 8392, 5715, 0, 0}, + {13478, 6058, 3154, 0, 0}, {17832, 9777, 5584, 0, 0}, + {21530, 13817, 9006, 0, 0}, {23982, 17151, 12180, 0, 0}, + {25451, 19540, 14765, 0, 0}, {27667, 23256, 19275, 0, 0}, + {10129, 4546, 2558, 0, 0}, {9552, 3437, 1461, 0, 0}, + {13693, 6006, 2873, 0, 0}, {17754, 9655, 5311, 0, 0}, + {20830, 12911, 8016, 0, 0}, {22826, 15488, 10486, 0, 0}, + {25601, 19624, 15016, 0, 0}}, + {{16948, 10030, 6280, 0, 0}, {19238, 11883, 7552, 0, 0}, + {24373, 17238, 12316, 0, 0}, {26194, 20447, 16388, 0, 0}, + {27415, 22349, 18200, 0, 0}, {28155, 24322, 20387, 0, 0}, + {29328, 25610, 22865, 0, 0}, {8521, 3717, 1544, 0, 0}, + {10650, 4710, 2399, 0, 0}, {16270, 8000, 4379, 0, 0}, + {19848, 11593, 6631, 0, 0}, {22038, 14149, 7416, 0, 0}, + {22581, 16489, 9977, 0, 0}, {23458, 18137, 10641, 0, 0}, + {7798, 2210, 711, 0, 0}, {7967, 2826, 1070, 0, 0}, + {10336, 4315, 1913, 0, 0}, {13714, 7088, 3188, 0, 0}, + {18376, 9732, 4659, 0, 0}, {20273, 11821, 6118, 0, 0}, + {20326, 12442, 6554, 0, 0}}}, + {{{20606, 13983, 10120, 0, 0}, {20019, 13071, 8962, 0, 0}, + {24188, 17471, 12422, 0, 0}, {26599, 21019, 16225, 0, 0}, + {27932, 23377, 19320, 0, 0}, {28947, 25057, 21155, 0, 0}, + {30540, 28167, 25698, 0, 0}, {16449, 8043, 4488, 0, 0}, + {17070, 9491, 5600, 0, 0}, {20042, 12400, 7721, 0, 0}, + {22856, 15753, 10792, 0, 0}, {24880, 18548, 13589, 0, 0}, + {25991, 20484, 15750, 0, 0}, {28276, 24178, 20516, 0, 0}, + {9519, 3864, 1821, 0, 0}, {11718, 4860, 2256, 0, 0}, + {15328, 7428, 3819, 0, 0}, {18709, 10750, 6227, 0, 0}, + {21480, 13865, 8870, 0, 0}, {23357, 16426, 11340, 0, 0}, + {26490, 21180, 16824, 0, 0}}, + {{18787, 12701, 9542, 0, 0}, {15846, 9188, 5985, 0, 0}, + {21763, 13729, 8281, 0, 0}, {25379, 18550, 12970, 0, 0}, + {27170, 21263, 15562, 0, 0}, {26678, 21555, 17109, 0, 0}, + {28948, 25397, 22649, 0, 0}, {11686, 5843, 3093, 0, 0}, + {11506, 4141, 1640, 0, 0}, {14376, 6314, 2331, 0, 0}, + {17898, 9858, 5672, 0, 0}, {20148, 13284, 7860, 0, 0}, + {23478, 16215, 9966, 0, 0}, {26100, 18480, 12764, 0, 0}, + {5064, 1713, 819, 0, 0}, {8059, 2790, 980, 0, 0}, + {11100, 3504, 1111, 0, 0}, {14473, 5800, 2694, 0, 0}, + {16369, 8346, 3455, 0, 0}, {18421, 9742, 4664, 0, 0}, + {20398, 12962, 8291, 0, 0}}}, + {{{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}}, + {{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}, + {24576, 16384, 8192, 0, 0}}}}}; + +/* clang-format off */ +alignas(kMaxAlignment) constexpr uint16_t kDefaultDcSignCdf[kCoefficientQuantizerContexts][kNumPlaneTypes] + [kDcSignContexts][kBooleanFieldCdfSize] = { + {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0}, + {15488, 0, 0}}}, + {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0}, + {15488, 0, 0}}}, + {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0}, + {15488, 0, 0}}}, + {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0}, + {15488, 0, 0}}} +}; +/* clang-format on */ +alignas(kMaxAlignment) constexpr uint16_t + kDefaultRestorationTypeCdf[kRestorationTypeSymbolCount + 1] = {23355, 10187, + 0, 0}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultUseWienerCdf[kBooleanFieldCdfSize] = {21198, 0, 0}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultUseSgrProjCdf[kBooleanFieldCdfSize] = {15913, 0, 0}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultHasPaletteYCdf[kPaletteBlockSizeContexts][kPaletteYModeContexts] + [kBooleanFieldCdfSize] = { + {{1092, 0, 0}, {29349, 0, 0}, {31507, 0, 0}}, + {{856, 0, 0}, {29909, 0, 0}, {31788, 0, 0}}, + {{945, 0, 0}, {29368, 0, 0}, {31987, 0, 0}}, + {{738, 0, 0}, {29207, 0, 0}, {31864, 0, 0}}, + {{459, 0, 0}, {25431, 0, 0}, {31306, 0, 0}}, + {{503, 0, 0}, {28753, 0, 0}, {31247, 0, 0}}, + {{318, 0, 0}, {24822, 0, 0}, {32639, 0, 0}}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultPaletteYSizeCdf[kPaletteBlockSizeContexts] + [kPaletteSizeSymbolCount + 1] = { + {24816, 19768, 14619, 11290, 7241, 3527, 0, 0}, + {25629, 21347, 16573, 13224, 9102, 4695, 0, 0}, + {24980, 20027, 15443, 12268, 8453, 4238, 0, 0}, + {24497, 18704, 14522, 11204, 7697, 4235, 0, 0}, + {20043, 13588, 10905, 7929, 5233, 2648, 0, 0}, + {23057, 17880, 15845, 11716, 7107, 4893, 0, 0}, + {17828, 11971, 11090, 8582, 5735, 3769, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultHasPaletteUVCdf[kPaletteUVModeContexts][kBooleanFieldCdfSize] = { + {307, 0, 0}, {11280, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultPaletteUVSizeCdf[kPaletteBlockSizeContexts] + [kPaletteSizeSymbolCount + 1] = { + {24055, 12789, 5640, 3159, 1437, 496, 0, 0}, + {26929, 17195, 9187, 5821, 2920, 1068, 0, 0}, + {28342, 21508, 14769, 11285, 6905, 3338, 0, 0}, + {29540, 23304, 17775, 14679, 10245, 5348, 0, 0}, + {29000, 23882, 19677, 14916, 10273, 5561, 0, 0}, + {30304, 24317, 19907, 11136, 7243, 4213, 0, 0}, + {31499, 27333, 22335, 13805, 11068, 6903, 0, + 0}}; + +alignas(kMaxAlignment) constexpr uint16_t kDefaultPaletteColorIndexCdf + [kNumPlaneTypes][kPaletteSizeSymbolCount][kPaletteColorIndexContexts] + [kPaletteColorIndexSymbolCount + 1] = { + {{{4058, 0, 0}, + {16384, 0, 0}, + {22215, 0, 0}, + {5732, 0, 0}, + {1165, 0, 0}}, + {{4891, 2278, 0, 0}, + {21236, 7071, 0, 0}, + {26224, 2534, 0, 0}, + {9750, 4696, 0, 0}, + {853, 383, 0, 0}}, + {{7196, 4722, 2723, 0, 0}, + {23290, 11178, 5512, 0, 0}, + {25520, 5931, 2944, 0, 0}, + {13601, 8282, 4419, 0, 0}, + {1368, 943, 518, 0, 0}}, + {{7989, 5813, 4192, 2486, 0, 0}, + {24099, 12404, 8695, 4675, 0, 0}, + {28513, 5203, 3391, 1701, 0, 0}, + {12904, 9094, 6052, 3238, 0, 0}, + {1122, 875, 621, 342, 0, 0}}, + {{9636, 7361, 5798, 4333, 2695, 0, 0}, + {25325, 15526, 12051, 8006, 4786, 0, 0}, + {26468, 7906, 5824, 3984, 2097, 0, 0}, + {13852, 9873, 7501, 5333, 3116, 0, 0}, + {1498, 1218, 960, 709, 415, 0, 0}}, + {{9663, 7569, 6304, 5084, 3837, 2450, 0, 0}, + {25818, 17321, 13816, 10087, 7201, 4205, 0, 0}, + {25208, 9294, 7278, 5565, 3847, 2060, 0, 0}, + {14224, 10395, 8311, 6573, 4649, 2723, 0, 0}, + {1570, 1317, 1098, 886, 645, 377, 0, 0}}, + {{11079, 8885, 7605, 6416, 5262, 3941, 2573, 0, 0}, + {25876, 17383, 14928, 11162, 8481, 6015, 3564, 0, 0}, + {27117, 9586, 7726, 6250, 4786, 3376, 1868, 0, 0}, + {13419, 10190, 8350, 6774, 5244, 3737, 2320, 0, 0}, + {1740, 1498, 1264, 1063, 841, 615, 376, 0, 0}}}, + {{{3679, 0, 0}, + {16384, 0, 0}, + {24055, 0, 0}, + {3511, 0, 0}, + {1158, 0, 0}}, + {{7511, 3623, 0, 0}, + {20481, 5475, 0, 0}, + {25735, 4808, 0, 0}, + {12623, 7363, 0, 0}, + {2160, 1129, 0, 0}}, + {{8558, 5593, 2865, 0, 0}, + {22880, 10382, 5554, 0, 0}, + {26867, 6715, 3475, 0, 0}, + {14450, 10616, 4435, 0, 0}, + {2309, 1632, 842, 0, 0}}, + {{9788, 7289, 4987, 2782, 0, 0}, + {24355, 11360, 7909, 3894, 0, 0}, + {30511, 3319, 2174, 1170, 0, 0}, + {13579, 11566, 6853, 4148, 0, 0}, + {924, 724, 487, 250, 0, 0}}, + {{10551, 8201, 6131, 4085, 2220, 0, 0}, + {25461, 16362, 13132, 8136, 4344, 0, 0}, + {28327, 7704, 5889, 3826, 1849, 0, 0}, + {15558, 12240, 9449, 6018, 3186, 0, 0}, + {2094, 1815, 1372, 1033, 561, 0, 0}}, + {{11529, 9600, 7724, 5806, 4063, 2262, 0, 0}, + {26223, 17756, 14764, 10951, 7265, 4067, 0, 0}, + {29320, 6473, 5331, 4064, 2642, 1326, 0, 0}, + {16879, 14445, 11064, 8070, 5792, 3078, 0, 0}, + {1780, 1564, 1289, 1034, 785, 443, 0, 0}}, + {{11326, 9480, 8010, 6522, 5119, 3788, 2205, 0, 0}, + {26905, 17835, 15216, 12100, 9085, 6357, 3495, 0, 0}, + {29353, 6958, 5891, 4778, 3545, 2374, 1150, 0, 0}, + {14803, 12684, 10536, 8794, 6494, 4366, 2378, 0, 0}, + {1578, 1439, 1252, 1089, 943, 742, 446, 0, 0}}}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultIsInterCdf[kIsInterContexts][kBooleanFieldCdfSize] = { + {31962, 0, 0}, {16106, 0, 0}, {12582, 0, 0}, {6230, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultUseCompoundReferenceCdf[kUseCompoundReferenceContexts] + [kBooleanFieldCdfSize] = {{5940, 0, 0}, + {8733, 0, 0}, + {20737, 0, 0}, + {22128, 0, 0}, + {29867, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultCompoundReferenceTypeCdf[kCompoundReferenceTypeContexts] + [kBooleanFieldCdfSize] = {{31570, 0, 0}, + {30698, 0, 0}, + {23602, 0, 0}, + {25269, 0, 0}, + {10293, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundReferenceCdf + [kNumCompoundReferenceTypes][kReferenceContexts][3][kBooleanFieldCdfSize] = + {{{{27484, 0, 0}, {28903, 0, 0}, {29640, 0, 0}}, + {{9616, 0, 0}, {18595, 0, 0}, {17498, 0, 0}}, + {{994, 0, 0}, {7648, 0, 0}, {6058, 0, 0}}}, + {{{27822, 0, 0}, {23300, 0, 0}, {31265, 0, 0}}, + {{12877, 0, 0}, {10327, 0, 0}, {17608, 0, 0}}, + {{2037, 0, 0}, {1709, 0, 0}, {5224, 0, 0}}}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultCompoundBackwardReferenceCdf[kReferenceContexts][2] + [kBooleanFieldCdfSize] = { + {{30533, 0, 0}, {31345, 0, 0}}, + {{15586, 0, 0}, {17593, 0, 0}}, + {{2162, 0, 0}, {2279, 0, 0}}}; + +/* clang-format off */ +alignas(kMaxAlignment) constexpr uint16_t kDefaultSingleReferenceCdf[kReferenceContexts][6] + [kBooleanFieldCdfSize] = { + {{27871, 0, 0}, {31213, 0, 0}, {28532, 0, 0}, {24118, 0, 0}, {31864, 0, 0}, + {31324, 0, 0}}, + {{15795, 0, 0}, {16017, 0, 0}, {13121, 0, 0}, {7995, 0, 0}, {21754, 0, 0}, + {17681, 0, 0}}, + {{3024, 0, 0}, {2489, 0, 0}, {1574, 0, 0}, {873, 0, 0}, {5893, 0, 0}, + {2464, 0, 0}}}; +/* clang-format on */ + +alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundPredictionModeCdf + [kCompoundPredictionModeContexts][kNumCompoundInterPredictionModes + 1] = { + {25008, 18945, 16960, 15127, 13612, 12102, 5877, 0, 0}, + {22038, 13316, 11623, 10019, 8729, 7637, 4044, 0, 0}, + {22104, 12547, 11180, 9862, 8473, 7381, 4332, 0, 0}, + {19470, 15784, 12297, 8586, 7701, 7032, 6346, 0, 0}, + {13864, 9443, 7526, 5336, 4870, 4510, 2010, 0, 0}, + {22043, 15314, 12644, 9948, 8573, 7600, 6722, 0, 0}, + {15643, 8495, 6954, 5276, 4554, 4064, 2176, 0, 0}, + {19722, 9554, 8263, 6826, 5333, 4326, 3438, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultNewMvCdf[kNewMvContexts][kBooleanFieldCdfSize] = { + {8733, 0, 0}, {16138, 0, 0}, {17429, 0, 0}, + {24382, 0, 0}, {20546, 0, 0}, {28092, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultZeroMvCdf[kZeroMvContexts][kBooleanFieldCdfSize] = {{30593, 0, 0}, + {31714, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultReferenceMvCdf[kReferenceMvContexts][kBooleanFieldCdfSize] = { + {8794, 0, 0}, {8580, 0, 0}, {14920, 0, 0}, + {4146, 0, 0}, {8456, 0, 0}, {12845, 0, 0}}; + +// This is called drl_mode in the spec where DRL stands for Dynamic Reference +// List. +alignas(kMaxAlignment) constexpr uint16_t + kDefaultRefMvIndexCdf[kRefMvIndexContexts][kBooleanFieldCdfSize] = { + {19664, 0, 0}, {8208, 0, 0}, {13823, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultIsInterIntraCdf[kInterIntraContexts][kBooleanFieldCdfSize] = { + {5881, 0, 0}, {5171, 0, 0}, {2531, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultInterIntraModeCdf[kInterIntraContexts][kNumInterIntraModes + 1] = { + {30893, 21686, 5436, 0, 0}, + {30295, 22772, 6380, 0, 0}, + {28530, 21231, 6842, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultIsWedgeInterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = { + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {12732, 0, 0}, {7811, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {6064, 0, 0}, {5238, 0, 0}, {3204, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {3324, 0, 0}, {5896, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultWedgeIndexCdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1] = { + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}, + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}, + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}, + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}, + {30330, 28328, 26169, 24105, 21763, 19894, 17017, 14674, 12409, 10406, + 8641, 7066, 5016, 3318, 1597, 0, 0}, + {31962, 29502, 26763, 26030, 25550, 25401, 24997, 18180, 16445, 15401, + 14316, 13346, 9929, 6641, 3139, 0, 0}, + {32614, 31781, 30843, 30717, 30680, 30657, 30617, 9735, 9065, 8484, + 7783, 7084, 5509, 3885, 1857, 0, 0}, + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}, + {29989, 29030, 28085, 25555, 24993, 24751, 24113, 18411, 14829, 11436, + 8248, 5298, 3312, 2239, 1112, 0, 0}, + {31084, 29143, 27093, 25660, 23466, 21494, 18339, 15624, 13605, 11807, + 9884, 8297, 6049, 4054, 1891, 0, 0}, + {31626, 29277, 26491, 25454, 24679, 24413, 23745, 19144, 17399, 16038, + 14654, 13455, 10247, 6756, 3218, 0, 0}, + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}, + {31633, 31446, 31275, 30133, 30072, 30031, 29998, 11752, 9833, 7711, + 5517, 3595, 2679, 1808, 835, 0, 0}, + {30026, 28573, 27041, 24733, 23788, 23432, 22622, 18644, 15498, 12235, + 9334, 6796, 4824, 3198, 1352, 0, 0}, + {31041, 28820, 26667, 24972, 22927, 20424, 17002, 13824, 12130, 10730, + 8805, 7457, 5780, 4002, 1756, 0, 0}, + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}, + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}, + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}, + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}, + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}, + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}, + {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288, + 10240, 8192, 6144, 4096, 2048, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultUseObmcCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = { + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {22331, 0, 0}, {23397, 0, 0}, {9104, 0, 0}, {16384, 0, 0}, + {23467, 0, 0}, {15336, 0, 0}, {18345, 0, 0}, {8760, 0, 0}, + {11867, 0, 0}, {17626, 0, 0}, {6951, 0, 0}, {9945, 0, 0}, + {5889, 0, 0}, {10685, 0, 0}, {2640, 0, 0}, {1754, 0, 0}, + {1208, 0, 0}, {130, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultMotionModeCdf[kMaxBlockSizes][kNumMotionModes + 1] = { + {21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0}, + {21845, 10923, 0, 0}, {25117, 8008, 0, 0}, {28030, 8003, 0, 0}, + {3969, 1378, 0, 0}, {21845, 10923, 0, 0}, {27377, 7240, 0, 0}, + {13349, 5958, 0, 0}, {27645, 9162, 0, 0}, {3795, 1174, 0, 0}, + {6337, 1994, 0, 0}, {21162, 8460, 0, 0}, {6508, 3652, 0, 0}, + {12408, 4706, 0, 0}, {3026, 1565, 0, 0}, {11089, 5938, 0, 0}, + {3252, 2067, 0, 0}, {3870, 2371, 0, 0}, {1890, 1433, 0, 0}, + {261, 210, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultIsExplicitCompoundTypeCdf[kIsExplicitCompoundTypeContexts] + [kBooleanFieldCdfSize] = { + {6161, 0, 0}, {9877, 0, 0}, + {13928, 0, 0}, {8174, 0, 0}, + {12834, 0, 0}, {10094, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultIsCompoundTypeAverageCdf[kIsCompoundTypeAverageContexts] + [kBooleanFieldCdfSize] = { + {14524, 0, 0}, {19903, 0, 0}, + {25715, 0, 0}, {19509, 0, 0}, + {23434, 0, 0}, {28124, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultCompoundTypeCdf[kMaxBlockSizes] + [kNumExplicitCompoundPredictionTypes + 1] = { + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {9337, 0, 0}, {19597, 0, 0}, + {20948, 0, 0}, {16384, 0, 0}, {21298, 0, 0}, + {22998, 0, 0}, {23668, 0, 0}, {16384, 0, 0}, + {25067, 0, 0}, {24535, 0, 0}, {26596, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, + {16384, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t kDefaultInterpolationFilterCdf + [kInterpolationFilterContexts][kNumExplicitInterpolationFilters + 1] = { + {833, 48, 0, 0}, {27200, 49, 0, 0}, {32346, 29830, 0, 0}, + {4524, 160, 0, 0}, {1562, 815, 0, 0}, {27906, 647, 0, 0}, + {31998, 31616, 0, 0}, {11879, 7131, 0, 0}, {858, 44, 0, 0}, + {28648, 56, 0, 0}, {32463, 30521, 0, 0}, {5365, 132, 0, 0}, + {1746, 759, 0, 0}, {29805, 675, 0, 0}, {32167, 31825, 0, 0}, + {17799, 11370, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultMvJointCdf[kNumMvJointTypes + 1] = {28672, 21504, 13440, 0, 0}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultMvSignCdf[kBooleanFieldCdfSize] = {16384, 0, 0}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultMvClassCdf[kMvClassSymbolCount + 1] = { + 4096, 1792, 910, 448, 217, 112, 28, 11, 6, 1, 0}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultMvClass0BitCdf[kBooleanFieldCdfSize] = {5120, 0, 0}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultMvClass0FractionCdf[kBooleanSymbolCount][kMvFractionSymbolCount + + 1] = { + {16384, 8192, 6144, 0, 0}, {20480, 11520, 8640, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultMvClass0HighPrecisionCdf[kBooleanFieldCdfSize] = {12288, 0, 0}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultMvBitCdf[kMvBitSymbolCount][kBooleanFieldCdfSize] = { + {15360, 0, 0}, {14848, 0, 0}, {13824, 0, 0}, {12288, 0, 0}, + {10240, 0, 0}, {8192, 0, 0}, {4096, 0, 0}, {2816, 0, 0}, + {2816, 0, 0}, {2048, 0, 0}}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultMvFractionCdf[kMvFractionSymbolCount + 1] = {24576, 15360, 11520, 0, + 0}; + +alignas(kMaxAlignment) constexpr uint16_t + kDefaultMvHighPrecisionCdf[kBooleanFieldCdfSize] = {16384, 0, 0}; diff --git a/src/threading_strategy.cc b/src/threading_strategy.cc new file mode 100644 index 0000000..cd4d576 --- /dev/null +++ b/src/threading_strategy.cc @@ -0,0 +1,222 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/threading_strategy.h" + +#include +#include +#include + +#include "src/frame_scratch_buffer.h" +#include "src/utils/constants.h" +#include "src/utils/logging.h" +#include "src/utils/vector.h" + +namespace libgav1 { +namespace { + +#if !defined(LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER) +constexpr int kFrameParallelThresholdMultiplier = 3; +#else +constexpr int kFrameParallelThresholdMultiplier = + LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER; +#endif + +// Computes the number of frame threads to be used based on the following +// heuristic: +// * If |thread_count| == 1, return 0. +// * If |thread_count| <= |tile_count| * 4, return 0. +// * Otherwise, return the largest value of i which satisfies the following +// condition: i + i * tile_columns <= thread_count. This ensures that there +// are at least |tile_columns| worker threads for each frame thread. +// * This function will never return 1 or a value > |thread_count|. +// +// This heuristic is based empirical performance data. The in-frame threading +// model (combination of tile multithreading, superblock row multithreading and +// post filter multithreading) performs better than the frame parallel model +// until we reach the threshold of |thread_count| > |tile_count| * +// kFrameParallelThresholdMultiplier. +// +// It is a function of |tile_count| since tile threading and superblock row +// multithreading will scale only as a factor of |tile_count|. The threshold 4 +// is arrived at based on empirical data. The general idea is that superblock +// row multithreading plateaus at 4 * |tile_count| because in most practical +// cases there aren't more than that many superblock rows and columns available +// to work on in parallel. +int ComputeFrameThreadCount(int thread_count, int tile_count, + int tile_columns) { + assert(thread_count > 0); + if (thread_count == 1) return 0; + return (thread_count <= tile_count * kFrameParallelThresholdMultiplier) + ? 0 + : std::max(2, thread_count / (1 + tile_columns)); +} + +} // namespace + +bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header, + int thread_count) { + assert(thread_count > 0); + frame_parallel_ = false; + + if (thread_count == 1) { + thread_pool_.reset(nullptr); + tile_thread_count_ = 0; + max_tile_index_for_row_threads_ = 0; + return true; + } + + // We do work in the current thread, so it is sufficient to create + // |thread_count|-1 threads in the threadpool. + thread_count = std::min(thread_count, static_cast(kMaxThreads)) - 1; + + if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) { + thread_pool_ = ThreadPool::Create("libgav1", thread_count); + if (thread_pool_ == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.", + thread_count); + tile_thread_count_ = 0; + max_tile_index_for_row_threads_ = 0; + return false; + } + } + + // Prefer tile threads first (but only if there is more than one tile). + const int tile_count = frame_header.tile_info.tile_count; + if (tile_count > 1) { + // We want 1 + tile_thread_count_ <= tile_count because the current thread + // is also used to decode tiles. This is equivalent to + // tile_thread_count_ <= tile_count - 1. + tile_thread_count_ = std::min(thread_count, tile_count - 1); + thread_count -= tile_thread_count_; + if (thread_count == 0) { + max_tile_index_for_row_threads_ = 0; + return true; + } + } else { + tile_thread_count_ = 0; + } + +#if defined(__ANDROID__) + // Assign the remaining threads for each Tile. The heuristic used here is that + // we will assign two threads for each Tile. So for example, if |thread_count| + // is 2, for a stream with 2 tiles the first tile would get both the threads + // and the second tile would have row multi-threading turned off. This + // heuristic is based on the fact that row multi-threading is fast enough only + // when there are at least two threads to do the decoding (since one thread + // always does the parsing). + // + // This heuristic might stop working when SIMD optimizations make the decoding + // much faster and the parsing thread is only as fast as the decoding threads. + // So we will have to revisit this later to make sure that this is still + // optimal. + // + // Note that while this heuristic significantly improves performance on high + // end devices (like the Pixel 3), there are some performance regressions in + // some lower end devices (in some cases) and that needs to be revisited as we + // bring in more optimizations. Overall, the gains because of this heuristic + // seems to be much larger than the regressions. + for (int i = 0; i < tile_count; ++i) { + max_tile_index_for_row_threads_ = i + 1; + thread_count -= 2; + if (thread_count <= 0) break; + } +#else // !defined(__ANDROID__) + // Assign the remaining threads to each Tile. + for (int i = 0; i < tile_count; ++i) { + const int count = thread_count / tile_count + + static_cast(i < thread_count % tile_count); + if (count == 0) { + // Once we see a 0 value, all subsequent values will be 0 since it is + // supposed to be assigned in a round-robin fashion. + break; + } + max_tile_index_for_row_threads_ = i + 1; + } +#endif // defined(__ANDROID__) + return true; +} + +bool ThreadingStrategy::Reset(int thread_count) { + assert(thread_count > 0); + frame_parallel_ = true; + + // In frame parallel mode, we simply access the underlying |thread_pool_| + // directly. So ensure all the other threadpool getter functions return + // nullptr. Also, superblock row multithreading is always disabled in frame + // parallel mode. + tile_thread_count_ = 0; + max_tile_index_for_row_threads_ = 0; + + if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) { + thread_pool_ = ThreadPool::Create("libgav1-fp", thread_count); + if (thread_pool_ == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.", + thread_count); + return false; + } + } + return true; +} + +bool InitializeThreadPoolsForFrameParallel( + int thread_count, int tile_count, int tile_columns, + std::unique_ptr* const frame_thread_pool, + FrameScratchBufferPool* const frame_scratch_buffer_pool) { + assert(*frame_thread_pool == nullptr); + thread_count = std::min(thread_count, static_cast(kMaxThreads)); + const int frame_threads = + ComputeFrameThreadCount(thread_count, tile_count, tile_columns); + if (frame_threads == 0) return true; + *frame_thread_pool = ThreadPool::Create(frame_threads); + if (*frame_thread_pool == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to create frame thread pool with %d threads.", + frame_threads); + return false; + } + int remaining_threads = thread_count - frame_threads; + if (remaining_threads == 0) return true; + int threads_per_frame = remaining_threads / frame_threads; + const int extra_threads = remaining_threads % frame_threads; + Vector> frame_scratch_buffers; + if (!frame_scratch_buffers.reserve(frame_threads)) return false; + // Create the tile thread pools. + for (int i = 0; i < frame_threads && remaining_threads > 0; ++i) { + std::unique_ptr frame_scratch_buffer = + frame_scratch_buffer_pool->Get(); + if (frame_scratch_buffer == nullptr) { + return false; + } + // If the number of tile threads cannot be divided equally amongst all the + // frame threads, assign one extra thread to the first |extra_threads| frame + // threads. + const int current_frame_thread_count = + threads_per_frame + static_cast(i < extra_threads); + if (!frame_scratch_buffer->threading_strategy.Reset( + current_frame_thread_count)) { + return false; + } + remaining_threads -= current_frame_thread_count; + frame_scratch_buffers.push_back_unchecked(std::move(frame_scratch_buffer)); + } + // We release the frame scratch buffers in reverse order so that the extra + // threads are allocated to buffers in the top of the stack. + for (int i = static_cast(frame_scratch_buffers.size()) - 1; i >= 0; + --i) { + frame_scratch_buffer_pool->Release(std::move(frame_scratch_buffers[i])); + } + return true; +} + +} // namespace libgav1 diff --git a/src/threading_strategy.h b/src/threading_strategy.h new file mode 100644 index 0000000..84b3589 --- /dev/null +++ b/src/threading_strategy.h @@ -0,0 +1,131 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_THREADING_STRATEGY_H_ +#define LIBGAV1_SRC_THREADING_STRATEGY_H_ + +#include + +#include "src/obu_parser.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/threadpool.h" + +namespace libgav1 { + +class FrameScratchBufferPool; + +// This class allocates and manages the worker threads among thread pools used +// for multi-threaded decoding. +class ThreadingStrategy { + public: + ThreadingStrategy() = default; + + // Not copyable or movable. + ThreadingStrategy(const ThreadingStrategy&) = delete; + ThreadingStrategy& operator=(const ThreadingStrategy&) = delete; + + // Creates or re-allocates the thread pools based on the |frame_header| and + // |thread_count|. This function is used only in non frame-parallel mode. This + // function is idempotent if the |frame_header| and |thread_count| don't + // change between calls (it will only create new threads on the first call and + // do nothing on the subsequent calls). This function also starts the worker + // threads whenever it creates new thread pools. + // The following strategy is used to allocate threads: + // * One thread is allocated for decoding each Tile. + // * Any remaining threads are allocated for superblock row multi-threading + // within each of the tile in a round robin fashion. + // Note: During the lifetime of a ThreadingStrategy object, only one of the + // Reset() variants will be used. + LIBGAV1_MUST_USE_RESULT bool Reset(const ObuFrameHeader& frame_header, + int thread_count); + + // Creates or re-allocates a thread pool with |thread_count| threads. This + // function is used only in frame parallel mode. This function is idempotent + // if the |thread_count| doesn't change between calls (it will only create new + // threads on the first call and do nothing on the subsequent calls). + // Note: During the lifetime of a ThreadingStrategy object, only one of the + // Reset() variants will be used. + LIBGAV1_MUST_USE_RESULT bool Reset(int thread_count); + + // Returns a pointer to the ThreadPool that is to be used for Tile + // multi-threading. + ThreadPool* tile_thread_pool() const { + return (tile_thread_count_ != 0) ? thread_pool_.get() : nullptr; + } + + int tile_thread_count() const { return tile_thread_count_; } + + // Returns a pointer to the underlying ThreadPool. + // Note: Valid only when |frame_parallel_| is true. This is used for + // facilitating in-frame multi-threading in that case. + ThreadPool* thread_pool() const { return thread_pool_.get(); } + + // Returns a pointer to the ThreadPool that is to be used within the Tile at + // index |tile_index| for superblock row multi-threading. + // Note: Valid only when |frame_parallel_| is false. + ThreadPool* row_thread_pool(int tile_index) const { + return tile_index < max_tile_index_for_row_threads_ ? thread_pool_.get() + : nullptr; + } + + // Returns a pointer to the ThreadPool that is to be used for post filter + // multi-threading. + // Note: Valid only when |frame_parallel_| is false. + ThreadPool* post_filter_thread_pool() const { + return frame_parallel_ ? nullptr : thread_pool_.get(); + } + + // Returns a pointer to the ThreadPool that is to be used for film grain + // synthesis and blending. + // Note: Valid only when |frame_parallel_| is false. + ThreadPool* film_grain_thread_pool() const { return thread_pool_.get(); } + + private: + std::unique_ptr thread_pool_; + int tile_thread_count_ = 0; + int max_tile_index_for_row_threads_ = 0; + bool frame_parallel_ = false; +}; + +// Initializes the |frame_thread_pool| and the necessary worker threadpools (the +// threading_strategy objects in each of the frame scratch buffer in +// |frame_scratch_buffer_pool|) as follows: +// * frame_threads = ComputeFrameThreadCount(); +// * For more details on how frame_threads is computed, see the function +// comment in ComputeFrameThreadCount(). +// * |frame_thread_pool| is created with |frame_threads| threads. +// * divide the remaining number of threads into each frame thread and +// initialize a frame_scratch_buffer.threading_strategy for each frame +// thread. +// When this function is called, |frame_scratch_buffer_pool| must be empty. If +// this function returns true, it means the initialization was successful and +// one of the following is true: +// * |frame_thread_pool| has been successfully initialized and +// |frame_scratch_buffer_pool| has been successfully populated with +// |frame_threads| buffers to be used by each frame thread. The total +// number of threads that this function creates will always be equal to +// |thread_count|. +// * |frame_thread_pool| is nullptr. |frame_scratch_buffer_pool| is not +// modified. This means that frame threading will not be used and the +// decoder will continue to operate normally in non frame parallel mode. +LIBGAV1_MUST_USE_RESULT bool InitializeThreadPoolsForFrameParallel( + int thread_count, int tile_count, int tile_columns, + std::unique_ptr* frame_thread_pool, + FrameScratchBufferPool* frame_scratch_buffer_pool); + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_THREADING_STRATEGY_H_ diff --git a/src/tile.h b/src/tile.h new file mode 100644 index 0000000..73bb5fd --- /dev/null +++ b/src/tile.h @@ -0,0 +1,914 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_TILE_H_ +#define LIBGAV1_SRC_TILE_H_ + +#include +#include +#include +#include // NOLINT (unapproved c++11 header) +#include +#include +#include +#include // NOLINT (unapproved c++11 header) +#include + +#include "src/buffer_pool.h" +#include "src/decoder_state.h" +#include "src/dsp/common.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/frame_scratch_buffer.h" +#include "src/loop_restoration_info.h" +#include "src/obu_parser.h" +#include "src/post_filter.h" +#include "src/quantizer.h" +#include "src/residual_buffer_pool.h" +#include "src/symbol_decoder_context.h" +#include "src/tile_scratch_buffer.h" +#include "src/utils/array_2d.h" +#include "src/utils/block_parameters_holder.h" +#include "src/utils/blocking_counter.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" +#include "src/utils/entropy_decoder.h" +#include "src/utils/memory.h" +#include "src/utils/parameter_tree.h" +#include "src/utils/segmentation_map.h" +#include "src/utils/threadpool.h" +#include "src/utils/types.h" +#include "src/yuv_buffer.h" + +namespace libgav1 { + +// Indicates what the ProcessSuperBlock() and TransformBlock() functions should +// do. "Parse" refers to consuming the bitstream, reading the transform +// coefficients and performing the dequantization. "Decode" refers to computing +// the prediction, applying the inverse transforms and adding the residual. +enum ProcessingMode { + kProcessingModeParseOnly, + kProcessingModeDecodeOnly, + kProcessingModeParseAndDecode, +}; + +class Tile : public Allocable { + public: + static std::unique_ptr Create( + int tile_number, const uint8_t* const data, size_t size, + const ObuSequenceHeader& sequence_header, + const ObuFrameHeader& frame_header, RefCountedBuffer* const current_frame, + const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer, + const WedgeMaskArray& wedge_masks, + const QuantizerMatrix& quantizer_matrix, + SymbolDecoderContext* const saved_symbol_decoder_context, + const SegmentationMap* prev_segment_ids, PostFilter* const post_filter, + const dsp::Dsp* const dsp, ThreadPool* const thread_pool, + BlockingCounterWithStatus* const pending_tiles, bool frame_parallel, + bool use_intra_prediction_buffer) { + std::unique_ptr tile(new (std::nothrow) Tile( + tile_number, data, size, sequence_header, frame_header, current_frame, + state, frame_scratch_buffer, wedge_masks, quantizer_matrix, + saved_symbol_decoder_context, prev_segment_ids, post_filter, dsp, + thread_pool, pending_tiles, frame_parallel, + use_intra_prediction_buffer)); + return (tile != nullptr && tile->Init()) ? std::move(tile) : nullptr; + } + + // Move only. + Tile(Tile&& tile) noexcept; + Tile& operator=(Tile&& tile) noexcept; + Tile(const Tile&) = delete; + Tile& operator=(const Tile&) = delete; + + struct Block; // Defined after this class. + + // Parses the entire tile. + bool Parse(); + // Decodes the entire tile. |superblock_row_progress| and + // |superblock_row_progress_condvar| are arrays of size equal to the number of + // superblock rows in the frame. Increments |superblock_row_progress[i]| after + // each superblock row at index |i| is decoded. If the count reaches the + // number of tile columns, then it notifies + // |superblock_row_progress_condvar[i]|. + bool Decode(std::mutex* mutex, int* superblock_row_progress, + std::condition_variable* superblock_row_progress_condvar); + // Parses and decodes the entire tile. Depending on the configuration of this + // Tile, this function may do multithreaded decoding. + bool ParseAndDecode(); // 5.11.2. + // Processes all the columns of the superblock row at |row4x4| that are within + // this Tile. If |save_symbol_decoder_context| is true, then + // SaveSymbolDecoderContext() is invoked for the last superblock row. + template + bool ProcessSuperBlockRow(int row4x4, TileScratchBuffer* scratch_buffer); + + const ObuSequenceHeader& sequence_header() const { return sequence_header_; } + const ObuFrameHeader& frame_header() const { return frame_header_; } + const RefCountedBuffer& current_frame() const { return current_frame_; } + const TemporalMotionField& motion_field() const { return motion_field_; } + const std::array& reference_frame_sign_bias() + const { + return reference_frame_sign_bias_; + } + + bool IsRow4x4Inside(int row4x4) const { + return row4x4 >= row4x4_start_ && row4x4 < row4x4_end_; + } + + // 5.11.51. + bool IsInside(int row4x4, int column4x4) const { + return IsRow4x4Inside(row4x4) && column4x4 >= column4x4_start_ && + column4x4 < column4x4_end_; + } + + bool IsLeftInside(int column4x4) const { + // We use "larger than" as the condition. Don't pass in the left column + // offset column4x4 - 1. + assert(column4x4 <= column4x4_end_); + return column4x4 > column4x4_start_; + } + + bool IsTopInside(int row4x4) const { + // We use "larger than" as the condition. Don't pass in the top row offset + // row4x4 - 1. + assert(row4x4 <= row4x4_end_); + return row4x4 > row4x4_start_; + } + + bool IsTopLeftInside(int row4x4, int column4x4) const { + // We use "larger than" as the condition. Don't pass in the top row offset + // row4x4 - 1 or the left column offset column4x4 - 1. + assert(row4x4 <= row4x4_end_); + assert(column4x4 <= column4x4_end_); + return row4x4 > row4x4_start_ && column4x4 > column4x4_start_; + } + + bool IsBottomRightInside(int row4x4, int column4x4) const { + assert(row4x4 >= row4x4_start_); + assert(column4x4 >= column4x4_start_); + return row4x4 < row4x4_end_ && column4x4 < column4x4_end_; + } + + BlockParameters** BlockParametersAddress(int row4x4, int column4x4) const { + return block_parameters_holder_.Address(row4x4, column4x4); + } + + int BlockParametersStride() const { + return block_parameters_holder_.columns4x4(); + } + + // Returns true if Parameters() can be called with |row| and |column| as + // inputs, false otherwise. + bool HasParameters(int row, int column) const { + return block_parameters_holder_.Find(row, column) != nullptr; + } + const BlockParameters& Parameters(int row, int column) const { + return *block_parameters_holder_.Find(row, column); + } + + int number() const { return number_; } + int superblock_rows() const { return superblock_rows_; } + int superblock_columns() const { return superblock_columns_; } + int row4x4_start() const { return row4x4_start_; } + int column4x4_start() const { return column4x4_start_; } + int column4x4_end() const { return column4x4_end_; } + + private: + // Stores the transform tree state when reading variable size transform trees + // and when applying the transform tree. When applying the transform tree, + // |depth| is not used. + struct TransformTreeNode { + // The default constructor is invoked by the Stack + // constructor. Stack<> does not use the default-constructed elements, so it + // is safe for the default constructor to not initialize the members. + TransformTreeNode() = default; + TransformTreeNode(int x, int y, TransformSize tx_size, int depth = -1) + : x(x), y(y), tx_size(tx_size), depth(depth) {} + + int x; + int y; + TransformSize tx_size; + int depth; + }; + + // Enum to track the processing state of a superblock. + enum SuperBlockState : uint8_t { + kSuperBlockStateNone, // Not yet parsed or decoded. + kSuperBlockStateParsed, // Parsed but not yet decoded. + kSuperBlockStateScheduled, // Scheduled for decoding. + kSuperBlockStateDecoded // Parsed and decoded. + }; + + // Parameters used to facilitate multi-threading within the Tile. + struct ThreadingParameters { + std::mutex mutex; + // 2d array of size |superblock_rows_| by |superblock_columns_| containing + // the processing state of each superblock. + Array2D sb_state LIBGAV1_GUARDED_BY(mutex); + // Variable used to indicate either parse or decode failure. + bool abort LIBGAV1_GUARDED_BY(mutex) = false; + int pending_jobs LIBGAV1_GUARDED_BY(mutex) = 0; + std::condition_variable pending_jobs_zero_condvar; + }; + + // The residual pointer is used to traverse the |residual_buffer_|. It is + // used in two different ways. + // If |split_parse_and_decode_| is true: + // The pointer points to the beginning of the |residual_buffer_| when the + // "parse" and "decode" steps begin. It is then moved forward tx_size in + // each iteration of the "parse" and the "decode" steps. In this case, the + // ResidualPtr variable passed into various functions starting from + // ProcessSuperBlock is used as an in/out parameter to keep track of the + // residual pointer. + // If |split_parse_and_decode_| is false: + // The pointer is reset to the beginning of the |residual_buffer_| for + // every transform block. + using ResidualPtr = uint8_t*; + + Tile(int tile_number, const uint8_t* data, size_t size, + const ObuSequenceHeader& sequence_header, + const ObuFrameHeader& frame_header, RefCountedBuffer* current_frame, + const DecoderState& state, FrameScratchBuffer* frame_scratch_buffer, + const WedgeMaskArray& wedge_masks, + const QuantizerMatrix& quantizer_matrix, + SymbolDecoderContext* saved_symbol_decoder_context, + const SegmentationMap* prev_segment_ids, PostFilter* post_filter, + const dsp::Dsp* dsp, ThreadPool* thread_pool, + BlockingCounterWithStatus* pending_tiles, bool frame_parallel, + bool use_intra_prediction_buffer); + + // Performs member initializations that may fail. Helper function used by + // Create(). + LIBGAV1_MUST_USE_RESULT bool Init(); + + // Saves the symbol decoder context of this tile into + // |saved_symbol_decoder_context_| if necessary. + void SaveSymbolDecoderContext(); + + // Entry point for multi-threaded decoding. This function performs the same + // functionality as ParseAndDecode(). The current thread does the "parse" step + // while the worker threads do the "decode" step. + bool ThreadedParseAndDecode(); + + // Returns whether or not the prerequisites for decoding the superblock at + // |row_index| and |column_index| are satisfied. |threading_.mutex| must be + // held when calling this function. + bool CanDecode(int row_index, int column_index) const; + + // This function is run by the worker threads when multi-threaded decoding is + // enabled. Once a superblock is decoded, this function will set the + // corresponding |threading_.sb_state| entry to kSuperBlockStateDecoded. On + // failure, |threading_.abort| will be set to true. If at any point + // |threading_.abort| becomes true, this function will return as early as it + // can. If the decoding succeeds, this function will also schedule the + // decoding jobs for the superblock to the bottom-left and the superblock to + // the right of this superblock (if it is allowed). + void DecodeSuperBlock(int row_index, int column_index, int block_width4x4); + + // If |use_intra_prediction_buffer_| is true, then this function copies the + // last row of the superblockrow starting at |row4x4| into the + // |intra_prediction_buffer_| (which may be used by the intra prediction + // process for the next superblock row). + void PopulateIntraPredictionBuffer(int row4x4); + + uint16_t* GetPartitionCdf(int row4x4, int column4x4, BlockSize block_size); + bool ReadPartition(int row4x4, int column4x4, BlockSize block_size, + bool has_rows, bool has_columns, Partition* partition); + // Processes the Partition starting at |row4x4_start|, |column4x4_start| + // iteratively. It performs a DFS traversal over the partition tree to process + // the blocks in the right order. + bool ProcessPartition( + int row4x4_start, int column4x4_start, ParameterTree* root, + TileScratchBuffer* scratch_buffer, + ResidualPtr* residual); // Iterative implementation of 5.11.4. + bool ProcessBlock(int row4x4, int column4x4, BlockSize block_size, + ParameterTree* tree, TileScratchBuffer* scratch_buffer, + ResidualPtr* residual); // 5.11.5. + void ResetCdef(int row4x4, int column4x4); // 5.11.55. + + // This function is used to decode a superblock when the parsing has already + // been done for that superblock. + bool DecodeSuperBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer, + ResidualPtr* residual); + // Helper function used by DecodeSuperBlock(). Note that the decode_block() + // function in the spec is equivalent to ProcessBlock() in the code. + bool DecodeBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer, + ResidualPtr* residual); + + void ClearBlockDecoded(TileScratchBuffer* scratch_buffer, int row4x4, + int column4x4); // 5.11.3. + bool ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4, + TileScratchBuffer* scratch_buffer, + ProcessingMode mode); + void ResetLoopRestorationParams(); + void ReadLoopRestorationCoefficients(int row4x4, int column4x4, + BlockSize block_size); // 5.11.57. + + // Helper functions for DecodeBlock. + bool ReadSegmentId(const Block& block); // 5.11.9. + bool ReadIntraSegmentId(const Block& block); // 5.11.8. + void ReadSkip(const Block& block); // 5.11.11. + void ReadSkipMode(const Block& block); // 5.11.10. + void ReadCdef(const Block& block); // 5.11.56. + // Returns the new value. |cdf| is an array of size kDeltaSymbolCount + 1. + int ReadAndClipDelta(uint16_t* cdf, int delta_small, int scale, int min_value, + int max_value, int value); + void ReadQuantizerIndexDelta(const Block& block); // 5.11.12. + void ReadLoopFilterDelta(const Block& block); // 5.11.13. + // Populates |BlockParameters::deblock_filter_level| for the given |block| + // using |deblock_filter_levels_|. + void PopulateDeblockFilterLevel(const Block& block); + void ReadPredictionModeY(const Block& block, bool intra_y_mode); + void ReadIntraAngleInfo(const Block& block, + PlaneType plane_type); // 5.11.42 and 5.11.43. + void ReadPredictionModeUV(const Block& block); + void ReadCflAlpha(const Block& block); // 5.11.45. + int GetPaletteCache(const Block& block, PlaneType plane_type, + uint16_t* cache); + void ReadPaletteColors(const Block& block, Plane plane); + void ReadPaletteModeInfo(const Block& block); // 5.11.46. + void ReadFilterIntraModeInfo(const Block& block); // 5.11.24. + int ReadMotionVectorComponent(const Block& block, + int component); // 5.11.32. + void ReadMotionVector(const Block& block, int index); // 5.11.31. + bool DecodeIntraModeInfo(const Block& block); // 5.11.7. + int8_t ComputePredictedSegmentId(const Block& block) const; // 5.11.21. + bool ReadInterSegmentId(const Block& block, bool pre_skip); // 5.11.19. + void ReadIsInter(const Block& block); // 5.11.20. + bool ReadIntraBlockModeInfo(const Block& block, + bool intra_y_mode); // 5.11.22. + CompoundReferenceType ReadCompoundReferenceType(const Block& block); + template + uint16_t* GetReferenceCdf(const Block& block, CompoundReferenceType type = + kNumCompoundReferenceTypes); + void ReadReferenceFrames(const Block& block); // 5.11.25. + void ReadInterPredictionModeY(const Block& block, + const MvContexts& mode_contexts); + void ReadRefMvIndex(const Block& block); + void ReadInterIntraMode(const Block& block, bool is_compound); // 5.11.28. + bool IsScaled(ReferenceFrameType type) const { // Part of 5.11.27. + const int index = + frame_header_.reference_frame_index[type - kReferenceFrameLast]; + return reference_frames_[index]->upscaled_width() != frame_header_.width || + reference_frames_[index]->frame_height() != frame_header_.height; + } + void ReadMotionMode(const Block& block, bool is_compound); // 5.11.27. + uint16_t* GetIsExplicitCompoundTypeCdf(const Block& block); + uint16_t* GetIsCompoundTypeAverageCdf(const Block& block); + void ReadCompoundType(const Block& block, bool is_compound); // 5.11.29. + uint16_t* GetInterpolationFilterCdf(const Block& block, int direction); + void ReadInterpolationFilter(const Block& block); + bool ReadInterBlockModeInfo(const Block& block); // 5.11.23. + bool DecodeInterModeInfo(const Block& block); // 5.11.18. + bool DecodeModeInfo(const Block& block); // 5.11.6. + bool IsMvValid(const Block& block, bool is_compound) const; // 6.10.25. + bool AssignInterMv(const Block& block, bool is_compound); // 5.11.26. + bool AssignIntraMv(const Block& block); // 5.11.26. + int GetTopTransformWidth(const Block& block, int row4x4, int column4x4, + bool ignore_skip); + int GetLeftTransformHeight(const Block& block, int row4x4, int column4x4, + bool ignore_skip); + TransformSize ReadFixedTransformSize(const Block& block); // 5.11.15. + // Iterative implementation of 5.11.17. + void ReadVariableTransformTree(const Block& block, int row4x4, int column4x4, + TransformSize tx_size); + void DecodeTransformSize(const Block& block); // 5.11.16. + bool ComputePrediction(const Block& block); // 5.11.33. + // |x4| and |y4| are the column and row positions of the 4x4 block. |w4| and + // |h4| are the width and height in 4x4 units of |tx_size|. + int GetTransformAllZeroContext(const Block& block, Plane plane, + TransformSize tx_size, int x4, int y4, int w4, + int h4); + TransformSet GetTransformSet(TransformSize tx_size, + bool is_inter) const; // 5.11.48. + TransformType ComputeTransformType(const Block& block, Plane plane, + TransformSize tx_size, int block_x, + int block_y); // 5.11.40. + void ReadTransformType(const Block& block, int x4, int y4, + TransformSize tx_size); // 5.11.47. + template + void ReadCoeffBase2D( + const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2, + int eob, + uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1], + uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts] + [kCoeffBaseRangeSymbolCount + 1], + ResidualType* quantized_buffer, uint8_t* level_buffer); + template + void ReadCoeffBaseHorizontal( + const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2, + int eob, + uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1], + uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts] + [kCoeffBaseRangeSymbolCount + 1], + ResidualType* quantized_buffer, uint8_t* level_buffer); + template + void ReadCoeffBaseVertical( + const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2, + int eob, + uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1], + uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts] + [kCoeffBaseRangeSymbolCount + 1], + ResidualType* quantized_buffer, uint8_t* level_buffer); + int GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane); + void SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane, + uint8_t coefficient_level, int8_t dc_category); + void InterIntraPrediction( + uint16_t* prediction_0, const uint8_t* prediction_mask, + ptrdiff_t prediction_mask_stride, + const PredictionParameters& prediction_parameters, int prediction_width, + int prediction_height, int subsampling_x, int subsampling_y, + uint8_t* dest, + ptrdiff_t dest_stride); // Part of section 7.11.3.1 in the spec. + void CompoundInterPrediction( + const Block& block, const uint8_t* prediction_mask, + ptrdiff_t prediction_mask_stride, int prediction_width, + int prediction_height, int subsampling_x, int subsampling_y, + int candidate_row, int candidate_column, uint8_t* dest, + ptrdiff_t dest_stride); // Part of section 7.11.3.1 in the spec. + GlobalMotion* GetWarpParams(const Block& block, Plane plane, + int prediction_width, int prediction_height, + const PredictionParameters& prediction_parameters, + ReferenceFrameType reference_type, + bool* is_local_valid, + GlobalMotion* global_motion_params, + GlobalMotion* local_warp_params) + const; // Part of section 7.11.3.1 in the spec. + bool InterPrediction(const Block& block, Plane plane, int x, int y, + int prediction_width, int prediction_height, + int candidate_row, int candidate_column, + bool* is_local_valid, + GlobalMotion* local_warp_params); // 7.11.3.1. + void ScaleMotionVector(const MotionVector& mv, Plane plane, + int reference_frame_index, int x, int y, int* start_x, + int* start_y, int* step_x, int* step_y); // 7.11.3.3. + // If the method returns false, the caller only uses the output parameters + // *ref_block_start_x and *ref_block_start_y. If the method returns true, the + // caller uses all three output parameters. + static bool GetReferenceBlockPosition( + int reference_frame_index, bool is_scaled, int width, int height, + int ref_start_x, int ref_last_x, int ref_start_y, int ref_last_y, + int start_x, int start_y, int step_x, int step_y, int left_border, + int right_border, int top_border, int bottom_border, + int* ref_block_start_x, int* ref_block_start_y, int* ref_block_end_x); + + template + void BuildConvolveBlock(Plane plane, int reference_frame_index, + bool is_scaled, int height, int ref_start_x, + int ref_last_x, int ref_start_y, int ref_last_y, + int step_y, int ref_block_start_x, + int ref_block_end_x, int ref_block_start_y, + uint8_t* block_buffer, + ptrdiff_t convolve_buffer_stride, + ptrdiff_t block_extended_width); + bool BlockInterPrediction(const Block& block, Plane plane, + int reference_frame_index, const MotionVector& mv, + int x, int y, int width, int height, + int candidate_row, int candidate_column, + uint16_t* prediction, bool is_compound, + bool is_inter_intra, uint8_t* dest, + ptrdiff_t dest_stride); // 7.11.3.4. + bool BlockWarpProcess(const Block& block, Plane plane, int index, + int block_start_x, int block_start_y, int width, + int height, GlobalMotion* warp_params, bool is_compound, + bool is_inter_intra, uint8_t* dest, + ptrdiff_t dest_stride); // 7.11.3.5. + bool ObmcBlockPrediction(const Block& block, const MotionVector& mv, + Plane plane, int reference_frame_index, int width, + int height, int x, int y, int candidate_row, + int candidate_column, + ObmcDirection blending_direction); + bool ObmcPrediction(const Block& block, Plane plane, int width, + int height); // 7.11.3.9. + void DistanceWeightedPrediction(void* prediction_0, void* prediction_1, + int width, int height, int candidate_row, + int candidate_column, uint8_t* dest, + ptrdiff_t dest_stride); // 7.11.3.15. + // This function specializes the parsing of DC coefficient by removing some of + // the branches when i == 0 (since scan[0] is always 0 and scan[i] is always + // non-zero for all other possible values of i). |dc_category| is an output + // parameter that is populated when |is_dc_coefficient| is true. + // |coefficient_level| is an output parameter which accumulates the + // coefficient level. + template + LIBGAV1_ALWAYS_INLINE bool ReadSignAndApplyDequantization( + const uint16_t* scan, int i, int q_value, const uint8_t* quantizer_matrix, + int shift, int max_value, uint16_t* dc_sign_cdf, int8_t* dc_category, + int* coefficient_level, + ResidualType* residual_buffer); // Part of 5.11.39. + int ReadCoeffBaseRange(uint16_t* cdf); // Part of 5.11.39. + // Returns the number of non-zero coefficients that were read. |tx_type| is an + // output parameter that stores the computed transform type for the plane + // whose coefficients were read. Returns -1 on failure. + template + int ReadTransformCoefficients(const Block& block, Plane plane, int start_x, + int start_y, TransformSize tx_size, + TransformType* tx_type); // 5.11.39. + bool TransformBlock(const Block& block, Plane plane, int base_x, int base_y, + TransformSize tx_size, int x, int y, + ProcessingMode mode); // 5.11.35. + // Iterative implementation of 5.11.36. + bool TransformTree(const Block& block, int start_x, int start_y, + BlockSize plane_size, ProcessingMode mode); + void ReconstructBlock(const Block& block, Plane plane, int start_x, + int start_y, TransformSize tx_size, + TransformType tx_type, + int non_zero_coeff_count); // Part of 7.12.3. + bool Residual(const Block& block, ProcessingMode mode); // 5.11.34. + // part of 5.11.5 (reset_block_context() in the spec). + void ResetEntropyContext(const Block& block); + // Populates the |color_context| and |color_order| for the |i|th iteration + // with entries counting down from |start| to |end| (|start| > |end|). + void PopulatePaletteColorContexts( + const Block& block, PlaneType plane_type, int i, int start, int end, + uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize], + uint8_t color_context[kMaxPaletteSquare]); // 5.11.50. + bool ReadPaletteTokens(const Block& block); // 5.11.49. + template + void IntraPrediction(const Block& block, Plane plane, int x, int y, + bool has_left, bool has_top, bool has_top_right, + bool has_bottom_left, PredictionMode mode, + TransformSize tx_size); + bool IsSmoothPrediction(int row, int column, Plane plane) const; + int GetIntraEdgeFilterType(const Block& block, + Plane plane) const; // 7.11.2.8. + template + void DirectionalPrediction(const Block& block, Plane plane, int x, int y, + bool has_left, bool has_top, bool needs_left, + bool needs_top, int prediction_angle, int width, + int height, int max_x, int max_y, + TransformSize tx_size, Pixel* top_row, + Pixel* left_column); // 7.11.2.4. + template + void PalettePrediction(const Block& block, Plane plane, int start_x, + int start_y, int x, int y, + TransformSize tx_size); // 7.11.4. + template + void ChromaFromLumaPrediction(const Block& block, Plane plane, int start_x, + int start_y, + TransformSize tx_size); // 7.11.5. + // Section 7.19. Applies some filtering and reordering to the motion vectors + // for the given |block| and stores them into |current_frame_|. + void StoreMotionFieldMvsIntoCurrentFrame(const Block& block); + + // Returns the zero-based index of the super block that contains |row4x4| + // relative to the start of this tile. + int SuperBlockRowIndex(int row4x4) const { + return (row4x4 - row4x4_start_) >> + (sequence_header_.use_128x128_superblock ? 5 : 4); + } + + // Returns the zero-based index of the super block that contains |column4x4| + // relative to the start of this tile. + int SuperBlockColumnIndex(int column4x4) const { + return (column4x4 - column4x4_start_) >> + (sequence_header_.use_128x128_superblock ? 5 : 4); + } + + BlockSize SuperBlockSize() const { + return sequence_header_.use_128x128_superblock ? kBlock128x128 + : kBlock64x64; + } + int PlaneCount() const { + return sequence_header_.color_config.is_monochrome ? kMaxPlanesMonochrome + : kMaxPlanes; + } + + const int number_; + const int row_; + const int column_; + const uint8_t* const data_; + size_t size_; + int row4x4_start_; + int row4x4_end_; + int column4x4_start_; + int column4x4_end_; + int superblock_rows_; + int superblock_columns_; + bool read_deltas_; + const int8_t subsampling_x_[kMaxPlanes]; + const int8_t subsampling_y_[kMaxPlanes]; + int deblock_row_limit_[kMaxPlanes]; + int deblock_column_limit_[kMaxPlanes]; + + // The dimensions (in order) are: segment_id, level_index (based on plane and + // direction), reference_frame and mode_id. + uint8_t deblock_filter_levels_[kMaxSegments][kFrameLfCount] + [kNumReferenceFrameTypes][2]; + + // current_quantizer_index_ is in the range [0, 255]. + uint8_t current_quantizer_index_; + // These two arrays (|coefficient_levels_| and |dc_categories_|) are used to + // store the entropy context. Their dimensions are as follows: First - + // left/top; Second - plane; Third - row4x4 (if first dimension is + // left)/column4x4 (if first dimension is top). + // + // This is equivalent to the LeftLevelContext and AboveLevelContext arrays in + // the spec. In the spec, it stores values from 0 through 63 (inclusive). The + // stored values are used to compute the left and top contexts in + // GetTransformAllZeroContext. In that function, we only care about the + // following values: 0, 1, 2, 3 and >= 4. So instead of clamping to 63, we + // clamp to 4 (i.e.) all the values greater than 4 are stored as 4. + std::array, 2> coefficient_levels_; + // This is equivalent to the LeftDcContext and AboveDcContext arrays in the + // spec. In the spec, it can store 3 possible values: 0, 1 and 2 (where 1 + // means the value is < 0, 2 means the value is > 0 and 0 means the value is + // equal to 0). + // + // The stored values are used in two places: + // * GetTransformAllZeroContext: Here, we only care about whether the + // value is 0 or not (whether it is 1 or 2 is irrelevant). + // * GetDcSignContext: Here, we do the following computation: if the + // stored value is 1, we decrement a counter. If the stored value is 2 + // we increment a counter. + // + // Based on this usage, we can simply replace 1 with -1 and 2 with 1 and + // use that value to compute the counter. + // + // The usage on GetTransformAllZeroContext is unaffected since there we + // only care about whether it is 0 or not. + std::array, 2> dc_categories_; + const ObuSequenceHeader& sequence_header_; + const ObuFrameHeader& frame_header_; + const std::array& reference_frame_sign_bias_; + const std::array& + reference_frames_; + TemporalMotionField& motion_field_; + const std::array& reference_order_hint_; + const WedgeMaskArray& wedge_masks_; + const QuantizerMatrix& quantizer_matrix_; + DaalaBitReader reader_; + SymbolDecoderContext symbol_decoder_context_; + SymbolDecoderContext* const saved_symbol_decoder_context_; + const SegmentationMap* prev_segment_ids_; + const dsp::Dsp& dsp_; + PostFilter& post_filter_; + BlockParametersHolder& block_parameters_holder_; + Quantizer quantizer_; + // When there is no multi-threading within the Tile, |residual_buffer_| is + // used. When there is multi-threading within the Tile, + // |residual_buffer_threaded_| is used. In the following comment, + // |residual_buffer| refers to either |residual_buffer_| or + // |residual_buffer_threaded_| depending on whether multi-threading is enabled + // within the Tile or not. + // The |residual_buffer| is used to help with the dequantization and the + // inverse transform processes. It is declared as a uint8_t, but is always + // accessed either as an int16_t or int32_t depending on |bitdepth|. Here is + // what it stores at various stages of the decoding process (in the order + // which they happen): + // 1) In ReadTransformCoefficients(), this buffer is used to store the + // dequantized values. + // 2) In Reconstruct(), this buffer is used as the input to the row + // transform process. + // The size of this buffer would be: + // For |residual_buffer_|: (4096 + 32 * |kResidualPaddingVertical|) * + // |residual_size_|. Where 4096 = 64x64 which is the maximum transform + // size, and 32 * |kResidualPaddingVertical| is the padding to avoid + // bottom boundary checks when parsing quantized coefficients. This + // memory is allocated and owned by the Tile class. + // For |residual_buffer_threaded_|: See the comment below. This memory is + // not allocated or owned by the Tile class. + AlignedUniquePtr residual_buffer_; + // This is a 2d array of pointers of size |superblock_rows_| by + // |superblock_columns_| where each pointer points to a ResidualBuffer for a + // single super block. The array is populated when the parsing process begins + // by calling |residual_buffer_pool_->Get()| and the memory is released back + // to the pool by calling |residual_buffer_pool_->Release()| when the decoding + // process is complete. + Array2D> residual_buffer_threaded_; + // sizeof(int16_t or int32_t) depending on |bitdepth|. + const size_t residual_size_; + // Number of superblocks on the top-right that will have to be decoded before + // the current superblock can be decoded. This will be 1 if allow_intrabc is + // false. If allow_intrabc is true, then this value will be + // use_128x128_superblock ? 3 : 5. This is the allowed range of reference for + // the top rows for intrabc. + const int intra_block_copy_lag_; + + // In the Tile class, we use the "current_frame" in two ways: + // 1) To write the decoded output into (using the |buffer_| view). + // 2) To read the pixels for intra block copy (using the |current_frame_| + // reference). + // + // When intra block copy is off, |buffer_| and |current_frame_| may or may not + // point to the same plane pointers. But it is okay since |current_frame_| is + // never used in this case. + // + // When intra block copy is on, |buffer_| and |current_frame_| always point to + // the same plane pointers (since post filtering is disabled). So the usage in + // both case 1 and case 2 remain valid. + Array2DView buffer_[kMaxPlanes]; + RefCountedBuffer& current_frame_; + + Array2D& cdef_index_; + Array2D& inter_transform_sizes_; + std::array reference_unit_info_; + // If |thread_pool_| is nullptr, the calling thread will do the parsing and + // the decoding in one pass. If |thread_pool_| is not nullptr, then the main + // thread will do the parsing while the thread pool workers will do the + // decoding. + ThreadPool* const thread_pool_; + ThreadingParameters threading_; + ResidualBufferPool* const residual_buffer_pool_; + TileScratchBufferPool* const tile_scratch_buffer_pool_; + BlockingCounterWithStatus* const pending_tiles_; + bool split_parse_and_decode_; + // This is used only when |split_parse_and_decode_| is false. + std::unique_ptr prediction_parameters_ = nullptr; + // Stores the |transform_type| for the super block being decoded at a 4x4 + // granularity. The spec uses absolute indices for this array but it is + // sufficient to use indices relative to the super block being decoded. + TransformType transform_types_[32][32]; + // delta_lf_[i] is in the range [-63, 63]. + int8_t delta_lf_[kFrameLfCount]; + // True if all the values in |delta_lf_| are zero. False otherwise. + bool delta_lf_all_zero_; + const bool frame_parallel_; + const bool use_intra_prediction_buffer_; + // Buffer used to store the unfiltered pixels that are necessary for decoding + // the next superblock row (for the intra prediction process). Used only if + // |use_intra_prediction_buffer_| is true. The |frame_scratch_buffer| contains + // one row buffer for each tile row. This tile will have to use the buffer + // corresponding to this tile's row. + IntraPredictionBuffer* const intra_prediction_buffer_; + // Stores the progress of the reference frames. This will be used to avoid + // unnecessary calls into RefCountedBuffer::WaitUntil(). + std::array reference_frame_progress_cache_; +}; + +struct Tile::Block { + Block(const Tile& tile, BlockSize size, int row4x4, int column4x4, + TileScratchBuffer* const scratch_buffer, ResidualPtr* residual) + : tile(tile), + size(size), + row4x4(row4x4), + column4x4(column4x4), + width(kBlockWidthPixels[size]), + height(kBlockHeightPixels[size]), + width4x4(width >> 2), + height4x4(height >> 2), + scratch_buffer(scratch_buffer), + residual(residual) { + assert(size != kBlockInvalid); + residual_size[kPlaneY] = kPlaneResidualSize[size][0][0]; + residual_size[kPlaneU] = residual_size[kPlaneV] = + kPlaneResidualSize[size][tile.subsampling_x_[kPlaneU]] + [tile.subsampling_y_[kPlaneU]]; + assert(residual_size[kPlaneY] != kBlockInvalid); + if (tile.PlaneCount() > 1) { + assert(residual_size[kPlaneU] != kBlockInvalid); + } + if ((row4x4 & 1) == 0 && + (tile.sequence_header_.color_config.subsampling_y & height4x4) == 1) { + has_chroma = false; + } else if ((column4x4 & 1) == 0 && + (tile.sequence_header_.color_config.subsampling_x & width4x4) == + 1) { + has_chroma = false; + } else { + has_chroma = !tile.sequence_header_.color_config.is_monochrome; + } + top_available[kPlaneY] = tile.IsTopInside(row4x4); + left_available[kPlaneY] = tile.IsLeftInside(column4x4); + if (has_chroma) { + // top_available[kPlaneU] and top_available[kPlaneV] are valid only if + // has_chroma is true. + // The next 3 lines are equivalent to: + // top_available[kPlaneU] = top_available[kPlaneV] = + // top_available[kPlaneY] && + // ((tile.sequence_header_.color_config.subsampling_y & height4x4) == + // 0 || tile.IsTopInside(row4x4 - 1)); + top_available[kPlaneU] = top_available[kPlaneV] = tile.IsTopInside( + row4x4 - + (tile.sequence_header_.color_config.subsampling_y & height4x4)); + // left_available[kPlaneU] and left_available[kPlaneV] are valid only if + // has_chroma is true. + // The next 3 lines are equivalent to: + // left_available[kPlaneU] = left_available[kPlaneV] = + // left_available[kPlaneY] && + // ((tile.sequence_header_.color_config.subsampling_x & width4x4) == 0 + // || tile.IsLeftInside(column4x4 - 1)); + left_available[kPlaneU] = left_available[kPlaneV] = tile.IsLeftInside( + column4x4 - + (tile.sequence_header_.color_config.subsampling_x & width4x4)); + } + const ptrdiff_t stride = tile.BlockParametersStride(); + BlockParameters** const bps = + tile.BlockParametersAddress(row4x4, column4x4); + bp = *bps; + // bp_top is valid only if top_available[kPlaneY] is true. + if (top_available[kPlaneY]) { + bp_top = *(bps - stride); + } + // bp_left is valid only if left_available[kPlaneY] is true. + if (left_available[kPlaneY]) { + bp_left = *(bps - 1); + } + } + + bool HasChroma() const { return has_chroma; } + + // These return values of these group of functions are valid only if the + // corresponding top_available or left_available is true. + ReferenceFrameType TopReference(int index) const { + return bp_top->reference_frame[index]; + } + + ReferenceFrameType LeftReference(int index) const { + return bp_left->reference_frame[index]; + } + + bool IsTopIntra() const { return TopReference(0) <= kReferenceFrameIntra; } + bool IsLeftIntra() const { return LeftReference(0) <= kReferenceFrameIntra; } + + bool IsTopSingle() const { return TopReference(1) <= kReferenceFrameIntra; } + bool IsLeftSingle() const { return LeftReference(1) <= kReferenceFrameIntra; } + + int CountReferences(ReferenceFrameType type) const { + return static_cast(top_available[kPlaneY] && + bp_top->reference_frame[0] == type) + + static_cast(top_available[kPlaneY] && + bp_top->reference_frame[1] == type) + + static_cast(left_available[kPlaneY] && + bp_left->reference_frame[0] == type) + + static_cast(left_available[kPlaneY] && + bp_left->reference_frame[1] == type); + } + + // 7.10.3. + // Checks if there are any inter blocks to the left or above. If so, it + // returns true indicating that the block has neighbors that are suitable for + // use by overlapped motion compensation. + bool HasOverlappableCandidates() const { + const ptrdiff_t stride = tile.BlockParametersStride(); + BlockParameters** const bps = tile.BlockParametersAddress(0, 0); + if (top_available[kPlaneY]) { + BlockParameters** bps_top = bps + (row4x4 - 1) * stride + (column4x4 | 1); + const int columns = std::min(tile.frame_header_.columns4x4 - column4x4, + static_cast(width4x4)); + BlockParameters** const bps_top_end = bps_top + columns; + do { + if ((*bps_top)->reference_frame[0] > kReferenceFrameIntra) { + return true; + } + bps_top += 2; + } while (bps_top < bps_top_end); + } + if (left_available[kPlaneY]) { + BlockParameters** bps_left = bps + (row4x4 | 1) * stride + column4x4 - 1; + const int rows = std::min(tile.frame_header_.rows4x4 - row4x4, + static_cast(height4x4)); + BlockParameters** const bps_left_end = bps_left + rows * stride; + do { + if ((*bps_left)->reference_frame[0] > kReferenceFrameIntra) { + return true; + } + bps_left += 2 * stride; + } while (bps_left < bps_left_end); + } + return false; + } + + const Tile& tile; + bool has_chroma; + const BlockSize size; + bool top_available[kMaxPlanes]; + bool left_available[kMaxPlanes]; + BlockSize residual_size[kMaxPlanes]; + const int row4x4; + const int column4x4; + const int width; + const int height; + const int width4x4; + const int height4x4; + const BlockParameters* bp_top; + const BlockParameters* bp_left; + BlockParameters* bp; + TileScratchBuffer* const scratch_buffer; + ResidualPtr* const residual; +}; + +extern template bool +Tile::ProcessSuperBlockRow( + int row4x4, TileScratchBuffer* scratch_buffer); +extern template bool +Tile::ProcessSuperBlockRow( + int row4x4, TileScratchBuffer* scratch_buffer); + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_TILE_H_ diff --git a/src/tile/bitstream/mode_info.cc b/src/tile/bitstream/mode_info.cc new file mode 100644 index 0000000..0b22eb0 --- /dev/null +++ b/src/tile/bitstream/mode_info.cc @@ -0,0 +1,1303 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "src/buffer_pool.h" +#include "src/dsp/constants.h" +#include "src/motion_vector.h" +#include "src/obu_parser.h" +#include "src/prediction_mask.h" +#include "src/symbol_decoder_context.h" +#include "src/tile.h" +#include "src/utils/array_2d.h" +#include "src/utils/bit_mask_set.h" +#include "src/utils/block_parameters_holder.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/entropy_decoder.h" +#include "src/utils/logging.h" +#include "src/utils/segmentation.h" +#include "src/utils/segmentation_map.h" +#include "src/utils/types.h" + +namespace libgav1 { +namespace { + +constexpr int kDeltaQSmall = 3; +constexpr int kDeltaLfSmall = 3; + +constexpr uint8_t kIntraYModeContext[kIntraPredictionModesY] = { + 0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0}; + +constexpr uint8_t kSizeGroup[kMaxBlockSizes] = { + 0, 0, 0, 0, 1, 1, 1, 0, 1, 2, 2, 2, 1, 2, 3, 3, 2, 3, 3, 3, 3, 3}; + +constexpr int kCompoundModeNewMvContexts = 5; +constexpr uint8_t kCompoundModeContextMap[3][kCompoundModeNewMvContexts] = { + {0, 1, 1, 1, 1}, {1, 2, 3, 4, 4}, {4, 4, 5, 6, 7}}; + +enum CflSign : uint8_t { + kCflSignZero = 0, + kCflSignNegative = 1, + kCflSignPositive = 2 +}; + +// For each possible value of the combined signs (which is read from the +// bitstream), this array stores the following: sign_u, sign_v, alpha_u_context, +// alpha_v_context. Only positive entries are used. Entry at index i is computed +// as follows: +// sign_u = i / 3 +// sign_v = i % 3 +// alpha_u_context = i - 2 +// alpha_v_context = (sign_v - 1) * 3 + sign_u +constexpr int8_t kCflAlphaLookup[kCflAlphaSignsSymbolCount][4] = { + {0, 1, -2, 0}, {0, 2, -1, 3}, {1, 0, 0, -2}, {1, 1, 1, 1}, + {1, 2, 2, 4}, {2, 0, 3, -1}, {2, 1, 4, 2}, {2, 2, 5, 5}, +}; + +constexpr BitMaskSet kPredictionModeHasNearMvMask(kPredictionModeNearMv, + kPredictionModeNearNearMv, + kPredictionModeNearNewMv, + kPredictionModeNewNearMv); + +constexpr BitMaskSet kIsInterIntraModeAllowedMask(kBlock8x8, kBlock8x16, + kBlock16x8, kBlock16x16, + kBlock16x32, kBlock32x16, + kBlock32x32); + +bool IsBackwardReference(ReferenceFrameType type) { + return type >= kReferenceFrameBackward && type <= kReferenceFrameAlternate; +} + +bool IsSameDirectionReferencePair(ReferenceFrameType type1, + ReferenceFrameType type2) { + return (type1 >= kReferenceFrameBackward) == + (type2 >= kReferenceFrameBackward); +} + +// This is called neg_deinterleave() in the spec. +int DecodeSegmentId(int diff, int reference, int max) { + if (reference == 0) return diff; + if (reference >= max - 1) return max - diff - 1; + const int value = ((diff & 1) != 0) ? reference + ((diff + 1) >> 1) + : reference - (diff >> 1); + const int reference2 = (reference << 1); + if (reference2 < max) { + return (diff <= reference2) ? value : diff; + } + return (diff <= ((max - reference - 1) << 1)) ? value : max - (diff + 1); +} + +// This is called DrlCtxStack in section 7.10.2.14 of the spec. +// In the spec, the weights of all the nearest mvs are incremented by a bonus +// weight which is larger than any natural weight, and the weights of the mvs +// are compared with this bonus weight to determine their contexts. We replace +// this procedure by introducing |nearest_mv_count| in PredictionParameters, +// which records the count of the nearest mvs. Since all the nearest mvs are in +// the beginning of the mv stack, the |index| of a mv in the mv stack can be +// compared with |nearest_mv_count| to get that mv's context. +int GetRefMvIndexContext(int nearest_mv_count, int index) { + if (index + 1 < nearest_mv_count) { + return 0; + } + if (index + 1 == nearest_mv_count) { + return 1; + } + return 2; +} + +// Returns true if both the width and height of the block is less than 64. +bool IsBlockDimensionLessThan64(BlockSize size) { + return size <= kBlock32x32 && size != kBlock16x64; +} + +int GetUseCompoundReferenceContext(const Tile::Block& block) { + if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) { + if (block.IsTopSingle() && block.IsLeftSingle()) { + return static_cast(IsBackwardReference(block.TopReference(0))) ^ + static_cast(IsBackwardReference(block.LeftReference(0))); + } + if (block.IsTopSingle()) { + return 2 + static_cast(IsBackwardReference(block.TopReference(0)) || + block.IsTopIntra()); + } + if (block.IsLeftSingle()) { + return 2 + static_cast(IsBackwardReference(block.LeftReference(0)) || + block.IsLeftIntra()); + } + return 4; + } + if (block.top_available[kPlaneY]) { + return block.IsTopSingle() + ? static_cast(IsBackwardReference(block.TopReference(0))) + : 3; + } + if (block.left_available[kPlaneY]) { + return block.IsLeftSingle() + ? static_cast(IsBackwardReference(block.LeftReference(0))) + : 3; + } + return 1; +} + +// Calculates count0 by calling block.CountReferences() on the frame types from +// type0_start to type0_end, inclusive, and summing the results. +// Calculates count1 by calling block.CountReferences() on the frame types from +// type1_start to type1_end, inclusive, and summing the results. +// Compares count0 with count1 and returns 0, 1 or 2. +// +// See count_refs and ref_count_ctx in 8.3.2. +int GetReferenceContext(const Tile::Block& block, + ReferenceFrameType type0_start, + ReferenceFrameType type0_end, + ReferenceFrameType type1_start, + ReferenceFrameType type1_end) { + int count0 = 0; + int count1 = 0; + for (int type = type0_start; type <= type0_end; ++type) { + count0 += block.CountReferences(static_cast(type)); + } + for (int type = type1_start; type <= type1_end; ++type) { + count1 += block.CountReferences(static_cast(type)); + } + return (count0 < count1) ? 0 : (count0 == count1 ? 1 : 2); +} + +} // namespace + +bool Tile::ReadSegmentId(const Block& block) { + int top_left = -1; + if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) { + top_left = + block_parameters_holder_.Find(block.row4x4 - 1, block.column4x4 - 1) + ->segment_id; + } + int top = -1; + if (block.top_available[kPlaneY]) { + top = block.bp_top->segment_id; + } + int left = -1; + if (block.left_available[kPlaneY]) { + left = block.bp_left->segment_id; + } + int pred; + if (top == -1) { + pred = (left == -1) ? 0 : left; + } else if (left == -1) { + pred = top; + } else { + pred = (top_left == top) ? top : left; + } + BlockParameters& bp = *block.bp; + if (bp.skip) { + bp.segment_id = pred; + return true; + } + int context = 0; + if (top_left < 0) { + context = 0; + } else if (top_left == top && top_left == left) { + context = 2; + } else if (top_left == top || top_left == left || top == left) { + context = 1; + } + uint16_t* const segment_id_cdf = + symbol_decoder_context_.segment_id_cdf[context]; + const int encoded_segment_id = + reader_.ReadSymbol(segment_id_cdf); + bp.segment_id = + DecodeSegmentId(encoded_segment_id, pred, + frame_header_.segmentation.last_active_segment_id + 1); + // Check the bitstream conformance requirement in Section 6.10.8 of the spec. + if (bp.segment_id < 0 || + bp.segment_id > frame_header_.segmentation.last_active_segment_id) { + LIBGAV1_DLOG( + ERROR, + "Corrupted segment_ids: encoded %d, last active %d, postprocessed %d", + encoded_segment_id, frame_header_.segmentation.last_active_segment_id, + bp.segment_id); + return false; + } + return true; +} + +bool Tile::ReadIntraSegmentId(const Block& block) { + BlockParameters& bp = *block.bp; + if (!frame_header_.segmentation.enabled) { + bp.segment_id = 0; + return true; + } + return ReadSegmentId(block); +} + +void Tile::ReadSkip(const Block& block) { + BlockParameters& bp = *block.bp; + if (frame_header_.segmentation.segment_id_pre_skip && + frame_header_.segmentation.FeatureActive(bp.segment_id, + kSegmentFeatureSkip)) { + bp.skip = true; + return; + } + int context = 0; + if (block.top_available[kPlaneY] && block.bp_top->skip) { + ++context; + } + if (block.left_available[kPlaneY] && block.bp_left->skip) { + ++context; + } + uint16_t* const skip_cdf = symbol_decoder_context_.skip_cdf[context]; + bp.skip = reader_.ReadSymbol(skip_cdf); +} + +void Tile::ReadSkipMode(const Block& block) { + BlockParameters& bp = *block.bp; + if (!frame_header_.skip_mode_present || + frame_header_.segmentation.FeatureActive(bp.segment_id, + kSegmentFeatureSkip) || + frame_header_.segmentation.FeatureActive(bp.segment_id, + kSegmentFeatureReferenceFrame) || + frame_header_.segmentation.FeatureActive(bp.segment_id, + kSegmentFeatureGlobalMv) || + IsBlockDimension4(block.size)) { + bp.skip_mode = false; + return; + } + const int context = + (block.left_available[kPlaneY] + ? static_cast(block.bp_left->skip_mode) + : 0) + + (block.top_available[kPlaneY] ? static_cast(block.bp_top->skip_mode) + : 0); + bp.skip_mode = + reader_.ReadSymbol(symbol_decoder_context_.skip_mode_cdf[context]); +} + +void Tile::ReadCdef(const Block& block) { + BlockParameters& bp = *block.bp; + if (bp.skip || frame_header_.coded_lossless || + !sequence_header_.enable_cdef || frame_header_.allow_intrabc) { + return; + } + const int cdef_size4x4 = kNum4x4BlocksWide[kBlock64x64]; + const int cdef_mask4x4 = ~(cdef_size4x4 - 1); + const int row4x4 = block.row4x4 & cdef_mask4x4; + const int column4x4 = block.column4x4 & cdef_mask4x4; + const int row = DivideBy16(row4x4); + const int column = DivideBy16(column4x4); + if (cdef_index_[row][column] == -1) { + cdef_index_[row][column] = + frame_header_.cdef.bits > 0 + ? static_cast(reader_.ReadLiteral(frame_header_.cdef.bits)) + : 0; + for (int i = row4x4; i < row4x4 + block.height4x4; i += cdef_size4x4) { + for (int j = column4x4; j < column4x4 + block.width4x4; + j += cdef_size4x4) { + cdef_index_[DivideBy16(i)][DivideBy16(j)] = cdef_index_[row][column]; + } + } + } +} + +int Tile::ReadAndClipDelta(uint16_t* const cdf, int delta_small, int scale, + int min_value, int max_value, int value) { + int abs = reader_.ReadSymbol(cdf); + if (abs == delta_small) { + const int remaining_bit_count = + static_cast(reader_.ReadLiteral(3)) + 1; + const int abs_remaining_bits = + static_cast(reader_.ReadLiteral(remaining_bit_count)); + abs = abs_remaining_bits + (1 << remaining_bit_count) + 1; + } + if (abs != 0) { + const bool sign = static_cast(reader_.ReadBit()); + const int scaled_abs = abs << scale; + const int reduced_delta = sign ? -scaled_abs : scaled_abs; + value += reduced_delta; + value = Clip3(value, min_value, max_value); + } + return value; +} + +void Tile::ReadQuantizerIndexDelta(const Block& block) { + assert(read_deltas_); + BlockParameters& bp = *block.bp; + if ((block.size == SuperBlockSize() && bp.skip)) { + return; + } + current_quantizer_index_ = + ReadAndClipDelta(symbol_decoder_context_.delta_q_cdf, kDeltaQSmall, + frame_header_.delta_q.scale, kMinLossyQuantizer, + kMaxQuantizer, current_quantizer_index_); +} + +void Tile::ReadLoopFilterDelta(const Block& block) { + assert(read_deltas_); + BlockParameters& bp = *block.bp; + if (!frame_header_.delta_lf.present || + (block.size == SuperBlockSize() && bp.skip)) { + return; + } + int frame_lf_count = 1; + if (frame_header_.delta_lf.multi) { + frame_lf_count = kFrameLfCount - (PlaneCount() > 1 ? 0 : 2); + } + bool recompute_deblock_filter_levels = false; + for (int i = 0; i < frame_lf_count; ++i) { + uint16_t* const delta_lf_abs_cdf = + frame_header_.delta_lf.multi + ? symbol_decoder_context_.delta_lf_multi_cdf[i] + : symbol_decoder_context_.delta_lf_cdf; + const int8_t old_delta_lf = delta_lf_[i]; + delta_lf_[i] = ReadAndClipDelta( + delta_lf_abs_cdf, kDeltaLfSmall, frame_header_.delta_lf.scale, + -kMaxLoopFilterValue, kMaxLoopFilterValue, delta_lf_[i]); + recompute_deblock_filter_levels = + recompute_deblock_filter_levels || (old_delta_lf != delta_lf_[i]); + } + delta_lf_all_zero_ = + (delta_lf_[0] | delta_lf_[1] | delta_lf_[2] | delta_lf_[3]) == 0; + if (!delta_lf_all_zero_ && recompute_deblock_filter_levels) { + post_filter_.ComputeDeblockFilterLevels(delta_lf_, deblock_filter_levels_); + } +} + +void Tile::ReadPredictionModeY(const Block& block, bool intra_y_mode) { + uint16_t* cdf; + if (intra_y_mode) { + const PredictionMode top_mode = + block.top_available[kPlaneY] ? block.bp_top->y_mode : kPredictionModeDc; + const PredictionMode left_mode = block.left_available[kPlaneY] + ? block.bp_left->y_mode + : kPredictionModeDc; + const int top_context = kIntraYModeContext[top_mode]; + const int left_context = kIntraYModeContext[left_mode]; + cdf = symbol_decoder_context_ + .intra_frame_y_mode_cdf[top_context][left_context]; + } else { + cdf = symbol_decoder_context_.y_mode_cdf[kSizeGroup[block.size]]; + } + block.bp->y_mode = static_cast( + reader_.ReadSymbol(cdf)); +} + +void Tile::ReadIntraAngleInfo(const Block& block, PlaneType plane_type) { + BlockParameters& bp = *block.bp; + PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + prediction_parameters.angle_delta[plane_type] = 0; + const PredictionMode mode = + (plane_type == kPlaneTypeY) ? bp.y_mode : bp.uv_mode; + if (IsBlockSmallerThan8x8(block.size) || !IsDirectionalMode(mode)) return; + uint16_t* const cdf = + symbol_decoder_context_.angle_delta_cdf[mode - kPredictionModeVertical]; + prediction_parameters.angle_delta[plane_type] = + reader_.ReadSymbol(cdf); + prediction_parameters.angle_delta[plane_type] -= kMaxAngleDelta; +} + +void Tile::ReadCflAlpha(const Block& block) { + const int signs = reader_.ReadSymbol( + symbol_decoder_context_.cfl_alpha_signs_cdf); + const int8_t* const cfl_lookup = kCflAlphaLookup[signs]; + const auto sign_u = static_cast(cfl_lookup[0]); + const auto sign_v = static_cast(cfl_lookup[1]); + PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + prediction_parameters.cfl_alpha_u = 0; + if (sign_u != kCflSignZero) { + assert(cfl_lookup[2] >= 0); + prediction_parameters.cfl_alpha_u = + reader_.ReadSymbol( + symbol_decoder_context_.cfl_alpha_cdf[cfl_lookup[2]]) + + 1; + if (sign_u == kCflSignNegative) prediction_parameters.cfl_alpha_u *= -1; + } + prediction_parameters.cfl_alpha_v = 0; + if (sign_v != kCflSignZero) { + assert(cfl_lookup[3] >= 0); + prediction_parameters.cfl_alpha_v = + reader_.ReadSymbol( + symbol_decoder_context_.cfl_alpha_cdf[cfl_lookup[3]]) + + 1; + if (sign_v == kCflSignNegative) prediction_parameters.cfl_alpha_v *= -1; + } +} + +void Tile::ReadPredictionModeUV(const Block& block) { + BlockParameters& bp = *block.bp; + bool chroma_from_luma_allowed; + if (frame_header_.segmentation.lossless[bp.segment_id]) { + chroma_from_luma_allowed = block.residual_size[kPlaneU] == kBlock4x4; + } else { + chroma_from_luma_allowed = IsBlockDimensionLessThan64(block.size); + } + uint16_t* const cdf = + symbol_decoder_context_ + .uv_mode_cdf[static_cast(chroma_from_luma_allowed)][bp.y_mode]; + if (chroma_from_luma_allowed) { + bp.uv_mode = static_cast( + reader_.ReadSymbol(cdf)); + } else { + bp.uv_mode = static_cast( + reader_.ReadSymbol(cdf)); + } +} + +int Tile::ReadMotionVectorComponent(const Block& block, const int component) { + const int context = + static_cast(block.bp->prediction_parameters->use_intra_block_copy); + const bool sign = reader_.ReadSymbol( + symbol_decoder_context_.mv_sign_cdf[component][context]); + const int mv_class = reader_.ReadSymbol( + symbol_decoder_context_.mv_class_cdf[component][context]); + int magnitude = 1; + int value; + uint16_t* fraction_cdf; + uint16_t* precision_cdf; + if (mv_class == 0) { + value = static_cast(reader_.ReadSymbol( + symbol_decoder_context_.mv_class0_bit_cdf[component][context])); + fraction_cdf = symbol_decoder_context_ + .mv_class0_fraction_cdf[component][context][value]; + precision_cdf = symbol_decoder_context_ + .mv_class0_high_precision_cdf[component][context]; + } else { + assert(mv_class <= kMvBitSymbolCount); + value = 0; + for (int i = 0; i < mv_class; ++i) { + const int bit = static_cast(reader_.ReadSymbol( + symbol_decoder_context_.mv_bit_cdf[component][context][i])); + value |= bit << i; + } + magnitude += 2 << (mv_class + 2); + fraction_cdf = symbol_decoder_context_.mv_fraction_cdf[component][context]; + precision_cdf = + symbol_decoder_context_.mv_high_precision_cdf[component][context]; + } + const int fraction = + (frame_header_.force_integer_mv == 0) + ? reader_.ReadSymbol(fraction_cdf) + : 3; + const int precision = + frame_header_.allow_high_precision_mv + ? static_cast(reader_.ReadSymbol(precision_cdf)) + : 1; + magnitude += (value << 3) | (fraction << 1) | precision; + return sign ? -magnitude : magnitude; +} + +void Tile::ReadMotionVector(const Block& block, int index) { + BlockParameters& bp = *block.bp; + const int context = + static_cast(block.bp->prediction_parameters->use_intra_block_copy); + const auto mv_joint = + static_cast(reader_.ReadSymbol( + symbol_decoder_context_.mv_joint_cdf[context])); + if (mv_joint == kMvJointTypeHorizontalZeroVerticalNonZero || + mv_joint == kMvJointTypeNonZero) { + bp.mv.mv[index].mv[0] = ReadMotionVectorComponent(block, 0); + } + if (mv_joint == kMvJointTypeHorizontalNonZeroVerticalZero || + mv_joint == kMvJointTypeNonZero) { + bp.mv.mv[index].mv[1] = ReadMotionVectorComponent(block, 1); + } +} + +void Tile::ReadFilterIntraModeInfo(const Block& block) { + BlockParameters& bp = *block.bp; + PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + prediction_parameters.use_filter_intra = false; + if (!sequence_header_.enable_filter_intra || bp.y_mode != kPredictionModeDc || + bp.palette_mode_info.size[kPlaneTypeY] != 0 || + !IsBlockDimensionLessThan64(block.size)) { + return; + } + prediction_parameters.use_filter_intra = reader_.ReadSymbol( + symbol_decoder_context_.use_filter_intra_cdf[block.size]); + if (prediction_parameters.use_filter_intra) { + prediction_parameters.filter_intra_mode = static_cast( + reader_.ReadSymbol( + symbol_decoder_context_.filter_intra_mode_cdf)); + } +} + +bool Tile::DecodeIntraModeInfo(const Block& block) { + BlockParameters& bp = *block.bp; + bp.skip = false; + if (frame_header_.segmentation.segment_id_pre_skip && + !ReadIntraSegmentId(block)) { + return false; + } + bp.skip_mode = false; + ReadSkip(block); + if (!frame_header_.segmentation.segment_id_pre_skip && + !ReadIntraSegmentId(block)) { + return false; + } + ReadCdef(block); + if (read_deltas_) { + ReadQuantizerIndexDelta(block); + ReadLoopFilterDelta(block); + read_deltas_ = false; + } + PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + prediction_parameters.use_intra_block_copy = false; + if (frame_header_.allow_intrabc) { + prediction_parameters.use_intra_block_copy = + reader_.ReadSymbol(symbol_decoder_context_.intra_block_copy_cdf); + } + if (prediction_parameters.use_intra_block_copy) { + bp.is_inter = true; + bp.reference_frame[0] = kReferenceFrameIntra; + bp.reference_frame[1] = kReferenceFrameNone; + bp.y_mode = kPredictionModeDc; + bp.uv_mode = kPredictionModeDc; + prediction_parameters.motion_mode = kMotionModeSimple; + prediction_parameters.compound_prediction_type = + kCompoundPredictionTypeAverage; + bp.palette_mode_info.size[kPlaneTypeY] = 0; + bp.palette_mode_info.size[kPlaneTypeUV] = 0; + bp.interpolation_filter[0] = kInterpolationFilterBilinear; + bp.interpolation_filter[1] = kInterpolationFilterBilinear; + MvContexts dummy_mode_contexts; + FindMvStack(block, /*is_compound=*/false, &dummy_mode_contexts); + return AssignIntraMv(block); + } + bp.is_inter = false; + return ReadIntraBlockModeInfo(block, /*intra_y_mode=*/true); +} + +int8_t Tile::ComputePredictedSegmentId(const Block& block) const { + // If prev_segment_ids_ is null, treat it as if it pointed to a segmentation + // map containing all 0s. + if (prev_segment_ids_ == nullptr) return 0; + + const int x_limit = std::min(frame_header_.columns4x4 - block.column4x4, + static_cast(block.width4x4)); + const int y_limit = std::min(frame_header_.rows4x4 - block.row4x4, + static_cast(block.height4x4)); + int8_t id = 7; + for (int y = 0; y < y_limit; ++y) { + for (int x = 0; x < x_limit; ++x) { + const int8_t prev_segment_id = + prev_segment_ids_->segment_id(block.row4x4 + y, block.column4x4 + x); + id = std::min(id, prev_segment_id); + } + } + return id; +} + +bool Tile::ReadInterSegmentId(const Block& block, bool pre_skip) { + BlockParameters& bp = *block.bp; + if (!frame_header_.segmentation.enabled) { + bp.segment_id = 0; + return true; + } + if (!frame_header_.segmentation.update_map) { + bp.segment_id = ComputePredictedSegmentId(block); + return true; + } + if (pre_skip) { + if (!frame_header_.segmentation.segment_id_pre_skip) { + bp.segment_id = 0; + return true; + } + } else if (bp.skip) { + bp.use_predicted_segment_id = false; + return ReadSegmentId(block); + } + if (frame_header_.segmentation.temporal_update) { + const int context = + (block.left_available[kPlaneY] + ? static_cast(block.bp_left->use_predicted_segment_id) + : 0) + + (block.top_available[kPlaneY] + ? static_cast(block.bp_top->use_predicted_segment_id) + : 0); + bp.use_predicted_segment_id = reader_.ReadSymbol( + symbol_decoder_context_.use_predicted_segment_id_cdf[context]); + if (bp.use_predicted_segment_id) { + bp.segment_id = ComputePredictedSegmentId(block); + return true; + } + } + return ReadSegmentId(block); +} + +void Tile::ReadIsInter(const Block& block) { + BlockParameters& bp = *block.bp; + if (bp.skip_mode) { + bp.is_inter = true; + return; + } + if (frame_header_.segmentation.FeatureActive(bp.segment_id, + kSegmentFeatureReferenceFrame)) { + bp.is_inter = + frame_header_.segmentation + .feature_data[bp.segment_id][kSegmentFeatureReferenceFrame] != + kReferenceFrameIntra; + return; + } + if (frame_header_.segmentation.FeatureActive(bp.segment_id, + kSegmentFeatureGlobalMv)) { + bp.is_inter = true; + return; + } + int context = 0; + if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) { + context = (block.IsTopIntra() && block.IsLeftIntra()) + ? 3 + : static_cast(block.IsTopIntra() || block.IsLeftIntra()); + } else if (block.top_available[kPlaneY] || block.left_available[kPlaneY]) { + context = 2 * static_cast(block.top_available[kPlaneY] + ? block.IsTopIntra() + : block.IsLeftIntra()); + } + bp.is_inter = + reader_.ReadSymbol(symbol_decoder_context_.is_inter_cdf[context]); +} + +bool Tile::ReadIntraBlockModeInfo(const Block& block, bool intra_y_mode) { + BlockParameters& bp = *block.bp; + bp.reference_frame[0] = kReferenceFrameIntra; + bp.reference_frame[1] = kReferenceFrameNone; + ReadPredictionModeY(block, intra_y_mode); + ReadIntraAngleInfo(block, kPlaneTypeY); + if (block.HasChroma()) { + ReadPredictionModeUV(block); + if (bp.uv_mode == kPredictionModeChromaFromLuma) { + ReadCflAlpha(block); + } + ReadIntraAngleInfo(block, kPlaneTypeUV); + } + ReadPaletteModeInfo(block); + ReadFilterIntraModeInfo(block); + return true; +} + +CompoundReferenceType Tile::ReadCompoundReferenceType(const Block& block) { + // compound and inter. + const bool top_comp_inter = block.top_available[kPlaneY] && + !block.IsTopIntra() && !block.IsTopSingle(); + const bool left_comp_inter = block.left_available[kPlaneY] && + !block.IsLeftIntra() && !block.IsLeftSingle(); + // unidirectional compound. + const bool top_uni_comp = + top_comp_inter && IsSameDirectionReferencePair(block.TopReference(0), + block.TopReference(1)); + const bool left_uni_comp = + left_comp_inter && IsSameDirectionReferencePair(block.LeftReference(0), + block.LeftReference(1)); + int context; + if (block.top_available[kPlaneY] && !block.IsTopIntra() && + block.left_available[kPlaneY] && !block.IsLeftIntra()) { + const int same_direction = static_cast(IsSameDirectionReferencePair( + block.TopReference(0), block.LeftReference(0))); + if (!top_comp_inter && !left_comp_inter) { + context = 1 + MultiplyBy2(same_direction); + } else if (!top_comp_inter) { + context = left_uni_comp ? 3 + same_direction : 1; + } else if (!left_comp_inter) { + context = top_uni_comp ? 3 + same_direction : 1; + } else { + if (!top_uni_comp && !left_uni_comp) { + context = 0; + } else if (!top_uni_comp || !left_uni_comp) { + context = 2; + } else { + context = 3 + static_cast( + (block.TopReference(0) == kReferenceFrameBackward) == + (block.LeftReference(0) == kReferenceFrameBackward)); + } + } + } else if (block.top_available[kPlaneY] && block.left_available[kPlaneY]) { + if (top_comp_inter) { + context = 1 + MultiplyBy2(static_cast(top_uni_comp)); + } else if (left_comp_inter) { + context = 1 + MultiplyBy2(static_cast(left_uni_comp)); + } else { + context = 2; + } + } else if (top_comp_inter) { + context = MultiplyBy4(static_cast(top_uni_comp)); + } else if (left_comp_inter) { + context = MultiplyBy4(static_cast(left_uni_comp)); + } else { + context = 2; + } + return static_cast(reader_.ReadSymbol( + symbol_decoder_context_.compound_reference_type_cdf[context])); +} + +template +uint16_t* Tile::GetReferenceCdf( + const Block& block, + CompoundReferenceType type /*= kNumCompoundReferenceTypes*/) { + int context = 0; + if ((type == kCompoundReferenceUnidirectional && index == 0) || + (is_single && index == 1)) { + // uni_comp_ref and single_ref_p1. + context = + GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameGolden, + kReferenceFrameBackward, kReferenceFrameAlternate); + } else if (type == kCompoundReferenceUnidirectional && index == 1) { + // uni_comp_ref_p1. + context = + GetReferenceContext(block, kReferenceFrameLast2, kReferenceFrameLast2, + kReferenceFrameLast3, kReferenceFrameGolden); + } else if ((type == kCompoundReferenceUnidirectional && index == 2) || + (type == kCompoundReferenceBidirectional && index == 2) || + (is_single && index == 5)) { + // uni_comp_ref_p2, comp_ref_p2 and single_ref_p5. + context = + GetReferenceContext(block, kReferenceFrameLast3, kReferenceFrameLast3, + kReferenceFrameGolden, kReferenceFrameGolden); + } else if ((type == kCompoundReferenceBidirectional && index == 0) || + (is_single && index == 3)) { + // comp_ref and single_ref_p3. + context = + GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameLast2, + kReferenceFrameLast3, kReferenceFrameGolden); + } else if ((type == kCompoundReferenceBidirectional && index == 1) || + (is_single && index == 4)) { + // comp_ref_p1 and single_ref_p4. + context = + GetReferenceContext(block, kReferenceFrameLast, kReferenceFrameLast, + kReferenceFrameLast2, kReferenceFrameLast2); + } else if ((is_single && index == 2) || (is_backward && index == 0)) { + // single_ref_p2 and comp_bwdref. + context = GetReferenceContext( + block, kReferenceFrameBackward, kReferenceFrameAlternate2, + kReferenceFrameAlternate, kReferenceFrameAlternate); + } else if ((is_single && index == 6) || (is_backward && index == 1)) { + // single_ref_p6 and comp_bwdref_p1. + context = GetReferenceContext( + block, kReferenceFrameBackward, kReferenceFrameBackward, + kReferenceFrameAlternate2, kReferenceFrameAlternate2); + } + if (is_single) { + // The index parameter for single references is offset by one since the spec + // uses 1-based index for these elements. + return symbol_decoder_context_.single_reference_cdf[context][index - 1]; + } + if (is_backward) { + return symbol_decoder_context_ + .compound_backward_reference_cdf[context][index]; + } + return symbol_decoder_context_.compound_reference_cdf[type][context][index]; +} + +void Tile::ReadReferenceFrames(const Block& block) { + BlockParameters& bp = *block.bp; + if (bp.skip_mode) { + bp.reference_frame[0] = frame_header_.skip_mode_frame[0]; + bp.reference_frame[1] = frame_header_.skip_mode_frame[1]; + return; + } + if (frame_header_.segmentation.FeatureActive(bp.segment_id, + kSegmentFeatureReferenceFrame)) { + bp.reference_frame[0] = static_cast( + frame_header_.segmentation + .feature_data[bp.segment_id][kSegmentFeatureReferenceFrame]); + bp.reference_frame[1] = kReferenceFrameNone; + return; + } + if (frame_header_.segmentation.FeatureActive(bp.segment_id, + kSegmentFeatureSkip) || + frame_header_.segmentation.FeatureActive(bp.segment_id, + kSegmentFeatureGlobalMv)) { + bp.reference_frame[0] = kReferenceFrameLast; + bp.reference_frame[1] = kReferenceFrameNone; + return; + } + const bool use_compound_reference = + frame_header_.reference_mode_select && + std::min(block.width4x4, block.height4x4) >= 2 && + reader_.ReadSymbol(symbol_decoder_context_.use_compound_reference_cdf + [GetUseCompoundReferenceContext(block)]); + if (use_compound_reference) { + CompoundReferenceType reference_type = ReadCompoundReferenceType(block); + if (reference_type == kCompoundReferenceUnidirectional) { + // uni_comp_ref. + if (reader_.ReadSymbol( + GetReferenceCdf(block, reference_type))) { + bp.reference_frame[0] = kReferenceFrameBackward; + bp.reference_frame[1] = kReferenceFrameAlternate; + return; + } + // uni_comp_ref_p1. + if (!reader_.ReadSymbol( + GetReferenceCdf(block, reference_type))) { + bp.reference_frame[0] = kReferenceFrameLast; + bp.reference_frame[1] = kReferenceFrameLast2; + return; + } + // uni_comp_ref_p2. + if (reader_.ReadSymbol( + GetReferenceCdf(block, reference_type))) { + bp.reference_frame[0] = kReferenceFrameLast; + bp.reference_frame[1] = kReferenceFrameGolden; + return; + } + bp.reference_frame[0] = kReferenceFrameLast; + bp.reference_frame[1] = kReferenceFrameLast3; + return; + } + assert(reference_type == kCompoundReferenceBidirectional); + // comp_ref. + if (reader_.ReadSymbol( + GetReferenceCdf(block, reference_type))) { + // comp_ref_p2. + bp.reference_frame[0] = + reader_.ReadSymbol( + GetReferenceCdf(block, reference_type)) + ? kReferenceFrameGolden + : kReferenceFrameLast3; + } else { + // comp_ref_p1. + bp.reference_frame[0] = + reader_.ReadSymbol( + GetReferenceCdf(block, reference_type)) + ? kReferenceFrameLast2 + : kReferenceFrameLast; + } + // comp_bwdref. + if (reader_.ReadSymbol(GetReferenceCdf(block))) { + bp.reference_frame[1] = kReferenceFrameAlternate; + } else { + // comp_bwdref_p1. + bp.reference_frame[1] = + reader_.ReadSymbol(GetReferenceCdf(block)) + ? kReferenceFrameAlternate2 + : kReferenceFrameBackward; + } + return; + } + assert(!use_compound_reference); + bp.reference_frame[1] = kReferenceFrameNone; + // single_ref_p1. + if (reader_.ReadSymbol(GetReferenceCdf(block))) { + // single_ref_p2. + if (reader_.ReadSymbol(GetReferenceCdf(block))) { + bp.reference_frame[0] = kReferenceFrameAlternate; + return; + } + // single_ref_p6. + bp.reference_frame[0] = + reader_.ReadSymbol(GetReferenceCdf(block)) + ? kReferenceFrameAlternate2 + : kReferenceFrameBackward; + return; + } + // single_ref_p3. + if (reader_.ReadSymbol(GetReferenceCdf(block))) { + // single_ref_p5. + bp.reference_frame[0] = + reader_.ReadSymbol(GetReferenceCdf(block)) + ? kReferenceFrameGolden + : kReferenceFrameLast3; + return; + } + // single_ref_p4. + bp.reference_frame[0] = + reader_.ReadSymbol(GetReferenceCdf(block)) + ? kReferenceFrameLast2 + : kReferenceFrameLast; +} + +void Tile::ReadInterPredictionModeY(const Block& block, + const MvContexts& mode_contexts) { + BlockParameters& bp = *block.bp; + if (bp.skip_mode) { + bp.y_mode = kPredictionModeNearestNearestMv; + return; + } + if (frame_header_.segmentation.FeatureActive(bp.segment_id, + kSegmentFeatureSkip) || + frame_header_.segmentation.FeatureActive(bp.segment_id, + kSegmentFeatureGlobalMv)) { + bp.y_mode = kPredictionModeGlobalMv; + return; + } + if (bp.reference_frame[1] > kReferenceFrameIntra) { + const int idx0 = mode_contexts.reference_mv >> 1; + const int idx1 = + std::min(mode_contexts.new_mv, kCompoundModeNewMvContexts - 1); + const int context = kCompoundModeContextMap[idx0][idx1]; + const int offset = reader_.ReadSymbol( + symbol_decoder_context_.compound_prediction_mode_cdf[context]); + bp.y_mode = + static_cast(kPredictionModeNearestNearestMv + offset); + return; + } + // new_mv. + if (!reader_.ReadSymbol( + symbol_decoder_context_.new_mv_cdf[mode_contexts.new_mv])) { + bp.y_mode = kPredictionModeNewMv; + return; + } + // zero_mv. + if (!reader_.ReadSymbol( + symbol_decoder_context_.zero_mv_cdf[mode_contexts.zero_mv])) { + bp.y_mode = kPredictionModeGlobalMv; + return; + } + // ref_mv. + bp.y_mode = + reader_.ReadSymbol( + symbol_decoder_context_.reference_mv_cdf[mode_contexts.reference_mv]) + ? kPredictionModeNearMv + : kPredictionModeNearestMv; +} + +void Tile::ReadRefMvIndex(const Block& block) { + BlockParameters& bp = *block.bp; + PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + prediction_parameters.ref_mv_index = 0; + if (bp.y_mode != kPredictionModeNewMv && + bp.y_mode != kPredictionModeNewNewMv && + !kPredictionModeHasNearMvMask.Contains(bp.y_mode)) { + return; + } + const int start = + static_cast(kPredictionModeHasNearMvMask.Contains(bp.y_mode)); + prediction_parameters.ref_mv_index = start; + for (int i = start; i < start + 2; ++i) { + if (prediction_parameters.ref_mv_count <= i + 1) break; + // drl_mode in the spec. + const bool ref_mv_index_bit = reader_.ReadSymbol( + symbol_decoder_context_.ref_mv_index_cdf[GetRefMvIndexContext( + prediction_parameters.nearest_mv_count, i)]); + prediction_parameters.ref_mv_index = i + static_cast(ref_mv_index_bit); + if (!ref_mv_index_bit) return; + } +} + +void Tile::ReadInterIntraMode(const Block& block, bool is_compound) { + BlockParameters& bp = *block.bp; + PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + prediction_parameters.inter_intra_mode = kNumInterIntraModes; + prediction_parameters.is_wedge_inter_intra = false; + if (bp.skip_mode || !sequence_header_.enable_interintra_compound || + is_compound || !kIsInterIntraModeAllowedMask.Contains(block.size)) { + return; + } + // kSizeGroup[block.size] is guaranteed to be non-zero because of the block + // size constraint enforced in the above condition. + assert(kSizeGroup[block.size] - 1 >= 0); + if (!reader_.ReadSymbol( + symbol_decoder_context_ + .is_inter_intra_cdf[kSizeGroup[block.size] - 1])) { + prediction_parameters.inter_intra_mode = kNumInterIntraModes; + return; + } + prediction_parameters.inter_intra_mode = + static_cast(reader_.ReadSymbol( + symbol_decoder_context_ + .inter_intra_mode_cdf[kSizeGroup[block.size] - 1])); + bp.reference_frame[1] = kReferenceFrameIntra; + prediction_parameters.angle_delta[kPlaneTypeY] = 0; + prediction_parameters.angle_delta[kPlaneTypeUV] = 0; + prediction_parameters.use_filter_intra = false; + prediction_parameters.is_wedge_inter_intra = reader_.ReadSymbol( + symbol_decoder_context_.is_wedge_inter_intra_cdf[block.size]); + if (!prediction_parameters.is_wedge_inter_intra) return; + prediction_parameters.wedge_index = + reader_.ReadSymbol( + symbol_decoder_context_.wedge_index_cdf[block.size]); + prediction_parameters.wedge_sign = 0; +} + +void Tile::ReadMotionMode(const Block& block, bool is_compound) { + BlockParameters& bp = *block.bp; + PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + const auto global_motion_type = + frame_header_.global_motion[bp.reference_frame[0]].type; + if (bp.skip_mode || !frame_header_.is_motion_mode_switchable || + IsBlockDimension4(block.size) || + (frame_header_.force_integer_mv == 0 && + (bp.y_mode == kPredictionModeGlobalMv || + bp.y_mode == kPredictionModeGlobalGlobalMv) && + global_motion_type > kGlobalMotionTransformationTypeTranslation) || + is_compound || bp.reference_frame[1] == kReferenceFrameIntra || + !block.HasOverlappableCandidates()) { + prediction_parameters.motion_mode = kMotionModeSimple; + return; + } + prediction_parameters.num_warp_samples = 0; + int num_samples_scanned = 0; + memset(prediction_parameters.warp_estimate_candidates, 0, + sizeof(prediction_parameters.warp_estimate_candidates)); + FindWarpSamples(block, &prediction_parameters.num_warp_samples, + &num_samples_scanned, + prediction_parameters.warp_estimate_candidates); + if (frame_header_.force_integer_mv != 0 || + prediction_parameters.num_warp_samples == 0 || + !frame_header_.allow_warped_motion || IsScaled(bp.reference_frame[0])) { + prediction_parameters.motion_mode = + reader_.ReadSymbol(symbol_decoder_context_.use_obmc_cdf[block.size]) + ? kMotionModeObmc + : kMotionModeSimple; + return; + } + prediction_parameters.motion_mode = + static_cast(reader_.ReadSymbol( + symbol_decoder_context_.motion_mode_cdf[block.size])); +} + +uint16_t* Tile::GetIsExplicitCompoundTypeCdf(const Block& block) { + int context = 0; + if (block.top_available[kPlaneY]) { + if (!block.IsTopSingle()) { + context += static_cast(block.bp_top->is_explicit_compound_type); + } else if (block.TopReference(0) == kReferenceFrameAlternate) { + context += 3; + } + } + if (block.left_available[kPlaneY]) { + if (!block.IsLeftSingle()) { + context += static_cast(block.bp_left->is_explicit_compound_type); + } else if (block.LeftReference(0) == kReferenceFrameAlternate) { + context += 3; + } + } + return symbol_decoder_context_.is_explicit_compound_type_cdf[std::min( + context, kIsExplicitCompoundTypeContexts - 1)]; +} + +uint16_t* Tile::GetIsCompoundTypeAverageCdf(const Block& block) { + const BlockParameters& bp = *block.bp; + const ReferenceInfo& reference_info = *current_frame_.reference_info(); + const int forward = + std::abs(reference_info.relative_distance_from[bp.reference_frame[0]]); + const int backward = + std::abs(reference_info.relative_distance_from[bp.reference_frame[1]]); + int context = (forward == backward) ? 3 : 0; + if (block.top_available[kPlaneY]) { + if (!block.IsTopSingle()) { + context += static_cast(block.bp_top->is_compound_type_average); + } else if (block.TopReference(0) == kReferenceFrameAlternate) { + ++context; + } + } + if (block.left_available[kPlaneY]) { + if (!block.IsLeftSingle()) { + context += static_cast(block.bp_left->is_compound_type_average); + } else if (block.LeftReference(0) == kReferenceFrameAlternate) { + ++context; + } + } + return symbol_decoder_context_.is_compound_type_average_cdf[context]; +} + +void Tile::ReadCompoundType(const Block& block, bool is_compound) { + BlockParameters& bp = *block.bp; + bp.is_explicit_compound_type = false; + bp.is_compound_type_average = true; + PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + if (bp.skip_mode) { + prediction_parameters.compound_prediction_type = + kCompoundPredictionTypeAverage; + return; + } + if (is_compound) { + if (sequence_header_.enable_masked_compound) { + bp.is_explicit_compound_type = + reader_.ReadSymbol(GetIsExplicitCompoundTypeCdf(block)); + } + if (bp.is_explicit_compound_type) { + if (kIsWedgeCompoundModeAllowed.Contains(block.size)) { + // Only kCompoundPredictionTypeWedge and + // kCompoundPredictionTypeDiffWeighted are signaled explicitly. + prediction_parameters.compound_prediction_type = + static_cast(reader_.ReadSymbol( + symbol_decoder_context_.compound_type_cdf[block.size])); + } else { + prediction_parameters.compound_prediction_type = + kCompoundPredictionTypeDiffWeighted; + } + } else { + if (sequence_header_.enable_jnt_comp) { + bp.is_compound_type_average = + reader_.ReadSymbol(GetIsCompoundTypeAverageCdf(block)); + prediction_parameters.compound_prediction_type = + bp.is_compound_type_average ? kCompoundPredictionTypeAverage + : kCompoundPredictionTypeDistance; + } else { + prediction_parameters.compound_prediction_type = + kCompoundPredictionTypeAverage; + return; + } + } + if (prediction_parameters.compound_prediction_type == + kCompoundPredictionTypeWedge) { + prediction_parameters.wedge_index = + reader_.ReadSymbol( + symbol_decoder_context_.wedge_index_cdf[block.size]); + prediction_parameters.wedge_sign = static_cast(reader_.ReadBit()); + } else if (prediction_parameters.compound_prediction_type == + kCompoundPredictionTypeDiffWeighted) { + prediction_parameters.mask_is_inverse = + static_cast(reader_.ReadBit()); + } + return; + } + if (prediction_parameters.inter_intra_mode != kNumInterIntraModes) { + prediction_parameters.compound_prediction_type = + prediction_parameters.is_wedge_inter_intra + ? kCompoundPredictionTypeWedge + : kCompoundPredictionTypeIntra; + return; + } + prediction_parameters.compound_prediction_type = + kCompoundPredictionTypeAverage; +} + +uint16_t* Tile::GetInterpolationFilterCdf(const Block& block, int direction) { + const BlockParameters& bp = *block.bp; + int context = MultiplyBy8(direction) + + MultiplyBy4(static_cast(bp.reference_frame[1] > + kReferenceFrameIntra)); + int top_type = kNumExplicitInterpolationFilters; + if (block.top_available[kPlaneY]) { + if (block.bp_top->reference_frame[0] == bp.reference_frame[0] || + block.bp_top->reference_frame[1] == bp.reference_frame[0]) { + top_type = block.bp_top->interpolation_filter[direction]; + } + } + int left_type = kNumExplicitInterpolationFilters; + if (block.left_available[kPlaneY]) { + if (block.bp_left->reference_frame[0] == bp.reference_frame[0] || + block.bp_left->reference_frame[1] == bp.reference_frame[0]) { + left_type = block.bp_left->interpolation_filter[direction]; + } + } + if (left_type == top_type) { + context += left_type; + } else if (left_type == kNumExplicitInterpolationFilters) { + context += top_type; + } else if (top_type == kNumExplicitInterpolationFilters) { + context += left_type; + } else { + context += kNumExplicitInterpolationFilters; + } + return symbol_decoder_context_.interpolation_filter_cdf[context]; +} + +void Tile::ReadInterpolationFilter(const Block& block) { + BlockParameters& bp = *block.bp; + if (frame_header_.interpolation_filter != kInterpolationFilterSwitchable) { + static_assert( + sizeof(bp.interpolation_filter) / sizeof(bp.interpolation_filter[0]) == + 2, + "Interpolation filter array size is not 2"); + for (auto& interpolation_filter : bp.interpolation_filter) { + interpolation_filter = frame_header_.interpolation_filter; + } + return; + } + bool interpolation_filter_present = true; + if (bp.skip_mode || + block.bp->prediction_parameters->motion_mode == kMotionModeLocalWarp) { + interpolation_filter_present = false; + } else if (!IsBlockDimension4(block.size) && + bp.y_mode == kPredictionModeGlobalMv) { + interpolation_filter_present = + frame_header_.global_motion[bp.reference_frame[0]].type == + kGlobalMotionTransformationTypeTranslation; + } else if (!IsBlockDimension4(block.size) && + bp.y_mode == kPredictionModeGlobalGlobalMv) { + interpolation_filter_present = + frame_header_.global_motion[bp.reference_frame[0]].type == + kGlobalMotionTransformationTypeTranslation || + frame_header_.global_motion[bp.reference_frame[1]].type == + kGlobalMotionTransformationTypeTranslation; + } + for (int i = 0; i < (sequence_header_.enable_dual_filter ? 2 : 1); ++i) { + bp.interpolation_filter[i] = + interpolation_filter_present + ? static_cast( + reader_.ReadSymbol( + GetInterpolationFilterCdf(block, i))) + : kInterpolationFilterEightTap; + } + if (!sequence_header_.enable_dual_filter) { + bp.interpolation_filter[1] = bp.interpolation_filter[0]; + } +} + +bool Tile::ReadInterBlockModeInfo(const Block& block) { + BlockParameters& bp = *block.bp; + bp.palette_mode_info.size[kPlaneTypeY] = 0; + bp.palette_mode_info.size[kPlaneTypeUV] = 0; + ReadReferenceFrames(block); + const bool is_compound = bp.reference_frame[1] > kReferenceFrameIntra; + MvContexts mode_contexts; + FindMvStack(block, is_compound, &mode_contexts); + ReadInterPredictionModeY(block, mode_contexts); + ReadRefMvIndex(block); + if (!AssignInterMv(block, is_compound)) return false; + ReadInterIntraMode(block, is_compound); + ReadMotionMode(block, is_compound); + ReadCompoundType(block, is_compound); + ReadInterpolationFilter(block); + return true; +} + +bool Tile::DecodeInterModeInfo(const Block& block) { + BlockParameters& bp = *block.bp; + block.bp->prediction_parameters->use_intra_block_copy = false; + bp.skip = false; + if (!ReadInterSegmentId(block, /*pre_skip=*/true)) return false; + ReadSkipMode(block); + if (bp.skip_mode) { + bp.skip = true; + } else { + ReadSkip(block); + } + if (!frame_header_.segmentation.segment_id_pre_skip && + !ReadInterSegmentId(block, /*pre_skip=*/false)) { + return false; + } + ReadCdef(block); + if (read_deltas_) { + ReadQuantizerIndexDelta(block); + ReadLoopFilterDelta(block); + read_deltas_ = false; + } + ReadIsInter(block); + return bp.is_inter ? ReadInterBlockModeInfo(block) + : ReadIntraBlockModeInfo(block, /*intra_y_mode=*/false); +} + +bool Tile::DecodeModeInfo(const Block& block) { + return IsIntraFrame(frame_header_.frame_type) ? DecodeIntraModeInfo(block) + : DecodeInterModeInfo(block); +} + +} // namespace libgav1 diff --git a/src/tile/bitstream/palette.cc b/src/tile/bitstream/palette.cc new file mode 100644 index 0000000..674d210 --- /dev/null +++ b/src/tile/bitstream/palette.cc @@ -0,0 +1,319 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include + +#include "src/obu_parser.h" +#include "src/symbol_decoder_context.h" +#include "src/tile.h" +#include "src/utils/bit_mask_set.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/entropy_decoder.h" +#include "src/utils/memory.h" +#include "src/utils/types.h" + +namespace libgav1 { + +int Tile::GetPaletteCache(const Block& block, PlaneType plane_type, + uint16_t* const cache) { + const int top_size = + (block.top_available[kPlaneY] && Mod64(MultiplyBy4(block.row4x4)) != 0) + ? block.bp_top->palette_mode_info.size[plane_type] + : 0; + const int left_size = block.left_available[kPlaneY] + ? block.bp_left->palette_mode_info.size[plane_type] + : 0; + if (left_size == 0 && top_size == 0) return 0; + // Merge the left and top colors in sorted order and store them in |cache|. + uint16_t dummy[1]; + const uint16_t* top = (top_size > 0) + ? block.bp_top->palette_mode_info.color[plane_type] + : dummy; + const uint16_t* left = + (left_size > 0) ? block.bp_left->palette_mode_info.color[plane_type] + : dummy; + std::merge(top, top + top_size, left, left + left_size, cache); + // Deduplicate the entries in |cache| and return the number of unique + // entries. + return static_cast( + std::distance(cache, std::unique(cache, cache + left_size + top_size))); +} + +void Tile::ReadPaletteColors(const Block& block, Plane plane) { + const PlaneType plane_type = GetPlaneType(plane); + uint16_t cache[2 * kMaxPaletteSize]; + const int n = GetPaletteCache(block, plane_type, cache); + BlockParameters& bp = *block.bp; + const uint8_t palette_size = bp.palette_mode_info.size[plane_type]; + uint16_t* const palette_color = bp.palette_mode_info.color[plane]; + const int8_t bitdepth = sequence_header_.color_config.bitdepth; + int index = 0; + for (int i = 0; i < n && index < palette_size; ++i) { + if (reader_.ReadBit() != 0) { // use_palette_color_cache. + palette_color[index++] = cache[i]; + } + } + const int merge_pivot = index; + if (index < palette_size) { + palette_color[index++] = + static_cast(reader_.ReadLiteral(bitdepth)); + } + const int max_value = (1 << bitdepth) - 1; + if (index < palette_size) { + int bits = bitdepth - 3 + static_cast(reader_.ReadLiteral(2)); + do { + const int delta = static_cast(reader_.ReadLiteral(bits)) + + (plane_type == kPlaneTypeY ? 1 : 0); + palette_color[index] = + std::min(palette_color[index - 1] + delta, max_value); + if (palette_color[index] + (plane_type == kPlaneTypeY ? 1 : 0) >= + max_value) { + // Once the color exceeds max_value, all others can be set to max_value + // (since they are computed as a delta on top of the current color and + // then clipped). + Memset(&palette_color[index + 1], max_value, palette_size - index - 1); + break; + } + const int range = (1 << bitdepth) - palette_color[index] - + (plane_type == kPlaneTypeY ? 1 : 0); + bits = std::min(bits, CeilLog2(range)); + } while (++index < palette_size); + } + // Palette colors are generated using two ascending arrays. So sorting them is + // simply a matter of merging the two sorted portions of the array. + std::inplace_merge(palette_color, palette_color + merge_pivot, + palette_color + palette_size); + if (plane_type == kPlaneTypeUV) { + uint16_t* const palette_color_v = bp.palette_mode_info.color[kPlaneV]; + if (reader_.ReadBit() != 0) { // delta_encode_palette_colors_v. + const int bits = bitdepth - 4 + static_cast(reader_.ReadLiteral(2)); + palette_color_v[0] = reader_.ReadLiteral(bitdepth); + for (int i = 1; i < palette_size; ++i) { + int delta = static_cast(reader_.ReadLiteral(bits)); + if (delta != 0 && reader_.ReadBit() != 0) delta = -delta; + // This line is equivalent to the following lines in the spec: + // val = palette_colors_v[ idx - 1 ] + palette_delta_v + // if ( val < 0 ) val += maxVal + // if ( val >= maxVal ) val -= maxVal + // palette_colors_v[ idx ] = Clip1( val ) + // + // The difference is that in the code, max_value is (1 << bitdepth) - 1. + // So "& max_value" has the desired effect of computing both the "if" + // conditions and the Clip. + palette_color_v[i] = (palette_color_v[i - 1] + delta) & max_value; + } + } else { + for (int i = 0; i < palette_size; ++i) { + palette_color_v[i] = + static_cast(reader_.ReadLiteral(bitdepth)); + } + } + } +} + +void Tile::ReadPaletteModeInfo(const Block& block) { + BlockParameters& bp = *block.bp; + if (IsBlockSmallerThan8x8(block.size) || block.size > kBlock64x64 || + !frame_header_.allow_screen_content_tools) { + bp.palette_mode_info.size[kPlaneTypeY] = 0; + bp.palette_mode_info.size[kPlaneTypeUV] = 0; + return; + } + const int block_size_context = + k4x4WidthLog2[block.size] + k4x4HeightLog2[block.size] - 2; + if (bp.y_mode == kPredictionModeDc) { + const int context = + static_cast(block.top_available[kPlaneY] && + block.bp_top->palette_mode_info.size[kPlaneTypeY] > + 0) + + static_cast(block.left_available[kPlaneY] && + block.bp_left->palette_mode_info.size[kPlaneTypeY] > + 0); + const bool has_palette_y = reader_.ReadSymbol( + symbol_decoder_context_.has_palette_y_cdf[block_size_context][context]); + if (has_palette_y) { + bp.palette_mode_info.size[kPlaneTypeY] = + kMinPaletteSize + + reader_.ReadSymbol( + symbol_decoder_context_.palette_y_size_cdf[block_size_context]); + ReadPaletteColors(block, kPlaneY); + } + } + if (bp.uv_mode == kPredictionModeDc && block.HasChroma()) { + const int context = + static_cast(bp.palette_mode_info.size[kPlaneTypeY] > 0); + const bool has_palette_uv = + reader_.ReadSymbol(symbol_decoder_context_.has_palette_uv_cdf[context]); + if (has_palette_uv) { + bp.palette_mode_info.size[kPlaneTypeUV] = + kMinPaletteSize + + reader_.ReadSymbol( + symbol_decoder_context_.palette_uv_size_cdf[block_size_context]); + ReadPaletteColors(block, kPlaneU); + } + } +} + +void Tile::PopulatePaletteColorContexts( + const Block& block, PlaneType plane_type, int i, int start, int end, + uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize], + uint8_t color_context[kMaxPaletteSquare]) { + const PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + for (int column = start, counter = 0; column >= end; --column, ++counter) { + const int row = i - column; + assert(row > 0 || column > 0); + const uint8_t top = + (row > 0) + ? prediction_parameters.color_index_map[plane_type][row - 1][column] + : 0; + const uint8_t left = + (column > 0) + ? prediction_parameters.color_index_map[plane_type][row][column - 1] + : 0; + uint8_t index_mask; + static_assert(kMaxPaletteSize <= 8, ""); + int index; + if (column <= 0) { + color_context[counter] = 0; + color_order[counter][0] = top; + index_mask = 1 << top; + index = 1; + } else if (row <= 0) { + color_context[counter] = 0; + color_order[counter][0] = left; + index_mask = 1 << left; + index = 1; + } else { + const uint8_t top_left = + prediction_parameters + .color_index_map[plane_type][row - 1][column - 1]; + index_mask = (1 << top) | (1 << left) | (1 << top_left); + if (top == left && top == top_left) { + color_context[counter] = 4; + color_order[counter][0] = top; + index = 1; + } else if (top == left) { + color_context[counter] = 3; + color_order[counter][0] = top; + color_order[counter][1] = top_left; + index = 2; + } else if (top == top_left) { + color_context[counter] = 2; + color_order[counter][0] = top_left; + color_order[counter][1] = left; + index = 2; + } else if (left == top_left) { + color_context[counter] = 2; + color_order[counter][0] = top_left; + color_order[counter][1] = top; + index = 2; + } else { + color_context[counter] = 1; + color_order[counter][0] = std::min(top, left); + color_order[counter][1] = std::max(top, left); + color_order[counter][2] = top_left; + index = 3; + } + } + // Even though only the first |palette_size| entries of this array are ever + // used, it is faster to populate all 8 because of the vectorization of the + // constant sized loop. + for (uint8_t j = 0; j < kMaxPaletteSize; ++j) { + if (BitMaskSet::MaskContainsValue(index_mask, j)) continue; + color_order[counter][index++] = j; + } + } +} + +bool Tile::ReadPaletteTokens(const Block& block) { + const PaletteModeInfo& palette_mode_info = block.bp->palette_mode_info; + PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + for (int plane_type = kPlaneTypeY; + plane_type < (block.HasChroma() ? kNumPlaneTypes : kPlaneTypeUV); + ++plane_type) { + const int palette_size = palette_mode_info.size[plane_type]; + if (palette_size == 0) continue; + int block_height = block.height; + int block_width = block.width; + int screen_height = std::min( + block_height, MultiplyBy4(frame_header_.rows4x4 - block.row4x4)); + int screen_width = std::min( + block_width, MultiplyBy4(frame_header_.columns4x4 - block.column4x4)); + if (plane_type == kPlaneTypeUV) { + block_height >>= sequence_header_.color_config.subsampling_y; + block_width >>= sequence_header_.color_config.subsampling_x; + screen_height >>= sequence_header_.color_config.subsampling_y; + screen_width >>= sequence_header_.color_config.subsampling_x; + if (block_height < 4) { + block_height += 2; + screen_height += 2; + } + if (block_width < 4) { + block_width += 2; + screen_width += 2; + } + } + if (!prediction_parameters.color_index_map[plane_type].Reset( + block_height, block_width, /*zero_initialize=*/false)) { + return false; + } + int first_value = 0; + reader_.DecodeUniform(palette_size, &first_value); + prediction_parameters.color_index_map[plane_type][0][0] = first_value; + for (int i = 1; i < screen_height + screen_width - 1; ++i) { + const int start = std::min(i, screen_width - 1); + const int end = std::max(0, i - screen_height + 1); + uint8_t color_order[kMaxPaletteSquare][kMaxPaletteSize]; + uint8_t color_context[kMaxPaletteSquare]; + PopulatePaletteColorContexts(block, static_cast(plane_type), i, + start, end, color_order, color_context); + for (int j = start, counter = 0; j >= end; --j, ++counter) { + uint16_t* const cdf = + symbol_decoder_context_ + .palette_color_index_cdf[plane_type] + [palette_size - kMinPaletteSize] + [color_context[counter]]; + const int color_order_index = reader_.ReadSymbol(cdf, palette_size); + prediction_parameters.color_index_map[plane_type][i - j][j] = + color_order[counter][color_order_index]; + } + } + if (screen_width < block_width) { + for (int i = 0; i < screen_height; ++i) { + memset( + &prediction_parameters.color_index_map[plane_type][i][screen_width], + prediction_parameters + .color_index_map[plane_type][i][screen_width - 1], + block_width - screen_width); + } + } + for (int i = screen_height; i < block_height; ++i) { + memcpy( + prediction_parameters.color_index_map[plane_type][i], + prediction_parameters.color_index_map[plane_type][screen_height - 1], + block_width); + } + } + return true; +} + +} // namespace libgav1 diff --git a/src/tile/bitstream/partition.cc b/src/tile/bitstream/partition.cc new file mode 100644 index 0000000..f3dbbb0 --- /dev/null +++ b/src/tile/bitstream/partition.cc @@ -0,0 +1,148 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "src/symbol_decoder_context.h" +#include "src/tile.h" +#include "src/utils/block_parameters_holder.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/entropy_decoder.h" +#include "src/utils/types.h" + +namespace libgav1 { +namespace { + +uint16_t PartitionCdfGatherHorizontalAlike(const uint16_t* const partition_cdf, + BlockSize block_size) { + // The spec computes the cdf value using the following formula (not writing + // partition_cdf[] and using short forms for partition names for clarity): + // cdf = None - H + V - S + S - HTS + HTS - HBS + HBS - VLS; + // if (block_size != 128x128) { + // cdf += VRS - H4; + // } + // After canceling out the repeated terms with opposite signs, we have: + // cdf = None - H + V - VLS; + // if (block_size != 128x128) { + // cdf += VRS - H4; + // } + uint16_t cdf = partition_cdf[kPartitionNone] - + partition_cdf[kPartitionHorizontal] + + partition_cdf[kPartitionVertical] - + partition_cdf[kPartitionVerticalWithLeftSplit]; + if (block_size != kBlock128x128) { + cdf += partition_cdf[kPartitionVerticalWithRightSplit] - + partition_cdf[kPartitionHorizontal4]; + } + return cdf; +} + +uint16_t PartitionCdfGatherVerticalAlike(const uint16_t* const partition_cdf, + BlockSize block_size) { + // The spec computes the cdf value using the following formula (not writing + // partition_cdf[] and using short forms for partition names for clarity): + // cdf = H - V + V - S + HBS - VLS + VLS - VRS + S - HTS; + // if (block_size != 128x128) { + // cdf += H4 - V4; + // } + // V4 is always zero. So, after canceling out the repeated terms with opposite + // signs, we have: + // cdf = H + HBS - VRS - HTS; + // if (block_size != 128x128) { + // cdf += H4; + // } + // VRS is zero for 128x128 blocks. So, further simplifying we have: + // cdf = H + HBS - HTS; + // if (block_size != 128x128) { + // cdf += H4 - VRS; + // } + uint16_t cdf = partition_cdf[kPartitionHorizontal] + + partition_cdf[kPartitionHorizontalWithBottomSplit] - + partition_cdf[kPartitionHorizontalWithTopSplit]; + if (block_size != kBlock128x128) { + cdf += partition_cdf[kPartitionHorizontal4] - + partition_cdf[kPartitionVerticalWithRightSplit]; + } + return cdf; +} + +} // namespace + +uint16_t* Tile::GetPartitionCdf(int row4x4, int column4x4, + BlockSize block_size) { + const int block_size_log2 = k4x4WidthLog2[block_size]; + int top = 0; + if (IsTopInside(row4x4)) { + top = static_cast( + k4x4WidthLog2[block_parameters_holder_.Find(row4x4 - 1, column4x4) + ->size] < block_size_log2); + } + int left = 0; + if (IsLeftInside(column4x4)) { + left = static_cast( + k4x4HeightLog2[block_parameters_holder_.Find(row4x4, column4x4 - 1) + ->size] < block_size_log2); + } + const int context = left * 2 + top; + return symbol_decoder_context_.partition_cdf[block_size_log2 - 1][context]; +} + +bool Tile::ReadPartition(int row4x4, int column4x4, BlockSize block_size, + bool has_rows, bool has_columns, + Partition* const partition) { + if (IsBlockSmallerThan8x8(block_size)) { + *partition = kPartitionNone; + return true; + } + if (!has_rows && !has_columns) { + *partition = kPartitionSplit; + return true; + } + uint16_t* const partition_cdf = + GetPartitionCdf(row4x4, column4x4, block_size); + if (partition_cdf == nullptr) { + return false; + } + if (has_rows && has_columns) { + const int bsize_log2 = k4x4WidthLog2[block_size]; + // The partition block size should be 8x8 or above. + assert(bsize_log2 > 0); + if (bsize_log2 == 1) { + *partition = static_cast( + reader_.ReadSymbol(partition_cdf)); + } else if (bsize_log2 == 5) { + *partition = static_cast( + reader_.ReadSymbol( + partition_cdf)); + } else { + *partition = static_cast( + reader_.ReadSymbol(partition_cdf)); + } + } else if (has_columns) { + const uint16_t cdf = + PartitionCdfGatherVerticalAlike(partition_cdf, block_size); + *partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit + : kPartitionHorizontal; + } else { + const uint16_t cdf = + PartitionCdfGatherHorizontalAlike(partition_cdf, block_size); + *partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit + : kPartitionVertical; + } + return true; +} + +} // namespace libgav1 diff --git a/src/tile/bitstream/transform_size.cc b/src/tile/bitstream/transform_size.cc new file mode 100644 index 0000000..b79851d --- /dev/null +++ b/src/tile/bitstream/transform_size.cc @@ -0,0 +1,222 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "src/dsp/constants.h" +#include "src/obu_parser.h" +#include "src/symbol_decoder_context.h" +#include "src/tile.h" +#include "src/utils/array_2d.h" +#include "src/utils/block_parameters_holder.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/entropy_decoder.h" +#include "src/utils/segmentation.h" +#include "src/utils/stack.h" +#include "src/utils/types.h" + +namespace libgav1 { +namespace { + +constexpr uint8_t kMaxVariableTransformTreeDepth = 2; +// Max_Tx_Depth array from section 5.11.5 in the spec with the following +// modification: If the element is not zero, it is subtracted by one. That is +// the only way in which this array is being used. +constexpr int kTxDepthCdfIndex[kMaxBlockSizes] = { + 0, 0, 1, 0, 0, 1, 2, 1, 1, 1, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3}; + +constexpr TransformSize kMaxTransformSizeRectangle[kMaxBlockSizes] = { + kTransformSize4x4, kTransformSize4x8, kTransformSize4x16, + kTransformSize8x4, kTransformSize8x8, kTransformSize8x16, + kTransformSize8x32, kTransformSize16x4, kTransformSize16x8, + kTransformSize16x16, kTransformSize16x32, kTransformSize16x64, + kTransformSize32x8, kTransformSize32x16, kTransformSize32x32, + kTransformSize32x64, kTransformSize64x16, kTransformSize64x32, + kTransformSize64x64, kTransformSize64x64, kTransformSize64x64, + kTransformSize64x64}; + +TransformSize GetSquareTransformSize(uint8_t pixels) { + switch (pixels) { + case 128: + case 64: + return kTransformSize64x64; + case 32: + return kTransformSize32x32; + case 16: + return kTransformSize16x16; + case 8: + return kTransformSize8x8; + default: + return kTransformSize4x4; + } +} + +} // namespace + +int Tile::GetTopTransformWidth(const Block& block, int row4x4, int column4x4, + bool ignore_skip) { + if (row4x4 == block.row4x4) { + if (!block.top_available[kPlaneY]) return 64; + const BlockParameters& bp_top = + *block_parameters_holder_.Find(row4x4 - 1, column4x4); + if ((ignore_skip || bp_top.skip) && bp_top.is_inter) { + return kBlockWidthPixels[bp_top.size]; + } + } + return kTransformWidth[inter_transform_sizes_[row4x4 - 1][column4x4]]; +} + +int Tile::GetLeftTransformHeight(const Block& block, int row4x4, int column4x4, + bool ignore_skip) { + if (column4x4 == block.column4x4) { + if (!block.left_available[kPlaneY]) return 64; + const BlockParameters& bp_left = + *block_parameters_holder_.Find(row4x4, column4x4 - 1); + if ((ignore_skip || bp_left.skip) && bp_left.is_inter) { + return kBlockHeightPixels[bp_left.size]; + } + } + return kTransformHeight[inter_transform_sizes_[row4x4][column4x4 - 1]]; +} + +TransformSize Tile::ReadFixedTransformSize(const Block& block) { + BlockParameters& bp = *block.bp; + if (frame_header_.segmentation.lossless[bp.segment_id]) { + return kTransformSize4x4; + } + const TransformSize max_rect_tx_size = kMaxTransformSizeRectangle[block.size]; + const bool allow_select = !bp.skip || !bp.is_inter; + if (block.size == kBlock4x4 || !allow_select || + frame_header_.tx_mode != kTxModeSelect) { + return max_rect_tx_size; + } + const int max_tx_width = kTransformWidth[max_rect_tx_size]; + const int max_tx_height = kTransformHeight[max_rect_tx_size]; + const int top_width = + block.top_available[kPlaneY] + ? GetTopTransformWidth(block, block.row4x4, block.column4x4, true) + : 0; + const int left_height = + block.left_available[kPlaneY] + ? GetLeftTransformHeight(block, block.row4x4, block.column4x4, true) + : 0; + const auto context = static_cast(top_width >= max_tx_width) + + static_cast(left_height >= max_tx_height); + const int cdf_index = kTxDepthCdfIndex[block.size]; + uint16_t* const cdf = + symbol_decoder_context_.tx_depth_cdf[cdf_index][context]; + const int tx_depth = (cdf_index == 0) + ? static_cast(reader_.ReadSymbol(cdf)) + : reader_.ReadSymbol<3>(cdf); + assert(tx_depth < 3); + TransformSize tx_size = max_rect_tx_size; + if (tx_depth == 0) return tx_size; + tx_size = kSplitTransformSize[tx_size]; + if (tx_depth == 1) return tx_size; + return kSplitTransformSize[tx_size]; +} + +void Tile::ReadVariableTransformTree(const Block& block, int row4x4, + int column4x4, TransformSize tx_size) { + const uint8_t pixels = std::max(block.width, block.height); + const TransformSize max_tx_size = GetSquareTransformSize(pixels); + const int context_delta = (kNumSquareTransformSizes - 1 - + TransformSizeToSquareTransformIndex(max_tx_size)) * + 6; + + // Branching factor is 4 and maximum depth is 2. So the maximum stack size + // necessary is (4 - 1) + 4 = 7. + Stack stack; + stack.Push(TransformTreeNode(column4x4, row4x4, tx_size, 0)); + + do { + TransformTreeNode node = stack.Pop(); + const int tx_width4x4 = kTransformWidth4x4[node.tx_size]; + const int tx_height4x4 = kTransformHeight4x4[node.tx_size]; + if (node.tx_size != kTransformSize4x4 && + node.depth != kMaxVariableTransformTreeDepth) { + const auto top = + static_cast(GetTopTransformWidth(block, node.y, node.x, false) < + kTransformWidth[node.tx_size]); + const auto left = static_cast( + GetLeftTransformHeight(block, node.y, node.x, false) < + kTransformHeight[node.tx_size]); + const int context = + static_cast(max_tx_size > kTransformSize8x8 && + kTransformSizeSquareMax[node.tx_size] != + max_tx_size) * + 3 + + context_delta + top + left; + // tx_split. + if (reader_.ReadSymbol(symbol_decoder_context_.tx_split_cdf[context])) { + const TransformSize sub_tx_size = kSplitTransformSize[node.tx_size]; + const int step_width4x4 = kTransformWidth4x4[sub_tx_size]; + const int step_height4x4 = kTransformHeight4x4[sub_tx_size]; + // The loops have to run in reverse order because we use a stack for + // DFS. + for (int i = tx_height4x4 - step_height4x4; i >= 0; + i -= step_height4x4) { + for (int j = tx_width4x4 - step_width4x4; j >= 0; + j -= step_width4x4) { + if (node.y + i >= frame_header_.rows4x4 || + node.x + j >= frame_header_.columns4x4) { + continue; + } + stack.Push(TransformTreeNode(node.x + j, node.y + i, sub_tx_size, + node.depth + 1)); + } + } + continue; + } + } + // tx_split is false. + for (int i = 0; i < tx_height4x4; ++i) { + static_assert(sizeof(TransformSize) == 1, ""); + memset(&inter_transform_sizes_[node.y + i][node.x], node.tx_size, + tx_width4x4); + } + block_parameters_holder_.Find(node.y, node.x)->transform_size = + node.tx_size; + } while (!stack.Empty()); +} + +void Tile::DecodeTransformSize(const Block& block) { + BlockParameters& bp = *block.bp; + if (frame_header_.tx_mode == kTxModeSelect && block.size > kBlock4x4 && + bp.is_inter && !bp.skip && + !frame_header_.segmentation.lossless[bp.segment_id]) { + const TransformSize max_tx_size = kMaxTransformSizeRectangle[block.size]; + const int tx_width4x4 = kTransformWidth4x4[max_tx_size]; + const int tx_height4x4 = kTransformHeight4x4[max_tx_size]; + for (int row = block.row4x4; row < block.row4x4 + block.height4x4; + row += tx_height4x4) { + for (int column = block.column4x4; + column < block.column4x4 + block.width4x4; column += tx_width4x4) { + ReadVariableTransformTree(block, row, column, max_tx_size); + } + } + } else { + bp.transform_size = ReadFixedTransformSize(block); + for (int row = block.row4x4; row < block.row4x4 + block.height4x4; ++row) { + static_assert(sizeof(TransformSize) == 1, ""); + memset(&inter_transform_sizes_[row][block.column4x4], bp.transform_size, + block.width4x4); + } + } +} + +} // namespace libgav1 diff --git a/src/tile/prediction.cc b/src/tile/prediction.cc new file mode 100644 index 0000000..c5560a6 --- /dev/null +++ b/src/tile/prediction.cc @@ -0,0 +1,1361 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "src/buffer_pool.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/motion_vector.h" +#include "src/obu_parser.h" +#include "src/prediction_mask.h" +#include "src/tile.h" +#include "src/utils/array_2d.h" +#include "src/utils/bit_mask_set.h" +#include "src/utils/block_parameters_holder.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/logging.h" +#include "src/utils/memory.h" +#include "src/utils/types.h" +#include "src/warp_prediction.h" +#include "src/yuv_buffer.h" + +namespace libgav1 { +namespace { + +// Import all the constants in the anonymous namespace. +#include "src/inter_intra_masks.inc" + +// Precision bits when scaling reference frames. +constexpr int kReferenceScaleShift = 14; +constexpr int kAngleStep = 3; +constexpr int kPredictionModeToAngle[kIntraPredictionModesUV] = { + 0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0}; + +// The following modes need both the left_column and top_row for intra +// prediction. For directional modes left/top requirement is inferred based on +// the prediction angle. For Dc modes, left/top requirement is inferred based on +// whether or not left/top is available. +constexpr BitMaskSet kNeedsLeftAndTop(kPredictionModeSmooth, + kPredictionModeSmoothHorizontal, + kPredictionModeSmoothVertical, + kPredictionModePaeth); + +int16_t GetDirectionalIntraPredictorDerivative(const int angle) { + assert(angle >= 3); + assert(angle <= 87); + return kDirectionalIntraPredictorDerivative[DivideBy2(angle) - 1]; +} + +// Maps the block_size to an index as follows: +// kBlock8x8 => 0. +// kBlock8x16 => 1. +// kBlock8x32 => 2. +// kBlock16x8 => 3. +// kBlock16x16 => 4. +// kBlock16x32 => 5. +// kBlock32x8 => 6. +// kBlock32x16 => 7. +// kBlock32x32 => 8. +int GetWedgeBlockSizeIndex(BlockSize block_size) { + assert(block_size >= kBlock8x8); + return block_size - kBlock8x8 - static_cast(block_size >= kBlock16x8) - + static_cast(block_size >= kBlock32x8); +} + +// Maps a dimension of 4, 8, 16 and 32 to indices 0, 1, 2 and 3 respectively. +int GetInterIntraMaskLookupIndex(int dimension) { + assert(dimension == 4 || dimension == 8 || dimension == 16 || + dimension == 32); + return FloorLog2(dimension) - 2; +} + +// 7.11.2.9. +int GetIntraEdgeFilterStrength(int width, int height, int filter_type, + int delta) { + const int sum = width + height; + delta = std::abs(delta); + if (filter_type == 0) { + if (sum <= 8) { + if (delta >= 56) return 1; + } else if (sum <= 16) { + if (delta >= 40) return 1; + } else if (sum <= 24) { + if (delta >= 32) return 3; + if (delta >= 16) return 2; + if (delta >= 8) return 1; + } else if (sum <= 32) { + if (delta >= 32) return 3; + if (delta >= 4) return 2; + return 1; + } else { + return 3; + } + } else { + if (sum <= 8) { + if (delta >= 64) return 2; + if (delta >= 40) return 1; + } else if (sum <= 16) { + if (delta >= 48) return 2; + if (delta >= 20) return 1; + } else if (sum <= 24) { + if (delta >= 4) return 3; + } else { + return 3; + } + } + return 0; +} + +// 7.11.2.10. +bool DoIntraEdgeUpsampling(int width, int height, int filter_type, int delta) { + const int sum = width + height; + delta = std::abs(delta); + // This function should not be called when the prediction angle is 90 or 180. + assert(delta != 0); + if (delta >= 40) return false; + return (filter_type == 1) ? sum <= 8 : sum <= 16; +} + +constexpr uint8_t kQuantizedDistanceWeight[4][2] = { + {2, 3}, {2, 5}, {2, 7}, {1, kMaxFrameDistance}}; + +constexpr uint8_t kQuantizedDistanceLookup[4][2] = { + {9, 7}, {11, 5}, {12, 4}, {13, 3}}; + +void GetDistanceWeights(const int distance[2], int weight[2]) { + // Note: distance[0] and distance[1] correspond to relative distance + // between current frame and reference frame [1] and [0], respectively. + const int order = static_cast(distance[0] <= distance[1]); + if (distance[0] == 0 || distance[1] == 0) { + weight[0] = kQuantizedDistanceLookup[3][order]; + weight[1] = kQuantizedDistanceLookup[3][1 - order]; + } else { + int i; + for (i = 0; i < 3; ++i) { + const int weight_0 = kQuantizedDistanceWeight[i][order]; + const int weight_1 = kQuantizedDistanceWeight[i][1 - order]; + if (order == 0) { + if (distance[0] * weight_0 < distance[1] * weight_1) break; + } else { + if (distance[0] * weight_0 > distance[1] * weight_1) break; + } + } + weight[0] = kQuantizedDistanceLookup[i][order]; + weight[1] = kQuantizedDistanceLookup[i][1 - order]; + } +} + +dsp::IntraPredictor GetIntraPredictor(PredictionMode mode, bool has_left, + bool has_top) { + if (mode == kPredictionModeDc) { + if (has_left && has_top) { + return dsp::kIntraPredictorDc; + } + if (has_left) { + return dsp::kIntraPredictorDcLeft; + } + if (has_top) { + return dsp::kIntraPredictorDcTop; + } + return dsp::kIntraPredictorDcFill; + } + switch (mode) { + case kPredictionModePaeth: + return dsp::kIntraPredictorPaeth; + case kPredictionModeSmooth: + return dsp::kIntraPredictorSmooth; + case kPredictionModeSmoothVertical: + return dsp::kIntraPredictorSmoothVertical; + case kPredictionModeSmoothHorizontal: + return dsp::kIntraPredictorSmoothHorizontal; + default: + return dsp::kNumIntraPredictors; + } +} + +uint8_t* GetStartPoint(Array2DView* const buffer, const int plane, + const int x, const int y, const int bitdepth) { +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth > 8) { + Array2DView buffer16( + buffer[plane].rows(), buffer[plane].columns() / sizeof(uint16_t), + reinterpret_cast(&buffer[plane][0][0])); + return reinterpret_cast(&buffer16[y][x]); + } +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + static_cast(bitdepth); + return &buffer[plane][y][x]; +} + +int GetPixelPositionFromHighScale(int start, int step, int offset) { + return (start + step * offset) >> kScaleSubPixelBits; +} + +dsp::MaskBlendFunc GetMaskBlendFunc(const dsp::Dsp& dsp, bool is_inter_intra, + bool is_wedge_inter_intra, + int subsampling_x, int subsampling_y) { + return (is_inter_intra && !is_wedge_inter_intra) + ? dsp.mask_blend[0][/*is_inter_intra=*/true] + : dsp.mask_blend[subsampling_x + subsampling_y][is_inter_intra]; +} + +} // namespace + +template +void Tile::IntraPrediction(const Block& block, Plane plane, int x, int y, + bool has_left, bool has_top, bool has_top_right, + bool has_bottom_left, PredictionMode mode, + TransformSize tx_size) { + const int width = 1 << kTransformWidthLog2[tx_size]; + const int height = 1 << kTransformHeightLog2[tx_size]; + const int x_shift = subsampling_x_[plane]; + const int y_shift = subsampling_y_[plane]; + const int max_x = (MultiplyBy4(frame_header_.columns4x4) >> x_shift) - 1; + const int max_y = (MultiplyBy4(frame_header_.rows4x4) >> y_shift) - 1; + // For performance reasons, do not initialize the following two buffers. + alignas(kMaxAlignment) Pixel top_row_data[160]; + alignas(kMaxAlignment) Pixel left_column_data[160]; +#if LIBGAV1_MSAN + if (IsDirectionalMode(mode)) { + memset(top_row_data, 0, sizeof(top_row_data)); + memset(left_column_data, 0, sizeof(left_column_data)); + } +#endif + // Some predictors use |top_row_data| and |left_column_data| with a negative + // offset to access pixels to the top-left of the current block. So have some + // space before the arrays to allow populating those without having to move + // the rest of the array. + Pixel* const top_row = top_row_data + 16; + Pixel* const left_column = left_column_data + 16; + const int bitdepth = sequence_header_.color_config.bitdepth; + const int top_and_left_size = width + height; + const bool is_directional_mode = IsDirectionalMode(mode); + const PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + const bool use_filter_intra = + (plane == kPlaneY && prediction_parameters.use_filter_intra); + const int prediction_angle = + is_directional_mode + ? kPredictionModeToAngle[mode] + + prediction_parameters.angle_delta[GetPlaneType(plane)] * + kAngleStep + : 0; + // Directional prediction requires buffers larger than the width or height. + const int top_size = is_directional_mode ? top_and_left_size : width; + const int left_size = is_directional_mode ? top_and_left_size : height; + const int top_right_size = + is_directional_mode ? (has_top_right ? 2 : 1) * width : width; + const int bottom_left_size = + is_directional_mode ? (has_bottom_left ? 2 : 1) * height : height; + + Array2DView buffer(buffer_[plane].rows(), + buffer_[plane].columns() / sizeof(Pixel), + reinterpret_cast(&buffer_[plane][0][0])); + const bool needs_top = use_filter_intra || kNeedsLeftAndTop.Contains(mode) || + (is_directional_mode && prediction_angle < 180) || + (mode == kPredictionModeDc && has_top); + const bool needs_left = use_filter_intra || kNeedsLeftAndTop.Contains(mode) || + (is_directional_mode && prediction_angle > 90) || + (mode == kPredictionModeDc && has_left); + + const Pixel* top_row_src = buffer[y - 1]; + + // Determine if we need to retrieve the top row from + // |intra_prediction_buffer_|. + if ((needs_top || needs_left) && use_intra_prediction_buffer_) { + // Superblock index of block.row4x4. block.row4x4 is always in luma + // dimension (no subsampling). + const int current_superblock_index = + block.row4x4 >> (sequence_header_.use_128x128_superblock ? 5 : 4); + // Superblock index of y - 1. y is in the plane dimension (chroma planes + // could be subsampled). + const int plane_shift = (sequence_header_.use_128x128_superblock ? 7 : 6) - + subsampling_y_[plane]; + const int top_row_superblock_index = (y - 1) >> plane_shift; + // If the superblock index of y - 1 is not that of the current superblock, + // then we will have to retrieve the top row from the + // |intra_prediction_buffer_|. + if (current_superblock_index != top_row_superblock_index) { + top_row_src = reinterpret_cast( + (*intra_prediction_buffer_)[plane].get()); + } + } + + if (needs_top) { + // Compute top_row. + if (has_top || has_left) { + const int left_index = has_left ? x - 1 : x; + top_row[-1] = has_top ? top_row_src[left_index] : buffer[y][left_index]; + } else { + top_row[-1] = 1 << (bitdepth - 1); + } + if (!has_top && has_left) { + Memset(top_row, buffer[y][x - 1], top_size); + } else if (!has_top && !has_left) { + Memset(top_row, (1 << (bitdepth - 1)) - 1, top_size); + } else { + const int top_limit = std::min(max_x - x + 1, top_right_size); + memcpy(top_row, &top_row_src[x], top_limit * sizeof(Pixel)); + // Even though it is safe to call Memset with a size of 0, accessing + // top_row_src[top_limit - x + 1] is not allowed when this condition is + // false. + if (top_size - top_limit > 0) { + Memset(top_row + top_limit, top_row_src[top_limit + x - 1], + top_size - top_limit); + } + } + } + if (needs_left) { + // Compute left_column. + if (has_top || has_left) { + const int left_index = has_left ? x - 1 : x; + left_column[-1] = + has_top ? top_row_src[left_index] : buffer[y][left_index]; + } else { + left_column[-1] = 1 << (bitdepth - 1); + } + if (!has_left && has_top) { + Memset(left_column, top_row_src[x], left_size); + } else if (!has_left && !has_top) { + Memset(left_column, (1 << (bitdepth - 1)) + 1, left_size); + } else { + const int left_limit = std::min(max_y - y + 1, bottom_left_size); + for (int i = 0; i < left_limit; ++i) { + left_column[i] = buffer[y + i][x - 1]; + } + // Even though it is safe to call Memset with a size of 0, accessing + // buffer[left_limit - y + 1][x - 1] is not allowed when this condition is + // false. + if (left_size - left_limit > 0) { + Memset(left_column + left_limit, buffer[left_limit + y - 1][x - 1], + left_size - left_limit); + } + } + } + Pixel* const dest = &buffer[y][x]; + const ptrdiff_t dest_stride = buffer_[plane].columns(); + if (use_filter_intra) { + dsp_.filter_intra_predictor(dest, dest_stride, top_row, left_column, + prediction_parameters.filter_intra_mode, width, + height); + } else if (is_directional_mode) { + DirectionalPrediction(block, plane, x, y, has_left, has_top, needs_left, + needs_top, prediction_angle, width, height, max_x, + max_y, tx_size, top_row, left_column); + } else { + const dsp::IntraPredictor predictor = + GetIntraPredictor(mode, has_left, has_top); + assert(predictor != dsp::kNumIntraPredictors); + dsp_.intra_predictors[tx_size][predictor](dest, dest_stride, top_row, + left_column); + } +} + +template void Tile::IntraPrediction(const Block& block, Plane plane, + int x, int y, bool has_left, + bool has_top, bool has_top_right, + bool has_bottom_left, + PredictionMode mode, + TransformSize tx_size); +#if LIBGAV1_MAX_BITDEPTH >= 10 +template void Tile::IntraPrediction(const Block& block, Plane plane, + int x, int y, bool has_left, + bool has_top, bool has_top_right, + bool has_bottom_left, + PredictionMode mode, + TransformSize tx_size); +#endif + +constexpr BitMaskSet kPredictionModeSmoothMask(kPredictionModeSmooth, + kPredictionModeSmoothHorizontal, + kPredictionModeSmoothVertical); + +bool Tile::IsSmoothPrediction(int row, int column, Plane plane) const { + const BlockParameters& bp = *block_parameters_holder_.Find(row, column); + PredictionMode mode; + if (plane == kPlaneY) { + mode = bp.y_mode; + } else { + if (bp.reference_frame[0] > kReferenceFrameIntra) return false; + mode = bp.uv_mode; + } + return kPredictionModeSmoothMask.Contains(mode); +} + +int Tile::GetIntraEdgeFilterType(const Block& block, Plane plane) const { + const int subsampling_x = subsampling_x_[plane]; + const int subsampling_y = subsampling_y_[plane]; + if (block.top_available[plane]) { + const int row = block.row4x4 - 1 - (block.row4x4 & subsampling_y); + const int column = block.column4x4 + (~block.column4x4 & subsampling_x); + if (IsSmoothPrediction(row, column, plane)) return 1; + } + if (block.left_available[plane]) { + const int row = block.row4x4 + (~block.row4x4 & subsampling_y); + const int column = block.column4x4 - 1 - (block.column4x4 & subsampling_x); + if (IsSmoothPrediction(row, column, plane)) return 1; + } + return 0; +} + +template +void Tile::DirectionalPrediction(const Block& block, Plane plane, int x, int y, + bool has_left, bool has_top, bool needs_left, + bool needs_top, int prediction_angle, + int width, int height, int max_x, int max_y, + TransformSize tx_size, Pixel* const top_row, + Pixel* const left_column) { + Array2DView buffer(buffer_[plane].rows(), + buffer_[plane].columns() / sizeof(Pixel), + reinterpret_cast(&buffer_[plane][0][0])); + Pixel* const dest = &buffer[y][x]; + const ptrdiff_t stride = buffer_[plane].columns(); + if (prediction_angle == 90) { + dsp_.intra_predictors[tx_size][dsp::kIntraPredictorVertical]( + dest, stride, top_row, left_column); + return; + } + if (prediction_angle == 180) { + dsp_.intra_predictors[tx_size][dsp::kIntraPredictorHorizontal]( + dest, stride, top_row, left_column); + return; + } + + bool upsampled_top = false; + bool upsampled_left = false; + if (sequence_header_.enable_intra_edge_filter) { + const int filter_type = GetIntraEdgeFilterType(block, plane); + if (prediction_angle > 90 && prediction_angle < 180 && + (width + height) >= 24) { + // 7.11.2.7. + left_column[-1] = top_row[-1] = RightShiftWithRounding( + left_column[0] * 5 + top_row[-1] * 6 + top_row[0] * 5, 4); + } + if (has_top && needs_top) { + const int strength = GetIntraEdgeFilterStrength( + width, height, filter_type, prediction_angle - 90); + if (strength > 0) { + const int num_pixels = std::min(width, max_x - x + 1) + + ((prediction_angle < 90) ? height : 0) + 1; + dsp_.intra_edge_filter(top_row - 1, num_pixels, strength); + } + } + if (has_left && needs_left) { + const int strength = GetIntraEdgeFilterStrength( + width, height, filter_type, prediction_angle - 180); + if (strength > 0) { + const int num_pixels = std::min(height, max_y - y + 1) + + ((prediction_angle > 180) ? width : 0) + 1; + dsp_.intra_edge_filter(left_column - 1, num_pixels, strength); + } + } + upsampled_top = DoIntraEdgeUpsampling(width, height, filter_type, + prediction_angle - 90); + if (upsampled_top && needs_top) { + const int num_pixels = width + ((prediction_angle < 90) ? height : 0); + dsp_.intra_edge_upsampler(top_row, num_pixels); + } + upsampled_left = DoIntraEdgeUpsampling(width, height, filter_type, + prediction_angle - 180); + if (upsampled_left && needs_left) { + const int num_pixels = height + ((prediction_angle > 180) ? width : 0); + dsp_.intra_edge_upsampler(left_column, num_pixels); + } + } + + if (prediction_angle < 90) { + const int dx = GetDirectionalIntraPredictorDerivative(prediction_angle); + dsp_.directional_intra_predictor_zone1(dest, stride, top_row, width, height, + dx, upsampled_top); + } else if (prediction_angle < 180) { + const int dx = + GetDirectionalIntraPredictorDerivative(180 - prediction_angle); + const int dy = + GetDirectionalIntraPredictorDerivative(prediction_angle - 90); + dsp_.directional_intra_predictor_zone2(dest, stride, top_row, left_column, + width, height, dx, dy, upsampled_top, + upsampled_left); + } else { + assert(prediction_angle < 270); + const int dy = + GetDirectionalIntraPredictorDerivative(270 - prediction_angle); + dsp_.directional_intra_predictor_zone3(dest, stride, left_column, width, + height, dy, upsampled_left); + } +} + +template +void Tile::PalettePrediction(const Block& block, const Plane plane, + const int start_x, const int start_y, const int x, + const int y, const TransformSize tx_size) { + const int tx_width = kTransformWidth[tx_size]; + const int tx_height = kTransformHeight[tx_size]; + const uint16_t* const palette = block.bp->palette_mode_info.color[plane]; + const PlaneType plane_type = GetPlaneType(plane); + const int x4 = MultiplyBy4(x); + const int y4 = MultiplyBy4(y); + Array2DView buffer(buffer_[plane].rows(), + buffer_[plane].columns() / sizeof(Pixel), + reinterpret_cast(&buffer_[plane][0][0])); + for (int row = 0; row < tx_height; ++row) { + assert(block.bp->prediction_parameters + ->color_index_map[plane_type][y4 + row] != nullptr); + for (int column = 0; column < tx_width; ++column) { + buffer[start_y + row][start_x + column] = + palette[block.bp->prediction_parameters + ->color_index_map[plane_type][y4 + row][x4 + column]]; + } + } +} + +template void Tile::PalettePrediction( + const Block& block, const Plane plane, const int start_x, const int start_y, + const int x, const int y, const TransformSize tx_size); +#if LIBGAV1_MAX_BITDEPTH >= 10 +template void Tile::PalettePrediction( + const Block& block, const Plane plane, const int start_x, const int start_y, + const int x, const int y, const TransformSize tx_size); +#endif + +template +void Tile::ChromaFromLumaPrediction(const Block& block, const Plane plane, + const int start_x, const int start_y, + const TransformSize tx_size) { + const int subsampling_x = subsampling_x_[plane]; + const int subsampling_y = subsampling_y_[plane]; + const PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + Array2DView y_buffer( + buffer_[kPlaneY].rows(), buffer_[kPlaneY].columns() / sizeof(Pixel), + reinterpret_cast(&buffer_[kPlaneY][0][0])); + if (!block.scratch_buffer->cfl_luma_buffer_valid) { + const int luma_x = start_x << subsampling_x; + const int luma_y = start_y << subsampling_y; + dsp_.cfl_subsamplers[tx_size][subsampling_x + subsampling_y]( + block.scratch_buffer->cfl_luma_buffer, + prediction_parameters.max_luma_width - luma_x, + prediction_parameters.max_luma_height - luma_y, + reinterpret_cast(&y_buffer[luma_y][luma_x]), + buffer_[kPlaneY].columns()); + block.scratch_buffer->cfl_luma_buffer_valid = true; + } + Array2DView buffer(buffer_[plane].rows(), + buffer_[plane].columns() / sizeof(Pixel), + reinterpret_cast(&buffer_[plane][0][0])); + dsp_.cfl_intra_predictors[tx_size]( + reinterpret_cast(&buffer[start_y][start_x]), + buffer_[plane].columns(), block.scratch_buffer->cfl_luma_buffer, + (plane == kPlaneU) ? prediction_parameters.cfl_alpha_u + : prediction_parameters.cfl_alpha_v); +} + +template void Tile::ChromaFromLumaPrediction( + const Block& block, const Plane plane, const int start_x, const int start_y, + const TransformSize tx_size); +#if LIBGAV1_MAX_BITDEPTH >= 10 +template void Tile::ChromaFromLumaPrediction( + const Block& block, const Plane plane, const int start_x, const int start_y, + const TransformSize tx_size); +#endif + +void Tile::InterIntraPrediction( + uint16_t* const prediction_0, const uint8_t* const prediction_mask, + const ptrdiff_t prediction_mask_stride, + const PredictionParameters& prediction_parameters, + const int prediction_width, const int prediction_height, + const int subsampling_x, const int subsampling_y, uint8_t* const dest, + const ptrdiff_t dest_stride) { + assert(prediction_mask != nullptr); + assert(prediction_parameters.compound_prediction_type == + kCompoundPredictionTypeIntra || + prediction_parameters.compound_prediction_type == + kCompoundPredictionTypeWedge); + // The first buffer of InterIntra is from inter prediction. + // The second buffer is from intra prediction. +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (sequence_header_.color_config.bitdepth > 8) { + GetMaskBlendFunc(dsp_, /*is_inter_intra=*/true, + prediction_parameters.is_wedge_inter_intra, subsampling_x, + subsampling_y)( + prediction_0, reinterpret_cast(dest), + dest_stride / sizeof(uint16_t), prediction_mask, prediction_mask_stride, + prediction_width, prediction_height, dest, dest_stride); + return; + } +#endif + const int function_index = prediction_parameters.is_wedge_inter_intra + ? subsampling_x + subsampling_y + : 0; + // |is_inter_intra| prediction values are stored in a Pixel buffer but it is + // currently declared as a uint16_t buffer. + // TODO(johannkoenig): convert the prediction buffer to a uint8_t buffer and + // remove the reinterpret_cast. + dsp_.inter_intra_mask_blend_8bpp[function_index]( + reinterpret_cast(prediction_0), dest, dest_stride, + prediction_mask, prediction_mask_stride, prediction_width, + prediction_height); +} + +void Tile::CompoundInterPrediction( + const Block& block, const uint8_t* const prediction_mask, + const ptrdiff_t prediction_mask_stride, const int prediction_width, + const int prediction_height, const int subsampling_x, + const int subsampling_y, const int candidate_row, + const int candidate_column, uint8_t* dest, const ptrdiff_t dest_stride) { + const PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + + void* prediction[2]; +#if LIBGAV1_MAX_BITDEPTH >= 10 + const int bitdepth = sequence_header_.color_config.bitdepth; + if (bitdepth > 8) { + prediction[0] = block.scratch_buffer->prediction_buffer[0]; + prediction[1] = block.scratch_buffer->prediction_buffer[1]; + } else { +#endif + prediction[0] = block.scratch_buffer->compound_prediction_buffer_8bpp[0]; + prediction[1] = block.scratch_buffer->compound_prediction_buffer_8bpp[1]; +#if LIBGAV1_MAX_BITDEPTH >= 10 + } +#endif + + switch (prediction_parameters.compound_prediction_type) { + case kCompoundPredictionTypeWedge: + case kCompoundPredictionTypeDiffWeighted: + GetMaskBlendFunc(dsp_, /*is_inter_intra=*/false, + prediction_parameters.is_wedge_inter_intra, + subsampling_x, subsampling_y)( + prediction[0], prediction[1], + /*prediction_stride=*/prediction_width, prediction_mask, + prediction_mask_stride, prediction_width, prediction_height, dest, + dest_stride); + break; + case kCompoundPredictionTypeDistance: + DistanceWeightedPrediction(prediction[0], prediction[1], prediction_width, + prediction_height, candidate_row, + candidate_column, dest, dest_stride); + break; + default: + assert(prediction_parameters.compound_prediction_type == + kCompoundPredictionTypeAverage); + dsp_.average_blend(prediction[0], prediction[1], prediction_width, + prediction_height, dest, dest_stride); + break; + } +} + +GlobalMotion* Tile::GetWarpParams( + const Block& block, const Plane plane, const int prediction_width, + const int prediction_height, + const PredictionParameters& prediction_parameters, + const ReferenceFrameType reference_type, bool* const is_local_valid, + GlobalMotion* const global_motion_params, + GlobalMotion* const local_warp_params) const { + if (prediction_width < 8 || prediction_height < 8 || + frame_header_.force_integer_mv == 1) { + return nullptr; + } + if (plane == kPlaneY) { + *is_local_valid = + prediction_parameters.motion_mode == kMotionModeLocalWarp && + WarpEstimation( + prediction_parameters.num_warp_samples, DivideBy4(prediction_width), + DivideBy4(prediction_height), block.row4x4, block.column4x4, + block.bp->mv.mv[0], prediction_parameters.warp_estimate_candidates, + local_warp_params) && + SetupShear(local_warp_params); + } + if (prediction_parameters.motion_mode == kMotionModeLocalWarp && + *is_local_valid) { + return local_warp_params; + } + if (!IsScaled(reference_type)) { + GlobalMotionTransformationType global_motion_type = + (reference_type != kReferenceFrameIntra) + ? global_motion_params->type + : kNumGlobalMotionTransformationTypes; + const bool is_global_valid = + IsGlobalMvBlock(block.bp->is_global_mv_block, global_motion_type) && + SetupShear(global_motion_params); + // Valid global motion type implies reference type can't be intra. + assert(!is_global_valid || reference_type != kReferenceFrameIntra); + if (is_global_valid) return global_motion_params; + } + return nullptr; +} + +bool Tile::InterPrediction(const Block& block, const Plane plane, const int x, + const int y, const int prediction_width, + const int prediction_height, int candidate_row, + int candidate_column, bool* const is_local_valid, + GlobalMotion* const local_warp_params) { + const int bitdepth = sequence_header_.color_config.bitdepth; + const BlockParameters& bp = *block.bp; + const BlockParameters& bp_reference = + *block_parameters_holder_.Find(candidate_row, candidate_column); + const bool is_compound = + bp_reference.reference_frame[1] > kReferenceFrameIntra; + assert(bp.is_inter); + const bool is_inter_intra = bp.reference_frame[1] == kReferenceFrameIntra; + + const PredictionParameters& prediction_parameters = + *block.bp->prediction_parameters; + uint8_t* const dest = GetStartPoint(buffer_, plane, x, y, bitdepth); + const ptrdiff_t dest_stride = buffer_[plane].columns(); // In bytes. + for (int index = 0; index < 1 + static_cast(is_compound); ++index) { + const ReferenceFrameType reference_type = + bp_reference.reference_frame[index]; + GlobalMotion global_motion_params = + frame_header_.global_motion[reference_type]; + GlobalMotion* warp_params = + GetWarpParams(block, plane, prediction_width, prediction_height, + prediction_parameters, reference_type, is_local_valid, + &global_motion_params, local_warp_params); + if (warp_params != nullptr) { + if (!BlockWarpProcess(block, plane, index, x, y, prediction_width, + prediction_height, warp_params, is_compound, + is_inter_intra, dest, dest_stride)) { + return false; + } + } else { + const int reference_index = + prediction_parameters.use_intra_block_copy + ? -1 + : frame_header_.reference_frame_index[reference_type - + kReferenceFrameLast]; + if (!BlockInterPrediction( + block, plane, reference_index, bp_reference.mv.mv[index], x, y, + prediction_width, prediction_height, candidate_row, + candidate_column, block.scratch_buffer->prediction_buffer[index], + is_compound, is_inter_intra, dest, dest_stride)) { + return false; + } + } + } + + const int subsampling_x = subsampling_x_[plane]; + const int subsampling_y = subsampling_y_[plane]; + ptrdiff_t prediction_mask_stride = 0; + const uint8_t* prediction_mask = nullptr; + if (prediction_parameters.compound_prediction_type == + kCompoundPredictionTypeWedge) { + const Array2D& wedge_mask = + wedge_masks_[GetWedgeBlockSizeIndex(block.size)] + [prediction_parameters.wedge_sign] + [prediction_parameters.wedge_index]; + prediction_mask = wedge_mask[0]; + prediction_mask_stride = wedge_mask.columns(); + } else if (prediction_parameters.compound_prediction_type == + kCompoundPredictionTypeIntra) { + // 7.11.3.13. The inter intra masks are precomputed and stored as a set of + // look up tables. + assert(prediction_parameters.inter_intra_mode < kNumInterIntraModes); + prediction_mask = + kInterIntraMasks[prediction_parameters.inter_intra_mode] + [GetInterIntraMaskLookupIndex(prediction_width)] + [GetInterIntraMaskLookupIndex(prediction_height)]; + prediction_mask_stride = prediction_width; + } else if (prediction_parameters.compound_prediction_type == + kCompoundPredictionTypeDiffWeighted) { + if (plane == kPlaneY) { + assert(prediction_width >= 8); + assert(prediction_height >= 8); + dsp_.weight_mask[FloorLog2(prediction_width) - 3] + [FloorLog2(prediction_height) - 3] + [static_cast(prediction_parameters.mask_is_inverse)]( + block.scratch_buffer->prediction_buffer[0], + block.scratch_buffer->prediction_buffer[1], + block.scratch_buffer->weight_mask, + kMaxSuperBlockSizeInPixels); + } + prediction_mask = block.scratch_buffer->weight_mask; + prediction_mask_stride = kMaxSuperBlockSizeInPixels; + } + + if (is_compound) { + CompoundInterPrediction(block, prediction_mask, prediction_mask_stride, + prediction_width, prediction_height, subsampling_x, + subsampling_y, candidate_row, candidate_column, + dest, dest_stride); + } else if (prediction_parameters.motion_mode == kMotionModeObmc) { + // Obmc mode is allowed only for single reference (!is_compound). + return ObmcPrediction(block, plane, prediction_width, prediction_height); + } else if (is_inter_intra) { + // InterIntra and obmc must be mutually exclusive. + InterIntraPrediction( + block.scratch_buffer->prediction_buffer[0], prediction_mask, + prediction_mask_stride, prediction_parameters, prediction_width, + prediction_height, subsampling_x, subsampling_y, dest, dest_stride); + } + return true; +} + +bool Tile::ObmcBlockPrediction(const Block& block, const MotionVector& mv, + const Plane plane, + const int reference_frame_index, const int width, + const int height, const int x, const int y, + const int candidate_row, + const int candidate_column, + const ObmcDirection blending_direction) { + const int bitdepth = sequence_header_.color_config.bitdepth; + // Obmc's prediction needs to be clipped before blending with above/left + // prediction blocks. + // Obmc prediction is used only when is_compound is false. So it is safe to + // use prediction_buffer[1] as a temporary buffer for the Obmc prediction. + static_assert(sizeof(block.scratch_buffer->prediction_buffer[1]) >= + 64 * 64 * sizeof(uint16_t), + ""); + auto* const obmc_buffer = + reinterpret_cast(block.scratch_buffer->prediction_buffer[1]); + const ptrdiff_t obmc_buffer_stride = + (bitdepth == 8) ? width : width * sizeof(uint16_t); + if (!BlockInterPrediction(block, plane, reference_frame_index, mv, x, y, + width, height, candidate_row, candidate_column, + nullptr, false, false, obmc_buffer, + obmc_buffer_stride)) { + return false; + } + + uint8_t* const prediction = GetStartPoint(buffer_, plane, x, y, bitdepth); + const ptrdiff_t prediction_stride = buffer_[plane].columns(); + dsp_.obmc_blend[blending_direction](prediction, prediction_stride, width, + height, obmc_buffer, obmc_buffer_stride); + return true; +} + +bool Tile::ObmcPrediction(const Block& block, const Plane plane, + const int width, const int height) { + const int subsampling_x = subsampling_x_[plane]; + const int subsampling_y = subsampling_y_[plane]; + if (block.top_available[kPlaneY] && + !IsBlockSmallerThan8x8(block.residual_size[plane])) { + const int num_limit = std::min(uint8_t{4}, k4x4WidthLog2[block.size]); + const int column4x4_max = + std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4); + const int candidate_row = block.row4x4 - 1; + const int block_start_y = MultiplyBy4(block.row4x4) >> subsampling_y; + int column4x4 = block.column4x4; + const int prediction_height = std::min(height >> 1, 32 >> subsampling_y); + for (int i = 0, step; i < num_limit && column4x4 < column4x4_max; + column4x4 += step) { + const int candidate_column = column4x4 | 1; + const BlockParameters& bp_top = + *block_parameters_holder_.Find(candidate_row, candidate_column); + const int candidate_block_size = bp_top.size; + step = Clip3(kNum4x4BlocksWide[candidate_block_size], 2, 16); + if (bp_top.reference_frame[0] > kReferenceFrameIntra) { + i++; + const int candidate_reference_frame_index = + frame_header_.reference_frame_index[bp_top.reference_frame[0] - + kReferenceFrameLast]; + const int prediction_width = + std::min(width, MultiplyBy4(step) >> subsampling_x); + if (!ObmcBlockPrediction( + block, bp_top.mv.mv[0], plane, candidate_reference_frame_index, + prediction_width, prediction_height, + MultiplyBy4(column4x4) >> subsampling_x, block_start_y, + candidate_row, candidate_column, kObmcDirectionVertical)) { + return false; + } + } + } + } + + if (block.left_available[kPlaneY]) { + const int num_limit = std::min(uint8_t{4}, k4x4HeightLog2[block.size]); + const int row4x4_max = + std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4); + const int candidate_column = block.column4x4 - 1; + int row4x4 = block.row4x4; + const int block_start_x = MultiplyBy4(block.column4x4) >> subsampling_x; + const int prediction_width = std::min(width >> 1, 32 >> subsampling_x); + for (int i = 0, step; i < num_limit && row4x4 < row4x4_max; + row4x4 += step) { + const int candidate_row = row4x4 | 1; + const BlockParameters& bp_left = + *block_parameters_holder_.Find(candidate_row, candidate_column); + const int candidate_block_size = bp_left.size; + step = Clip3(kNum4x4BlocksHigh[candidate_block_size], 2, 16); + if (bp_left.reference_frame[0] > kReferenceFrameIntra) { + i++; + const int candidate_reference_frame_index = + frame_header_.reference_frame_index[bp_left.reference_frame[0] - + kReferenceFrameLast]; + const int prediction_height = + std::min(height, MultiplyBy4(step) >> subsampling_y); + if (!ObmcBlockPrediction( + block, bp_left.mv.mv[0], plane, candidate_reference_frame_index, + prediction_width, prediction_height, block_start_x, + MultiplyBy4(row4x4) >> subsampling_y, candidate_row, + candidate_column, kObmcDirectionHorizontal)) { + return false; + } + } + } + } + return true; +} + +void Tile::DistanceWeightedPrediction(void* prediction_0, void* prediction_1, + const int width, const int height, + const int candidate_row, + const int candidate_column, uint8_t* dest, + ptrdiff_t dest_stride) { + int distance[2]; + int weight[2]; + for (int reference = 0; reference < 2; ++reference) { + const BlockParameters& bp = + *block_parameters_holder_.Find(candidate_row, candidate_column); + // Note: distance[0] and distance[1] correspond to relative distance + // between current frame and reference frame [1] and [0], respectively. + distance[1 - reference] = std::min( + std::abs(static_cast( + current_frame_.reference_info() + ->relative_distance_from[bp.reference_frame[reference]])), + static_cast(kMaxFrameDistance)); + } + GetDistanceWeights(distance, weight); + + dsp_.distance_weighted_blend(prediction_0, prediction_1, weight[0], weight[1], + width, height, dest, dest_stride); +} + +void Tile::ScaleMotionVector(const MotionVector& mv, const Plane plane, + const int reference_frame_index, const int x, + const int y, int* const start_x, + int* const start_y, int* const step_x, + int* const step_y) { + const int reference_upscaled_width = + (reference_frame_index == -1) + ? frame_header_.upscaled_width + : reference_frames_[reference_frame_index]->upscaled_width(); + const int reference_height = + (reference_frame_index == -1) + ? frame_header_.height + : reference_frames_[reference_frame_index]->frame_height(); + assert(2 * frame_header_.width >= reference_upscaled_width && + 2 * frame_header_.height >= reference_height && + frame_header_.width <= 16 * reference_upscaled_width && + frame_header_.height <= 16 * reference_height); + const bool is_scaled_x = reference_upscaled_width != frame_header_.width; + const bool is_scaled_y = reference_height != frame_header_.height; + const int half_sample = 1 << (kSubPixelBits - 1); + int orig_x = (x << kSubPixelBits) + ((2 * mv.mv[1]) >> subsampling_x_[plane]); + int orig_y = (y << kSubPixelBits) + ((2 * mv.mv[0]) >> subsampling_y_[plane]); + const int rounding_offset = + DivideBy2(1 << (kScaleSubPixelBits - kSubPixelBits)); + if (is_scaled_x) { + const int scale_x = ((reference_upscaled_width << kReferenceScaleShift) + + DivideBy2(frame_header_.width)) / + frame_header_.width; + *step_x = RightShiftWithRoundingSigned( + scale_x, kReferenceScaleShift - kScaleSubPixelBits); + orig_x += half_sample; + // When frame size is 4k and above, orig_x can be above 16 bits, scale_x can + // be up to 15 bits. So we use int64_t to hold base_x. + const int64_t base_x = static_cast(orig_x) * scale_x - + (half_sample << kReferenceScaleShift); + *start_x = + RightShiftWithRoundingSigned( + base_x, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) + + rounding_offset; + } else { + *step_x = 1 << kScaleSubPixelBits; + *start_x = LeftShift(orig_x, 6) + rounding_offset; + } + if (is_scaled_y) { + const int scale_y = ((reference_height << kReferenceScaleShift) + + DivideBy2(frame_header_.height)) / + frame_header_.height; + *step_y = RightShiftWithRoundingSigned( + scale_y, kReferenceScaleShift - kScaleSubPixelBits); + orig_y += half_sample; + const int64_t base_y = static_cast(orig_y) * scale_y - + (half_sample << kReferenceScaleShift); + *start_y = + RightShiftWithRoundingSigned( + base_y, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) + + rounding_offset; + } else { + *step_y = 1 << kScaleSubPixelBits; + *start_y = LeftShift(orig_y, 6) + rounding_offset; + } +} + +// static. +bool Tile::GetReferenceBlockPosition( + const int reference_frame_index, const bool is_scaled, const int width, + const int height, const int ref_start_x, const int ref_last_x, + const int ref_start_y, const int ref_last_y, const int start_x, + const int start_y, const int step_x, const int step_y, + const int left_border, const int right_border, const int top_border, + const int bottom_border, int* ref_block_start_x, int* ref_block_start_y, + int* ref_block_end_x) { + *ref_block_start_x = GetPixelPositionFromHighScale(start_x, 0, 0); + *ref_block_start_y = GetPixelPositionFromHighScale(start_y, 0, 0); + if (reference_frame_index == -1) { + return false; + } + *ref_block_start_x -= kConvolveBorderLeftTop; + *ref_block_start_y -= kConvolveBorderLeftTop; + *ref_block_end_x = GetPixelPositionFromHighScale(start_x, step_x, width - 1) + + kConvolveBorderRight; + int ref_block_end_y = + GetPixelPositionFromHighScale(start_y, step_y, height - 1) + + kConvolveBorderBottom; + if (is_scaled) { + const int block_height = + (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> + kScaleSubPixelBits) + + kSubPixelTaps; + ref_block_end_y = *ref_block_start_y + block_height - 1; + } + // Determines if we need to extend beyond the left/right/top/bottom border. + return *ref_block_start_x < (ref_start_x - left_border) || + *ref_block_end_x > (ref_last_x + right_border) || + *ref_block_start_y < (ref_start_y - top_border) || + ref_block_end_y > (ref_last_y + bottom_border); +} + +// Builds a block as the input for convolve, by copying the content of +// reference frame (either a decoded reference frame, or current frame). +// |block_extended_width| is the combined width of the block and its borders. +template +void Tile::BuildConvolveBlock( + const Plane plane, const int reference_frame_index, const bool is_scaled, + const int height, const int ref_start_x, const int ref_last_x, + const int ref_start_y, const int ref_last_y, const int step_y, + const int ref_block_start_x, const int ref_block_end_x, + const int ref_block_start_y, uint8_t* block_buffer, + ptrdiff_t convolve_buffer_stride, ptrdiff_t block_extended_width) { + const YuvBuffer* const reference_buffer = + (reference_frame_index == -1) + ? current_frame_.buffer() + : reference_frames_[reference_frame_index]->buffer(); + Array2DView reference_block( + reference_buffer->height(plane), + reference_buffer->stride(plane) / sizeof(Pixel), + reinterpret_cast(reference_buffer->data(plane))); + auto* const block_head = reinterpret_cast(block_buffer); + convolve_buffer_stride /= sizeof(Pixel); + int block_height = height + kConvolveBorderLeftTop + kConvolveBorderBottom; + if (is_scaled) { + block_height = (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >> + kScaleSubPixelBits) + + kSubPixelTaps; + } + const int copy_start_x = Clip3(ref_block_start_x, ref_start_x, ref_last_x); + const int copy_start_y = Clip3(ref_block_start_y, ref_start_y, ref_last_y); + const int copy_end_x = Clip3(ref_block_end_x, copy_start_x, ref_last_x); + const int block_width = copy_end_x - copy_start_x + 1; + const bool extend_left = ref_block_start_x < ref_start_x; + const bool extend_right = ref_block_end_x > ref_last_x; + const bool out_of_left = copy_start_x > ref_block_end_x; + const bool out_of_right = copy_end_x < ref_block_start_x; + if (out_of_left || out_of_right) { + const int ref_x = out_of_left ? copy_start_x : copy_end_x; + Pixel* buf_ptr = block_head; + for (int y = 0, ref_y = copy_start_y; y < block_height; ++y) { + Memset(buf_ptr, reference_block[ref_y][ref_x], block_extended_width); + if (ref_block_start_y + y >= ref_start_y && + ref_block_start_y + y < ref_last_y) { + ++ref_y; + } + buf_ptr += convolve_buffer_stride; + } + } else { + Pixel* buf_ptr = block_head; + const int left_width = copy_start_x - ref_block_start_x; + for (int y = 0, ref_y = copy_start_y; y < block_height; ++y) { + if (extend_left) { + Memset(buf_ptr, reference_block[ref_y][copy_start_x], left_width); + } + memcpy(buf_ptr + left_width, &reference_block[ref_y][copy_start_x], + block_width * sizeof(Pixel)); + if (extend_right) { + Memset(buf_ptr + left_width + block_width, + reference_block[ref_y][copy_end_x], + block_extended_width - left_width - block_width); + } + if (ref_block_start_y + y >= ref_start_y && + ref_block_start_y + y < ref_last_y) { + ++ref_y; + } + buf_ptr += convolve_buffer_stride; + } + } +} + +bool Tile::BlockInterPrediction( + const Block& block, const Plane plane, const int reference_frame_index, + const MotionVector& mv, const int x, const int y, const int width, + const int height, const int candidate_row, const int candidate_column, + uint16_t* const prediction, const bool is_compound, + const bool is_inter_intra, uint8_t* const dest, + const ptrdiff_t dest_stride) { + const BlockParameters& bp = + *block_parameters_holder_.Find(candidate_row, candidate_column); + int start_x; + int start_y; + int step_x; + int step_y; + ScaleMotionVector(mv, plane, reference_frame_index, x, y, &start_x, &start_y, + &step_x, &step_y); + const int horizontal_filter_index = bp.interpolation_filter[1]; + const int vertical_filter_index = bp.interpolation_filter[0]; + const int subsampling_x = subsampling_x_[plane]; + const int subsampling_y = subsampling_y_[plane]; + // reference_frame_index equal to -1 indicates using current frame as + // reference. + const YuvBuffer* const reference_buffer = + (reference_frame_index == -1) + ? current_frame_.buffer() + : reference_frames_[reference_frame_index]->buffer(); + const int reference_upscaled_width = + (reference_frame_index == -1) + ? MultiplyBy4(frame_header_.columns4x4) + : reference_frames_[reference_frame_index]->upscaled_width(); + const int reference_height = + (reference_frame_index == -1) + ? MultiplyBy4(frame_header_.rows4x4) + : reference_frames_[reference_frame_index]->frame_height(); + const int ref_start_x = 0; + const int ref_last_x = + SubsampledValue(reference_upscaled_width, subsampling_x) - 1; + const int ref_start_y = 0; + const int ref_last_y = SubsampledValue(reference_height, subsampling_y) - 1; + + const bool is_scaled = (reference_frame_index != -1) && + (frame_header_.width != reference_upscaled_width || + frame_header_.height != reference_height); + const int bitdepth = sequence_header_.color_config.bitdepth; + const int pixel_size = (bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t); + int ref_block_start_x; + int ref_block_start_y; + int ref_block_end_x; + const bool extend_block = GetReferenceBlockPosition( + reference_frame_index, is_scaled, width, height, ref_start_x, ref_last_x, + ref_start_y, ref_last_y, start_x, start_y, step_x, step_y, + reference_buffer->left_border(plane), + reference_buffer->right_border(plane), + reference_buffer->top_border(plane), + reference_buffer->bottom_border(plane), &ref_block_start_x, + &ref_block_start_y, &ref_block_end_x); + + // In frame parallel mode, ensure that the reference block has been decoded + // and available for referencing. + if (reference_frame_index != -1 && frame_parallel_) { + int reference_y_max; + if (is_scaled) { + // TODO(vigneshv): For now, we wait for the entire reference frame to be + // decoded if we are using scaled references. This will eventually be + // fixed. + reference_y_max = reference_height; + } else { + reference_y_max = + std::min(ref_block_start_y + height + kSubPixelTaps, ref_last_y); + // For U and V planes with subsampling, we need to multiply + // reference_y_max by 2 since we only track the progress of Y planes. + reference_y_max = LeftShift(reference_y_max, subsampling_y); + } + if (reference_frame_progress_cache_[reference_frame_index] < + reference_y_max && + !reference_frames_[reference_frame_index]->WaitUntil( + reference_y_max, + &reference_frame_progress_cache_[reference_frame_index])) { + return false; + } + } + + const uint8_t* block_start = nullptr; + ptrdiff_t convolve_buffer_stride; + if (!extend_block) { + const YuvBuffer* const reference_buffer = + (reference_frame_index == -1) + ? current_frame_.buffer() + : reference_frames_[reference_frame_index]->buffer(); + convolve_buffer_stride = reference_buffer->stride(plane); + if (reference_frame_index == -1 || is_scaled) { + block_start = reference_buffer->data(plane) + + ref_block_start_y * reference_buffer->stride(plane) + + ref_block_start_x * pixel_size; + } else { + block_start = reference_buffer->data(plane) + + (ref_block_start_y + kConvolveBorderLeftTop) * + reference_buffer->stride(plane) + + (ref_block_start_x + kConvolveBorderLeftTop) * pixel_size; + } + } else { + // The block width can be at most 2 times as much as current + // block's width because of scaling. + auto block_extended_width = Align( + (2 * width + kConvolveBorderLeftTop + kConvolveBorderRight) * + pixel_size, + kMaxAlignment); + convolve_buffer_stride = block.scratch_buffer->convolve_block_buffer_stride; +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth > 8) { + BuildConvolveBlock( + plane, reference_frame_index, is_scaled, height, ref_start_x, + ref_last_x, ref_start_y, ref_last_y, step_y, ref_block_start_x, + ref_block_end_x, ref_block_start_y, + block.scratch_buffer->convolve_block_buffer.get(), + convolve_buffer_stride, block_extended_width); + } else { +#endif + BuildConvolveBlock( + plane, reference_frame_index, is_scaled, height, ref_start_x, + ref_last_x, ref_start_y, ref_last_y, step_y, ref_block_start_x, + ref_block_end_x, ref_block_start_y, + block.scratch_buffer->convolve_block_buffer.get(), + convolve_buffer_stride, block_extended_width); +#if LIBGAV1_MAX_BITDEPTH >= 10 + } +#endif + block_start = block.scratch_buffer->convolve_block_buffer.get() + + (is_scaled ? 0 + : kConvolveBorderLeftTop * convolve_buffer_stride + + kConvolveBorderLeftTop * pixel_size); + } + + void* const output = + (is_compound || is_inter_intra) ? prediction : static_cast(dest); + ptrdiff_t output_stride = (is_compound || is_inter_intra) + ? /*prediction_stride=*/width + : dest_stride; +#if LIBGAV1_MAX_BITDEPTH >= 10 + // |is_inter_intra| calculations are written to the |prediction| buffer. + // Unlike the |is_compound| calculations the output is Pixel and not uint16_t. + // convolve_func() expects |output_stride| to be in bytes and not Pixels. + // |prediction_stride| is in units of uint16_t. Adjust |output_stride| to + // account for this. + if (is_inter_intra && sequence_header_.color_config.bitdepth > 8) { + output_stride *= 2; + } +#endif + assert(output != nullptr); + if (is_scaled) { + dsp::ConvolveScaleFunc convolve_func = dsp_.convolve_scale[is_compound]; + assert(convolve_func != nullptr); + + convolve_func(block_start, convolve_buffer_stride, horizontal_filter_index, + vertical_filter_index, start_x, start_y, step_x, step_y, + width, height, output, output_stride); + } else { + const int horizontal_filter_id = (start_x >> 6) & kSubPixelMask; + const int vertical_filter_id = (start_y >> 6) & kSubPixelMask; + + dsp::ConvolveFunc convolve_func = + dsp_.convolve[reference_frame_index == -1][is_compound] + [vertical_filter_id != 0][horizontal_filter_id != 0]; + assert(convolve_func != nullptr); + + convolve_func(block_start, convolve_buffer_stride, horizontal_filter_index, + vertical_filter_index, horizontal_filter_id, + vertical_filter_id, width, height, output, output_stride); + } + return true; +} + +bool Tile::BlockWarpProcess(const Block& block, const Plane plane, + const int index, const int block_start_x, + const int block_start_y, const int width, + const int height, GlobalMotion* const warp_params, + const bool is_compound, const bool is_inter_intra, + uint8_t* const dest, const ptrdiff_t dest_stride) { + assert(width >= 8 && height >= 8); + const BlockParameters& bp = *block.bp; + const int reference_frame_index = + frame_header_.reference_frame_index[bp.reference_frame[index] - + kReferenceFrameLast]; + const uint8_t* const source = + reference_frames_[reference_frame_index]->buffer()->data(plane); + ptrdiff_t source_stride = + reference_frames_[reference_frame_index]->buffer()->stride(plane); + const int source_width = + reference_frames_[reference_frame_index]->buffer()->width(plane); + const int source_height = + reference_frames_[reference_frame_index]->buffer()->height(plane); + uint16_t* const prediction = block.scratch_buffer->prediction_buffer[index]; + + // In frame parallel mode, ensure that the reference block has been decoded + // and available for referencing. + if (frame_parallel_) { + int reference_y_max = -1; + // Find out the maximum y-coordinate for warping. + for (int start_y = block_start_y; start_y < block_start_y + height; + start_y += 8) { + for (int start_x = block_start_x; start_x < block_start_x + width; + start_x += 8) { + const int src_x = (start_x + 4) << subsampling_x_[plane]; + const int src_y = (start_y + 4) << subsampling_y_[plane]; + const int dst_y = src_x * warp_params->params[4] + + src_y * warp_params->params[5] + + warp_params->params[1]; + const int y4 = dst_y >> subsampling_y_[plane]; + const int iy4 = y4 >> kWarpedModelPrecisionBits; + reference_y_max = std::max(iy4 + 8, reference_y_max); + } + } + // For U and V planes with subsampling, we need to multiply reference_y_max + // by 2 since we only track the progress of Y planes. + reference_y_max = LeftShift(reference_y_max, subsampling_y_[plane]); + if (reference_frame_progress_cache_[reference_frame_index] < + reference_y_max && + !reference_frames_[reference_frame_index]->WaitUntil( + reference_y_max, + &reference_frame_progress_cache_[reference_frame_index])) { + return false; + } + } + if (is_compound) { + dsp_.warp_compound(source, source_stride, source_width, source_height, + warp_params->params, subsampling_x_[plane], + subsampling_y_[plane], block_start_x, block_start_y, + width, height, warp_params->alpha, warp_params->beta, + warp_params->gamma, warp_params->delta, prediction, + /*prediction_stride=*/width); + } else { + void* const output = is_inter_intra ? static_cast(prediction) : dest; + ptrdiff_t output_stride = + is_inter_intra ? /*prediction_stride=*/width : dest_stride; +#if LIBGAV1_MAX_BITDEPTH >= 10 + // |is_inter_intra| calculations are written to the |prediction| buffer. + // Unlike the |is_compound| calculations the output is Pixel and not + // uint16_t. warp_clip() expects |output_stride| to be in bytes and not + // Pixels. |prediction_stride| is in units of uint16_t. Adjust + // |output_stride| to account for this. + if (is_inter_intra && sequence_header_.color_config.bitdepth > 8) { + output_stride *= 2; + } +#endif + dsp_.warp(source, source_stride, source_width, source_height, + warp_params->params, subsampling_x_[plane], subsampling_y_[plane], + block_start_x, block_start_y, width, height, warp_params->alpha, + warp_params->beta, warp_params->gamma, warp_params->delta, output, + output_stride); + } + return true; +} + +} // namespace libgav1 diff --git a/src/tile/tile.cc b/src/tile/tile.cc new file mode 100644 index 0000000..ee48f17 --- /dev/null +++ b/src/tile/tile.cc @@ -0,0 +1,2573 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/tile.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "src/frame_scratch_buffer.h" +#include "src/motion_vector.h" +#include "src/reconstruction.h" +#include "src/utils/bit_mask_set.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/logging.h" +#include "src/utils/segmentation.h" +#include "src/utils/stack.h" + +namespace libgav1 { +namespace { + +// Import all the constants in the anonymous namespace. +#include "src/scan_tables.inc" + +// Range above kNumQuantizerBaseLevels which the exponential golomb coding +// process is activated. +constexpr int kQuantizerCoefficientBaseRange = 12; +constexpr int kNumQuantizerBaseLevels = 2; +constexpr int kCoeffBaseRangeMaxIterations = + kQuantizerCoefficientBaseRange / (kCoeffBaseRangeSymbolCount - 1); +constexpr int kEntropyContextLeft = 0; +constexpr int kEntropyContextTop = 1; + +constexpr uint8_t kAllZeroContextsByTopLeft[5][5] = {{1, 2, 2, 2, 3}, + {2, 4, 4, 4, 5}, + {2, 4, 4, 4, 5}, + {2, 4, 4, 4, 5}, + {3, 5, 5, 5, 6}}; + +// The space complexity of DFS is O(branching_factor * max_depth). For the +// parameter tree, branching_factor = 4 (there could be up to 4 children for +// every node) and max_depth (excluding the root) = 5 (to go from a 128x128 +// block all the way to a 4x4 block). The worse-case stack size is 16, by +// counting the number of 'o' nodes in the diagram: +// +// | 128x128 The highest level (corresponding to the +// | root of the tree) has no node in the stack. +// |-----------------+ +// | | | | +// | o o o 64x64 +// | +// |-----------------+ +// | | | | +// | o o o 32x32 Higher levels have three nodes in the stack, +// | because we pop one node off the stack before +// |-----------------+ pushing its four children onto the stack. +// | | | | +// | o o o 16x16 +// | +// |-----------------+ +// | | | | +// | o o o 8x8 +// | +// |-----------------+ +// | | | | +// o o o o 4x4 Only the lowest level has four nodes in the +// stack. +constexpr int kDfsStackSize = 16; + +// Mask indicating whether the transform sets contain a particular transform +// type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set. +constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = { + BitMaskSet(0x1), BitMaskSet(0xE0F), BitMaskSet(0x20F), + BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)}; + +constexpr PredictionMode + kFilterIntraModeToIntraPredictor[kNumFilterIntraPredictors] = { + kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal, + kPredictionModeD157, kPredictionModeDc}; + +// Mask used to determine the index for mode_deltas lookup. +constexpr BitMaskSet kPredictionModeDeltasMask( + kPredictionModeNearestMv, kPredictionModeNearMv, kPredictionModeNewMv, + kPredictionModeNearestNearestMv, kPredictionModeNearNearMv, + kPredictionModeNearestNewMv, kPredictionModeNewNearestMv, + kPredictionModeNearNewMv, kPredictionModeNewNearMv, + kPredictionModeNewNewMv); + +// This is computed as: +// min(transform_width_log2, 5) + min(transform_height_log2, 5) - 4. +constexpr uint8_t kEobMultiSizeLookup[kNumTransformSizes] = { + 0, 1, 2, 1, 2, 3, 4, 2, 3, 4, 5, 5, 4, 5, 6, 6, 5, 6, 6}; + +/* clang-format off */ +constexpr uint8_t kCoeffBaseContextOffset[kNumTransformSizes][5][5] = { + {{0, 1, 6, 6, 0}, {1, 6, 6, 21, 0}, {6, 6, 21, 21, 0}, {6, 21, 21, 21, 0}, + {0, 0, 0, 0, 0}}, + {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0}, + {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}}, + {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0}, + {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}}, + {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21}, + {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}}, + {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21}, + {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}, + {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21}, + {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}, + {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21}, + {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}, + {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21}, + {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}}, + {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21}, + {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}}, + {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21}, + {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}, + {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21}, + {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}, + {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21}, + {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}, + {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21}, + {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}}, + {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21}, + {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}}, + {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21}, + {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}, + {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21}, + {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}, + {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21}, + {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}}, + {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21}, + {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}}, + {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21}, + {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}}; +/* clang-format on */ + +// Extended the table size from 3 to 16 by repeating the last element to avoid +// the clips to row or column indices. +constexpr uint8_t kCoeffBasePositionContextOffset[16] = { + 26, 31, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36}; + +constexpr PredictionMode kInterIntraToIntraMode[kNumInterIntraModes] = { + kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal, + kPredictionModeSmooth}; + +// Number of horizontal luma samples before intra block copy can be used. +constexpr int kIntraBlockCopyDelayPixels = 256; +// Number of 64 by 64 blocks before intra block copy can be used. +constexpr int kIntraBlockCopyDelay64x64Blocks = kIntraBlockCopyDelayPixels / 64; + +// Index [i][j] corresponds to the transform size of width 1 << (i + 2) and +// height 1 << (j + 2). +constexpr TransformSize k4x4SizeToTransformSize[5][5] = { + {kTransformSize4x4, kTransformSize4x8, kTransformSize4x16, + kNumTransformSizes, kNumTransformSizes}, + {kTransformSize8x4, kTransformSize8x8, kTransformSize8x16, + kTransformSize8x32, kNumTransformSizes}, + {kTransformSize16x4, kTransformSize16x8, kTransformSize16x16, + kTransformSize16x32, kTransformSize16x64}, + {kNumTransformSizes, kTransformSize32x8, kTransformSize32x16, + kTransformSize32x32, kTransformSize32x64}, + {kNumTransformSizes, kNumTransformSizes, kTransformSize64x16, + kTransformSize64x32, kTransformSize64x64}}; + +// Defined in section 9.3 of the spec. +constexpr TransformType kModeToTransformType[kIntraPredictionModesUV] = { + kTransformTypeDctDct, kTransformTypeDctAdst, kTransformTypeAdstDct, + kTransformTypeDctDct, kTransformTypeAdstAdst, kTransformTypeDctAdst, + kTransformTypeAdstDct, kTransformTypeAdstDct, kTransformTypeDctAdst, + kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct, + kTransformTypeAdstAdst, kTransformTypeDctDct}; + +// Defined in section 5.11.47 of the spec. This array does not contain an entry +// for kTransformSetDctOnly, so the first dimension needs to be +// |kNumTransformSets| - 1. +constexpr TransformType kInverseTransformTypeBySet[kNumTransformSets - 1][16] = + {{kTransformTypeIdentityIdentity, kTransformTypeDctDct, + kTransformTypeIdentityDct, kTransformTypeDctIdentity, + kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct}, + {kTransformTypeIdentityIdentity, kTransformTypeDctDct, + kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct}, + {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct, + kTransformTypeDctIdentity, kTransformTypeIdentityAdst, + kTransformTypeAdstIdentity, kTransformTypeIdentityFlipadst, + kTransformTypeFlipadstIdentity, kTransformTypeDctDct, + kTransformTypeDctAdst, kTransformTypeAdstDct, kTransformTypeDctFlipadst, + kTransformTypeFlipadstDct, kTransformTypeAdstAdst, + kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst, + kTransformTypeAdstFlipadst}, + {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct, + kTransformTypeDctIdentity, kTransformTypeDctDct, kTransformTypeDctAdst, + kTransformTypeAdstDct, kTransformTypeDctFlipadst, + kTransformTypeFlipadstDct, kTransformTypeAdstAdst, + kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst, + kTransformTypeAdstFlipadst}, + {kTransformTypeIdentityIdentity, kTransformTypeDctDct}}; + +// Replaces all occurrences of 64x* and *x64 with 32x* and *x32 respectively. +constexpr TransformSize kAdjustedTransformSize[kNumTransformSizes] = { + kTransformSize4x4, kTransformSize4x8, kTransformSize4x16, + kTransformSize8x4, kTransformSize8x8, kTransformSize8x16, + kTransformSize8x32, kTransformSize16x4, kTransformSize16x8, + kTransformSize16x16, kTransformSize16x32, kTransformSize16x32, + kTransformSize32x8, kTransformSize32x16, kTransformSize32x32, + kTransformSize32x32, kTransformSize32x16, kTransformSize32x32, + kTransformSize32x32}; + +// This is the same as Max_Tx_Size_Rect array in the spec but with *x64 and 64*x +// transforms replaced with *x32 and 32x* respectively. +constexpr TransformSize kUVTransformSize[kMaxBlockSizes] = { + kTransformSize4x4, kTransformSize4x8, kTransformSize4x16, + kTransformSize8x4, kTransformSize8x8, kTransformSize8x16, + kTransformSize8x32, kTransformSize16x4, kTransformSize16x8, + kTransformSize16x16, kTransformSize16x32, kTransformSize16x32, + kTransformSize32x8, kTransformSize32x16, kTransformSize32x32, + kTransformSize32x32, kTransformSize32x16, kTransformSize32x32, + kTransformSize32x32, kTransformSize32x32, kTransformSize32x32, + kTransformSize32x32}; + +// ith entry of this array is computed as: +// DivideBy2(TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[i]) + +// TransformSizeToSquareTransformIndex(kTransformSizeSquareMax[i]) + +// 1) +constexpr uint8_t kTransformSizeContext[kNumTransformSizes] = { + 0, 1, 1, 1, 1, 2, 2, 1, 2, 2, 3, 3, 2, 3, 3, 4, 3, 4, 4}; + +constexpr int8_t kSgrProjDefaultMultiplier[2] = {-32, 31}; + +constexpr int8_t kWienerDefaultFilter[kNumWienerCoefficients] = {3, -7, 15}; + +// Maps compound prediction modes into single modes. For e.g. +// kPredictionModeNearestNewMv will map to kPredictionModeNearestMv for index 0 +// and kPredictionModeNewMv for index 1. It is used to simplify the logic in +// AssignMv (and avoid duplicate code). This is section 5.11.30. in the spec. +constexpr PredictionMode + kCompoundToSinglePredictionMode[kNumCompoundInterPredictionModes][2] = { + {kPredictionModeNearestMv, kPredictionModeNearestMv}, + {kPredictionModeNearMv, kPredictionModeNearMv}, + {kPredictionModeNearestMv, kPredictionModeNewMv}, + {kPredictionModeNewMv, kPredictionModeNearestMv}, + {kPredictionModeNearMv, kPredictionModeNewMv}, + {kPredictionModeNewMv, kPredictionModeNearMv}, + {kPredictionModeGlobalMv, kPredictionModeGlobalMv}, + {kPredictionModeNewMv, kPredictionModeNewMv}, +}; +PredictionMode GetSinglePredictionMode(int index, PredictionMode y_mode) { + if (y_mode < kPredictionModeNearestNearestMv) { + return y_mode; + } + const int lookup_index = y_mode - kPredictionModeNearestNearestMv; + assert(lookup_index >= 0); + return kCompoundToSinglePredictionMode[lookup_index][index]; +} + +// log2(dqDenom) in section 7.12.3 of the spec. We use the log2 value because +// dqDenom is always a power of two and hence right shift can be used instead of +// division. +constexpr uint8_t kQuantizationShift[kNumTransformSizes] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 2, 1, 2, 2}; + +// Returns the minimum of |length| or |max|-|start|. This is used to clamp array +// indices when accessing arrays whose bound is equal to |max|. +int GetNumElements(int length, int start, int max) { + return std::min(length, max - start); +} + +template +void SetBlockValues(int rows, int columns, T value, T* dst, ptrdiff_t stride) { + // Specialize all columns cases (values in kTransformWidth4x4[]) for better + // performance. + switch (columns) { + case 1: + MemSetBlock(rows, 1, value, dst, stride); + break; + case 2: + MemSetBlock(rows, 2, value, dst, stride); + break; + case 4: + MemSetBlock(rows, 4, value, dst, stride); + break; + case 8: + MemSetBlock(rows, 8, value, dst, stride); + break; + default: + assert(columns == 16); + MemSetBlock(rows, 16, value, dst, stride); + break; + } +} + +void SetTransformType(const Tile::Block& block, int x4, int y4, int w4, int h4, + TransformType tx_type, + TransformType transform_types[32][32]) { + const int y_offset = y4 - block.row4x4; + const int x_offset = x4 - block.column4x4; + TransformType* const dst = &transform_types[y_offset][x_offset]; + SetBlockValues(h4, w4, tx_type, dst, 32); +} + +void StoreMotionFieldMvs(ReferenceFrameType reference_frame_to_store, + const MotionVector& mv_to_store, ptrdiff_t stride, + int rows, int columns, + ReferenceFrameType* reference_frame_row_start, + MotionVector* mv) { + static_assert(sizeof(*reference_frame_row_start) == sizeof(int8_t), ""); + do { + // Don't switch the following two memory setting functions. + // Some ARM CPUs are quite sensitive to the order. + memset(reference_frame_row_start, reference_frame_to_store, columns); + std::fill(mv, mv + columns, mv_to_store); + reference_frame_row_start += stride; + mv += stride; + } while (--rows != 0); +} + +// Inverse transform process assumes that the quantized coefficients are stored +// as a virtual 2d array of size |tx_width| x tx_height. If transform width is +// 64, then this assumption is broken because the scan order used for populating +// the coefficients for such transforms is the same as the one used for +// corresponding transform with width 32 (e.g. the scan order used for 64x16 is +// the same as the one used for 32x16). So we must restore the coefficients to +// their correct positions and clean the positions they occupied. +template +void MoveCoefficientsForTxWidth64(int clamped_tx_height, int tx_width, + ResidualType* residual) { + if (tx_width != 64) return; + const int rows = clamped_tx_height - 2; + auto* src = residual + 32 * rows; + residual += 64 * rows; + // Process 2 rows in each loop in reverse order to avoid overwrite. + int x = rows >> 1; + do { + // The 2 rows can be processed in order. + memcpy(residual, src, 32 * sizeof(src[0])); + memcpy(residual + 64, src + 32, 32 * sizeof(src[0])); + memset(src + 32, 0, 32 * sizeof(src[0])); + src -= 64; + residual -= 128; + } while (--x); + // Process the second row. The first row is already correct. + memcpy(residual + 64, src + 32, 32 * sizeof(src[0])); + memset(src + 32, 0, 32 * sizeof(src[0])); +} + +void GetClampParameters(const Tile::Block& block, int min[2], int max[2]) { + // 7.10.2.14 (part 1). (also contains implementations of 5.11.53 + // and 5.11.54). + constexpr int kMvBorder4x4 = 4; + const int row_border = kMvBorder4x4 + block.height4x4; + const int column_border = kMvBorder4x4 + block.width4x4; + const int macroblocks_to_top_edge = -block.row4x4; + const int macroblocks_to_bottom_edge = + block.tile.frame_header().rows4x4 - block.height4x4 - block.row4x4; + const int macroblocks_to_left_edge = -block.column4x4; + const int macroblocks_to_right_edge = + block.tile.frame_header().columns4x4 - block.width4x4 - block.column4x4; + min[0] = MultiplyBy32(macroblocks_to_top_edge - row_border); + min[1] = MultiplyBy32(macroblocks_to_left_edge - column_border); + max[0] = MultiplyBy32(macroblocks_to_bottom_edge + row_border); + max[1] = MultiplyBy32(macroblocks_to_right_edge + column_border); +} + +// Section 8.3.2 in the spec, under coeff_base_eob. +int GetCoeffBaseContextEob(TransformSize tx_size, int index) { + if (index == 0) return 0; + const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size]; + const int tx_width_log2 = kTransformWidthLog2[adjusted_tx_size]; + const int tx_height = kTransformHeight[adjusted_tx_size]; + if (index <= DivideBy8(tx_height << tx_width_log2)) return 1; + if (index <= DivideBy4(tx_height << tx_width_log2)) return 2; + return 3; +} + +// Section 8.3.2 in the spec, under coeff_br. Optimized for end of block based +// on the fact that {0, 1}, {1, 0}, {1, 1}, {0, 2} and {2, 0} will all be 0 in +// the end of block case. +int GetCoeffBaseRangeContextEob(int adjusted_tx_width_log2, int pos, + TransformClass tx_class) { + if (pos == 0) return 0; + const int tx_width = 1 << adjusted_tx_width_log2; + const int row = pos >> adjusted_tx_width_log2; + const int column = pos & (tx_width - 1); + // This return statement is equivalent to: + // return ((tx_class == kTransformClass2D && (row | column) < 2) || + // (tx_class == kTransformClassHorizontal && column == 0) || + // (tx_class == kTransformClassVertical && row == 0)) + // ? 7 + // : 14; + return 14 >> ((static_cast(tx_class == kTransformClass2D) & + static_cast((row | column) < 2)) | + (tx_class & static_cast(column == 0)) | + ((tx_class >> 1) & static_cast(row == 0))); +} + +} // namespace + +Tile::Tile(int tile_number, const uint8_t* const data, size_t size, + const ObuSequenceHeader& sequence_header, + const ObuFrameHeader& frame_header, + RefCountedBuffer* const current_frame, const DecoderState& state, + FrameScratchBuffer* const frame_scratch_buffer, + const WedgeMaskArray& wedge_masks, + const QuantizerMatrix& quantizer_matrix, + SymbolDecoderContext* const saved_symbol_decoder_context, + const SegmentationMap* prev_segment_ids, + PostFilter* const post_filter, const dsp::Dsp* const dsp, + ThreadPool* const thread_pool, + BlockingCounterWithStatus* const pending_tiles, bool frame_parallel, + bool use_intra_prediction_buffer) + : number_(tile_number), + row_(number_ / frame_header.tile_info.tile_columns), + column_(number_ % frame_header.tile_info.tile_columns), + data_(data), + size_(size), + read_deltas_(false), + subsampling_x_{0, sequence_header.color_config.subsampling_x, + sequence_header.color_config.subsampling_x}, + subsampling_y_{0, sequence_header.color_config.subsampling_y, + sequence_header.color_config.subsampling_y}, + current_quantizer_index_(frame_header.quantizer.base_index), + sequence_header_(sequence_header), + frame_header_(frame_header), + reference_frame_sign_bias_(state.reference_frame_sign_bias), + reference_frames_(state.reference_frame), + motion_field_(frame_scratch_buffer->motion_field), + reference_order_hint_(state.reference_order_hint), + wedge_masks_(wedge_masks), + quantizer_matrix_(quantizer_matrix), + reader_(data_, size_, frame_header_.enable_cdf_update), + symbol_decoder_context_(frame_scratch_buffer->symbol_decoder_context), + saved_symbol_decoder_context_(saved_symbol_decoder_context), + prev_segment_ids_(prev_segment_ids), + dsp_(*dsp), + post_filter_(*post_filter), + block_parameters_holder_(frame_scratch_buffer->block_parameters_holder), + quantizer_(sequence_header_.color_config.bitdepth, + &frame_header_.quantizer), + residual_size_((sequence_header_.color_config.bitdepth == 8) + ? sizeof(int16_t) + : sizeof(int32_t)), + intra_block_copy_lag_( + frame_header_.allow_intrabc + ? (sequence_header_.use_128x128_superblock ? 3 : 5) + : 1), + current_frame_(*current_frame), + cdef_index_(frame_scratch_buffer->cdef_index), + inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes), + thread_pool_(thread_pool), + residual_buffer_pool_(frame_scratch_buffer->residual_buffer_pool.get()), + tile_scratch_buffer_pool_( + &frame_scratch_buffer->tile_scratch_buffer_pool), + pending_tiles_(pending_tiles), + frame_parallel_(frame_parallel), + use_intra_prediction_buffer_(use_intra_prediction_buffer), + intra_prediction_buffer_( + use_intra_prediction_buffer_ + ? &frame_scratch_buffer->intra_prediction_buffers.get()[row_] + : nullptr) { + row4x4_start_ = frame_header.tile_info.tile_row_start[row_]; + row4x4_end_ = frame_header.tile_info.tile_row_start[row_ + 1]; + column4x4_start_ = frame_header.tile_info.tile_column_start[column_]; + column4x4_end_ = frame_header.tile_info.tile_column_start[column_ + 1]; + const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()]; + const int block_width4x4_log2 = k4x4HeightLog2[SuperBlockSize()]; + superblock_rows_ = + (row4x4_end_ - row4x4_start_ + block_width4x4 - 1) >> block_width4x4_log2; + superblock_columns_ = + (column4x4_end_ - column4x4_start_ + block_width4x4 - 1) >> + block_width4x4_log2; + // If |split_parse_and_decode_| is true, we do the necessary setup for + // splitting the parsing and the decoding steps. This is done in the following + // two cases: + // 1) If there is multi-threading within a tile (this is done if + // |thread_pool_| is not nullptr and if there are at least as many + // superblock columns as |intra_block_copy_lag_|). + // 2) If |frame_parallel| is true. + split_parse_and_decode_ = (thread_pool_ != nullptr && + superblock_columns_ > intra_block_copy_lag_) || + frame_parallel; + if (frame_parallel_) { + reference_frame_progress_cache_.fill(INT_MIN); + } + memset(delta_lf_, 0, sizeof(delta_lf_)); + delta_lf_all_zero_ = true; + const YuvBuffer& buffer = post_filter_.frame_buffer(); + for (int plane = kPlaneY; plane < PlaneCount(); ++plane) { + // Verify that the borders are big enough for Reconstruct(). max_tx_length + // is the maximum value of tx_width and tx_height for the plane. + const int max_tx_length = (plane == kPlaneY) ? 64 : 32; + // Reconstruct() may overwrite on the right. Since the right border of a + // row is followed in memory by the left border of the next row, the + // number of extra pixels to the right of a row is at least the sum of the + // left and right borders. + // + // Note: This assertion actually checks the sum of the left and right + // borders of post_filter_.GetUnfilteredBuffer(), which is a horizontally + // and vertically shifted version of |buffer|. Since the sum of the left and + // right borders is not changed by the shift, we can just check the sum of + // the left and right borders of |buffer|. + assert(buffer.left_border(plane) + buffer.right_border(plane) >= + max_tx_length - 1); + // Reconstruct() may overwrite on the bottom. We need an extra border row + // on the bottom because we need the left border of that row. + // + // Note: This assertion checks the bottom border of + // post_filter_.GetUnfilteredBuffer(). So we need to calculate the vertical + // shift that the PostFilter constructor applied to |buffer| and reduce the + // bottom border by that amount. +#ifndef NDEBUG + const int vertical_shift = static_cast( + (post_filter_.GetUnfilteredBuffer(plane) - buffer.data(plane)) / + buffer.stride(plane)); + const int bottom_border = buffer.bottom_border(plane) - vertical_shift; + assert(bottom_border >= max_tx_length); +#endif + // In AV1, a transform block of height H starts at a y coordinate that is + // a multiple of H. If a transform block at the bottom of the frame has + // height H, then Reconstruct() will write up to the row with index + // Align(buffer.height(plane), H) - 1. Therefore the maximum number of + // rows Reconstruct() may write to is + // Align(buffer.height(plane), max_tx_length). + buffer_[plane].Reset(Align(buffer.height(plane), max_tx_length), + buffer.stride(plane), + post_filter_.GetUnfilteredBuffer(plane)); + const int plane_height = + SubsampledValue(frame_header_.height, subsampling_y_[plane]); + deblock_row_limit_[plane] = + std::min(frame_header_.rows4x4, DivideBy4(plane_height + 3) + << subsampling_y_[plane]); + const int plane_width = + SubsampledValue(frame_header_.width, subsampling_x_[plane]); + deblock_column_limit_[plane] = + std::min(frame_header_.columns4x4, DivideBy4(plane_width + 3) + << subsampling_x_[plane]); + } +} + +bool Tile::Init() { + assert(coefficient_levels_.size() == dc_categories_.size()); + for (size_t i = 0; i < coefficient_levels_.size(); ++i) { + const int contexts_per_plane = (i == kEntropyContextLeft) + ? frame_header_.rows4x4 + : frame_header_.columns4x4; + if (!coefficient_levels_[i].Reset(PlaneCount(), contexts_per_plane)) { + LIBGAV1_DLOG(ERROR, "coefficient_levels_[%zu].Reset() failed.", i); + return false; + } + if (!dc_categories_[i].Reset(PlaneCount(), contexts_per_plane)) { + LIBGAV1_DLOG(ERROR, "dc_categories_[%zu].Reset() failed.", i); + return false; + } + } + if (split_parse_and_decode_) { + assert(residual_buffer_pool_ != nullptr); + if (!residual_buffer_threaded_.Reset(superblock_rows_, superblock_columns_, + /*zero_initialize=*/false)) { + LIBGAV1_DLOG(ERROR, "residual_buffer_threaded_.Reset() failed."); + return false; + } + } else { + // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary + // checks when parsing quantized coefficients. + residual_buffer_ = MakeAlignedUniquePtr( + 32, (4096 + 32 * kResidualPaddingVertical) * residual_size_); + if (residual_buffer_ == nullptr) { + LIBGAV1_DLOG(ERROR, "Allocation of residual_buffer_ failed."); + return false; + } + prediction_parameters_.reset(new (std::nothrow) PredictionParameters()); + if (prediction_parameters_ == nullptr) { + LIBGAV1_DLOG(ERROR, "Allocation of prediction_parameters_ failed."); + return false; + } + } + if (frame_header_.use_ref_frame_mvs) { + assert(sequence_header_.enable_order_hint); + SetupMotionField(frame_header_, current_frame_, reference_frames_, + row4x4_start_, row4x4_end_, column4x4_start_, + column4x4_end_, &motion_field_); + } + ResetLoopRestorationParams(); + return true; +} + +template +bool Tile::ProcessSuperBlockRow(int row4x4, + TileScratchBuffer* const scratch_buffer) { + if (row4x4 < row4x4_start_ || row4x4 >= row4x4_end_) return true; + assert(scratch_buffer != nullptr); + const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()]; + for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_; + column4x4 += block_width4x4) { + if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4, scratch_buffer, + processing_mode)) { + LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d", + row4x4, column4x4); + return false; + } + } + if (save_symbol_decoder_context && row4x4 + block_width4x4 >= row4x4_end_) { + SaveSymbolDecoderContext(); + } + if (processing_mode == kProcessingModeDecodeOnly || + processing_mode == kProcessingModeParseAndDecode) { + PopulateIntraPredictionBuffer(row4x4); + } + return true; +} + +// Used in frame parallel mode. The symbol decoder context need not be saved in +// this case since it was done when parsing was complete. +template bool Tile::ProcessSuperBlockRow( + int row4x4, TileScratchBuffer* scratch_buffer); +// Used in non frame parallel mode. +template bool Tile::ProcessSuperBlockRow( + int row4x4, TileScratchBuffer* scratch_buffer); + +void Tile::SaveSymbolDecoderContext() { + if (frame_header_.enable_frame_end_update_cdf && + number_ == frame_header_.tile_info.context_update_id) { + *saved_symbol_decoder_context_ = symbol_decoder_context_; + } +} + +bool Tile::ParseAndDecode() { + // If this is the main thread, we build the loop filter bit masks when parsing + // so that it happens in the current thread. This ensures that the main thread + // does as much work as possible. + if (split_parse_and_decode_) { + if (!ThreadedParseAndDecode()) return false; + SaveSymbolDecoderContext(); + return true; + } + std::unique_ptr scratch_buffer = + tile_scratch_buffer_pool_->Get(); + if (scratch_buffer == nullptr) { + pending_tiles_->Decrement(false); + LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer."); + return false; + } + const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()]; + for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_; + row4x4 += block_width4x4) { + if (!ProcessSuperBlockRow( + row4x4, scratch_buffer.get())) { + pending_tiles_->Decrement(false); + return false; + } + } + tile_scratch_buffer_pool_->Release(std::move(scratch_buffer)); + pending_tiles_->Decrement(true); + return true; +} + +bool Tile::Parse() { + const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()]; + std::unique_ptr scratch_buffer = + tile_scratch_buffer_pool_->Get(); + if (scratch_buffer == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer."); + return false; + } + for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_; + row4x4 += block_width4x4) { + if (!ProcessSuperBlockRow( + row4x4, scratch_buffer.get())) { + return false; + } + } + tile_scratch_buffer_pool_->Release(std::move(scratch_buffer)); + SaveSymbolDecoderContext(); + return true; +} + +bool Tile::Decode( + std::mutex* const mutex, int* const superblock_row_progress, + std::condition_variable* const superblock_row_progress_condvar) { + const int block_width4x4 = sequence_header_.use_128x128_superblock ? 32 : 16; + const int block_width4x4_log2 = + sequence_header_.use_128x128_superblock ? 5 : 4; + std::unique_ptr scratch_buffer = + tile_scratch_buffer_pool_->Get(); + if (scratch_buffer == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer."); + return false; + } + for (int row4x4 = row4x4_start_, index = row4x4_start_ >> block_width4x4_log2; + row4x4 < row4x4_end_; row4x4 += block_width4x4, ++index) { + if (!ProcessSuperBlockRow( + row4x4, scratch_buffer.get())) { + return false; + } + if (post_filter_.DoDeblock()) { + // Apply vertical deblock filtering for all the columns in this tile + // except for the first 64 columns. + post_filter_.ApplyDeblockFilter( + kLoopFilterTypeVertical, row4x4, + column4x4_start_ + kNum4x4InLoopFilterUnit, column4x4_end_, + block_width4x4); + // If this is the first superblock row of the tile, then we cannot apply + // horizontal deblocking here since we don't know if the top row is + // available. So it will be done by the calling thread in that case. + if (row4x4 != row4x4_start_) { + // Apply horizontal deblock filtering for all the columns in this tile + // except for the first and the last 64 columns. + // Note about the last tile of each row: For the last tile, + // column4x4_end may not be a multiple of 16. In that case it is still + // okay to simply subtract 16 since ApplyDeblockFilter() will only do + // the filters in increments of 64 columns (or 32 columns for chroma + // with subsampling). + post_filter_.ApplyDeblockFilter( + kLoopFilterTypeHorizontal, row4x4, + column4x4_start_ + kNum4x4InLoopFilterUnit, + column4x4_end_ - kNum4x4InLoopFilterUnit, block_width4x4); + } + } + bool notify; + { + std::unique_lock lock(*mutex); + notify = ++superblock_row_progress[index] == + frame_header_.tile_info.tile_columns; + } + if (notify) { + // We are done decoding this superblock row. Notify the post filtering + // thread. + superblock_row_progress_condvar[index].notify_one(); + } + } + tile_scratch_buffer_pool_->Release(std::move(scratch_buffer)); + return true; +} + +bool Tile::ThreadedParseAndDecode() { + { + std::lock_guard lock(threading_.mutex); + if (!threading_.sb_state.Reset(superblock_rows_, superblock_columns_)) { + pending_tiles_->Decrement(false); + LIBGAV1_DLOG(ERROR, "threading.sb_state.Reset() failed."); + return false; + } + // Account for the parsing job. + ++threading_.pending_jobs; + } + + const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()]; + + // Begin parsing. + std::unique_ptr scratch_buffer = + tile_scratch_buffer_pool_->Get(); + if (scratch_buffer == nullptr) { + pending_tiles_->Decrement(false); + LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer."); + return false; + } + for (int row4x4 = row4x4_start_, row_index = 0; row4x4 < row4x4_end_; + row4x4 += block_width4x4, ++row_index) { + for (int column4x4 = column4x4_start_, column_index = 0; + column4x4 < column4x4_end_; + column4x4 += block_width4x4, ++column_index) { + if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4, + scratch_buffer.get(), kProcessingModeParseOnly)) { + std::lock_guard lock(threading_.mutex); + threading_.abort = true; + break; + } + std::unique_lock lock(threading_.mutex); + if (threading_.abort) break; + threading_.sb_state[row_index][column_index] = kSuperBlockStateParsed; + // Schedule the decoding of this superblock if it is allowed. + if (CanDecode(row_index, column_index)) { + ++threading_.pending_jobs; + threading_.sb_state[row_index][column_index] = + kSuperBlockStateScheduled; + lock.unlock(); + thread_pool_->Schedule( + [this, row_index, column_index, block_width4x4]() { + DecodeSuperBlock(row_index, column_index, block_width4x4); + }); + } + } + std::lock_guard lock(threading_.mutex); + if (threading_.abort) break; + } + tile_scratch_buffer_pool_->Release(std::move(scratch_buffer)); + + // We are done parsing. We can return here since the calling thread will make + // sure that it waits for all the superblocks to be decoded. + // + // Finish using |threading_| before |pending_tiles_->Decrement()| because the + // Tile object could go out of scope as soon as |pending_tiles_->Decrement()| + // is called. + threading_.mutex.lock(); + const bool no_pending_jobs = (--threading_.pending_jobs == 0); + const bool job_succeeded = !threading_.abort; + threading_.mutex.unlock(); + if (no_pending_jobs) { + // We are done parsing and decoding this tile. + pending_tiles_->Decrement(job_succeeded); + } + return job_succeeded; +} + +bool Tile::CanDecode(int row_index, int column_index) const { + assert(row_index >= 0); + assert(column_index >= 0); + // If |threading_.sb_state[row_index][column_index]| is not equal to + // kSuperBlockStateParsed, then return false. This is ok because if + // |threading_.sb_state[row_index][column_index]| is equal to: + // kSuperBlockStateNone - then the superblock is not yet parsed. + // kSuperBlockStateScheduled - then the superblock is already scheduled for + // decode. + // kSuperBlockStateDecoded - then the superblock has already been decoded. + if (row_index >= superblock_rows_ || column_index >= superblock_columns_ || + threading_.sb_state[row_index][column_index] != kSuperBlockStateParsed) { + return false; + } + // First superblock has no dependencies. + if (row_index == 0 && column_index == 0) { + return true; + } + // Superblocks in the first row only depend on the superblock to the left of + // it. + if (row_index == 0) { + return threading_.sb_state[0][column_index - 1] == kSuperBlockStateDecoded; + } + // All other superblocks depend on superblock to the left of it (if one + // exists) and superblock to the top right with a lag of + // |intra_block_copy_lag_| (if one exists). + const int top_right_column_index = + std::min(column_index + intra_block_copy_lag_, superblock_columns_ - 1); + return threading_.sb_state[row_index - 1][top_right_column_index] == + kSuperBlockStateDecoded && + (column_index == 0 || + threading_.sb_state[row_index][column_index - 1] == + kSuperBlockStateDecoded); +} + +void Tile::DecodeSuperBlock(int row_index, int column_index, + int block_width4x4) { + const int row4x4 = row4x4_start_ + (row_index * block_width4x4); + const int column4x4 = column4x4_start_ + (column_index * block_width4x4); + std::unique_ptr scratch_buffer = + tile_scratch_buffer_pool_->Get(); + bool ok = scratch_buffer != nullptr; + if (ok) { + ok = ProcessSuperBlock(row4x4, column4x4, block_width4x4, + scratch_buffer.get(), kProcessingModeDecodeOnly); + tile_scratch_buffer_pool_->Release(std::move(scratch_buffer)); + } + std::unique_lock lock(threading_.mutex); + if (ok) { + threading_.sb_state[row_index][column_index] = kSuperBlockStateDecoded; + // Candidate rows and columns that we could potentially begin the decoding + // (if it is allowed to do so). The candidates are: + // 1) The superblock to the bottom-left of the current superblock with a + // lag of |intra_block_copy_lag_| (or the beginning of the next superblock + // row in case there are less than |intra_block_copy_lag_| superblock + // columns in the Tile). + // 2) The superblock to the right of the current superblock. + const int candidate_row_indices[] = {row_index + 1, row_index}; + const int candidate_column_indices[] = { + std::max(0, column_index - intra_block_copy_lag_), column_index + 1}; + for (size_t i = 0; i < std::extent::value; + ++i) { + const int candidate_row_index = candidate_row_indices[i]; + const int candidate_column_index = candidate_column_indices[i]; + if (!CanDecode(candidate_row_index, candidate_column_index)) { + continue; + } + ++threading_.pending_jobs; + threading_.sb_state[candidate_row_index][candidate_column_index] = + kSuperBlockStateScheduled; + lock.unlock(); + thread_pool_->Schedule([this, candidate_row_index, candidate_column_index, + block_width4x4]() { + DecodeSuperBlock(candidate_row_index, candidate_column_index, + block_width4x4); + }); + lock.lock(); + } + } else { + threading_.abort = true; + } + // Finish using |threading_| before |pending_tiles_->Decrement()| because the + // Tile object could go out of scope as soon as |pending_tiles_->Decrement()| + // is called. + const bool no_pending_jobs = (--threading_.pending_jobs == 0); + const bool job_succeeded = !threading_.abort; + lock.unlock(); + if (no_pending_jobs) { + // We are done parsing and decoding this tile. + pending_tiles_->Decrement(job_succeeded); + } +} + +void Tile::PopulateIntraPredictionBuffer(int row4x4) { + const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()]; + if (!use_intra_prediction_buffer_ || row4x4 + block_width4x4 >= row4x4_end_) { + return; + } + const size_t pixel_size = + (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t) + : sizeof(uint16_t)); + for (int plane = kPlaneY; plane < PlaneCount(); ++plane) { + const int row_to_copy = + (MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1; + const size_t pixels_to_copy = + (MultiplyBy4(column4x4_end_ - column4x4_start_) >> + subsampling_x_[plane]) * + pixel_size; + const size_t column_start = + MultiplyBy4(column4x4_start_) >> subsampling_x_[plane]; + void* start; +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (sequence_header_.color_config.bitdepth > 8) { + Array2DView buffer( + buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t), + reinterpret_cast(&buffer_[plane][0][0])); + start = &buffer[row_to_copy][column_start]; + } else // NOLINT +#endif + { + start = &buffer_[plane][row_to_copy][column_start]; + } + memcpy((*intra_prediction_buffer_)[plane].get() + column_start * pixel_size, + start, pixels_to_copy); + } +} + +int Tile::GetTransformAllZeroContext(const Block& block, Plane plane, + TransformSize tx_size, int x4, int y4, + int w4, int h4) { + const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane]; + const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane]; + + const int tx_width = kTransformWidth[tx_size]; + const int tx_height = kTransformHeight[tx_size]; + const BlockSize plane_size = block.residual_size[plane]; + const int block_width = kBlockWidthPixels[plane_size]; + const int block_height = kBlockHeightPixels[plane_size]; + + int top = 0; + int left = 0; + const int num_top_elements = GetNumElements(w4, x4, max_x4x4); + const int num_left_elements = GetNumElements(h4, y4, max_y4x4); + if (plane == kPlaneY) { + if (block_width == tx_width && block_height == tx_height) return 0; + const uint8_t* coefficient_levels = + &coefficient_levels_[kEntropyContextTop][plane][x4]; + for (int i = 0; i < num_top_elements; ++i) { + top = std::max(top, static_cast(coefficient_levels[i])); + } + coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4]; + for (int i = 0; i < num_left_elements; ++i) { + left = std::max(left, static_cast(coefficient_levels[i])); + } + assert(top <= 4); + assert(left <= 4); + // kAllZeroContextsByTopLeft is pre-computed based on the logic in the spec + // for top and left. + return kAllZeroContextsByTopLeft[top][left]; + } + const uint8_t* coefficient_levels = + &coefficient_levels_[kEntropyContextTop][plane][x4]; + const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4]; + for (int i = 0; i < num_top_elements; ++i) { + top |= coefficient_levels[i]; + top |= dc_categories[i]; + } + coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4]; + dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4]; + for (int i = 0; i < num_left_elements; ++i) { + left |= coefficient_levels[i]; + left |= dc_categories[i]; + } + return static_cast(top != 0) + static_cast(left != 0) + 7 + + 3 * static_cast(block_width * block_height > + tx_width * tx_height); +} + +TransformSet Tile::GetTransformSet(TransformSize tx_size, bool is_inter) const { + const TransformSize tx_size_square_min = kTransformSizeSquareMin[tx_size]; + const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size]; + if (tx_size_square_max == kTransformSize64x64) return kTransformSetDctOnly; + if (is_inter) { + if (frame_header_.reduced_tx_set || + tx_size_square_max == kTransformSize32x32) { + return kTransformSetInter3; + } + if (tx_size_square_min == kTransformSize16x16) return kTransformSetInter2; + return kTransformSetInter1; + } + if (tx_size_square_max == kTransformSize32x32) return kTransformSetDctOnly; + if (frame_header_.reduced_tx_set || + tx_size_square_min == kTransformSize16x16) { + return kTransformSetIntra2; + } + return kTransformSetIntra1; +} + +TransformType Tile::ComputeTransformType(const Block& block, Plane plane, + TransformSize tx_size, int block_x, + int block_y) { + const BlockParameters& bp = *block.bp; + const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size]; + if (frame_header_.segmentation.lossless[bp.segment_id] || + tx_size_square_max == kTransformSize64x64) { + return kTransformTypeDctDct; + } + if (plane == kPlaneY) { + return transform_types_[block_y - block.row4x4][block_x - block.column4x4]; + } + const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter); + TransformType tx_type; + if (bp.is_inter) { + const int x4 = + std::max(block.column4x4, block_x << subsampling_x_[kPlaneU]); + const int y4 = std::max(block.row4x4, block_y << subsampling_y_[kPlaneU]); + tx_type = transform_types_[y4 - block.row4x4][x4 - block.column4x4]; + } else { + tx_type = kModeToTransformType[bp.uv_mode]; + } + return kTransformTypeInSetMask[tx_set].Contains(tx_type) + ? tx_type + : kTransformTypeDctDct; +} + +void Tile::ReadTransformType(const Block& block, int x4, int y4, + TransformSize tx_size) { + BlockParameters& bp = *block.bp; + const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter); + + TransformType tx_type = kTransformTypeDctDct; + if (tx_set != kTransformSetDctOnly && + frame_header_.segmentation.qindex[bp.segment_id] > 0) { + const int cdf_index = SymbolDecoderContext::TxTypeIndex(tx_set); + const int cdf_tx_size_index = + TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[tx_size]); + uint16_t* cdf; + if (bp.is_inter) { + cdf = symbol_decoder_context_ + .inter_tx_type_cdf[cdf_index][cdf_tx_size_index]; + switch (tx_set) { + case kTransformSetInter1: + tx_type = static_cast(reader_.ReadSymbol<16>(cdf)); + break; + case kTransformSetInter2: + tx_type = static_cast(reader_.ReadSymbol<12>(cdf)); + break; + default: + assert(tx_set == kTransformSetInter3); + tx_type = static_cast(reader_.ReadSymbol(cdf)); + break; + } + } else { + const PredictionMode intra_direction = + block.bp->prediction_parameters->use_filter_intra + ? kFilterIntraModeToIntraPredictor[block.bp->prediction_parameters + ->filter_intra_mode] + : bp.y_mode; + cdf = + symbol_decoder_context_ + .intra_tx_type_cdf[cdf_index][cdf_tx_size_index][intra_direction]; + assert(tx_set == kTransformSetIntra1 || tx_set == kTransformSetIntra2); + tx_type = static_cast((tx_set == kTransformSetIntra1) + ? reader_.ReadSymbol<7>(cdf) + : reader_.ReadSymbol<5>(cdf)); + } + + // This array does not contain an entry for kTransformSetDctOnly, so the + // first dimension needs to be offset by 1. + tx_type = kInverseTransformTypeBySet[tx_set - 1][tx_type]; + } + SetTransformType(block, x4, y4, kTransformWidth4x4[tx_size], + kTransformHeight4x4[tx_size], tx_type, transform_types_); +} + +// Section 8.3.2 in the spec, under coeff_base and coeff_br. +// Bottom boundary checks are avoided by the padded rows. +// For a coefficient near the right boundary, the two right neighbors and the +// one bottom-right neighbor may be out of boundary. We don't check the right +// boundary for them, because the out of boundary neighbors project to positions +// above the diagonal line which goes through the current coefficient and these +// positions are still all 0s according to the diagonal scan order. +template +void Tile::ReadCoeffBase2D( + const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2, + int eob, + uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1], + uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts] + [kCoeffBaseRangeSymbolCount + 1], + ResidualType* const quantized_buffer, uint8_t* const level_buffer) { + const int tx_width = 1 << adjusted_tx_width_log2; + for (int i = eob - 2; i >= 1; --i) { + const uint16_t pos = scan[i]; + const int row = pos >> adjusted_tx_width_log2; + const int column = pos & (tx_width - 1); + auto* const quantized = &quantized_buffer[pos]; + auto* const levels = &level_buffer[pos]; + const int neighbor_sum = 1 + levels[1] + levels[tx_width] + + levels[tx_width + 1] + levels[2] + + levels[MultiplyBy2(tx_width)]; + const int context = + ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) + + kCoeffBaseContextOffset[tx_size][std::min(row, 4)][std::min(column, 4)]; + int level = + reader_.ReadSymbol(coeff_base_cdf[context]); + levels[0] = level; + if (level > kNumQuantizerBaseLevels) { + // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS + // + 1, because we clip the overall output to 6 and the unclipped + // quantized values will always result in an output of greater than 6. + int context = std::min(6, DivideBy2(1 + quantized[1] + // {0, 1} + quantized[tx_width] + // {1, 0} + quantized[tx_width + 1])); // {1, 1} + context += 14 >> static_cast((row | column) < 2); + level += ReadCoeffBaseRange(coeff_base_range_cdf[context]); + } + quantized[0] = level; + } + // Read position 0. + { + auto* const quantized = &quantized_buffer[0]; + int level = reader_.ReadSymbol(coeff_base_cdf[0]); + level_buffer[0] = level; + if (level > kNumQuantizerBaseLevels) { + // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS + // + 1, because we clip the overall output to 6 and the unclipped + // quantized values will always result in an output of greater than 6. + const int context = + std::min(6, DivideBy2(1 + quantized[1] + // {0, 1} + quantized[tx_width] + // {1, 0} + quantized[tx_width + 1])); // {1, 1} + level += ReadCoeffBaseRange(coeff_base_range_cdf[context]); + } + quantized[0] = level; + } +} + +// Section 8.3.2 in the spec, under coeff_base and coeff_br. +// Bottom boundary checks are avoided by the padded rows. +// For a coefficient near the right boundary, the four right neighbors may be +// out of boundary. We don't do the boundary check for the first three right +// neighbors, because even for the transform blocks with smallest width 4, the +// first three out of boundary neighbors project to positions left of the +// current coefficient and these positions are still all 0s according to the +// column scan order. However, when transform block width is 4 and the current +// coefficient is on the right boundary, its fourth right neighbor projects to +// the under position on the same column, which could be nonzero. Therefore, we +// must skip the fourth right neighbor. To make it simple, for any coefficient, +// we always do the boundary check for its fourth right neighbor. +template +void Tile::ReadCoeffBaseHorizontal( + const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2, + int eob, + uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1], + uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts] + [kCoeffBaseRangeSymbolCount + 1], + ResidualType* const quantized_buffer, uint8_t* const level_buffer) { + const int tx_width = 1 << adjusted_tx_width_log2; + int i = eob - 2; + do { + const uint16_t pos = scan[i]; + const int column = pos & (tx_width - 1); + auto* const quantized = &quantized_buffer[pos]; + auto* const levels = &level_buffer[pos]; + const int neighbor_sum = + 1 + (levels[1] + // {0, 1} + levels[tx_width] + // {1, 0} + levels[2] + // {0, 2} + levels[3] + // {0, 3} + ((column + 4 < tx_width) ? levels[4] : 0)); // {0, 4} + const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) + + kCoeffBasePositionContextOffset[column]; + int level = + reader_.ReadSymbol(coeff_base_cdf[context]); + levels[0] = level; + if (level > kNumQuantizerBaseLevels) { + // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS + // + 1, because we clip the overall output to 6 and the unclipped + // quantized values will always result in an output of greater than 6. + int context = std::min(6, DivideBy2(1 + quantized[1] + // {0, 1} + quantized[tx_width] + // {1, 0} + quantized[2])); // {0, 2} + if (pos != 0) { + context += 14 >> static_cast(column == 0); + } + level += ReadCoeffBaseRange(coeff_base_range_cdf[context]); + } + quantized[0] = level; + } while (--i >= 0); +} + +// Section 8.3.2 in the spec, under coeff_base and coeff_br. +// Bottom boundary checks are avoided by the padded rows. +// Right boundary check is performed explicitly. +template +void Tile::ReadCoeffBaseVertical( + const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2, + int eob, + uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1], + uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts] + [kCoeffBaseRangeSymbolCount + 1], + ResidualType* const quantized_buffer, uint8_t* const level_buffer) { + const int tx_width = 1 << adjusted_tx_width_log2; + int i = eob - 2; + do { + const uint16_t pos = scan[i]; + const int row = pos >> adjusted_tx_width_log2; + const int column = pos & (tx_width - 1); + auto* const quantized = &quantized_buffer[pos]; + auto* const levels = &level_buffer[pos]; + const int neighbor_sum = + 1 + (((column + 1 < tx_width) ? levels[1] : 0) + // {0, 1} + levels[tx_width] + // {1, 0} + levels[MultiplyBy2(tx_width)] + // {2, 0} + levels[tx_width * 3] + // {3, 0} + levels[MultiplyBy4(tx_width)]); // {4, 0} + const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) + + kCoeffBasePositionContextOffset[row]; + int level = + reader_.ReadSymbol(coeff_base_cdf[context]); + levels[0] = level; + if (level > kNumQuantizerBaseLevels) { + // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS + // + 1, because we clip the overall output to 6 and the unclipped + // quantized values will always result in an output of greater than 6. + const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0; + int context = + std::min(6, DivideBy2(1 + quantized_column1 + // {0, 1} + quantized[tx_width] + // {1, 0} + quantized[MultiplyBy2(tx_width)])); // {2, 0} + if (pos != 0) { + context += 14 >> static_cast(row == 0); + } + level += ReadCoeffBaseRange(coeff_base_range_cdf[context]); + } + quantized[0] = level; + } while (--i >= 0); +} + +int Tile::GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane) { + const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane]; + const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4]; + // Set dc_sign to 8-bit long so that std::accumulate() saves sign extension. + int8_t dc_sign = std::accumulate( + dc_categories, dc_categories + GetNumElements(w4, x4, max_x4x4), 0); + const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane]; + dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4]; + dc_sign = std::accumulate( + dc_categories, dc_categories + GetNumElements(h4, y4, max_y4x4), dc_sign); + // This return statement is equivalent to: + // if (dc_sign < 0) return 1; + // if (dc_sign > 0) return 2; + // return 0; + // And it is better than: + // return static_cast(dc_sign != 0) + static_cast(dc_sign > 0); + return static_cast(dc_sign < 0) + + MultiplyBy2(static_cast(dc_sign > 0)); +} + +void Tile::SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane, + uint8_t coefficient_level, int8_t dc_category) { + const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane]; + const int num_top_elements = GetNumElements(w4, x4, max_x4x4); + memset(&coefficient_levels_[kEntropyContextTop][plane][x4], coefficient_level, + num_top_elements); + memset(&dc_categories_[kEntropyContextTop][plane][x4], dc_category, + num_top_elements); + const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane]; + const int num_left_elements = GetNumElements(h4, y4, max_y4x4); + memset(&coefficient_levels_[kEntropyContextLeft][plane][y4], + coefficient_level, num_left_elements); + memset(&dc_categories_[kEntropyContextLeft][plane][y4], dc_category, + num_left_elements); +} + +template +bool Tile::ReadSignAndApplyDequantization( + const uint16_t* const scan, int i, int q_value, + const uint8_t* const quantizer_matrix, int shift, int max_value, + uint16_t* const dc_sign_cdf, int8_t* const dc_category, + int* const coefficient_level, ResidualType* residual_buffer) { + const int pos = is_dc_coefficient ? 0 : scan[i]; + // If residual_buffer[pos] is zero, then the rest of the function has no + // effect. + int level = residual_buffer[pos]; + if (level == 0) return true; + const int sign = is_dc_coefficient + ? static_cast(reader_.ReadSymbol(dc_sign_cdf)) + : reader_.ReadBit(); + if (level > kNumQuantizerBaseLevels + kQuantizerCoefficientBaseRange) { + int length = 0; + bool golomb_length_bit = false; + do { + golomb_length_bit = static_cast(reader_.ReadBit()); + ++length; + if (length > 20) { + LIBGAV1_DLOG(ERROR, "Invalid golomb_length %d", length); + return false; + } + } while (!golomb_length_bit); + int x = 1; + for (int i = length - 2; i >= 0; --i) { + x = (x << 1) | reader_.ReadBit(); + } + level += x - 1; + } + if (is_dc_coefficient) { + *dc_category = (sign != 0) ? -1 : 1; + } + level &= 0xfffff; + *coefficient_level += level; + // Apply dequantization. Step 1 of section 7.12.3 in the spec. + int q = q_value; + if (quantizer_matrix != nullptr) { + q = RightShiftWithRounding(q * quantizer_matrix[pos], 5); + } + // The intermediate multiplication can exceed 32 bits, so it has to be + // performed by promoting one of the values to int64_t. + int32_t dequantized_value = (static_cast(q) * level) & 0xffffff; + dequantized_value >>= shift; + // At this point: + // * |dequantized_value| is always non-negative. + // * |sign| can be either 0 or 1. + // * min_value = -(max_value + 1). + // We need to apply the following: + // dequantized_value = sign ? -dequantized_value : dequantized_value; + // dequantized_value = Clip3(dequantized_value, min_value, max_value); + // + // Note that -x == ~(x - 1). + // + // Now, The above two lines can be done with a std::min and xor as follows: + dequantized_value = std::min(dequantized_value - sign, max_value) ^ -sign; + residual_buffer[pos] = dequantized_value; + return true; +} + +int Tile::ReadCoeffBaseRange(uint16_t* cdf) { + int level = 0; + for (int j = 0; j < kCoeffBaseRangeMaxIterations; ++j) { + const int coeff_base_range = + reader_.ReadSymbol(cdf); + level += coeff_base_range; + if (coeff_base_range < (kCoeffBaseRangeSymbolCount - 1)) break; + } + return level; +} + +template +int Tile::ReadTransformCoefficients(const Block& block, Plane plane, + int start_x, int start_y, + TransformSize tx_size, + TransformType* const tx_type) { + const int x4 = DivideBy4(start_x); + const int y4 = DivideBy4(start_y); + const int w4 = kTransformWidth4x4[tx_size]; + const int h4 = kTransformHeight4x4[tx_size]; + const int tx_size_context = kTransformSizeContext[tx_size]; + int context = + GetTransformAllZeroContext(block, plane, tx_size, x4, y4, w4, h4); + const bool all_zero = reader_.ReadSymbol( + symbol_decoder_context_.all_zero_cdf[tx_size_context][context]); + if (all_zero) { + if (plane == kPlaneY) { + SetTransformType(block, x4, y4, w4, h4, kTransformTypeDctDct, + transform_types_); + } + SetEntropyContexts(x4, y4, w4, h4, plane, 0, 0); + // This is not used in this case, so it can be set to any value. + *tx_type = kNumTransformTypes; + return 0; + } + const int tx_width = kTransformWidth[tx_size]; + const int tx_height = kTransformHeight[tx_size]; + const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size]; + const int adjusted_tx_width_log2 = kTransformWidthLog2[adjusted_tx_size]; + const int tx_padding = + (1 << adjusted_tx_width_log2) * kResidualPaddingVertical; + auto* residual = reinterpret_cast(*block.residual); + // Clear padding to avoid bottom boundary checks when parsing quantized + // coefficients. + memset(residual, 0, (tx_width * tx_height + tx_padding) * residual_size_); + uint8_t level_buffer[(32 + kResidualPaddingVertical) * 32]; + memset( + level_buffer, 0, + kTransformWidth[adjusted_tx_size] * kTransformHeight[adjusted_tx_size] + + tx_padding); + const int clamped_tx_height = std::min(tx_height, 32); + if (plane == kPlaneY) { + ReadTransformType(block, x4, y4, tx_size); + } + BlockParameters& bp = *block.bp; + *tx_type = ComputeTransformType(block, plane, tx_size, x4, y4); + const int eob_multi_size = kEobMultiSizeLookup[tx_size]; + const PlaneType plane_type = GetPlaneType(plane); + const TransformClass tx_class = GetTransformClass(*tx_type); + context = static_cast(tx_class != kTransformClass2D); + int eob_pt = 1; + switch (eob_multi_size) { + case 0: + eob_pt += reader_.ReadSymbol( + symbol_decoder_context_.eob_pt_16_cdf[plane_type][context]); + break; + case 1: + eob_pt += reader_.ReadSymbol( + symbol_decoder_context_.eob_pt_32_cdf[plane_type][context]); + break; + case 2: + eob_pt += reader_.ReadSymbol( + symbol_decoder_context_.eob_pt_64_cdf[plane_type][context]); + break; + case 3: + eob_pt += reader_.ReadSymbol( + symbol_decoder_context_.eob_pt_128_cdf[plane_type][context]); + break; + case 4: + eob_pt += reader_.ReadSymbol( + symbol_decoder_context_.eob_pt_256_cdf[plane_type][context]); + break; + case 5: + eob_pt += reader_.ReadSymbol( + symbol_decoder_context_.eob_pt_512_cdf[plane_type]); + break; + case 6: + default: + eob_pt += reader_.ReadSymbol( + symbol_decoder_context_.eob_pt_1024_cdf[plane_type]); + break; + } + int eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1); + if (eob_pt >= 3) { + context = eob_pt - 3; + const bool eob_extra = reader_.ReadSymbol( + symbol_decoder_context_ + .eob_extra_cdf[tx_size_context][plane_type][context]); + if (eob_extra) eob += 1 << (eob_pt - 3); + for (int i = 1; i < eob_pt - 2; ++i) { + assert(eob_pt - i >= 3); + assert(eob_pt <= kEobPt1024SymbolCount); + if (static_cast(reader_.ReadBit())) { + eob += 1 << (eob_pt - i - 3); + } + } + } + const uint16_t* scan = kScan[tx_class][tx_size]; + const int clamped_tx_size_context = std::min(tx_size_context, 3); + auto coeff_base_range_cdf = + symbol_decoder_context_ + .coeff_base_range_cdf[clamped_tx_size_context][plane_type]; + // Read the last coefficient. + { + context = GetCoeffBaseContextEob(tx_size, eob - 1); + const uint16_t pos = scan[eob - 1]; + int level = + 1 + reader_.ReadSymbol( + symbol_decoder_context_ + .coeff_base_eob_cdf[tx_size_context][plane_type][context]); + level_buffer[pos] = level; + if (level > kNumQuantizerBaseLevels) { + level += + ReadCoeffBaseRange(coeff_base_range_cdf[GetCoeffBaseRangeContextEob( + adjusted_tx_width_log2, pos, tx_class)]); + } + residual[pos] = level; + } + if (eob > 1) { + // Read all the other coefficients. + // Lookup used to call the right variant of ReadCoeffBase*() based on the + // transform class. + static constexpr void (Tile::*kGetCoeffBaseFunc[])( + const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2, + int eob, + uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1], + uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts] + [kCoeffBaseRangeSymbolCount + 1], + ResidualType* quantized_buffer, + uint8_t* level_buffer) = {&Tile::ReadCoeffBase2D, + &Tile::ReadCoeffBaseHorizontal, + &Tile::ReadCoeffBaseVertical}; + (this->*kGetCoeffBaseFunc[tx_class])( + scan, tx_size, adjusted_tx_width_log2, eob, + symbol_decoder_context_.coeff_base_cdf[tx_size_context][plane_type], + coeff_base_range_cdf, residual, level_buffer); + } + const int max_value = (1 << (7 + sequence_header_.color_config.bitdepth)) - 1; + const int current_quantizer_index = GetQIndex( + frame_header_.segmentation, bp.segment_id, current_quantizer_index_); + const int dc_q_value = quantizer_.GetDcValue(plane, current_quantizer_index); + const int ac_q_value = quantizer_.GetAcValue(plane, current_quantizer_index); + const int shift = kQuantizationShift[tx_size]; + const uint8_t* const quantizer_matrix = + (frame_header_.quantizer.use_matrix && + *tx_type < kTransformTypeIdentityIdentity && + !frame_header_.segmentation.lossless[bp.segment_id] && + frame_header_.quantizer.matrix_level[plane] < 15) + ? quantizer_matrix_[frame_header_.quantizer.matrix_level[plane]] + [plane_type][adjusted_tx_size] + .get() + : nullptr; + int coefficient_level = 0; + int8_t dc_category = 0; + uint16_t* const dc_sign_cdf = + (residual[0] != 0) + ? symbol_decoder_context_.dc_sign_cdf[plane_type][GetDcSignContext( + x4, y4, w4, h4, plane)] + : nullptr; + assert(scan[0] == 0); + if (!ReadSignAndApplyDequantization( + scan, 0, dc_q_value, quantizer_matrix, shift, max_value, dc_sign_cdf, + &dc_category, &coefficient_level, residual)) { + return -1; + } + if (eob > 1) { + int i = 1; + do { + if (!ReadSignAndApplyDequantization( + scan, i, ac_q_value, quantizer_matrix, shift, max_value, nullptr, + nullptr, &coefficient_level, residual)) { + return -1; + } + } while (++i < eob); + MoveCoefficientsForTxWidth64(clamped_tx_height, tx_width, residual); + } + SetEntropyContexts(x4, y4, w4, h4, plane, std::min(4, coefficient_level), + dc_category); + if (split_parse_and_decode_) { + *block.residual += tx_width * tx_height * residual_size_; + } + return eob; +} + +// CALL_BITDEPTH_FUNCTION is a macro that calls the appropriate template +// |function| depending on the value of |sequence_header_.color_config.bitdepth| +// with the variadic arguments. +#if LIBGAV1_MAX_BITDEPTH >= 10 +#define CALL_BITDEPTH_FUNCTION(function, ...) \ + do { \ + if (sequence_header_.color_config.bitdepth > 8) { \ + function(__VA_ARGS__); \ + } else { \ + function(__VA_ARGS__); \ + } \ + } while (false) +#else +#define CALL_BITDEPTH_FUNCTION(function, ...) \ + do { \ + function(__VA_ARGS__); \ + } while (false) +#endif + +bool Tile::TransformBlock(const Block& block, Plane plane, int base_x, + int base_y, TransformSize tx_size, int x, int y, + ProcessingMode mode) { + BlockParameters& bp = *block.bp; + const int subsampling_x = subsampling_x_[plane]; + const int subsampling_y = subsampling_y_[plane]; + const int start_x = base_x + MultiplyBy4(x); + const int start_y = base_y + MultiplyBy4(y); + const int max_x = MultiplyBy4(frame_header_.columns4x4) >> subsampling_x; + const int max_y = MultiplyBy4(frame_header_.rows4x4) >> subsampling_y; + if (start_x >= max_x || start_y >= max_y) return true; + const int row = DivideBy4(start_y << subsampling_y); + const int column = DivideBy4(start_x << subsampling_x); + const int mask = sequence_header_.use_128x128_superblock ? 31 : 15; + const int sub_block_row4x4 = row & mask; + const int sub_block_column4x4 = column & mask; + const int step_x = kTransformWidth4x4[tx_size]; + const int step_y = kTransformHeight4x4[tx_size]; + const bool do_decode = mode == kProcessingModeDecodeOnly || + mode == kProcessingModeParseAndDecode; + if (do_decode && !bp.is_inter) { + if (bp.palette_mode_info.size[GetPlaneType(plane)] > 0) { + CALL_BITDEPTH_FUNCTION(PalettePrediction, block, plane, start_x, start_y, + x, y, tx_size); + } else { + const PredictionMode mode = + (plane == kPlaneY) + ? bp.y_mode + : (bp.uv_mode == kPredictionModeChromaFromLuma ? kPredictionModeDc + : bp.uv_mode); + const int tr_row4x4 = (sub_block_row4x4 >> subsampling_y); + const int tr_column4x4 = + (sub_block_column4x4 >> subsampling_x) + step_x + 1; + const int bl_row4x4 = (sub_block_row4x4 >> subsampling_y) + step_y + 1; + const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x); + const bool has_left = x > 0 || block.left_available[plane]; + const bool has_top = y > 0 || block.top_available[plane]; + + CALL_BITDEPTH_FUNCTION( + IntraPrediction, block, plane, start_x, start_y, has_left, has_top, + block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4], + block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4], + mode, tx_size); + if (plane != kPlaneY && bp.uv_mode == kPredictionModeChromaFromLuma) { + CALL_BITDEPTH_FUNCTION(ChromaFromLumaPrediction, block, plane, start_x, + start_y, tx_size); + } + } + if (plane == kPlaneY) { + block.bp->prediction_parameters->max_luma_width = + start_x + MultiplyBy4(step_x); + block.bp->prediction_parameters->max_luma_height = + start_y + MultiplyBy4(step_y); + block.scratch_buffer->cfl_luma_buffer_valid = false; + } + } + if (!bp.skip) { + const int sb_row_index = SuperBlockRowIndex(block.row4x4); + const int sb_column_index = SuperBlockColumnIndex(block.column4x4); + if (mode == kProcessingModeDecodeOnly) { + TransformParameterQueue& tx_params = + *residual_buffer_threaded_[sb_row_index][sb_column_index] + ->transform_parameters(); + ReconstructBlock(block, plane, start_x, start_y, tx_size, + tx_params.Type(), tx_params.NonZeroCoeffCount()); + tx_params.Pop(); + } else { + TransformType tx_type; + int non_zero_coeff_count; +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (sequence_header_.color_config.bitdepth > 8) { + non_zero_coeff_count = ReadTransformCoefficients( + block, plane, start_x, start_y, tx_size, &tx_type); + } else // NOLINT +#endif + { + non_zero_coeff_count = ReadTransformCoefficients( + block, plane, start_x, start_y, tx_size, &tx_type); + } + if (non_zero_coeff_count < 0) return false; + if (mode == kProcessingModeParseAndDecode) { + ReconstructBlock(block, plane, start_x, start_y, tx_size, tx_type, + non_zero_coeff_count); + } else { + assert(mode == kProcessingModeParseOnly); + residual_buffer_threaded_[sb_row_index][sb_column_index] + ->transform_parameters() + ->Push(non_zero_coeff_count, tx_type); + } + } + } + if (do_decode) { + bool* block_decoded = + &block.scratch_buffer + ->block_decoded[plane][(sub_block_row4x4 >> subsampling_y) + 1] + [(sub_block_column4x4 >> subsampling_x) + 1]; + SetBlockValues(step_y, step_x, true, block_decoded, + TileScratchBuffer::kBlockDecodedStride); + } + return true; +} + +bool Tile::TransformTree(const Block& block, int start_x, int start_y, + BlockSize plane_size, ProcessingMode mode) { + assert(plane_size <= kBlock64x64); + // Branching factor is 4; Maximum Depth is 4; So the maximum stack size + // required is (4 - 1) * 4 + 1 = 13. + Stack stack; + // It is okay to cast BlockSize to TransformSize here since the enum are + // equivalent for all BlockSize values <= kBlock64x64. + stack.Push(TransformTreeNode(start_x, start_y, + static_cast(plane_size))); + + do { + TransformTreeNode node = stack.Pop(); + const int row = DivideBy4(node.y); + const int column = DivideBy4(node.x); + if (row >= frame_header_.rows4x4 || column >= frame_header_.columns4x4) { + continue; + } + const TransformSize inter_tx_size = inter_transform_sizes_[row][column]; + const int width = kTransformWidth[node.tx_size]; + const int height = kTransformHeight[node.tx_size]; + if (width <= kTransformWidth[inter_tx_size] && + height <= kTransformHeight[inter_tx_size]) { + if (!TransformBlock(block, kPlaneY, node.x, node.y, node.tx_size, 0, 0, + mode)) { + return false; + } + continue; + } + // The split transform size look up gives the right transform size that we + // should push in the stack. + // if (width > height) => transform size whose width is half. + // if (width < height) => transform size whose height is half. + // if (width == height) => transform size whose width and height are half. + const TransformSize split_tx_size = kSplitTransformSize[node.tx_size]; + const int half_width = DivideBy2(width); + if (width > height) { + stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size)); + stack.Push(TransformTreeNode(node.x, node.y, split_tx_size)); + continue; + } + const int half_height = DivideBy2(height); + if (width < height) { + stack.Push( + TransformTreeNode(node.x, node.y + half_height, split_tx_size)); + stack.Push(TransformTreeNode(node.x, node.y, split_tx_size)); + continue; + } + stack.Push(TransformTreeNode(node.x + half_width, node.y + half_height, + split_tx_size)); + stack.Push(TransformTreeNode(node.x, node.y + half_height, split_tx_size)); + stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size)); + stack.Push(TransformTreeNode(node.x, node.y, split_tx_size)); + } while (!stack.Empty()); + return true; +} + +void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x, + int start_y, TransformSize tx_size, + TransformType tx_type, int non_zero_coeff_count) { + // Reconstruction process. Steps 2 and 3 of Section 7.12.3 in the spec. + assert(non_zero_coeff_count >= 0); + if (non_zero_coeff_count == 0) return; +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (sequence_header_.color_config.bitdepth > 8) { + Array2DView buffer( + buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t), + reinterpret_cast(&buffer_[plane][0][0])); + Reconstruct(dsp_, tx_type, tx_size, + frame_header_.segmentation.lossless[block.bp->segment_id], + reinterpret_cast(*block.residual), start_x, start_y, + &buffer, non_zero_coeff_count); + } else // NOLINT +#endif + { + Reconstruct(dsp_, tx_type, tx_size, + frame_header_.segmentation.lossless[block.bp->segment_id], + reinterpret_cast(*block.residual), start_x, start_y, + &buffer_[plane], non_zero_coeff_count); + } + if (split_parse_and_decode_) { + *block.residual += + kTransformWidth[tx_size] * kTransformHeight[tx_size] * residual_size_; + } +} + +bool Tile::Residual(const Block& block, ProcessingMode mode) { + const int width_chunks = std::max(1, block.width >> 6); + const int height_chunks = std::max(1, block.height >> 6); + const BlockSize size_chunk4x4 = + (width_chunks > 1 || height_chunks > 1) ? kBlock64x64 : block.size; + const BlockParameters& bp = *block.bp; + for (int chunk_y = 0; chunk_y < height_chunks; ++chunk_y) { + for (int chunk_x = 0; chunk_x < width_chunks; ++chunk_x) { + const int num_planes = block.HasChroma() ? PlaneCount() : 1; + int plane = kPlaneY; + do { + const int subsampling_x = subsampling_x_[plane]; + const int subsampling_y = subsampling_y_[plane]; + // For Y Plane, when lossless is true |bp.transform_size| is always + // kTransformSize4x4. So we can simply use |bp.transform_size| here as + // the Y plane's transform size (part of Section 5.11.37 in the spec). + const TransformSize tx_size = + (plane == kPlaneY) ? bp.transform_size : bp.uv_transform_size; + const BlockSize plane_size = + kPlaneResidualSize[size_chunk4x4][subsampling_x][subsampling_y]; + assert(plane_size != kBlockInvalid); + if (bp.is_inter && + !frame_header_.segmentation.lossless[bp.segment_id] && + plane == kPlaneY) { + const int row_chunk4x4 = block.row4x4 + MultiplyBy16(chunk_y); + const int column_chunk4x4 = block.column4x4 + MultiplyBy16(chunk_x); + const int base_x = MultiplyBy4(column_chunk4x4 >> subsampling_x); + const int base_y = MultiplyBy4(row_chunk4x4 >> subsampling_y); + if (!TransformTree(block, base_x, base_y, plane_size, mode)) { + return false; + } + } else { + const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x); + const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y); + const int step_x = kTransformWidth4x4[tx_size]; + const int step_y = kTransformHeight4x4[tx_size]; + const int num4x4_wide = kNum4x4BlocksWide[plane_size]; + const int num4x4_high = kNum4x4BlocksHigh[plane_size]; + for (int y = 0; y < num4x4_high; y += step_y) { + for (int x = 0; x < num4x4_wide; x += step_x) { + if (!TransformBlock( + block, static_cast(plane), base_x, base_y, tx_size, + x + (MultiplyBy16(chunk_x) >> subsampling_x), + y + (MultiplyBy16(chunk_y) >> subsampling_y), mode)) { + return false; + } + } + } + } + } while (++plane < num_planes); + } + } + return true; +} + +// The purpose of this function is to limit the maximum size of motion vectors +// and also, if use_intra_block_copy is true, to additionally constrain the +// motion vector so that the data is fetched from parts of the tile that have +// already been decoded and are not too close to the current block (in order to +// make a pipelined decoder implementation feasible). +bool Tile::IsMvValid(const Block& block, bool is_compound) const { + const BlockParameters& bp = *block.bp; + for (int i = 0; i < 1 + static_cast(is_compound); ++i) { + for (int mv_component : bp.mv.mv[i].mv) { + if (std::abs(mv_component) >= (1 << 14)) { + return false; + } + } + } + if (!block.bp->prediction_parameters->use_intra_block_copy) { + return true; + } + if ((bp.mv.mv[0].mv32 & 0x00070007) != 0) { + return false; + } + const int delta_row = bp.mv.mv[0].mv[0] >> 3; + const int delta_column = bp.mv.mv[0].mv[1] >> 3; + int src_top_edge = MultiplyBy4(block.row4x4) + delta_row; + int src_left_edge = MultiplyBy4(block.column4x4) + delta_column; + const int src_bottom_edge = src_top_edge + block.height; + const int src_right_edge = src_left_edge + block.width; + if (block.HasChroma()) { + if (block.width < 8 && subsampling_x_[kPlaneU] != 0) { + src_left_edge -= 4; + } + if (block.height < 8 && subsampling_y_[kPlaneU] != 0) { + src_top_edge -= 4; + } + } + if (src_top_edge < MultiplyBy4(row4x4_start_) || + src_left_edge < MultiplyBy4(column4x4_start_) || + src_bottom_edge > MultiplyBy4(row4x4_end_) || + src_right_edge > MultiplyBy4(column4x4_end_)) { + return false; + } + // sb_height_log2 = use_128x128_superblock ? log2(128) : log2(64) + const int sb_height_log2 = + 6 + static_cast(sequence_header_.use_128x128_superblock); + const int active_sb_row = MultiplyBy4(block.row4x4) >> sb_height_log2; + const int active_64x64_block_column = MultiplyBy4(block.column4x4) >> 6; + const int src_sb_row = (src_bottom_edge - 1) >> sb_height_log2; + const int src_64x64_block_column = (src_right_edge - 1) >> 6; + const int total_64x64_blocks_per_row = + ((column4x4_end_ - column4x4_start_ - 1) >> 4) + 1; + const int active_64x64_block = + active_sb_row * total_64x64_blocks_per_row + active_64x64_block_column; + const int src_64x64_block = + src_sb_row * total_64x64_blocks_per_row + src_64x64_block_column; + if (src_64x64_block >= active_64x64_block - kIntraBlockCopyDelay64x64Blocks) { + return false; + } + + // Wavefront constraint: use only top left area of frame for reference. + if (src_sb_row > active_sb_row) return false; + const int gradient = + 1 + kIntraBlockCopyDelay64x64Blocks + + static_cast(sequence_header_.use_128x128_superblock); + const int wavefront_offset = gradient * (active_sb_row - src_sb_row); + return src_64x64_block_column < active_64x64_block_column - + kIntraBlockCopyDelay64x64Blocks + + wavefront_offset; +} + +bool Tile::AssignInterMv(const Block& block, bool is_compound) { + int min[2]; + int max[2]; + GetClampParameters(block, min, max); + BlockParameters& bp = *block.bp; + const PredictionParameters& prediction_parameters = *bp.prediction_parameters; + if (is_compound) { + for (int i = 0; i < 2; ++i) { + const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode); + MotionVector predicted_mv; + if (mode == kPredictionModeGlobalMv) { + predicted_mv = prediction_parameters.global_mv[i]; + } else { + const int ref_mv_index = (mode == kPredictionModeNearestMv || + (mode == kPredictionModeNewMv && + prediction_parameters.ref_mv_count <= 1)) + ? 0 + : prediction_parameters.ref_mv_index; + predicted_mv = prediction_parameters.reference_mv(ref_mv_index, i); + if (ref_mv_index < prediction_parameters.ref_mv_count) { + predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]); + predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]); + } + } + if (mode == kPredictionModeNewMv) { + ReadMotionVector(block, i); + bp.mv.mv[i].mv[0] += predicted_mv.mv[0]; + bp.mv.mv[i].mv[1] += predicted_mv.mv[1]; + } else { + bp.mv.mv[i] = predicted_mv; + } + } + } else { + const PredictionMode mode = GetSinglePredictionMode(0, bp.y_mode); + MotionVector predicted_mv; + if (mode == kPredictionModeGlobalMv) { + predicted_mv = prediction_parameters.global_mv[0]; + } else { + const int ref_mv_index = (mode == kPredictionModeNearestMv || + (mode == kPredictionModeNewMv && + prediction_parameters.ref_mv_count <= 1)) + ? 0 + : prediction_parameters.ref_mv_index; + predicted_mv = prediction_parameters.reference_mv(ref_mv_index); + if (ref_mv_index < prediction_parameters.ref_mv_count) { + predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]); + predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]); + } + } + if (mode == kPredictionModeNewMv) { + ReadMotionVector(block, 0); + bp.mv.mv[0].mv[0] += predicted_mv.mv[0]; + bp.mv.mv[0].mv[1] += predicted_mv.mv[1]; + } else { + bp.mv.mv[0] = predicted_mv; + } + } + return IsMvValid(block, is_compound); +} + +bool Tile::AssignIntraMv(const Block& block) { + // TODO(linfengz): Check if the clamping process is necessary. + int min[2]; + int max[2]; + GetClampParameters(block, min, max); + BlockParameters& bp = *block.bp; + const PredictionParameters& prediction_parameters = *bp.prediction_parameters; + const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0); + ReadMotionVector(block, 0); + if (ref_mv_0.mv32 == 0) { + const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1); + if (ref_mv_1.mv32 == 0) { + const int super_block_size4x4 = kNum4x4BlocksHigh[SuperBlockSize()]; + if (block.row4x4 - super_block_size4x4 < row4x4_start_) { + bp.mv.mv[0].mv[1] -= MultiplyBy32(super_block_size4x4); + bp.mv.mv[0].mv[1] -= MultiplyBy8(kIntraBlockCopyDelayPixels); + } else { + bp.mv.mv[0].mv[0] -= MultiplyBy32(super_block_size4x4); + } + } else { + bp.mv.mv[0].mv[0] += Clip3(ref_mv_1.mv[0], min[0], max[0]); + bp.mv.mv[0].mv[1] += Clip3(ref_mv_1.mv[1], min[0], max[0]); + } + } else { + bp.mv.mv[0].mv[0] += Clip3(ref_mv_0.mv[0], min[0], max[0]); + bp.mv.mv[0].mv[1] += Clip3(ref_mv_0.mv[1], min[1], max[1]); + } + return IsMvValid(block, /*is_compound=*/false); +} + +void Tile::ResetEntropyContext(const Block& block) { + const int num_planes = block.HasChroma() ? PlaneCount() : 1; + int plane = kPlaneY; + do { + const int subsampling_x = subsampling_x_[plane]; + const int start_x = block.column4x4 >> subsampling_x; + const int end_x = + std::min((block.column4x4 + block.width4x4) >> subsampling_x, + frame_header_.columns4x4); + memset(&coefficient_levels_[kEntropyContextTop][plane][start_x], 0, + end_x - start_x); + memset(&dc_categories_[kEntropyContextTop][plane][start_x], 0, + end_x - start_x); + const int subsampling_y = subsampling_y_[plane]; + const int start_y = block.row4x4 >> subsampling_y; + const int end_y = + std::min((block.row4x4 + block.height4x4) >> subsampling_y, + frame_header_.rows4x4); + memset(&coefficient_levels_[kEntropyContextLeft][plane][start_y], 0, + end_y - start_y); + memset(&dc_categories_[kEntropyContextLeft][plane][start_y], 0, + end_y - start_y); + } while (++plane < num_planes); +} + +bool Tile::ComputePrediction(const Block& block) { + const BlockParameters& bp = *block.bp; + if (!bp.is_inter) return true; + const int mask = + (1 << (4 + static_cast(sequence_header_.use_128x128_superblock))) - + 1; + const int sub_block_row4x4 = block.row4x4 & mask; + const int sub_block_column4x4 = block.column4x4 & mask; + const int plane_count = block.HasChroma() ? PlaneCount() : 1; + // Returns true if this block applies local warping. The state is determined + // in the Y plane and carried for use in the U/V planes. + // But the U/V planes will not apply warping when the block size is smaller + // than 8x8, even if this variable is true. + bool is_local_valid = false; + // Local warping parameters, similar usage as is_local_valid. + GlobalMotion local_warp_params; + int plane = kPlaneY; + do { + const int8_t subsampling_x = subsampling_x_[plane]; + const int8_t subsampling_y = subsampling_y_[plane]; + const BlockSize plane_size = block.residual_size[plane]; + const int block_width4x4 = kNum4x4BlocksWide[plane_size]; + const int block_height4x4 = kNum4x4BlocksHigh[plane_size]; + const int block_width = MultiplyBy4(block_width4x4); + const int block_height = MultiplyBy4(block_height4x4); + const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x); + const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y); + if (bp.reference_frame[1] == kReferenceFrameIntra) { + const int tr_row4x4 = sub_block_row4x4 >> subsampling_y; + const int tr_column4x4 = + (sub_block_column4x4 >> subsampling_x) + block_width4x4 + 1; + const int bl_row4x4 = + (sub_block_row4x4 >> subsampling_y) + block_height4x4; + const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x) + 1; + const TransformSize tx_size = + k4x4SizeToTransformSize[k4x4WidthLog2[plane_size]] + [k4x4HeightLog2[plane_size]]; + const bool has_left = block.left_available[plane]; + const bool has_top = block.top_available[plane]; + CALL_BITDEPTH_FUNCTION( + IntraPrediction, block, static_cast(plane), base_x, base_y, + has_left, has_top, + block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4], + block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4], + kInterIntraToIntraMode[block.bp->prediction_parameters + ->inter_intra_mode], + tx_size); + } + int candidate_row = block.row4x4; + int candidate_column = block.column4x4; + bool some_use_intra = bp.reference_frame[0] == kReferenceFrameIntra; + if (!some_use_intra && plane != 0) { + candidate_row = (candidate_row >> subsampling_y) << subsampling_y; + candidate_column = (candidate_column >> subsampling_x) << subsampling_x; + if (candidate_row != block.row4x4) { + // Top block. + const BlockParameters& bp_top = + *block_parameters_holder_.Find(candidate_row, block.column4x4); + some_use_intra = bp_top.reference_frame[0] == kReferenceFrameIntra; + if (!some_use_intra && candidate_column != block.column4x4) { + // Top-left block. + const BlockParameters& bp_top_left = + *block_parameters_holder_.Find(candidate_row, candidate_column); + some_use_intra = + bp_top_left.reference_frame[0] == kReferenceFrameIntra; + } + } + if (!some_use_intra && candidate_column != block.column4x4) { + // Left block. + const BlockParameters& bp_left = + *block_parameters_holder_.Find(block.row4x4, candidate_column); + some_use_intra = bp_left.reference_frame[0] == kReferenceFrameIntra; + } + } + int prediction_width; + int prediction_height; + if (some_use_intra) { + candidate_row = block.row4x4; + candidate_column = block.column4x4; + prediction_width = block_width; + prediction_height = block_height; + } else { + prediction_width = block.width >> subsampling_x; + prediction_height = block.height >> subsampling_y; + } + int r = 0; + int y = 0; + do { + int c = 0; + int x = 0; + do { + if (!InterPrediction(block, static_cast(plane), base_x + x, + base_y + y, prediction_width, prediction_height, + candidate_row + r, candidate_column + c, + &is_local_valid, &local_warp_params)) { + return false; + } + ++c; + x += prediction_width; + } while (x < block_width); + ++r; + y += prediction_height; + } while (y < block_height); + } while (++plane < plane_count); + return true; +} + +#undef CALL_BITDEPTH_FUNCTION + +void Tile::PopulateDeblockFilterLevel(const Block& block) { + if (!post_filter_.DoDeblock()) return; + BlockParameters& bp = *block.bp; + const int mode_id = + static_cast(kPredictionModeDeltasMask.Contains(bp.y_mode)); + for (int i = 0; i < kFrameLfCount; ++i) { + if (delta_lf_all_zero_) { + bp.deblock_filter_level[i] = post_filter_.GetZeroDeltaDeblockFilterLevel( + bp.segment_id, i, bp.reference_frame[0], mode_id); + } else { + bp.deblock_filter_level[i] = + deblock_filter_levels_[bp.segment_id][i][bp.reference_frame[0]] + [mode_id]; + } + } +} + +bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size, + ParameterTree* const tree, + TileScratchBuffer* const scratch_buffer, + ResidualPtr* residual) { + // Do not process the block if the starting point is beyond the visible frame. + // This is equivalent to the has_row/has_column check in the + // decode_partition() section of the spec when partition equals + // kPartitionHorizontal or kPartitionVertical. + if (row4x4 >= frame_header_.rows4x4 || + column4x4 >= frame_header_.columns4x4) { + return true; + } + BlockParameters& bp = *tree->parameters(); + block_parameters_holder_.FillCache(row4x4, column4x4, block_size, &bp); + Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual); + bp.size = block_size; + bp.prediction_parameters = + split_parse_and_decode_ ? std::unique_ptr( + new (std::nothrow) PredictionParameters()) + : std::move(prediction_parameters_); + if (bp.prediction_parameters == nullptr) return false; + if (!DecodeModeInfo(block)) return false; + bp.is_global_mv_block = (bp.y_mode == kPredictionModeGlobalMv || + bp.y_mode == kPredictionModeGlobalGlobalMv) && + !IsBlockDimension4(bp.size); + PopulateDeblockFilterLevel(block); + if (!ReadPaletteTokens(block)) return false; + DecodeTransformSize(block); + // Part of Section 5.11.37 in the spec (implemented as a simple lookup). + bp.uv_transform_size = frame_header_.segmentation.lossless[bp.segment_id] + ? kTransformSize4x4 + : kUVTransformSize[block.residual_size[kPlaneU]]; + if (bp.skip) ResetEntropyContext(block); + if (split_parse_and_decode_) { + if (!Residual(block, kProcessingModeParseOnly)) return false; + } else { + if (!ComputePrediction(block) || + !Residual(block, kProcessingModeParseAndDecode)) { + return false; + } + } + // If frame_header_.segmentation.enabled is false, bp.segment_id is 0 for all + // blocks. We don't need to call save bp.segment_id in the current frame + // because the current frame's segmentation map will be cleared to all 0s. + // + // If frame_header_.segmentation.enabled is true and + // frame_header_.segmentation.update_map is false, we will copy the previous + // frame's segmentation map to the current frame. So we don't need to call + // save bp.segment_id in the current frame. + if (frame_header_.segmentation.enabled && + frame_header_.segmentation.update_map) { + const int x_limit = std::min(frame_header_.columns4x4 - column4x4, + static_cast(block.width4x4)); + const int y_limit = std::min(frame_header_.rows4x4 - row4x4, + static_cast(block.height4x4)); + current_frame_.segmentation_map()->FillBlock(row4x4, column4x4, x_limit, + y_limit, bp.segment_id); + } + StoreMotionFieldMvsIntoCurrentFrame(block); + if (!split_parse_and_decode_) { + prediction_parameters_ = std::move(bp.prediction_parameters); + } + return true; +} + +bool Tile::DecodeBlock(ParameterTree* const tree, + TileScratchBuffer* const scratch_buffer, + ResidualPtr* residual) { + const int row4x4 = tree->row4x4(); + const int column4x4 = tree->column4x4(); + if (row4x4 >= frame_header_.rows4x4 || + column4x4 >= frame_header_.columns4x4) { + return true; + } + const BlockSize block_size = tree->block_size(); + Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual); + if (!ComputePrediction(block) || + !Residual(block, kProcessingModeDecodeOnly)) { + return false; + } + block.bp->prediction_parameters.reset(nullptr); + return true; +} + +bool Tile::ProcessPartition(int row4x4_start, int column4x4_start, + ParameterTree* const root, + TileScratchBuffer* const scratch_buffer, + ResidualPtr* residual) { + Stack stack; + + // Set up the first iteration. + ParameterTree* node = root; + int row4x4 = row4x4_start; + int column4x4 = column4x4_start; + BlockSize block_size = SuperBlockSize(); + + // DFS loop. If it sees a terminal node (leaf node), ProcessBlock is invoked. + // Otherwise, the children are pushed into the stack for future processing. + do { + if (!stack.Empty()) { + // Set up subsequent iterations. + node = stack.Pop(); + row4x4 = node->row4x4(); + column4x4 = node->column4x4(); + block_size = node->block_size(); + } + if (row4x4 >= frame_header_.rows4x4 || + column4x4 >= frame_header_.columns4x4) { + continue; + } + const int block_width4x4 = kNum4x4BlocksWide[block_size]; + assert(block_width4x4 == kNum4x4BlocksHigh[block_size]); + const int half_block4x4 = block_width4x4 >> 1; + const bool has_rows = (row4x4 + half_block4x4) < frame_header_.rows4x4; + const bool has_columns = + (column4x4 + half_block4x4) < frame_header_.columns4x4; + Partition partition; + if (!ReadPartition(row4x4, column4x4, block_size, has_rows, has_columns, + &partition)) { + LIBGAV1_DLOG(ERROR, "Failed to read partition for row: %d column: %d", + row4x4, column4x4); + return false; + } + const BlockSize sub_size = kSubSize[partition][block_size]; + // Section 6.10.4: It is a requirement of bitstream conformance that + // get_plane_residual_size( subSize, 1 ) is not equal to BLOCK_INVALID + // every time subSize is computed. + if (sub_size == kBlockInvalid || + kPlaneResidualSize[sub_size] + [sequence_header_.color_config.subsampling_x] + [sequence_header_.color_config.subsampling_y] == + kBlockInvalid) { + LIBGAV1_DLOG( + ERROR, + "Invalid sub-block/plane size for row: %d column: %d partition: " + "%d block_size: %d sub_size: %d subsampling_x/y: %d, %d", + row4x4, column4x4, partition, block_size, sub_size, + sequence_header_.color_config.subsampling_x, + sequence_header_.color_config.subsampling_y); + return false; + } + if (!node->SetPartitionType(partition)) { + LIBGAV1_DLOG(ERROR, "node->SetPartitionType() failed."); + return false; + } + switch (partition) { + case kPartitionNone: + if (!ProcessBlock(row4x4, column4x4, sub_size, node, scratch_buffer, + residual)) { + return false; + } + break; + case kPartitionSplit: + // The children must be added in reverse order since a stack is being + // used. + for (int i = 3; i >= 0; --i) { + ParameterTree* const child = node->children(i); + assert(child != nullptr); + stack.Push(child); + } + break; + case kPartitionHorizontal: + case kPartitionVertical: + case kPartitionHorizontalWithTopSplit: + case kPartitionHorizontalWithBottomSplit: + case kPartitionVerticalWithLeftSplit: + case kPartitionVerticalWithRightSplit: + case kPartitionHorizontal4: + case kPartitionVertical4: + for (int i = 0; i < 4; ++i) { + ParameterTree* const child = node->children(i); + // Once a null child is seen, all the subsequent children will also be + // null. + if (child == nullptr) break; + if (!ProcessBlock(child->row4x4(), child->column4x4(), + child->block_size(), child, scratch_buffer, + residual)) { + return false; + } + } + break; + } + } while (!stack.Empty()); + return true; +} + +void Tile::ResetLoopRestorationParams() { + for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) { + for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) { + reference_unit_info_[plane].sgr_proj_info.multiplier[i] = + kSgrProjDefaultMultiplier[i]; + for (int j = 0; j < kNumWienerCoefficients; ++j) { + reference_unit_info_[plane].wiener_info.filter[i][j] = + kWienerDefaultFilter[j]; + } + } + } +} + +void Tile::ResetCdef(const int row4x4, const int column4x4) { + if (!sequence_header_.enable_cdef) return; + const int row = DivideBy16(row4x4); + const int column = DivideBy16(column4x4); + cdef_index_[row][column] = -1; + if (sequence_header_.use_128x128_superblock) { + const int cdef_size4x4 = kNum4x4BlocksWide[kBlock64x64]; + const int border_row = DivideBy16(row4x4 + cdef_size4x4); + const int border_column = DivideBy16(column4x4 + cdef_size4x4); + cdef_index_[row][border_column] = -1; + cdef_index_[border_row][column] = -1; + cdef_index_[border_row][border_column] = -1; + } +} + +void Tile::ClearBlockDecoded(TileScratchBuffer* const scratch_buffer, + int row4x4, int column4x4) { + // Set everything to false. + memset(scratch_buffer->block_decoded, 0, + sizeof(scratch_buffer->block_decoded)); + // Set specific edge cases to true. + const int sb_size4 = sequence_header_.use_128x128_superblock ? 32 : 16; + for (int plane = kPlaneY; plane < PlaneCount(); ++plane) { + const int subsampling_x = subsampling_x_[plane]; + const int subsampling_y = subsampling_y_[plane]; + const int sb_width4 = (column4x4_end_ - column4x4) >> subsampling_x; + const int sb_height4 = (row4x4_end_ - row4x4) >> subsampling_y; + // The memset is equivalent to the following lines in the spec: + // for ( x = -1; x <= ( sbSize4 >> subX ); x++ ) { + // if ( y < 0 && x < sbWidth4 ) { + // BlockDecoded[plane][y][x] = 1 + // } + // } + const int num_elements = + std::min((sb_size4 >> subsampling_x_[plane]) + 1, sb_width4) + 1; + memset(&scratch_buffer->block_decoded[plane][0][0], 1, num_elements); + // The for loop is equivalent to the following lines in the spec: + // for ( y = -1; y <= ( sbSize4 >> subY ); y++ ) + // if ( x < 0 && y < sbHeight4 ) + // BlockDecoded[plane][y][x] = 1 + // } + // } + // BlockDecoded[plane][sbSize4 >> subY][-1] = 0 + for (int y = -1; y < std::min((sb_size4 >> subsampling_y), sb_height4); + ++y) { + scratch_buffer->block_decoded[plane][y + 1][0] = true; + } + } +} + +bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4, + TileScratchBuffer* const scratch_buffer, + ProcessingMode mode) { + const bool parsing = + mode == kProcessingModeParseOnly || mode == kProcessingModeParseAndDecode; + const bool decoding = mode == kProcessingModeDecodeOnly || + mode == kProcessingModeParseAndDecode; + if (parsing) { + read_deltas_ = frame_header_.delta_q.present; + ResetCdef(row4x4, column4x4); + } + if (decoding) { + ClearBlockDecoded(scratch_buffer, row4x4, column4x4); + } + const BlockSize block_size = SuperBlockSize(); + if (parsing) { + ReadLoopRestorationCoefficients(row4x4, column4x4, block_size); + } + const int row = row4x4 / block_width4x4; + const int column = column4x4 / block_width4x4; + if (parsing && decoding) { + uint8_t* residual_buffer = residual_buffer_.get(); + if (!ProcessPartition(row4x4, column4x4, + block_parameters_holder_.Tree(row, column), + scratch_buffer, &residual_buffer)) { + LIBGAV1_DLOG(ERROR, "Error decoding partition row: %d column: %d", row4x4, + column4x4); + return false; + } + return true; + } + const int sb_row_index = SuperBlockRowIndex(row4x4); + const int sb_column_index = SuperBlockColumnIndex(column4x4); + if (parsing) { + residual_buffer_threaded_[sb_row_index][sb_column_index] = + residual_buffer_pool_->Get(); + if (residual_buffer_threaded_[sb_row_index][sb_column_index] == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to get residual buffer."); + return false; + } + uint8_t* residual_buffer = + residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer(); + if (!ProcessPartition(row4x4, column4x4, + block_parameters_holder_.Tree(row, column), + scratch_buffer, &residual_buffer)) { + LIBGAV1_DLOG(ERROR, "Error parsing partition row: %d column: %d", row4x4, + column4x4); + return false; + } + } else { + uint8_t* residual_buffer = + residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer(); + if (!DecodeSuperBlock(block_parameters_holder_.Tree(row, column), + scratch_buffer, &residual_buffer)) { + LIBGAV1_DLOG(ERROR, "Error decoding superblock row: %d column: %d", + row4x4, column4x4); + return false; + } + residual_buffer_pool_->Release( + std::move(residual_buffer_threaded_[sb_row_index][sb_column_index])); + } + return true; +} + +bool Tile::DecodeSuperBlock(ParameterTree* const tree, + TileScratchBuffer* const scratch_buffer, + ResidualPtr* residual) { + Stack stack; + stack.Push(tree); + do { + ParameterTree* const node = stack.Pop(); + if (node->partition() != kPartitionNone) { + for (int i = 3; i >= 0; --i) { + if (node->children(i) == nullptr) continue; + stack.Push(node->children(i)); + } + continue; + } + if (!DecodeBlock(node, scratch_buffer, residual)) { + LIBGAV1_DLOG(ERROR, "Error decoding block row: %d column: %d", + node->row4x4(), node->column4x4()); + return false; + } + } while (!stack.Empty()); + return true; +} + +void Tile::ReadLoopRestorationCoefficients(int row4x4, int column4x4, + BlockSize block_size) { + if (frame_header_.allow_intrabc) return; + LoopRestorationInfo* const restoration_info = post_filter_.restoration_info(); + const bool is_superres_scaled = + frame_header_.width != frame_header_.upscaled_width; + for (int plane = kPlaneY; plane < PlaneCount(); ++plane) { + LoopRestorationUnitInfo unit_info; + if (restoration_info->PopulateUnitInfoForSuperBlock( + static_cast(plane), block_size, is_superres_scaled, + frame_header_.superres_scale_denominator, row4x4, column4x4, + &unit_info)) { + for (int unit_row = unit_info.row_start; unit_row < unit_info.row_end; + ++unit_row) { + for (int unit_column = unit_info.column_start; + unit_column < unit_info.column_end; ++unit_column) { + const int unit_id = unit_row * restoration_info->num_horizontal_units( + static_cast(plane)) + + unit_column; + restoration_info->ReadUnitCoefficients( + &reader_, &symbol_decoder_context_, static_cast(plane), + unit_id, &reference_unit_info_); + } + } + } + } +} + +void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) { + if (frame_header_.refresh_frame_flags == 0 || + IsIntraFrame(frame_header_.frame_type)) { + return; + } + // Iterate over odd rows/columns beginning at the first odd row/column for the + // block. It is done this way because motion field mvs are only needed at a + // 8x8 granularity. + const int row_start4x4 = block.row4x4 | 1; + const int row_limit4x4 = + std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4); + if (row_start4x4 >= row_limit4x4) return; + const int column_start4x4 = block.column4x4 | 1; + const int column_limit4x4 = + std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4); + if (column_start4x4 >= column_limit4x4) return; + + // The largest reference MV component that can be saved. + constexpr int kRefMvsLimit = (1 << 12) - 1; + const BlockParameters& bp = *block.bp; + ReferenceInfo* reference_info = current_frame_.reference_info(); + for (int i = 1; i >= 0; --i) { + const ReferenceFrameType reference_frame_to_store = bp.reference_frame[i]; + // Must make a local copy so that StoreMotionFieldMvs() knows there is no + // overlap between load and store. + const MotionVector mv_to_store = bp.mv.mv[i]; + const int mv_row = std::abs(mv_to_store.mv[MotionVector::kRow]); + const int mv_column = std::abs(mv_to_store.mv[MotionVector::kColumn]); + if (reference_frame_to_store > kReferenceFrameIntra && + // kRefMvsLimit equals 0x07FF, so we can first bitwise OR the two + // absolute values and then compare with kRefMvsLimit to save a branch. + // The next line is equivalent to: + // mv_row <= kRefMvsLimit && mv_column <= kRefMvsLimit + (mv_row | mv_column) <= kRefMvsLimit && + reference_info->relative_distance_from[reference_frame_to_store] < 0) { + const int row_start8x8 = DivideBy2(row_start4x4); + const int row_limit8x8 = DivideBy2(row_limit4x4); + const int column_start8x8 = DivideBy2(column_start4x4); + const int column_limit8x8 = DivideBy2(column_limit4x4); + const int rows = row_limit8x8 - row_start8x8; + const int columns = column_limit8x8 - column_start8x8; + const ptrdiff_t stride = DivideBy2(current_frame_.columns4x4()); + ReferenceFrameType* const reference_frame_row_start = + &reference_info + ->motion_field_reference_frame[row_start8x8][column_start8x8]; + MotionVector* const mv = + &reference_info->motion_field_mv[row_start8x8][column_start8x8]; + + // Specialize columns cases 1, 2, 4, 8 and 16. This makes memset() inlined + // and simplifies std::fill() for these cases. + if (columns <= 1) { + // Don't change the above condition to (columns == 1). + // Condition (columns <= 1) may help the compiler simplify the inlining + // of the general case of StoreMotionFieldMvs() by eliminating the + // (columns == 0) case. + assert(columns == 1); + StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows, + 1, reference_frame_row_start, mv); + } else if (columns == 2) { + StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows, + 2, reference_frame_row_start, mv); + } else if (columns == 4) { + StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows, + 4, reference_frame_row_start, mv); + } else if (columns == 8) { + StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows, + 8, reference_frame_row_start, mv); + } else if (columns == 16) { + StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows, + 16, reference_frame_row_start, mv); + } else if (columns < 16) { + // This always true condition (columns < 16) may help the compiler + // simplify the inlining of the following function. + // This general case is rare and usually only happens to the blocks + // which contain the right boundary of the frame. + StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows, + columns, reference_frame_row_start, mv); + } else { + assert(false); + } + return; + } + } +} + +} // namespace libgav1 diff --git a/src/tile_scratch_buffer.cc b/src/tile_scratch_buffer.cc new file mode 100644 index 0000000..0b5ac96 --- /dev/null +++ b/src/tile_scratch_buffer.cc @@ -0,0 +1,26 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/tile_scratch_buffer.h" + +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { + +#if !LIBGAV1_CXX17 +// static +constexpr int TileScratchBuffer::kBlockDecodedStride; +#endif + +} // namespace libgav1 diff --git a/src/tile_scratch_buffer.h b/src/tile_scratch_buffer.h new file mode 100644 index 0000000..3eaf8b8 --- /dev/null +++ b/src/tile_scratch_buffer.h @@ -0,0 +1,160 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_ +#define LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_ + +#include +#include // NOLINT (unapproved c++11 header) + +#include "src/dsp/constants.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" +#include "src/utils/stack.h" + +namespace libgav1 { + +// Buffer to facilitate decoding a superblock. +struct TileScratchBuffer : public MaxAlignedAllocable { + static constexpr int kBlockDecodedStride = 34; + + LIBGAV1_MUST_USE_RESULT bool Init(int bitdepth) { +#if LIBGAV1_MAX_BITDEPTH >= 10 + const int pixel_size = (bitdepth == 8) ? 1 : 2; +#else + assert(bitdepth == 8); + static_cast(bitdepth); + const int pixel_size = 1; +#endif + + constexpr int unaligned_convolve_buffer_stride = + kMaxScaledSuperBlockSizeInPixels + kConvolveBorderLeftTop + + kConvolveBorderRight; + convolve_block_buffer_stride = Align( + unaligned_convolve_buffer_stride * pixel_size, kMaxAlignment); + constexpr int convolve_buffer_height = kMaxScaledSuperBlockSizeInPixels + + kConvolveBorderLeftTop + + kConvolveBorderBottom; + + convolve_block_buffer = MakeAlignedUniquePtr( + kMaxAlignment, convolve_buffer_height * convolve_block_buffer_stride); + return convolve_block_buffer != nullptr; + } + + // kCompoundPredictionTypeDiffWeighted prediction mode needs a mask of the + // prediction block size. This buffer is used to store that mask. The masks + // will be created for the Y plane and will be re-used for the U & V planes. + alignas(kMaxAlignment) uint8_t weight_mask[kMaxSuperBlockSizeSquareInPixels]; + + // For each instance of the TileScratchBuffer, only one of the following + // buffers will be used at any given time, so it is ok to share them in a + // union. + union { + // Buffers used for prediction process. + // Compound prediction calculations always output 16-bit values. Depending + // on the bitdepth the values may be treated as int16_t or uint16_t. See + // src/dsp/convolve.cc and src/dsp/warp.cc for explanations. + // Inter/intra calculations output Pixel values. + // These buffers always use width as the stride. This enables packing the + // values in and simplifies loads/stores for small values. + + // 10/12 bit compound prediction and 10/12 bit inter/intra prediction. + alignas(kMaxAlignment) uint16_t + prediction_buffer[2][kMaxSuperBlockSizeSquareInPixels]; + // 8 bit compound prediction buffer. + alignas(kMaxAlignment) int16_t + compound_prediction_buffer_8bpp[2][kMaxSuperBlockSizeSquareInPixels]; + + // Union usage note: This is used only by functions in the "intra" + // prediction path. + // + // Buffer used for storing subsampled luma samples needed for CFL + // prediction. This buffer is used to avoid repetition of the subsampling + // for the V plane when it is already done for the U plane. + int16_t cfl_luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride]; + }; + + // Buffer used for convolve. The maximum size required for this buffer is: + // maximum block height (with scaling and border) = 2 * 128 + 3 + 4 = 263. + // maximum block stride (with scaling and border aligned to 16) = + // (2 * 128 + 3 + 8 + 5) * pixel_size = 272 * pixel_size. + // Where pixel_size is (bitdepth == 8) ? 1 : 2. + // Has an alignment of kMaxAlignment when allocated. + AlignedUniquePtr convolve_block_buffer; + ptrdiff_t convolve_block_buffer_stride; + + // Flag indicating whether the data in |cfl_luma_buffer| is valid. + bool cfl_luma_buffer_valid; + + // Equivalent to BlockDecoded array in the spec. This stores the decoded + // state of every 4x4 block in a superblock. It has 1 row/column border on + // all 4 sides (hence the 34x34 dimension instead of 32x32). Note that the + // spec uses "-1" as an index to access the left and top borders. In the + // code, we treat the index (1, 1) as equivalent to the spec's (0, 0). So + // all accesses into this array will be offset by +1 when compared with the + // spec. + bool block_decoded[kMaxPlanes][kBlockDecodedStride][kBlockDecodedStride]; +}; + +class TileScratchBufferPool { + public: + void Reset(int bitdepth) { + if (bitdepth_ == bitdepth) return; +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth_ == 8 && bitdepth != 8) { + // We are going from a pixel size of 1 to a pixel size of 2. So invalidate + // the stack. + std::lock_guard lock(mutex_); + while (!buffers_.Empty()) { + buffers_.Pop(); + } + } +#endif + bitdepth_ = bitdepth; + } + + std::unique_ptr Get() { + std::lock_guard lock(mutex_); + if (buffers_.Empty()) { + std::unique_ptr scratch_buffer(new (std::nothrow) + TileScratchBuffer); + if (scratch_buffer == nullptr || !scratch_buffer->Init(bitdepth_)) { + return nullptr; + } + return scratch_buffer; + } + return buffers_.Pop(); + } + + void Release(std::unique_ptr scratch_buffer) { + std::lock_guard lock(mutex_); + buffers_.Push(std::move(scratch_buffer)); + } + + private: + std::mutex mutex_; + // We will never need more than kMaxThreads scratch buffers since that is the + // maximum amount of work that will be done at any given time. + Stack, kMaxThreads> buffers_ + LIBGAV1_GUARDED_BY(mutex_); + int bitdepth_ = 0; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_TILE_SCRATCH_BUFFER_H_ diff --git a/src/utils/array_2d.h b/src/utils/array_2d.h new file mode 100644 index 0000000..2df6241 --- /dev/null +++ b/src/utils/array_2d.h @@ -0,0 +1,131 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_ARRAY_2D_H_ +#define LIBGAV1_SRC_UTILS_ARRAY_2D_H_ + +#include +#include +#include +#include +#include +#include + +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { + +// Exposes a 1D allocated memory buffer as a 2D array. +template +class Array2DView { + public: + Array2DView() = default; + Array2DView(int rows, int columns, T* const data) { + Reset(rows, columns, data); + } + + // Copyable and Movable. + Array2DView(const Array2DView& rhs) = default; + Array2DView& operator=(const Array2DView& rhs) = default; + + void Reset(int rows, int columns, T* const data) { + rows_ = rows; + columns_ = columns; + data_ = data; + } + + int rows() const { return rows_; } + int columns() const { return columns_; } + + T* operator[](int row) { return const_cast(GetRow(row)); } + + const T* operator[](int row) const { return GetRow(row); } + + private: + const T* GetRow(int row) const { + assert(row < rows_); + const ptrdiff_t offset = static_cast(row) * columns_; + return data_ + offset; + } + + int rows_ = 0; + int columns_ = 0; + T* data_ = nullptr; +}; + +// Allocates and owns the contiguous memory and exposes an Array2DView of +// dimension |rows| x |columns|. +template +class Array2D { + public: + Array2D() = default; + + // Copyable and Movable. + Array2D(const Array2D& rhs) = default; + Array2D& operator=(const Array2D& rhs) = default; + + LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns, + bool zero_initialize = true) { + size_ = rows * columns; + // If T is not a trivial type, we should always reallocate the data_ + // buffer, so that the destructors of any existing objects are invoked. + if (!std::is_trivial::value || allocated_size_ < size_) { + // Note: This invokes the global operator new if T is a non-class type, + // such as integer or enum types, or a class type that is not derived + // from libgav1::Allocable, such as std::unique_ptr. If we enforce a + // maximum allocation size or keep track of our own heap memory + // consumption, we will need to handle the allocations here that use the + // global operator new. + if (zero_initialize) { + data_.reset(new (std::nothrow) T[size_]()); + } else { + data_.reset(new (std::nothrow) T[size_]); + } + if (data_ == nullptr) { + allocated_size_ = 0; + return false; + } + allocated_size_ = size_; + } else if (zero_initialize) { + // Cast the data_ pointer to void* to avoid the GCC -Wclass-memaccess + // warning. The memset is safe because T is a trivial type. + void* dest = data_.get(); + memset(dest, 0, sizeof(T) * size_); + } + data_view_.Reset(rows, columns, data_.get()); + return true; + } + + int rows() const { return data_view_.rows(); } + int columns() const { return data_view_.columns(); } + size_t size() const { return size_; } + T* data() { return data_.get(); } + const T* data() const { return data_.get(); } + + T* operator[](int row) { return data_view_[row]; } + + const T* operator[](int row) const { return data_view_[row]; } + + private: + std::unique_ptr data_ = nullptr; + size_t allocated_size_ = 0; + size_t size_ = 0; + Array2DView data_view_; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_ARRAY_2D_H_ diff --git a/src/utils/bit_mask_set.h b/src/utils/bit_mask_set.h new file mode 100644 index 0000000..7371753 --- /dev/null +++ b/src/utils/bit_mask_set.h @@ -0,0 +1,79 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_ +#define LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_ + +#include + +namespace libgav1 { + +// This class is used to check if a given value is equal to one of the several +// predetermined values using a bit mask instead of a chain of comparisons and +// ||s. This usually results in fewer instructions. +// +// Usage: +// constexpr BitMaskSet set(value1, value2); +// set.Contains(value1) => returns true. +// set.Contains(value3) => returns false. +class BitMaskSet { + public: + explicit constexpr BitMaskSet(uint32_t mask) : mask_(mask) {} + + constexpr BitMaskSet(int v1, int v2) : mask_((1U << v1) | (1U << v2)) {} + + constexpr BitMaskSet(int v1, int v2, int v3) + : mask_((1U << v1) | (1U << v2) | (1U << v3)) {} + + constexpr BitMaskSet(int v1, int v2, int v3, int v4) + : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4)) {} + + constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5) + : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5)) {} + + constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6) + : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) | + (1U << v6)) {} + + constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7) + : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) | + (1U << v6) | (1U << v7)) {} + + constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7, + int v8, int v9) + : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) | + (1U << v6) | (1U << v7) | (1U << v8) | (1U << v9)) {} + + constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7, + int v8, int v9, int v10) + : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) | + (1U << v6) | (1U << v7) | (1U << v8) | (1U << v9) | (1U << v10)) { + } + + constexpr bool Contains(uint8_t value) const { + return MaskContainsValue(mask_, value); + } + + static constexpr bool MaskContainsValue(uint32_t mask, uint8_t value) { + return ((mask >> value) & 1) != 0; + } + + private: + const uint32_t mask_; +}; + +} // namespace libgav1 +#endif // LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_ diff --git a/src/utils/bit_reader.cc b/src/utils/bit_reader.cc new file mode 100644 index 0000000..3234128 --- /dev/null +++ b/src/utils/bit_reader.cc @@ -0,0 +1,117 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/bit_reader.h" + +#include +#include + +#include "src/utils/common.h" + +namespace libgav1 { +namespace { + +bool Assign(int* const value, int assignment, bool return_value) { + *value = assignment; + return return_value; +} + +// 5.9.29. +int InverseRecenter(int r, int v) { + if (v > (r << 1)) { + return v; + } + if ((v & 1) != 0) { + return r - ((v + 1) >> 1); + } + return r + (v >> 1); +} + +} // namespace + +bool BitReader::DecodeSignedSubexpWithReference(int low, int high, + int reference, int control, + int* const value) { + if (!DecodeUnsignedSubexpWithReference(high - low, reference - low, control, + value)) { + return false; + } + *value += low; + return true; +} + +bool BitReader::DecodeUniform(int n, int* const value) { + if (n <= 1) { + return Assign(value, 0, true); + } + const int w = FloorLog2(n) + 1; + const int m = (1 << w) - n; + assert(w - 1 < 32); + const int v = static_cast(ReadLiteral(w - 1)); + if (v == -1) { + return Assign(value, 0, false); + } + if (v < m) { + return Assign(value, v, true); + } + const int extra_bit = ReadBit(); + if (extra_bit == -1) { + return Assign(value, 0, false); + } + return Assign(value, (v << 1) - m + extra_bit, true); +} + +bool BitReader::DecodeUnsignedSubexpWithReference(int mx, int reference, + int control, + int* const value) { + int v; + if (!DecodeSubexp(mx, control, &v)) return false; + if ((reference << 1) <= mx) { + *value = InverseRecenter(reference, v); + } else { + *value = mx - 1 - InverseRecenter(mx - 1 - reference, v); + } + return true; +} + +bool BitReader::DecodeSubexp(int num_symbols, int control, int* const value) { + int i = 0; + int mk = 0; + while (true) { + const int b = (i != 0) ? control + i - 1 : control; + if (b >= 32) { + return Assign(value, 0, false); + } + const int a = 1 << b; + if (num_symbols <= mk + 3 * a) { + if (!DecodeUniform(num_symbols - mk, value)) return false; + *value += mk; + return true; + } + const int8_t subexp_more_bits = ReadBit(); + if (subexp_more_bits == -1) return false; + if (subexp_more_bits != 0) { + ++i; + mk += a; + } else { + const int subexp_bits = static_cast(ReadLiteral(b)); + if (subexp_bits == -1) { + return Assign(value, 0, false); + } + return Assign(value, subexp_bits + mk, true); + } + } +} + +} // namespace libgav1 diff --git a/src/utils/bit_reader.h b/src/utils/bit_reader.h new file mode 100644 index 0000000..5a10e12 --- /dev/null +++ b/src/utils/bit_reader.h @@ -0,0 +1,49 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_BIT_READER_H_ +#define LIBGAV1_SRC_UTILS_BIT_READER_H_ + +#include + +namespace libgav1 { + +class BitReader { + public: + virtual ~BitReader() = default; + + virtual int ReadBit() = 0; + // |num_bits| has to be <= 32. The function returns a value in the range [0, + // 2^num_bits - 1] (inclusive) on success and -1 on failure. + virtual int64_t ReadLiteral(int num_bits) = 0; + + bool DecodeSignedSubexpWithReference(int low, int high, int reference, + int control, int* value); // 5.9.26. + // Decodes a nonnegative integer with maximum number of values |n| (i.e., + // output in range 0..n-1) by following the process specified in Section + // 4.10.7 ns(n) and Section 4.10.10 NS(n) of the spec. + bool DecodeUniform(int n, int* value); + + private: + // Helper functions for DecodeSignedSubexpWithReference. + bool DecodeUnsignedSubexpWithReference(int mx, int reference, int control, + int* value); // 5.9.27. + bool DecodeSubexp(int num_symbols, int control, int* value); // 5.9.28. +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_BIT_READER_H_ diff --git a/src/utils/block_parameters_holder.cc b/src/utils/block_parameters_holder.cc new file mode 100644 index 0000000..3ccdb9b --- /dev/null +++ b/src/utils/block_parameters_holder.cc @@ -0,0 +1,107 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/block_parameters_holder.h" + +#include + +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/logging.h" +#include "src/utils/parameter_tree.h" +#include "src/utils/types.h" + +namespace libgav1 { + +namespace { + +// Returns the number of super block rows/columns for |value4x4| where value4x4 +// is either rows4x4 or columns4x4. +int RowsOrColumns4x4ToSuperBlocks(int value4x4, bool use_128x128_superblock) { + return use_128x128_superblock ? DivideBy128(MultiplyBy4(value4x4) + 127) + : DivideBy64(MultiplyBy4(value4x4) + 63); +} + +} // namespace + +bool BlockParametersHolder::Reset(int rows4x4, int columns4x4, + bool use_128x128_superblock) { + rows4x4_ = rows4x4; + columns4x4_ = columns4x4; + use_128x128_superblock_ = use_128x128_superblock; + if (!block_parameters_cache_.Reset(rows4x4_, columns4x4_)) { + LIBGAV1_DLOG(ERROR, "block_parameters_cache_.Reset() failed."); + return false; + } + const int rows = + RowsOrColumns4x4ToSuperBlocks(rows4x4_, use_128x128_superblock_); + const int columns = + RowsOrColumns4x4ToSuperBlocks(columns4x4_, use_128x128_superblock_); + const BlockSize sb_size = + use_128x128_superblock_ ? kBlock128x128 : kBlock64x64; + const int multiplier = kNum4x4BlocksWide[sb_size]; + if (!trees_.Reset(rows, columns, /*zero_initialize=*/false)) { + LIBGAV1_DLOG(ERROR, "trees_.Reset() failed."); + return false; + } + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < columns; ++j) { + trees_[i][j] = + ParameterTree::Create(i * multiplier, j * multiplier, sb_size); + if (trees_[i][j] == nullptr) { + LIBGAV1_DLOG(ERROR, "Allocation of trees_[%d][%d] failed.", i, j); + return false; + } + } + } + return true; +} + +void BlockParametersHolder::FillCache(int row4x4, int column4x4, + BlockSize block_size, + BlockParameters* const bp) { + int rows = std::min(static_cast(kNum4x4BlocksHigh[block_size]), + rows4x4_ - row4x4); + const int columns = std::min(static_cast(kNum4x4BlocksWide[block_size]), + columns4x4_ - column4x4); + auto* bp_dst = &block_parameters_cache_[row4x4][column4x4]; + // Specialize columns cases (values in kNum4x4BlocksWide[]) for better + // performance. + if (columns == 1) { + SetBlock(rows, 1, bp, bp_dst, columns4x4_); + } else if (columns == 2) { + SetBlock(rows, 2, bp, bp_dst, columns4x4_); + } else if (columns == 4) { + SetBlock(rows, 4, bp, bp_dst, columns4x4_); + } else if (columns == 8) { + SetBlock(rows, 8, bp, bp_dst, columns4x4_); + } else if (columns == 16) { + SetBlock(rows, 16, bp, bp_dst, columns4x4_); + } else if (columns == 32) { + SetBlock(rows, 32, bp, bp_dst, columns4x4_); + } else { + do { + // The following loop has better performance than using std::fill(). + // std::fill() has some overhead in checking zero loop count. + int x = columns; + auto* d = bp_dst; + do { + *d++ = bp; + } while (--x != 0); + bp_dst += columns4x4_; + } while (--rows != 0); + } +} + +} // namespace libgav1 diff --git a/src/utils/block_parameters_holder.h b/src/utils/block_parameters_holder.h new file mode 100644 index 0000000..35543c3 --- /dev/null +++ b/src/utils/block_parameters_holder.h @@ -0,0 +1,85 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_ +#define LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_ + +#include + +#include "src/utils/array_2d.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" +#include "src/utils/parameter_tree.h" +#include "src/utils/types.h" + +namespace libgav1 { + +// Holds a 2D array of |ParameterTree| objects. Each tree stores the parameters +// corresponding to a superblock. +class BlockParametersHolder { + public: + BlockParametersHolder() = default; + + // Not copyable or movable. + BlockParametersHolder(const BlockParametersHolder&) = delete; + BlockParametersHolder& operator=(const BlockParametersHolder&) = delete; + + // If |use_128x128_superblock| is true, 128x128 superblocks will be used, + // otherwise 64x64 superblocks will be used. + LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4, + bool use_128x128_superblock); + + // Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This + // is done as a simple look up of the |block_parameters_cache_| matrix. + // Returns nullptr if the BlockParameters cannot be found. + BlockParameters* Find(int row4x4, int column4x4) const { + return block_parameters_cache_[row4x4][column4x4]; + } + + BlockParameters** Address(int row4x4, int column4x4) { + return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4; + } + + BlockParameters* const* Address(int row4x4, int column4x4) const { + return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4; + } + + int columns4x4() const { return columns4x4_; } + + // Returns the ParameterTree corresponding to superblock starting at (|row|, + // |column|). + ParameterTree* Tree(int row, int column) { return trees_[row][column].get(); } + + // Fills the cache matrix for the block starting at |row4x4|, |column4x4| of + // size |block_size| with the pointer |bp|. + void FillCache(int row4x4, int column4x4, BlockSize block_size, + BlockParameters* bp); + + private: + int rows4x4_ = 0; + int columns4x4_ = 0; + bool use_128x128_superblock_ = false; + Array2D> trees_; + + // This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by + // FillCache() and used by Find() to perform look ups using exactly one look + // up (instead of traversing the entire tree). + Array2D block_parameters_cache_; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_ diff --git a/src/utils/blocking_counter.h b/src/utils/blocking_counter.h new file mode 100644 index 0000000..6d664f8 --- /dev/null +++ b/src/utils/blocking_counter.h @@ -0,0 +1,97 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_ +#define LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_ + +#include +#include // NOLINT (unapproved c++11 header) +#include // NOLINT (unapproved c++11 header) + +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { + +// Implementation of a Blocking Counter that is used for the "fork-join" +// use case. Typical usage would be as follows: +// BlockingCounter counter(num_jobs); +// - spawn the jobs. +// - call counter.Wait() on the master thread. +// - worker threads will call counter.Decrement(). +// - master thread will return from counter.Wait() when all workers are +// complete. +template +class BlockingCounterImpl { + public: + explicit BlockingCounterImpl(int initial_count) + : count_(initial_count), job_failed_(false) {} + + // Increment the counter by |count|. This must be called before Wait() is + // called. This must be called from the same thread that will call Wait(). + void IncrementBy(int count) { + assert(count >= 0); + std::unique_lock lock(mutex_); + count_ += count; + } + + // Decrement the counter by 1. This function can be called only when + // |has_failure_status| is false (i.e.) when this class is being used with the + // |BlockingCounter| alias. + void Decrement() { + static_assert(!has_failure_status, ""); + std::unique_lock lock(mutex_); + if (--count_ == 0) { + condition_.notify_one(); + } + } + + // Decrement the counter by 1. This function can be called only when + // |has_failure_status| is true (i.e.) when this class is being used with the + // |BlockingCounterWithStatus| alias. |job_succeeded| is used to update the + // state of |job_failed_|. + void Decrement(bool job_succeeded) { + static_assert(has_failure_status, ""); + std::unique_lock lock(mutex_); + job_failed_ |= !job_succeeded; + if (--count_ == 0) { + condition_.notify_one(); + } + } + + // Block until the counter becomes 0. This function can be called only once + // per object. If |has_failure_status| is true, true is returned if all the + // jobs succeeded and false is returned if any of the jobs failed. If + // |has_failure_status| is false, this function always returns true. + bool Wait() { + std::unique_lock lock(mutex_); + condition_.wait(lock, [this]() { return count_ == 0; }); + // If |has_failure_status| is false, we simply return true. + return has_failure_status ? !job_failed_ : true; + } + + private: + std::mutex mutex_; + std::condition_variable condition_; + int count_ LIBGAV1_GUARDED_BY(mutex_); + bool job_failed_ LIBGAV1_GUARDED_BY(mutex_); +}; + +using BlockingCounterWithStatus = BlockingCounterImpl; +using BlockingCounter = BlockingCounterImpl; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_ diff --git a/src/utils/common.h b/src/utils/common.h new file mode 100644 index 0000000..ae43c2b --- /dev/null +++ b/src/utils/common.h @@ -0,0 +1,534 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_COMMON_H_ +#define LIBGAV1_SRC_UTILS_COMMON_H_ + +#if defined(_MSC_VER) +#include +#pragma intrinsic(_BitScanForward) +#pragma intrinsic(_BitScanReverse) +#if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64) +#pragma intrinsic(_BitScanReverse64) +#define HAVE_BITSCANREVERSE64 +#endif // defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64) +#endif // defined(_MSC_VER) + +#include +#include +#include +#include +#include +#include + +#include "src/utils/bit_mask_set.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" +#include "src/utils/types.h" + +namespace libgav1 { + +// Aligns |value| to the desired |alignment|. |alignment| must be a power of 2. +template +inline T Align(T value, T alignment) { + assert(alignment != 0); + const T alignment_mask = alignment - 1; + return (value + alignment_mask) & ~alignment_mask; +} + +// Aligns |addr| to the desired |alignment|. |alignment| must be a power of 2. +inline uint8_t* AlignAddr(uint8_t* const addr, const uintptr_t alignment) { + const auto value = reinterpret_cast(addr); + return reinterpret_cast(Align(value, alignment)); +} + +inline int32_t Clip3(int32_t value, int32_t low, int32_t high) { + return value < low ? low : (value > high ? high : value); +} + +template +void ExtendLine(void* const line_start, const int width, const int left, + const int right) { + auto* const start = static_cast(line_start); + const Pixel* src = start; + Pixel* dst = start - left; + // Copy to left and right borders. + Memset(dst, src[0], left); + Memset(dst + left + width, src[width - 1], right); +} + +// The following 2 templates set a block of data with uncontiguous memory to +// |value|. The compilers usually generate several branches to handle different +// cases of |columns| when inlining memset() and std::fill(), and these branches +// are unfortunately within the loop of |rows|. So calling these templates +// directly could be inefficient. It is recommended to specialize common cases +// of |columns|, such as 1, 2, 4, 8, 16 and 32, etc. in advance before +// processing the generic case of |columns|. The code size may be larger, but +// there would be big speed gains. +// Call template MemSetBlock<> when sizeof(|T|) is 1. +// Call template SetBlock<> when sizeof(|T|) is larger than 1. +template +void MemSetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) { + static_assert(sizeof(T) == 1, ""); + do { + memset(dst, value, columns); + dst += stride; + } while (--rows != 0); +} + +template +void SetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) { + do { + std::fill(dst, dst + columns, value); + dst += stride; + } while (--rows != 0); +} + +#if defined(__GNUC__) + +inline int CountLeadingZeros(uint32_t n) { + assert(n != 0); + return __builtin_clz(n); +} + +inline int CountLeadingZeros(uint64_t n) { + assert(n != 0); + return __builtin_clzll(n); +} + +inline int CountTrailingZeros(uint32_t n) { + assert(n != 0); + return __builtin_ctz(n); +} + +#elif defined(_MSC_VER) + +inline int CountLeadingZeros(uint32_t n) { + assert(n != 0); + unsigned long first_set_bit; // NOLINT(runtime/int) + const unsigned char bit_set = _BitScanReverse(&first_set_bit, n); + assert(bit_set != 0); + static_cast(bit_set); + return 31 ^ static_cast(first_set_bit); +} + +inline int CountLeadingZeros(uint64_t n) { + assert(n != 0); + unsigned long first_set_bit; // NOLINT(runtime/int) +#if defined(HAVE_BITSCANREVERSE64) + const unsigned char bit_set = + _BitScanReverse64(&first_set_bit, static_cast(n)); +#else // !defined(HAVE_BITSCANREVERSE64) + const auto n_hi = static_cast(n >> 32); // NOLINT(runtime/int) + if (n_hi != 0) { + const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi); + assert(bit_set != 0); + static_cast(bit_set); + return 31 ^ static_cast(first_set_bit); + } + const unsigned char bit_set = _BitScanReverse( + &first_set_bit, static_cast(n)); // NOLINT(runtime/int) +#endif // defined(HAVE_BITSCANREVERSE64) + assert(bit_set != 0); + static_cast(bit_set); + return 63 ^ static_cast(first_set_bit); +} + +#undef HAVE_BITSCANREVERSE64 + +inline int CountTrailingZeros(uint32_t n) { + assert(n != 0); + unsigned long first_set_bit; // NOLINT(runtime/int) + const unsigned char bit_set = _BitScanForward(&first_set_bit, n); + assert(bit_set != 0); + static_cast(bit_set); + return static_cast(first_set_bit); +} + +#else // !defined(__GNUC__) && !defined(_MSC_VER) + +template +inline int CountLeadingZeros(T n) { + assert(n != 0); + const T msb = T{1} << kMSB; + int count = 0; + while ((n & msb) == 0) { + ++count; + n <<= 1; + } + return count; +} + +inline int CountLeadingZeros(uint32_t n) { return CountLeadingZeros<31>(n); } + +inline int CountLeadingZeros(uint64_t n) { return CountLeadingZeros<63>(n); } + +// This is the algorithm on the left in Figure 5-23, Hacker's Delight, Second +// Edition, page 109. The book says: +// If the number of trailing 0's is expected to be small or large, then the +// simple loops shown in Figure 5-23 are quite fast. +inline int CountTrailingZeros(uint32_t n) { + assert(n != 0); + // Create a word with 1's at the positions of the trailing 0's in |n|, and + // 0's elsewhere (e.g., 01011000 => 00000111). + n = ~n & (n - 1); + int count = 0; + while (n != 0) { + ++count; + n >>= 1; + } + return count; +} + +#endif // defined(__GNUC__) + +inline int FloorLog2(int32_t n) { + assert(n > 0); + return 31 ^ CountLeadingZeros(static_cast(n)); +} + +inline int FloorLog2(uint32_t n) { + assert(n > 0); + return 31 ^ CountLeadingZeros(n); +} + +inline int FloorLog2(int64_t n) { + assert(n > 0); + return 63 ^ CountLeadingZeros(static_cast(n)); +} + +inline int FloorLog2(uint64_t n) { + assert(n > 0); + return 63 ^ CountLeadingZeros(n); +} + +inline int CeilLog2(unsigned int n) { + // The expression FloorLog2(n - 1) + 1 is undefined not only for n == 0 but + // also for n == 1, so this expression must be guarded by the n < 2 test. An + // alternative implementation is: + // return (n == 0) ? 0 : FloorLog2(n) + static_cast((n & (n - 1)) != 0); + return (n < 2) ? 0 : FloorLog2(n - 1) + 1; +} + +inline int RightShiftWithCeiling(int value, int bits) { + assert(bits > 0); + return (value + (1 << bits) - 1) >> bits; +} + +inline int32_t RightShiftWithRounding(int32_t value, int bits) { + assert(bits >= 0); + return (value + ((1 << bits) >> 1)) >> bits; +} + +inline uint32_t RightShiftWithRounding(uint32_t value, int bits) { + assert(bits >= 0); + return (value + ((1 << bits) >> 1)) >> bits; +} + +// This variant is used when |value| can exceed 32 bits. Although the final +// result must always fit into int32_t. +inline int32_t RightShiftWithRounding(int64_t value, int bits) { + assert(bits >= 0); + return static_cast((value + ((int64_t{1} << bits) >> 1)) >> bits); +} + +inline int32_t RightShiftWithRoundingSigned(int32_t value, int bits) { + assert(bits > 0); + // The next line is equivalent to: + // return (value >= 0) ? RightShiftWithRounding(value, bits) + // : -RightShiftWithRounding(-value, bits); + return RightShiftWithRounding(value + (value >> 31), bits); +} + +// This variant is used when |value| can exceed 32 bits. Although the final +// result must always fit into int32_t. +inline int32_t RightShiftWithRoundingSigned(int64_t value, int bits) { + assert(bits > 0); + // The next line is equivalent to: + // return (value >= 0) ? RightShiftWithRounding(value, bits) + // : -RightShiftWithRounding(-value, bits); + return RightShiftWithRounding(value + (value >> 63), bits); +} + +constexpr int DivideBy2(int n) { return n >> 1; } +constexpr int DivideBy4(int n) { return n >> 2; } +constexpr int DivideBy8(int n) { return n >> 3; } +constexpr int DivideBy16(int n) { return n >> 4; } +constexpr int DivideBy32(int n) { return n >> 5; } +constexpr int DivideBy64(int n) { return n >> 6; } +constexpr int DivideBy128(int n) { return n >> 7; } + +// Convert |value| to unsigned before shifting to avoid undefined behavior with +// negative values. +inline int LeftShift(int value, int bits) { + assert(bits >= 0); + assert(value >= -(int64_t{1} << (31 - bits))); + assert(value <= (int64_t{1} << (31 - bits)) - ((bits == 0) ? 1 : 0)); + return static_cast(static_cast(value) << bits); +} +inline int MultiplyBy2(int n) { return LeftShift(n, 1); } +inline int MultiplyBy4(int n) { return LeftShift(n, 2); } +inline int MultiplyBy8(int n) { return LeftShift(n, 3); } +inline int MultiplyBy16(int n) { return LeftShift(n, 4); } +inline int MultiplyBy32(int n) { return LeftShift(n, 5); } +inline int MultiplyBy64(int n) { return LeftShift(n, 6); } + +constexpr int Mod32(int n) { return n & 0x1f; } +constexpr int Mod64(int n) { return n & 0x3f; } + +//------------------------------------------------------------------------------ +// Bitstream functions + +constexpr bool IsIntraFrame(FrameType type) { + return type == kFrameKey || type == kFrameIntraOnly; +} + +inline TransformClass GetTransformClass(TransformType tx_type) { + constexpr BitMaskSet kTransformClassVerticalMask( + kTransformTypeIdentityDct, kTransformTypeIdentityAdst, + kTransformTypeIdentityFlipadst); + if (kTransformClassVerticalMask.Contains(tx_type)) { + return kTransformClassVertical; + } + constexpr BitMaskSet kTransformClassHorizontalMask( + kTransformTypeDctIdentity, kTransformTypeAdstIdentity, + kTransformTypeFlipadstIdentity); + if (kTransformClassHorizontalMask.Contains(tx_type)) { + return kTransformClassHorizontal; + } + return kTransformClass2D; +} + +inline int RowOrColumn4x4ToPixel(int row_or_column4x4, Plane plane, + int8_t subsampling) { + return MultiplyBy4(row_or_column4x4) >> (plane == kPlaneY ? 0 : subsampling); +} + +constexpr PlaneType GetPlaneType(Plane plane) { + return static_cast(plane != kPlaneY); +} + +// 5.11.44. +constexpr bool IsDirectionalMode(PredictionMode mode) { + return mode >= kPredictionModeVertical && mode <= kPredictionModeD67; +} + +// 5.9.3. +// +// |a| and |b| are order hints, treated as unsigned order_hint_bits-bit +// integers. |order_hint_shift_bits| equals (32 - order_hint_bits) % 32. +// order_hint_bits is at most 8, so |order_hint_shift_bits| is zero or a +// value between 24 and 31 (inclusive). +// +// If |order_hint_shift_bits| is zero, |a| and |b| are both zeros, and the +// result is zero. If |order_hint_shift_bits| is not zero, returns the +// signed difference |a| - |b| using "modular arithmetic". More precisely, the +// signed difference |a| - |b| is treated as a signed order_hint_bits-bit +// integer and cast to an int. The returned difference is between +// -(1 << (order_hint_bits - 1)) and (1 << (order_hint_bits - 1)) - 1 +// (inclusive). +// +// NOTE: |a| and |b| are the order_hint_bits least significant bits of the +// actual values. This function returns the signed difference between the +// actual values. The returned difference is correct as long as the actual +// values are not more than 1 << (order_hint_bits - 1) - 1 apart. +// +// Example: Suppose order_hint_bits is 4 and |order_hint_shift_bits| +// is 28. Then |a| and |b| are in the range [0, 15], and the actual values for +// |a| and |b| must not be more than 7 apart. (If the actual values for |a| and +// |b| are exactly 8 apart, this function cannot tell whether the actual value +// for |a| is before or after the actual value for |b|.) +// +// First, consider the order hints 2 and 6. For this simple case, we have +// GetRelativeDistance(2, 6, 28) = 2 - 6 = -4, and +// GetRelativeDistance(6, 2, 28) = 6 - 2 = 4. +// +// On the other hand, consider the order hints 2 and 14. The order hints are +// 12 (> 7) apart, so we need to use the actual values instead. The actual +// values may be 34 (= 2 mod 16) and 30 (= 14 mod 16), respectively. Therefore +// we have +// GetRelativeDistance(2, 14, 28) = 34 - 30 = 4, and +// GetRelativeDistance(14, 2, 28) = 30 - 34 = -4. +// +// The following comments apply only to specific CPUs' SIMD implementations, +// such as intrinsics code. +// For the 2 shift operations in this function, if the SIMD packed data is +// 16-bit wide, try to use |order_hint_shift_bits| - 16 as the number of bits to +// shift; If the SIMD packed data is 8-bit wide, try to use +// |order_hint_shift_bits| - 24 as as the number of bits to shift. +// |order_hint_shift_bits| - 16 and |order_hint_shift_bits| - 24 could be -16 or +// -24. In these cases diff is 0, and the behavior of left or right shifting -16 +// or -24 bits is defined for x86 SIMD instructions and ARM NEON instructions, +// and the result of shifting 0 is still 0. There is no guarantee that this +// behavior and result apply to other CPUs' SIMD instructions. +inline int GetRelativeDistance(const unsigned int a, const unsigned int b, + const unsigned int order_hint_shift_bits) { + const int diff = a - b; + assert(order_hint_shift_bits <= 31); + if (order_hint_shift_bits == 0) { + assert(a == 0); + assert(b == 0); + } else { + assert(order_hint_shift_bits >= 24); // i.e., order_hint_bits <= 8 + assert(a < (1u << (32 - order_hint_shift_bits))); + assert(b < (1u << (32 - order_hint_shift_bits))); + assert(diff < (1 << (32 - order_hint_shift_bits))); + assert(diff >= -(1 << (32 - order_hint_shift_bits))); + } + // Sign extend the result of subtracting the values. + // Cast to unsigned int and then left shift to avoid undefined behavior with + // negative values. Cast to int to do the sign extension through right shift. + // This requires the right shift of a signed integer be an arithmetic shift, + // which is true for clang, gcc, and Visual C++. + // These two casts do not generate extra instructions. + // Don't use LeftShift(diff) since a valid diff may fail its assertions. + // For example, GetRelativeDistance(2, 14, 28), diff equals -12 and is less + // than the minimum allowed value of LeftShift() which is -8. + // The next 3 lines are equivalent to: + // const int order_hint_bits = Mod32(32 - order_hint_shift_bits); + // const int m = (1 << order_hint_bits) >> 1; + // return (diff & (m - 1)) - (diff & m); + return static_cast(static_cast(diff) + << order_hint_shift_bits) >> + order_hint_shift_bits; +} + +// Applies |sign| (must be 0 or -1) to |value|, i.e., +// return (sign == 0) ? value : -value; +// and does so without a branch. +constexpr int ApplySign(int value, int sign) { return (value ^ sign) - sign; } + +// 7.9.3. (without the clamp for numerator and denominator). +inline void GetMvProjection(const MotionVector& mv, int numerator, + int division_multiplier, + MotionVector* projection_mv) { + // Allow numerator and to be 0 so that this function can be called + // unconditionally. When numerator is 0, |projection_mv| will be 0, and this + // is what we want. + assert(std::abs(numerator) <= kMaxFrameDistance); + for (int i = 0; i < 2; ++i) { + projection_mv->mv[i] = + Clip3(RightShiftWithRoundingSigned( + mv.mv[i] * numerator * division_multiplier, 14), + -kProjectionMvClamp, kProjectionMvClamp); + } +} + +// 7.9.4. +constexpr int Project(int value, int delta, int dst_sign) { + return value + ApplySign(delta / 64, dst_sign); +} + +inline bool IsBlockSmallerThan8x8(BlockSize size) { + return size < kBlock8x8 && size != kBlock4x16; +} + +// Returns true if the either the width or the height of the block is equal to +// four. +inline bool IsBlockDimension4(BlockSize size) { + return size < kBlock8x8 || size == kBlock16x4; +} + +// Converts bitdepth 8, 10, and 12 to array index 0, 1, and 2, respectively. +constexpr int BitdepthToArrayIndex(int bitdepth) { return (bitdepth - 8) >> 1; } + +// Maps a square transform to an index between [0, 4]. kTransformSize4x4 maps +// to 0, kTransformSize8x8 maps to 1 and so on. +inline int TransformSizeToSquareTransformIndex(TransformSize tx_size) { + assert(kTransformWidth[tx_size] == kTransformHeight[tx_size]); + + // The values of the square transform sizes happen to be in the right + // ranges, so we can just divide them by 4 to get the indexes. + static_assert( + std::is_unsigned::type>::value, ""); + static_assert(kTransformSize4x4 < 4, ""); + static_assert(4 <= kTransformSize8x8 && kTransformSize8x8 < 8, ""); + static_assert(8 <= kTransformSize16x16 && kTransformSize16x16 < 12, ""); + static_assert(12 <= kTransformSize32x32 && kTransformSize32x32 < 16, ""); + static_assert(16 <= kTransformSize64x64 && kTransformSize64x64 < 20, ""); + return DivideBy4(tx_size); +} + +// Gets the corresponding Y/U/V position, to set and get filter masks +// in deblock filtering. +// Returns luma_position if it's Y plane, whose subsampling must be 0. +// Returns the odd position for U/V plane, if there is subsampling. +constexpr int GetDeblockPosition(const int luma_position, + const int subsampling) { + return luma_position | subsampling; +} + +// Returns the size of the residual buffer required to hold the residual values +// for a block or frame of size |rows| by |columns| (taking into account +// |subsampling_x|, |subsampling_y| and |residual_size|). |residual_size| is the +// number of bytes required to represent one residual value. +inline size_t GetResidualBufferSize(const int rows, const int columns, + const int subsampling_x, + const int subsampling_y, + const size_t residual_size) { + // The subsampling multipliers are: + // Both x and y are subsampled: 3 / 2. + // Only x or y is subsampled: 2 / 1 (which is equivalent to 4 / 2). + // Both x and y are not subsampled: 3 / 1 (which is equivalent to 6 / 2). + // So we compute the final subsampling multiplier as follows: + // multiplier = (2 + (4 >> subsampling_x >> subsampling_y)) / 2. + // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary checks + // when parsing quantized coefficients. + const int subsampling_multiplier_num = + 2 + (4 >> subsampling_x >> subsampling_y); + const int number_elements = + (rows * columns * subsampling_multiplier_num) >> 1; + const int tx_padding = 32 * kResidualPaddingVertical; + return residual_size * (number_elements + tx_padding); +} + +// This function is equivalent to: +// std::min({kTransformWidthLog2[tx_size] - 2, +// kTransformWidthLog2[left_tx_size] - 2, +// 2}); +constexpr LoopFilterTransformSizeId GetTransformSizeIdWidth( + TransformSize tx_size, TransformSize left_tx_size) { + return static_cast( + static_cast(tx_size > kTransformSize4x16 && + left_tx_size > kTransformSize4x16) + + static_cast(tx_size > kTransformSize8x32 && + left_tx_size > kTransformSize8x32)); +} + +// This is used for 7.11.3.4 Block Inter Prediction Process, to select convolve +// filters. +inline int GetFilterIndex(const int filter_index, const int length) { + if (length <= 4) { + if (filter_index == kInterpolationFilterEightTap || + filter_index == kInterpolationFilterEightTapSharp) { + return 4; + } + if (filter_index == kInterpolationFilterEightTapSmooth) { + return 5; + } + } + return filter_index; +} + +// This has identical results as RightShiftWithRounding since |subsampling| can +// only be 0 or 1. +constexpr int SubsampledValue(int value, int subsampling) { + return (value + subsampling) >> subsampling; +} + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_COMMON_H_ diff --git a/src/utils/compiler_attributes.h b/src/utils/compiler_attributes.h new file mode 100644 index 0000000..e122426 --- /dev/null +++ b/src/utils/compiler_attributes.h @@ -0,0 +1,181 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_ +#define LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_ + +// A collection of compiler attribute checks and defines to control for +// compatibility across toolchains. + +//------------------------------------------------------------------------------ +// Language version, attribute and feature helpers. + +// Detect c++17 support. Visual Studio sets __cplusplus to 199711L by default +// unless compiled with /Zc:__cplusplus, use the value controlled by /std +// instead. +// https://docs.microsoft.com/en-us/cpp/build/reference/zc-cplusplus +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +#define LIBGAV1_CXX17 1 +#else +#define LIBGAV1_CXX17 0 +#endif + +#if defined(__has_attribute) +#define LIBGAV1_HAS_ATTRIBUTE __has_attribute +#else +#define LIBGAV1_HAS_ATTRIBUTE(x) 0 +#endif + +#if defined(__has_feature) +#define LIBGAV1_HAS_FEATURE __has_feature +#else +#define LIBGAV1_HAS_FEATURE(x) 0 +#endif + +//------------------------------------------------------------------------------ +// Sanitizer attributes. + +#if LIBGAV1_HAS_FEATURE(address_sanitizer) || defined(__SANITIZE_ADDRESS__) +#define LIBGAV1_ASAN 1 +#else +#define LIBGAV1_ASAN 0 +#endif + +#if LIBGAV1_HAS_FEATURE(memory_sanitizer) +#define LIBGAV1_MSAN 1 +#else +#define LIBGAV1_MSAN 0 +#endif + +#if LIBGAV1_HAS_FEATURE(thread_sanitizer) || defined(__SANITIZE_THREAD__) +#define LIBGAV1_TSAN 1 +#else +#define LIBGAV1_TSAN 0 +#endif + +//------------------------------------------------------------------------------ +// AddressSanitizer support. + +// Define the macros for AddressSanitizer manual memory poisoning. See +// https://github.com/google/sanitizers/wiki/AddressSanitizerManualPoisoning. +#if LIBGAV1_ASAN +#include +#else +#define ASAN_POISON_MEMORY_REGION(addr, size) \ + (static_cast(addr), static_cast(size)) +#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ + (static_cast(addr), static_cast(size)) +#endif + +//------------------------------------------------------------------------------ +// Function attributes. +// GCC: https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html +// Clang: https://clang.llvm.org/docs/AttributeReference.html + +#if defined(__GNUC__) +#define LIBGAV1_ALWAYS_INLINE __attribute__((always_inline)) inline +#elif defined(_MSC_VER) +#define LIBGAV1_ALWAYS_INLINE __forceinline +#else +#define LIBGAV1_ALWAYS_INLINE inline +#endif + +// LIBGAV1_MUST_USE_RESULT +// +// Tells the compiler to warn about unused results. +// +// When annotating a function, it must appear as the first part of the +// declaration or definition. The compiler will warn if the return value from +// such a function is unused: +// +// LIBGAV1_MUST_USE_RESULT Sprocket* AllocateSprocket(); +// AllocateSprocket(); // Triggers a warning. +// +// When annotating a class, it is equivalent to annotating every function which +// returns an instance. +// +// class LIBGAV1_MUST_USE_RESULT Sprocket {}; +// Sprocket(); // Triggers a warning. +// +// Sprocket MakeSprocket(); +// MakeSprocket(); // Triggers a warning. +// +// Note that references and pointers are not instances: +// +// Sprocket* SprocketPointer(); +// SprocketPointer(); // Does *not* trigger a warning. +// +// LIBGAV1_MUST_USE_RESULT allows using cast-to-void to suppress the unused +// result warning. For that, warn_unused_result is used only for clang but not +// for gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66425 +#if LIBGAV1_HAS_ATTRIBUTE(nodiscard) +#define LIBGAV1_MUST_USE_RESULT [[nodiscard]] +#elif defined(__clang__) && LIBGAV1_HAS_ATTRIBUTE(warn_unused_result) +#define LIBGAV1_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define LIBGAV1_MUST_USE_RESULT +#endif + +// LIBGAV1_PRINTF_ATTRIBUTE +// +// Tells the compiler to perform `printf` format string checking if the +// compiler supports it; see the 'format' attribute in +// . +// +// Note: As the GCC manual states, "[s]ince non-static C++ methods +// have an implicit 'this' argument, the arguments of such methods +// should be counted from two, not one." +#if LIBGAV1_HAS_ATTRIBUTE(format) || (defined(__GNUC__) && !defined(__clang__)) +#define LIBGAV1_PRINTF_ATTRIBUTE(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#else +#define LIBGAV1_PRINTF_ATTRIBUTE(string_index, first_to_check) +#endif + +//------------------------------------------------------------------------------ +// Thread annotations. + +// LIBGAV1_GUARDED_BY() +// +// Documents if a shared field or global variable needs to be protected by a +// mutex. LIBGAV1_GUARDED_BY() allows the user to specify a particular mutex +// that should be held when accessing the annotated variable. +// +// Although this annotation cannot be applied to local variables, a local +// variable and its associated mutex can often be combined into a small class +// or struct, thereby allowing the annotation. +// +// Example: +// +// class Foo { +// Mutex mu_; +// int p1_ LIBGAV1_GUARDED_BY(mu_); +// ... +// }; +// TODO(b/132506370): this can be reenabled after a local MutexLock +// implementation is added with proper thread annotations. +#if 0 // LIBGAV1_HAS_ATTRIBUTE(guarded_by) +#define LIBGAV1_GUARDED_BY(x) __attribute__((guarded_by(x))) +#else +#define LIBGAV1_GUARDED_BY(x) +#endif + +//------------------------------------------------------------------------------ + +#undef LIBGAV1_HAS_ATTRIBUTE +#undef LIBGAV1_HAS_FEATURE + +#endif // LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_ diff --git a/src/utils/constants.cc b/src/utils/constants.cc new file mode 100644 index 0000000..80d7acb --- /dev/null +++ b/src/utils/constants.cc @@ -0,0 +1,874 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/constants.h" + +namespace libgav1 { + +const uint8_t k4x4WidthLog2[kMaxBlockSizes] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, + 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5}; + +const uint8_t k4x4HeightLog2[kMaxBlockSizes] = { + 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 1, 2, 3, 4, 2, 3, 4, 5, 4, 5}; + +const uint8_t kNum4x4BlocksWide[kMaxBlockSizes] = { + 1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32}; + +const uint8_t kNum4x4BlocksHigh[kMaxBlockSizes] = { + 1, 2, 4, 1, 2, 4, 8, 1, 2, 4, 8, 16, 2, 4, 8, 16, 4, 8, 16, 32, 16, 32}; + +const uint8_t kBlockWidthPixels[kMaxBlockSizes] = { + 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, + 16, 32, 32, 32, 32, 64, 64, 64, 64, 128, 128}; + +const uint8_t kBlockHeightPixels[kMaxBlockSizes] = { + 4, 8, 16, 4, 8, 16, 32, 4, 8, 16, 32, + 64, 8, 16, 32, 64, 16, 32, 64, 128, 64, 128}; + +// 9.3 -- Partition_Subsize[] +const BlockSize kSubSize[kMaxPartitionTypes][kMaxBlockSizes] = { + // kPartitionNone + {kBlock4x4, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x32, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x64, kBlockInvalid, + kBlockInvalid, kBlock128x128}, + // kPartitionHorizontal + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32, kBlockInvalid, + kBlockInvalid, kBlock128x64}, + // kPartitionVertical + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64, kBlockInvalid, + kBlockInvalid, kBlock64x128}, + // kPartitionSplit + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x4, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x32, kBlockInvalid, + kBlockInvalid, kBlock64x64}, + // kPartitionHorizontalWithTopSplit + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32, kBlockInvalid, + kBlockInvalid, kBlock128x64}, + // kPartitionHorizontalWithBottomSplit + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32, kBlockInvalid, + kBlockInvalid, kBlock128x64}, + // kPartitionVerticalWithLeftSplit + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64, kBlockInvalid, + kBlockInvalid, kBlock64x128}, + // kPartitionVerticalWithRightSplit + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64, kBlockInvalid, + kBlockInvalid, kBlock64x128}, + // kPartitionHorizontal4 + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x4, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x16, kBlockInvalid, + kBlockInvalid, kBlockInvalid}, + // kPartitionVertical4 + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x32, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x64, kBlockInvalid, + kBlockInvalid, kBlockInvalid}}; + +// 5.11.38 (implemented as a simple look up. first dimension is block size, +// second and third are subsampling_x and subsampling_y). +const BlockSize kPlaneResidualSize[kMaxBlockSizes][2][2] = { + {{kBlock4x4, kBlock4x4}, {kBlock4x4, kBlock4x4}}, + {{kBlock4x8, kBlock4x4}, {kBlockInvalid, kBlock4x4}}, + {{kBlock4x16, kBlock4x8}, {kBlockInvalid, kBlock4x8}}, + {{kBlock8x4, kBlockInvalid}, {kBlock4x4, kBlock4x4}}, + {{kBlock8x8, kBlock8x4}, {kBlock4x8, kBlock4x4}}, + {{kBlock8x16, kBlock8x8}, {kBlockInvalid, kBlock4x8}}, + {{kBlock8x32, kBlock8x16}, {kBlockInvalid, kBlock4x16}}, + {{kBlock16x4, kBlockInvalid}, {kBlock8x4, kBlock8x4}}, + {{kBlock16x8, kBlockInvalid}, {kBlock8x8, kBlock8x4}}, + {{kBlock16x16, kBlock16x8}, {kBlock8x16, kBlock8x8}}, + {{kBlock16x32, kBlock16x16}, {kBlockInvalid, kBlock8x16}}, + {{kBlock16x64, kBlock16x32}, {kBlockInvalid, kBlock8x32}}, + {{kBlock32x8, kBlockInvalid}, {kBlock16x8, kBlock16x4}}, + {{kBlock32x16, kBlockInvalid}, {kBlock16x16, kBlock16x8}}, + {{kBlock32x32, kBlock32x16}, {kBlock16x32, kBlock16x16}}, + {{kBlock32x64, kBlock32x32}, {kBlockInvalid, kBlock16x32}}, + {{kBlock64x16, kBlockInvalid}, {kBlock32x16, kBlock32x8}}, + {{kBlock64x32, kBlockInvalid}, {kBlock32x32, kBlock32x16}}, + {{kBlock64x64, kBlock64x32}, {kBlock32x64, kBlock32x32}}, + {{kBlock64x128, kBlock64x64}, {kBlockInvalid, kBlock32x64}}, + {{kBlock128x64, kBlockInvalid}, {kBlock64x64, kBlock64x32}}, + {{kBlock128x128, kBlock128x64}, {kBlock64x128, kBlock64x64}}}; + +const int16_t kProjectionMvDivisionLookup[kMaxFrameDistance + 1] = { + 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638, + 1489, 1365, 1260, 1170, 1092, 1024, 963, 910, 862, 819, 780, + 744, 712, 682, 655, 630, 606, 585, 564, 546, 528}; + +const uint8_t kTransformWidth[kNumTransformSizes] = { + 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 32, 32, 32, 32, 64, 64, 64}; + +const uint8_t kTransformHeight[kNumTransformSizes] = { + 4, 8, 16, 4, 8, 16, 32, 4, 8, 16, 32, 64, 8, 16, 32, 64, 16, 32, 64}; + +const uint8_t kTransformWidth4x4[kNumTransformSizes] = { + 1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16}; + +const uint8_t kTransformHeight4x4[kNumTransformSizes] = { + 1, 2, 4, 1, 2, 4, 8, 1, 2, 4, 8, 16, 2, 4, 8, 16, 4, 8, 16}; + +const uint8_t kTransformWidthLog2[kNumTransformSizes] = { + 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6}; + +const uint8_t kTransformHeightLog2[kNumTransformSizes] = { + 2, 3, 4, 2, 3, 4, 5, 2, 3, 4, 5, 6, 3, 4, 5, 6, 4, 5, 6}; + +// 9.3 -- Split_Tx_Size[] +const TransformSize kSplitTransformSize[kNumTransformSizes] = { + kTransformSize4x4, kTransformSize4x4, kTransformSize4x8, + kTransformSize4x4, kTransformSize4x4, kTransformSize8x8, + kTransformSize8x16, kTransformSize8x4, kTransformSize8x8, + kTransformSize8x8, kTransformSize16x16, kTransformSize16x32, + kTransformSize16x8, kTransformSize16x16, kTransformSize16x16, + kTransformSize32x32, kTransformSize32x16, kTransformSize32x32, + kTransformSize32x32}; + +// Square transform of size min(w,h). +const TransformSize kTransformSizeSquareMin[kNumTransformSizes] = { + kTransformSize4x4, kTransformSize4x4, kTransformSize4x4, + kTransformSize4x4, kTransformSize8x8, kTransformSize8x8, + kTransformSize8x8, kTransformSize4x4, kTransformSize8x8, + kTransformSize16x16, kTransformSize16x16, kTransformSize16x16, + kTransformSize8x8, kTransformSize16x16, kTransformSize32x32, + kTransformSize32x32, kTransformSize16x16, kTransformSize32x32, + kTransformSize64x64}; + +// Square transform of size max(w,h). +const TransformSize kTransformSizeSquareMax[kNumTransformSizes] = { + kTransformSize4x4, kTransformSize8x8, kTransformSize16x16, + kTransformSize8x8, kTransformSize8x8, kTransformSize16x16, + kTransformSize32x32, kTransformSize16x16, kTransformSize16x16, + kTransformSize16x16, kTransformSize32x32, kTransformSize64x64, + kTransformSize32x32, kTransformSize32x32, kTransformSize32x32, + kTransformSize64x64, kTransformSize64x64, kTransformSize64x64, + kTransformSize64x64}; + +const uint8_t kNumTransformTypesInSet[kNumTransformSets] = {1, 7, 5, 16, 12, 2}; + +const uint8_t kSgrProjParams[1 << kSgrProjParamsBits][4] = { + {2, 12, 1, 4}, {2, 15, 1, 6}, {2, 18, 1, 8}, {2, 21, 1, 9}, + {2, 24, 1, 10}, {2, 29, 1, 11}, {2, 36, 1, 12}, {2, 45, 1, 13}, + {2, 56, 1, 14}, {2, 68, 1, 15}, {0, 0, 1, 5}, {0, 0, 1, 8}, + {0, 0, 1, 11}, {0, 0, 1, 14}, {2, 30, 0, 0}, {2, 75, 0, 0}}; + +const int8_t kSgrProjMultiplierMin[2] = {-96, -32}; + +const int8_t kSgrProjMultiplierMax[2] = {31, 95}; + +const int8_t kWienerTapsMin[3] = {-5, -23, -17}; + +const int8_t kWienerTapsMax[3] = {10, 8, 46}; + +// This was modified from Upscale_Filter as defined in AV1 Section 7.16, in +// order to support 16-bit packed NEON operations. +// The sign of each tap is: - + - + + - + - +alignas(16) const uint8_t + kUpscaleFilterUnsigned[kSuperResFilterShifts][kSuperResFilterTaps] = { + {0, 0, 0, 128, 0, 0, 0, 0}, {0, 0, 1, 128, 2, 1, 0, 0}, + {0, 1, 3, 127, 4, 2, 1, 0}, {0, 1, 4, 127, 6, 3, 1, 0}, + {0, 2, 6, 126, 8, 3, 1, 0}, {0, 2, 7, 125, 11, 4, 1, 0}, + {1, 2, 8, 125, 13, 5, 2, 0}, {1, 3, 9, 124, 15, 6, 2, 0}, + {1, 3, 10, 123, 18, 6, 2, 1}, {1, 3, 11, 122, 20, 7, 3, 1}, + {1, 4, 12, 121, 22, 8, 3, 1}, {1, 4, 13, 120, 25, 9, 3, 1}, + {1, 4, 14, 118, 28, 9, 3, 1}, {1, 4, 15, 117, 30, 10, 4, 1}, + {1, 5, 16, 116, 32, 11, 4, 1}, {1, 5, 16, 114, 35, 12, 4, 1}, + {1, 5, 17, 112, 38, 12, 4, 1}, {1, 5, 18, 111, 40, 13, 5, 1}, + {1, 5, 18, 109, 43, 14, 5, 1}, {1, 6, 19, 107, 45, 14, 5, 1}, + {1, 6, 19, 105, 48, 15, 5, 1}, {1, 6, 19, 103, 51, 16, 5, 1}, + {1, 6, 20, 101, 53, 16, 6, 1}, {1, 6, 20, 99, 56, 17, 6, 1}, + {1, 6, 20, 97, 58, 17, 6, 1}, {1, 6, 20, 95, 61, 18, 6, 1}, + {2, 7, 20, 93, 64, 18, 6, 2}, {2, 7, 20, 91, 66, 19, 6, 1}, + {2, 7, 20, 88, 69, 19, 6, 1}, {2, 7, 20, 86, 71, 19, 6, 1}, + {2, 7, 20, 84, 74, 20, 7, 2}, {2, 7, 20, 81, 76, 20, 7, 1}, + {2, 7, 20, 79, 79, 20, 7, 2}, {1, 7, 20, 76, 81, 20, 7, 2}, + {2, 7, 20, 74, 84, 20, 7, 2}, {1, 6, 19, 71, 86, 20, 7, 2}, + {1, 6, 19, 69, 88, 20, 7, 2}, {1, 6, 19, 66, 91, 20, 7, 2}, + {2, 6, 18, 64, 93, 20, 7, 2}, {1, 6, 18, 61, 95, 20, 6, 1}, + {1, 6, 17, 58, 97, 20, 6, 1}, {1, 6, 17, 56, 99, 20, 6, 1}, + {1, 6, 16, 53, 101, 20, 6, 1}, {1, 5, 16, 51, 103, 19, 6, 1}, + {1, 5, 15, 48, 105, 19, 6, 1}, {1, 5, 14, 45, 107, 19, 6, 1}, + {1, 5, 14, 43, 109, 18, 5, 1}, {1, 5, 13, 40, 111, 18, 5, 1}, + {1, 4, 12, 38, 112, 17, 5, 1}, {1, 4, 12, 35, 114, 16, 5, 1}, + {1, 4, 11, 32, 116, 16, 5, 1}, {1, 4, 10, 30, 117, 15, 4, 1}, + {1, 3, 9, 28, 118, 14, 4, 1}, {1, 3, 9, 25, 120, 13, 4, 1}, + {1, 3, 8, 22, 121, 12, 4, 1}, {1, 3, 7, 20, 122, 11, 3, 1}, + {1, 2, 6, 18, 123, 10, 3, 1}, {0, 2, 6, 15, 124, 9, 3, 1}, + {0, 2, 5, 13, 125, 8, 2, 1}, {0, 1, 4, 11, 125, 7, 2, 0}, + {0, 1, 3, 8, 126, 6, 2, 0}, {0, 1, 3, 6, 127, 4, 1, 0}, + {0, 1, 2, 4, 127, 3, 1, 0}, {0, 0, 1, 2, 128, 1, 0, 0}, +}; + +alignas(8) const int8_t + kWarpedFilters8[3 * kWarpedPixelPrecisionShifts + 1][8] = { + // [-1, 0). + {0, 0, 127, 1, 0, 0, 0, 0}, + {0, -1, 127, 2, 0, 0, 0, 0}, + {1, -3, 127, 4, -1, 0, 0, 0}, + {1, -4, 126, 6, -2, 1, 0, 0}, + {1, -5, 126, 8, -3, 1, 0, 0}, + {1, -6, 125, 11, -4, 1, 0, 0}, + {1, -7, 124, 13, -4, 1, 0, 0}, + {2, -8, 123, 15, -5, 1, 0, 0}, + {2, -9, 122, 18, -6, 1, 0, 0}, + {2, -10, 121, 20, -6, 1, 0, 0}, + {2, -11, 120, 22, -7, 2, 0, 0}, + {2, -12, 119, 25, -8, 2, 0, 0}, + {3, -13, 117, 27, -8, 2, 0, 0}, + {3, -13, 116, 29, -9, 2, 0, 0}, + {3, -14, 114, 32, -10, 3, 0, 0}, + {3, -15, 113, 35, -10, 2, 0, 0}, + {3, -15, 111, 37, -11, 3, 0, 0}, + {3, -16, 109, 40, -11, 3, 0, 0}, + {3, -16, 108, 42, -12, 3, 0, 0}, + {4, -17, 106, 45, -13, 3, 0, 0}, + {4, -17, 104, 47, -13, 3, 0, 0}, + {4, -17, 102, 50, -14, 3, 0, 0}, + {4, -17, 100, 52, -14, 3, 0, 0}, + {4, -18, 98, 55, -15, 4, 0, 0}, + {4, -18, 96, 58, -15, 3, 0, 0}, + {4, -18, 94, 60, -16, 4, 0, 0}, + {4, -18, 91, 63, -16, 4, 0, 0}, + {4, -18, 89, 65, -16, 4, 0, 0}, + {4, -18, 87, 68, -17, 4, 0, 0}, + {4, -18, 85, 70, -17, 4, 0, 0}, + {4, -18, 82, 73, -17, 4, 0, 0}, + {4, -18, 80, 75, -17, 4, 0, 0}, + {4, -18, 78, 78, -18, 4, 0, 0}, + {4, -17, 75, 80, -18, 4, 0, 0}, + {4, -17, 73, 82, -18, 4, 0, 0}, + {4, -17, 70, 85, -18, 4, 0, 0}, + {4, -17, 68, 87, -18, 4, 0, 0}, + {4, -16, 65, 89, -18, 4, 0, 0}, + {4, -16, 63, 91, -18, 4, 0, 0}, + {4, -16, 60, 94, -18, 4, 0, 0}, + {3, -15, 58, 96, -18, 4, 0, 0}, + {4, -15, 55, 98, -18, 4, 0, 0}, + {3, -14, 52, 100, -17, 4, 0, 0}, + {3, -14, 50, 102, -17, 4, 0, 0}, + {3, -13, 47, 104, -17, 4, 0, 0}, + {3, -13, 45, 106, -17, 4, 0, 0}, + {3, -12, 42, 108, -16, 3, 0, 0}, + {3, -11, 40, 109, -16, 3, 0, 0}, + {3, -11, 37, 111, -15, 3, 0, 0}, + {2, -10, 35, 113, -15, 3, 0, 0}, + {3, -10, 32, 114, -14, 3, 0, 0}, + {2, -9, 29, 116, -13, 3, 0, 0}, + {2, -8, 27, 117, -13, 3, 0, 0}, + {2, -8, 25, 119, -12, 2, 0, 0}, + {2, -7, 22, 120, -11, 2, 0, 0}, + {1, -6, 20, 121, -10, 2, 0, 0}, + {1, -6, 18, 122, -9, 2, 0, 0}, + {1, -5, 15, 123, -8, 2, 0, 0}, + {1, -4, 13, 124, -7, 1, 0, 0}, + {1, -4, 11, 125, -6, 1, 0, 0}, + {1, -3, 8, 126, -5, 1, 0, 0}, + {1, -2, 6, 126, -4, 1, 0, 0}, + {0, -1, 4, 127, -3, 1, 0, 0}, + {0, 0, 2, 127, -1, 0, 0, 0}, + // [0, 1). + {0, 0, 0, 127, 1, 0, 0, 0}, + {0, 0, -1, 127, 2, 0, 0, 0}, + {0, 1, -3, 127, 4, -2, 1, 0}, + {0, 1, -5, 127, 6, -2, 1, 0}, + {0, 2, -6, 126, 8, -3, 1, 0}, + {-1, 2, -7, 126, 11, -4, 2, -1}, + {-1, 3, -8, 125, 13, -5, 2, -1}, + {-1, 3, -10, 124, 16, -6, 3, -1}, + {-1, 4, -11, 123, 18, -7, 3, -1}, + {-1, 4, -12, 122, 20, -7, 3, -1}, + {-1, 4, -13, 121, 23, -8, 3, -1}, + {-2, 5, -14, 120, 25, -9, 4, -1}, + {-1, 5, -15, 119, 27, -10, 4, -1}, + {-1, 5, -16, 118, 30, -11, 4, -1}, + {-2, 6, -17, 116, 33, -12, 5, -1}, + {-2, 6, -17, 114, 35, -12, 5, -1}, + {-2, 6, -18, 113, 38, -13, 5, -1}, + {-2, 7, -19, 111, 41, -14, 6, -2}, + {-2, 7, -19, 110, 43, -15, 6, -2}, + {-2, 7, -20, 108, 46, -15, 6, -2}, + {-2, 7, -20, 106, 49, -16, 6, -2}, + {-2, 7, -21, 104, 51, -16, 7, -2}, + {-2, 7, -21, 102, 54, -17, 7, -2}, + {-2, 8, -21, 100, 56, -18, 7, -2}, + {-2, 8, -22, 98, 59, -18, 7, -2}, + {-2, 8, -22, 96, 62, -19, 7, -2}, + {-2, 8, -22, 94, 64, -19, 7, -2}, + {-2, 8, -22, 91, 67, -20, 8, -2}, + {-2, 8, -22, 89, 69, -20, 8, -2}, + {-2, 8, -22, 87, 72, -21, 8, -2}, + {-2, 8, -21, 84, 74, -21, 8, -2}, + {-2, 8, -22, 82, 77, -21, 8, -2}, + {-2, 8, -21, 79, 79, -21, 8, -2}, + {-2, 8, -21, 77, 82, -22, 8, -2}, + {-2, 8, -21, 74, 84, -21, 8, -2}, + {-2, 8, -21, 72, 87, -22, 8, -2}, + {-2, 8, -20, 69, 89, -22, 8, -2}, + {-2, 8, -20, 67, 91, -22, 8, -2}, + {-2, 7, -19, 64, 94, -22, 8, -2}, + {-2, 7, -19, 62, 96, -22, 8, -2}, + {-2, 7, -18, 59, 98, -22, 8, -2}, + {-2, 7, -18, 56, 100, -21, 8, -2}, + {-2, 7, -17, 54, 102, -21, 7, -2}, + {-2, 7, -16, 51, 104, -21, 7, -2}, + {-2, 6, -16, 49, 106, -20, 7, -2}, + {-2, 6, -15, 46, 108, -20, 7, -2}, + {-2, 6, -15, 43, 110, -19, 7, -2}, + {-2, 6, -14, 41, 111, -19, 7, -2}, + {-1, 5, -13, 38, 113, -18, 6, -2}, + {-1, 5, -12, 35, 114, -17, 6, -2}, + {-1, 5, -12, 33, 116, -17, 6, -2}, + {-1, 4, -11, 30, 118, -16, 5, -1}, + {-1, 4, -10, 27, 119, -15, 5, -1}, + {-1, 4, -9, 25, 120, -14, 5, -2}, + {-1, 3, -8, 23, 121, -13, 4, -1}, + {-1, 3, -7, 20, 122, -12, 4, -1}, + {-1, 3, -7, 18, 123, -11, 4, -1}, + {-1, 3, -6, 16, 124, -10, 3, -1}, + {-1, 2, -5, 13, 125, -8, 3, -1}, + {-1, 2, -4, 11, 126, -7, 2, -1}, + {0, 1, -3, 8, 126, -6, 2, 0}, + {0, 1, -2, 6, 127, -5, 1, 0}, + {0, 1, -2, 4, 127, -3, 1, 0}, + {0, 0, 0, 2, 127, -1, 0, 0}, + // [1, 2). + {0, 0, 0, 1, 127, 0, 0, 0}, + {0, 0, 0, -1, 127, 2, 0, 0}, + {0, 0, 1, -3, 127, 4, -1, 0}, + {0, 0, 1, -4, 126, 6, -2, 1}, + {0, 0, 1, -5, 126, 8, -3, 1}, + {0, 0, 1, -6, 125, 11, -4, 1}, + {0, 0, 1, -7, 124, 13, -4, 1}, + {0, 0, 2, -8, 123, 15, -5, 1}, + {0, 0, 2, -9, 122, 18, -6, 1}, + {0, 0, 2, -10, 121, 20, -6, 1}, + {0, 0, 2, -11, 120, 22, -7, 2}, + {0, 0, 2, -12, 119, 25, -8, 2}, + {0, 0, 3, -13, 117, 27, -8, 2}, + {0, 0, 3, -13, 116, 29, -9, 2}, + {0, 0, 3, -14, 114, 32, -10, 3}, + {0, 0, 3, -15, 113, 35, -10, 2}, + {0, 0, 3, -15, 111, 37, -11, 3}, + {0, 0, 3, -16, 109, 40, -11, 3}, + {0, 0, 3, -16, 108, 42, -12, 3}, + {0, 0, 4, -17, 106, 45, -13, 3}, + {0, 0, 4, -17, 104, 47, -13, 3}, + {0, 0, 4, -17, 102, 50, -14, 3}, + {0, 0, 4, -17, 100, 52, -14, 3}, + {0, 0, 4, -18, 98, 55, -15, 4}, + {0, 0, 4, -18, 96, 58, -15, 3}, + {0, 0, 4, -18, 94, 60, -16, 4}, + {0, 0, 4, -18, 91, 63, -16, 4}, + {0, 0, 4, -18, 89, 65, -16, 4}, + {0, 0, 4, -18, 87, 68, -17, 4}, + {0, 0, 4, -18, 85, 70, -17, 4}, + {0, 0, 4, -18, 82, 73, -17, 4}, + {0, 0, 4, -18, 80, 75, -17, 4}, + {0, 0, 4, -18, 78, 78, -18, 4}, + {0, 0, 4, -17, 75, 80, -18, 4}, + {0, 0, 4, -17, 73, 82, -18, 4}, + {0, 0, 4, -17, 70, 85, -18, 4}, + {0, 0, 4, -17, 68, 87, -18, 4}, + {0, 0, 4, -16, 65, 89, -18, 4}, + {0, 0, 4, -16, 63, 91, -18, 4}, + {0, 0, 4, -16, 60, 94, -18, 4}, + {0, 0, 3, -15, 58, 96, -18, 4}, + {0, 0, 4, -15, 55, 98, -18, 4}, + {0, 0, 3, -14, 52, 100, -17, 4}, + {0, 0, 3, -14, 50, 102, -17, 4}, + {0, 0, 3, -13, 47, 104, -17, 4}, + {0, 0, 3, -13, 45, 106, -17, 4}, + {0, 0, 3, -12, 42, 108, -16, 3}, + {0, 0, 3, -11, 40, 109, -16, 3}, + {0, 0, 3, -11, 37, 111, -15, 3}, + {0, 0, 2, -10, 35, 113, -15, 3}, + {0, 0, 3, -10, 32, 114, -14, 3}, + {0, 0, 2, -9, 29, 116, -13, 3}, + {0, 0, 2, -8, 27, 117, -13, 3}, + {0, 0, 2, -8, 25, 119, -12, 2}, + {0, 0, 2, -7, 22, 120, -11, 2}, + {0, 0, 1, -6, 20, 121, -10, 2}, + {0, 0, 1, -6, 18, 122, -9, 2}, + {0, 0, 1, -5, 15, 123, -8, 2}, + {0, 0, 1, -4, 13, 124, -7, 1}, + {0, 0, 1, -4, 11, 125, -6, 1}, + {0, 0, 1, -3, 8, 126, -5, 1}, + {0, 0, 1, -2, 6, 126, -4, 1}, + {0, 0, 0, -1, 4, 127, -3, 1}, + {0, 0, 0, 0, 2, 127, -1, 0}, + // dummy, replicate row index 191. + {0, 0, 0, 0, 2, 127, -1, 0}}; + +alignas(16) const int16_t + kWarpedFilters[3 * kWarpedPixelPrecisionShifts + 1][8] = { + // [-1, 0). + {0, 0, 127, 1, 0, 0, 0, 0}, + {0, -1, 127, 2, 0, 0, 0, 0}, + {1, -3, 127, 4, -1, 0, 0, 0}, + {1, -4, 126, 6, -2, 1, 0, 0}, + {1, -5, 126, 8, -3, 1, 0, 0}, + {1, -6, 125, 11, -4, 1, 0, 0}, + {1, -7, 124, 13, -4, 1, 0, 0}, + {2, -8, 123, 15, -5, 1, 0, 0}, + {2, -9, 122, 18, -6, 1, 0, 0}, + {2, -10, 121, 20, -6, 1, 0, 0}, + {2, -11, 120, 22, -7, 2, 0, 0}, + {2, -12, 119, 25, -8, 2, 0, 0}, + {3, -13, 117, 27, -8, 2, 0, 0}, + {3, -13, 116, 29, -9, 2, 0, 0}, + {3, -14, 114, 32, -10, 3, 0, 0}, + {3, -15, 113, 35, -10, 2, 0, 0}, + {3, -15, 111, 37, -11, 3, 0, 0}, + {3, -16, 109, 40, -11, 3, 0, 0}, + {3, -16, 108, 42, -12, 3, 0, 0}, + {4, -17, 106, 45, -13, 3, 0, 0}, + {4, -17, 104, 47, -13, 3, 0, 0}, + {4, -17, 102, 50, -14, 3, 0, 0}, + {4, -17, 100, 52, -14, 3, 0, 0}, + {4, -18, 98, 55, -15, 4, 0, 0}, + {4, -18, 96, 58, -15, 3, 0, 0}, + {4, -18, 94, 60, -16, 4, 0, 0}, + {4, -18, 91, 63, -16, 4, 0, 0}, + {4, -18, 89, 65, -16, 4, 0, 0}, + {4, -18, 87, 68, -17, 4, 0, 0}, + {4, -18, 85, 70, -17, 4, 0, 0}, + {4, -18, 82, 73, -17, 4, 0, 0}, + {4, -18, 80, 75, -17, 4, 0, 0}, + {4, -18, 78, 78, -18, 4, 0, 0}, + {4, -17, 75, 80, -18, 4, 0, 0}, + {4, -17, 73, 82, -18, 4, 0, 0}, + {4, -17, 70, 85, -18, 4, 0, 0}, + {4, -17, 68, 87, -18, 4, 0, 0}, + {4, -16, 65, 89, -18, 4, 0, 0}, + {4, -16, 63, 91, -18, 4, 0, 0}, + {4, -16, 60, 94, -18, 4, 0, 0}, + {3, -15, 58, 96, -18, 4, 0, 0}, + {4, -15, 55, 98, -18, 4, 0, 0}, + {3, -14, 52, 100, -17, 4, 0, 0}, + {3, -14, 50, 102, -17, 4, 0, 0}, + {3, -13, 47, 104, -17, 4, 0, 0}, + {3, -13, 45, 106, -17, 4, 0, 0}, + {3, -12, 42, 108, -16, 3, 0, 0}, + {3, -11, 40, 109, -16, 3, 0, 0}, + {3, -11, 37, 111, -15, 3, 0, 0}, + {2, -10, 35, 113, -15, 3, 0, 0}, + {3, -10, 32, 114, -14, 3, 0, 0}, + {2, -9, 29, 116, -13, 3, 0, 0}, + {2, -8, 27, 117, -13, 3, 0, 0}, + {2, -8, 25, 119, -12, 2, 0, 0}, + {2, -7, 22, 120, -11, 2, 0, 0}, + {1, -6, 20, 121, -10, 2, 0, 0}, + {1, -6, 18, 122, -9, 2, 0, 0}, + {1, -5, 15, 123, -8, 2, 0, 0}, + {1, -4, 13, 124, -7, 1, 0, 0}, + {1, -4, 11, 125, -6, 1, 0, 0}, + {1, -3, 8, 126, -5, 1, 0, 0}, + {1, -2, 6, 126, -4, 1, 0, 0}, + {0, -1, 4, 127, -3, 1, 0, 0}, + {0, 0, 2, 127, -1, 0, 0, 0}, + // [0, 1). + {0, 0, 0, 127, 1, 0, 0, 0}, + {0, 0, -1, 127, 2, 0, 0, 0}, + {0, 1, -3, 127, 4, -2, 1, 0}, + {0, 1, -5, 127, 6, -2, 1, 0}, + {0, 2, -6, 126, 8, -3, 1, 0}, + {-1, 2, -7, 126, 11, -4, 2, -1}, + {-1, 3, -8, 125, 13, -5, 2, -1}, + {-1, 3, -10, 124, 16, -6, 3, -1}, + {-1, 4, -11, 123, 18, -7, 3, -1}, + {-1, 4, -12, 122, 20, -7, 3, -1}, + {-1, 4, -13, 121, 23, -8, 3, -1}, + {-2, 5, -14, 120, 25, -9, 4, -1}, + {-1, 5, -15, 119, 27, -10, 4, -1}, + {-1, 5, -16, 118, 30, -11, 4, -1}, + {-2, 6, -17, 116, 33, -12, 5, -1}, + {-2, 6, -17, 114, 35, -12, 5, -1}, + {-2, 6, -18, 113, 38, -13, 5, -1}, + {-2, 7, -19, 111, 41, -14, 6, -2}, + {-2, 7, -19, 110, 43, -15, 6, -2}, + {-2, 7, -20, 108, 46, -15, 6, -2}, + {-2, 7, -20, 106, 49, -16, 6, -2}, + {-2, 7, -21, 104, 51, -16, 7, -2}, + {-2, 7, -21, 102, 54, -17, 7, -2}, + {-2, 8, -21, 100, 56, -18, 7, -2}, + {-2, 8, -22, 98, 59, -18, 7, -2}, + {-2, 8, -22, 96, 62, -19, 7, -2}, + {-2, 8, -22, 94, 64, -19, 7, -2}, + {-2, 8, -22, 91, 67, -20, 8, -2}, + {-2, 8, -22, 89, 69, -20, 8, -2}, + {-2, 8, -22, 87, 72, -21, 8, -2}, + {-2, 8, -21, 84, 74, -21, 8, -2}, + {-2, 8, -22, 82, 77, -21, 8, -2}, + {-2, 8, -21, 79, 79, -21, 8, -2}, + {-2, 8, -21, 77, 82, -22, 8, -2}, + {-2, 8, -21, 74, 84, -21, 8, -2}, + {-2, 8, -21, 72, 87, -22, 8, -2}, + {-2, 8, -20, 69, 89, -22, 8, -2}, + {-2, 8, -20, 67, 91, -22, 8, -2}, + {-2, 7, -19, 64, 94, -22, 8, -2}, + {-2, 7, -19, 62, 96, -22, 8, -2}, + {-2, 7, -18, 59, 98, -22, 8, -2}, + {-2, 7, -18, 56, 100, -21, 8, -2}, + {-2, 7, -17, 54, 102, -21, 7, -2}, + {-2, 7, -16, 51, 104, -21, 7, -2}, + {-2, 6, -16, 49, 106, -20, 7, -2}, + {-2, 6, -15, 46, 108, -20, 7, -2}, + {-2, 6, -15, 43, 110, -19, 7, -2}, + {-2, 6, -14, 41, 111, -19, 7, -2}, + {-1, 5, -13, 38, 113, -18, 6, -2}, + {-1, 5, -12, 35, 114, -17, 6, -2}, + {-1, 5, -12, 33, 116, -17, 6, -2}, + {-1, 4, -11, 30, 118, -16, 5, -1}, + {-1, 4, -10, 27, 119, -15, 5, -1}, + {-1, 4, -9, 25, 120, -14, 5, -2}, + {-1, 3, -8, 23, 121, -13, 4, -1}, + {-1, 3, -7, 20, 122, -12, 4, -1}, + {-1, 3, -7, 18, 123, -11, 4, -1}, + {-1, 3, -6, 16, 124, -10, 3, -1}, + {-1, 2, -5, 13, 125, -8, 3, -1}, + {-1, 2, -4, 11, 126, -7, 2, -1}, + {0, 1, -3, 8, 126, -6, 2, 0}, + {0, 1, -2, 6, 127, -5, 1, 0}, + {0, 1, -2, 4, 127, -3, 1, 0}, + {0, 0, 0, 2, 127, -1, 0, 0}, + // [1, 2). + {0, 0, 0, 1, 127, 0, 0, 0}, + {0, 0, 0, -1, 127, 2, 0, 0}, + {0, 0, 1, -3, 127, 4, -1, 0}, + {0, 0, 1, -4, 126, 6, -2, 1}, + {0, 0, 1, -5, 126, 8, -3, 1}, + {0, 0, 1, -6, 125, 11, -4, 1}, + {0, 0, 1, -7, 124, 13, -4, 1}, + {0, 0, 2, -8, 123, 15, -5, 1}, + {0, 0, 2, -9, 122, 18, -6, 1}, + {0, 0, 2, -10, 121, 20, -6, 1}, + {0, 0, 2, -11, 120, 22, -7, 2}, + {0, 0, 2, -12, 119, 25, -8, 2}, + {0, 0, 3, -13, 117, 27, -8, 2}, + {0, 0, 3, -13, 116, 29, -9, 2}, + {0, 0, 3, -14, 114, 32, -10, 3}, + {0, 0, 3, -15, 113, 35, -10, 2}, + {0, 0, 3, -15, 111, 37, -11, 3}, + {0, 0, 3, -16, 109, 40, -11, 3}, + {0, 0, 3, -16, 108, 42, -12, 3}, + {0, 0, 4, -17, 106, 45, -13, 3}, + {0, 0, 4, -17, 104, 47, -13, 3}, + {0, 0, 4, -17, 102, 50, -14, 3}, + {0, 0, 4, -17, 100, 52, -14, 3}, + {0, 0, 4, -18, 98, 55, -15, 4}, + {0, 0, 4, -18, 96, 58, -15, 3}, + {0, 0, 4, -18, 94, 60, -16, 4}, + {0, 0, 4, -18, 91, 63, -16, 4}, + {0, 0, 4, -18, 89, 65, -16, 4}, + {0, 0, 4, -18, 87, 68, -17, 4}, + {0, 0, 4, -18, 85, 70, -17, 4}, + {0, 0, 4, -18, 82, 73, -17, 4}, + {0, 0, 4, -18, 80, 75, -17, 4}, + {0, 0, 4, -18, 78, 78, -18, 4}, + {0, 0, 4, -17, 75, 80, -18, 4}, + {0, 0, 4, -17, 73, 82, -18, 4}, + {0, 0, 4, -17, 70, 85, -18, 4}, + {0, 0, 4, -17, 68, 87, -18, 4}, + {0, 0, 4, -16, 65, 89, -18, 4}, + {0, 0, 4, -16, 63, 91, -18, 4}, + {0, 0, 4, -16, 60, 94, -18, 4}, + {0, 0, 3, -15, 58, 96, -18, 4}, + {0, 0, 4, -15, 55, 98, -18, 4}, + {0, 0, 3, -14, 52, 100, -17, 4}, + {0, 0, 3, -14, 50, 102, -17, 4}, + {0, 0, 3, -13, 47, 104, -17, 4}, + {0, 0, 3, -13, 45, 106, -17, 4}, + {0, 0, 3, -12, 42, 108, -16, 3}, + {0, 0, 3, -11, 40, 109, -16, 3}, + {0, 0, 3, -11, 37, 111, -15, 3}, + {0, 0, 2, -10, 35, 113, -15, 3}, + {0, 0, 3, -10, 32, 114, -14, 3}, + {0, 0, 2, -9, 29, 116, -13, 3}, + {0, 0, 2, -8, 27, 117, -13, 3}, + {0, 0, 2, -8, 25, 119, -12, 2}, + {0, 0, 2, -7, 22, 120, -11, 2}, + {0, 0, 1, -6, 20, 121, -10, 2}, + {0, 0, 1, -6, 18, 122, -9, 2}, + {0, 0, 1, -5, 15, 123, -8, 2}, + {0, 0, 1, -4, 13, 124, -7, 1}, + {0, 0, 1, -4, 11, 125, -6, 1}, + {0, 0, 1, -3, 8, 126, -5, 1}, + {0, 0, 1, -2, 6, 126, -4, 1}, + {0, 0, 0, -1, 4, 127, -3, 1}, + {0, 0, 0, 0, 2, 127, -1, 0}, + // dummy, replicate row index 191. + {0, 0, 0, 0, 2, 127, -1, 0}}; + +// Every value in |kSubPixelFilters| is even. Divide by 2 to simplify +// calculations by reducing the range by 1 bit. +alignas(8) const int8_t kHalfSubPixelFilters[6][16][8] = { + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 1, -3, 63, 4, -1, 0, 0}, + {0, 1, -5, 61, 9, -2, 0, 0}, + {0, 1, -6, 58, 14, -4, 1, 0}, + {0, 1, -7, 55, 19, -5, 1, 0}, + {0, 1, -7, 51, 24, -6, 1, 0}, + {0, 1, -8, 47, 29, -6, 1, 0}, + {0, 1, -7, 42, 33, -6, 1, 0}, + {0, 1, -7, 38, 38, -7, 1, 0}, + {0, 1, -6, 33, 42, -7, 1, 0}, + {0, 1, -6, 29, 47, -8, 1, 0}, + {0, 1, -6, 24, 51, -7, 1, 0}, + {0, 1, -5, 19, 55, -7, 1, 0}, + {0, 1, -4, 14, 58, -6, 1, 0}, + {0, 0, -2, 9, 61, -5, 1, 0}, + {0, 0, -1, 4, 63, -3, 1, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 1, 14, 31, 17, 1, 0, 0}, + {0, 0, 13, 31, 18, 2, 0, 0}, + {0, 0, 11, 31, 20, 2, 0, 0}, + {0, 0, 10, 30, 21, 3, 0, 0}, + {0, 0, 9, 29, 22, 4, 0, 0}, + {0, 0, 8, 28, 23, 5, 0, 0}, + {0, -1, 8, 27, 24, 6, 0, 0}, + {0, -1, 7, 26, 26, 7, -1, 0}, + {0, 0, 6, 24, 27, 8, -1, 0}, + {0, 0, 5, 23, 28, 8, 0, 0}, + {0, 0, 4, 22, 29, 9, 0, 0}, + {0, 0, 3, 21, 30, 10, 0, 0}, + {0, 0, 2, 20, 31, 11, 0, 0}, + {0, 0, 2, 18, 31, 13, 0, 0}, + {0, 0, 1, 17, 31, 14, 1, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {-1, 1, -3, 63, 4, -1, 1, 0}, + {-1, 3, -6, 62, 8, -3, 2, -1}, + {-1, 4, -9, 60, 13, -5, 3, -1}, + {-2, 5, -11, 58, 19, -7, 3, -1}, + {-2, 5, -11, 54, 24, -9, 4, -1}, + {-2, 5, -12, 50, 30, -10, 4, -1}, + {-2, 5, -12, 45, 35, -11, 5, -1}, + {-2, 6, -12, 40, 40, -12, 6, -2}, + {-1, 5, -11, 35, 45, -12, 5, -2}, + {-1, 4, -10, 30, 50, -12, 5, -2}, + {-1, 4, -9, 24, 54, -11, 5, -2}, + {-1, 3, -7, 19, 58, -11, 5, -2}, + {-1, 3, -5, 13, 60, -9, 4, -1}, + {-1, 2, -3, 8, 62, -6, 3, -1}, + {0, 1, -1, 4, 63, -3, 1, -1}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 0, 0, 60, 4, 0, 0, 0}, + {0, 0, 0, 56, 8, 0, 0, 0}, + {0, 0, 0, 52, 12, 0, 0, 0}, + {0, 0, 0, 48, 16, 0, 0, 0}, + {0, 0, 0, 44, 20, 0, 0, 0}, + {0, 0, 0, 40, 24, 0, 0, 0}, + {0, 0, 0, 36, 28, 0, 0, 0}, + {0, 0, 0, 32, 32, 0, 0, 0}, + {0, 0, 0, 28, 36, 0, 0, 0}, + {0, 0, 0, 24, 40, 0, 0, 0}, + {0, 0, 0, 20, 44, 0, 0, 0}, + {0, 0, 0, 16, 48, 0, 0, 0}, + {0, 0, 0, 12, 52, 0, 0, 0}, + {0, 0, 0, 8, 56, 0, 0, 0}, + {0, 0, 0, 4, 60, 0, 0, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 0, -2, 63, 4, -1, 0, 0}, + {0, 0, -4, 61, 9, -2, 0, 0}, + {0, 0, -5, 58, 14, -3, 0, 0}, + {0, 0, -6, 55, 19, -4, 0, 0}, + {0, 0, -6, 51, 24, -5, 0, 0}, + {0, 0, -7, 47, 29, -5, 0, 0}, + {0, 0, -6, 42, 33, -5, 0, 0}, + {0, 0, -6, 38, 38, -6, 0, 0}, + {0, 0, -5, 33, 42, -6, 0, 0}, + {0, 0, -5, 29, 47, -7, 0, 0}, + {0, 0, -5, 24, 51, -6, 0, 0}, + {0, 0, -4, 19, 55, -6, 0, 0}, + {0, 0, -3, 14, 58, -5, 0, 0}, + {0, 0, -2, 9, 61, -4, 0, 0}, + {0, 0, -1, 4, 63, -2, 0, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 0, 15, 31, 17, 1, 0, 0}, + {0, 0, 13, 31, 18, 2, 0, 0}, + {0, 0, 11, 31, 20, 2, 0, 0}, + {0, 0, 10, 30, 21, 3, 0, 0}, + {0, 0, 9, 29, 22, 4, 0, 0}, + {0, 0, 8, 28, 23, 5, 0, 0}, + {0, 0, 7, 27, 24, 6, 0, 0}, + {0, 0, 6, 26, 26, 6, 0, 0}, + {0, 0, 6, 24, 27, 7, 0, 0}, + {0, 0, 5, 23, 28, 8, 0, 0}, + {0, 0, 4, 22, 29, 9, 0, 0}, + {0, 0, 3, 21, 30, 10, 0, 0}, + {0, 0, 2, 20, 31, 11, 0, 0}, + {0, 0, 2, 18, 31, 13, 0, 0}, + {0, 0, 1, 17, 31, 15, 0, 0}}}; + +// Absolute values of |kHalfSubPixelFilters|. Used in situations where we know +// the pattern of the signs and account for it in other ways. +const uint8_t kAbsHalfSubPixelFilters[6][16][8] = { + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 1, 3, 63, 4, 1, 0, 0}, + {0, 1, 5, 61, 9, 2, 0, 0}, + {0, 1, 6, 58, 14, 4, 1, 0}, + {0, 1, 7, 55, 19, 5, 1, 0}, + {0, 1, 7, 51, 24, 6, 1, 0}, + {0, 1, 8, 47, 29, 6, 1, 0}, + {0, 1, 7, 42, 33, 6, 1, 0}, + {0, 1, 7, 38, 38, 7, 1, 0}, + {0, 1, 6, 33, 42, 7, 1, 0}, + {0, 1, 6, 29, 47, 8, 1, 0}, + {0, 1, 6, 24, 51, 7, 1, 0}, + {0, 1, 5, 19, 55, 7, 1, 0}, + {0, 1, 4, 14, 58, 6, 1, 0}, + {0, 0, 2, 9, 61, 5, 1, 0}, + {0, 0, 1, 4, 63, 3, 1, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 1, 14, 31, 17, 1, 0, 0}, + {0, 0, 13, 31, 18, 2, 0, 0}, + {0, 0, 11, 31, 20, 2, 0, 0}, + {0, 0, 10, 30, 21, 3, 0, 0}, + {0, 0, 9, 29, 22, 4, 0, 0}, + {0, 0, 8, 28, 23, 5, 0, 0}, + {0, 1, 8, 27, 24, 6, 0, 0}, + {0, 1, 7, 26, 26, 7, 1, 0}, + {0, 0, 6, 24, 27, 8, 1, 0}, + {0, 0, 5, 23, 28, 8, 0, 0}, + {0, 0, 4, 22, 29, 9, 0, 0}, + {0, 0, 3, 21, 30, 10, 0, 0}, + {0, 0, 2, 20, 31, 11, 0, 0}, + {0, 0, 2, 18, 31, 13, 0, 0}, + {0, 0, 1, 17, 31, 14, 1, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {1, 1, 3, 63, 4, 1, 1, 0}, + {1, 3, 6, 62, 8, 3, 2, 1}, + {1, 4, 9, 60, 13, 5, 3, 1}, + {2, 5, 11, 58, 19, 7, 3, 1}, + {2, 5, 11, 54, 24, 9, 4, 1}, + {2, 5, 12, 50, 30, 10, 4, 1}, + {2, 5, 12, 45, 35, 11, 5, 1}, + {2, 6, 12, 40, 40, 12, 6, 2}, + {1, 5, 11, 35, 45, 12, 5, 2}, + {1, 4, 10, 30, 50, 12, 5, 2}, + {1, 4, 9, 24, 54, 11, 5, 2}, + {1, 3, 7, 19, 58, 11, 5, 2}, + {1, 3, 5, 13, 60, 9, 4, 1}, + {1, 2, 3, 8, 62, 6, 3, 1}, + {0, 1, 1, 4, 63, 3, 1, 1}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 0, 0, 60, 4, 0, 0, 0}, + {0, 0, 0, 56, 8, 0, 0, 0}, + {0, 0, 0, 52, 12, 0, 0, 0}, + {0, 0, 0, 48, 16, 0, 0, 0}, + {0, 0, 0, 44, 20, 0, 0, 0}, + {0, 0, 0, 40, 24, 0, 0, 0}, + {0, 0, 0, 36, 28, 0, 0, 0}, + {0, 0, 0, 32, 32, 0, 0, 0}, + {0, 0, 0, 28, 36, 0, 0, 0}, + {0, 0, 0, 24, 40, 0, 0, 0}, + {0, 0, 0, 20, 44, 0, 0, 0}, + {0, 0, 0, 16, 48, 0, 0, 0}, + {0, 0, 0, 12, 52, 0, 0, 0}, + {0, 0, 0, 8, 56, 0, 0, 0}, + {0, 0, 0, 4, 60, 0, 0, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 0, 2, 63, 4, 1, 0, 0}, + {0, 0, 4, 61, 9, 2, 0, 0}, + {0, 0, 5, 58, 14, 3, 0, 0}, + {0, 0, 6, 55, 19, 4, 0, 0}, + {0, 0, 6, 51, 24, 5, 0, 0}, + {0, 0, 7, 47, 29, 5, 0, 0}, + {0, 0, 6, 42, 33, 5, 0, 0}, + {0, 0, 6, 38, 38, 6, 0, 0}, + {0, 0, 5, 33, 42, 6, 0, 0}, + {0, 0, 5, 29, 47, 7, 0, 0}, + {0, 0, 5, 24, 51, 6, 0, 0}, + {0, 0, 4, 19, 55, 6, 0, 0}, + {0, 0, 3, 14, 58, 5, 0, 0}, + {0, 0, 2, 9, 61, 4, 0, 0}, + {0, 0, 1, 4, 63, 2, 0, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 0, 15, 31, 17, 1, 0, 0}, + {0, 0, 13, 31, 18, 2, 0, 0}, + {0, 0, 11, 31, 20, 2, 0, 0}, + {0, 0, 10, 30, 21, 3, 0, 0}, + {0, 0, 9, 29, 22, 4, 0, 0}, + {0, 0, 8, 28, 23, 5, 0, 0}, + {0, 0, 7, 27, 24, 6, 0, 0}, + {0, 0, 6, 26, 26, 6, 0, 0}, + {0, 0, 6, 24, 27, 7, 0, 0}, + {0, 0, 5, 23, 28, 8, 0, 0}, + {0, 0, 4, 22, 29, 9, 0, 0}, + {0, 0, 3, 21, 30, 10, 0, 0}, + {0, 0, 2, 20, 31, 11, 0, 0}, + {0, 0, 2, 18, 31, 13, 0, 0}, + {0, 0, 1, 17, 31, 15, 0, 0}}}; + +// 9.3 -- Dr_Intra_Derivative[] +// This is a more compact version of the table from the spec. angle / 2 - 1 is +// used as the lookup. Note angle / 3 - 1 would work too, but the calculation +// becomes more costly. +const int16_t kDirectionalIntraPredictorDerivative[44] = { + // Approx angle + 1023, 0, // 3, ... + 547, // 6, ... + 372, 0, 0, // 9, ... + 273, // 14, ... + 215, 0, // 17, ... + 178, // 20, ... + 151, 0, // 23, ... (113 & 203 are base angles) + 132, // 26, ... + 116, 0, // 29, ... + 102, 0, // 32, ... + 90, // 36, ... + 80, 0, // 39, ... + 71, // 42, ... + 64, 0, // 45, ... (45 & 135 are base angles) + 57, // 48, ... + 51, 0, // 51, ... + 45, 0, // 54, ... + 40, // 58, ... + 35, 0, // 61, ... + 31, // 64, ... + 27, 0, // 67, ... (67 & 157 are base angles) + 23, // 70, ... + 19, 0, // 73, ... + 15, 0, // 76, ... + 11, 0, // 81, ... + 7, // 84, ... + 3, // 87, ... +}; + +const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes] = { + {0, 1}, {2, 2}, {3, 3}}; + +} // namespace libgav1 diff --git a/src/utils/constants.h b/src/utils/constants.h new file mode 100644 index 0000000..34cf56d --- /dev/null +++ b/src/utils/constants.h @@ -0,0 +1,744 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_CONSTANTS_H_ +#define LIBGAV1_SRC_UTILS_CONSTANTS_H_ + +#include +#include + +#include "src/utils/bit_mask_set.h" + +namespace libgav1 { + +// Returns the number of elements between begin (inclusive) and end (inclusive). +constexpr int EnumRangeLength(int begin, int end) { return end - begin + 1; } + +enum { +// Maximum number of threads that the library will ever create. +#if defined(LIBGAV1_MAX_THREADS) && LIBGAV1_MAX_THREADS > 0 + kMaxThreads = LIBGAV1_MAX_THREADS +#else + kMaxThreads = 128 +#endif +}; // anonymous enum + +enum { + kInvalidMvValue = -32768, + kCdfMaxProbability = 32768, + kBlockWidthCount = 5, + kMaxSegments = 8, + kMinQuantizer = 0, + kMinLossyQuantizer = 1, + kMaxQuantizer = 255, + // Quantizer matrix is used only when level < 15. + kNumQuantizerLevelsForQuantizerMatrix = 15, + kFrameLfCount = 4, + kMaxLoopFilterValue = 63, + kNum4x4In64x64 = 256, + kMaxAngleDelta = 3, + kDirectionalIntraModes = 8, + kMaxSuperBlockSizeLog2 = 7, + kMinSuperBlockSizeLog2 = 6, + kGlobalMotionReadControl = 3, + kSuperResScaleNumerator = 8, + kBooleanSymbolCount = 2, + kRestorationTypeSymbolCount = 3, + kSgrProjParamsBits = 4, + kSgrProjPrecisionBits = 7, + // Padding on left and right side of a restoration block. + // 3 is enough, but padding to 4 is more efficient, and makes the temporary + // source buffer 8-pixel aligned. + kRestorationHorizontalBorder = 4, + // Padding on top and bottom side of a restoration block. + kRestorationVerticalBorder = 2, + kCdefBorder = 2, // Padding on each side of a cdef block. + kConvolveBorderLeftTop = 3, // Left/top padding of a convolve block. + // Right/bottom padding of a convolve block. This needs to be 4 at minimum, + // but was increased to simplify the SIMD loads in + // ConvolveCompoundScale2D_NEON() and ConvolveScale2D_NEON(). + kConvolveBorderRight = 8, + kConvolveBorderBottom = 4, + kSubPixelTaps = 8, + kWienerFilterBits = 7, + kWienerFilterTaps = 7, + kMaxPaletteSize = 8, + kMinPaletteSize = 2, + kMaxPaletteSquare = 64, + kBorderPixels = 64, + // The final blending process for film grain needs room to overwrite and read + // with SIMD instructions. The maximum overwrite is 7 pixels, but the border + // is required to be a multiple of 32 by YuvBuffer::Realloc, so that + // subsampled chroma borders are 16-aligned. + kBorderPixelsFilmGrain = 32, + // These constants are the minimum left, right, top, and bottom border sizes + // in pixels as an extension of the frame boundary. The minimum border sizes + // are derived from the following requirements: + // - Warp_C() may read up to 13 pixels before or after a row. + // - Warp_NEON() may read up to 13 pixels before a row. It may read up to 14 + // pixels after a row, but the value of the last read pixel is not used. + // - Warp_C() and Warp_NEON() may read up to 13 pixels above the top row and + // 13 pixels below the bottom row. + kMinLeftBorderPixels = 13, + kMinRightBorderPixels = 13, + kMinTopBorderPixels = 13, + kMinBottomBorderPixels = 13, + kWarpedModelPrecisionBits = 16, + kMaxRefMvStackSize = 8, + kMaxLeastSquaresSamples = 8, + kMaxTemporalMvCandidates = 19, + // The SIMD implementations of motion vection projection functions always + // process 2 or 4 elements together, so we pad the corresponding buffers to + // size 20. + kMaxTemporalMvCandidatesWithPadding = 20, + kMaxSuperBlockSizeInPixels = 128, + kMaxScaledSuperBlockSizeInPixels = 128 * 2, + kMaxSuperBlockSizeSquareInPixels = 128 * 128, + kNum4x4InLoopFilterUnit = 16, + kNum4x4InLoopRestorationUnit = 16, + kProjectionMvClamp = (1 << 14) - 1, // == 16383 + kProjectionMvMaxHorizontalOffset = 8, + kCdefUnitSize = 64, + kCdefUnitSizeWithBorders = kCdefUnitSize + 2 * kCdefBorder, + kRestorationUnitOffset = 8, + // Loop restoration's processing unit size is fixed as 64x64. + kRestorationUnitHeight = 64, + kRestorationUnitWidth = 256, + kRestorationUnitHeightWithBorders = + kRestorationUnitHeight + 2 * kRestorationVerticalBorder, + kRestorationUnitWidthWithBorders = + kRestorationUnitWidth + 2 * kRestorationHorizontalBorder, + kSuperResFilterBits = 6, + kSuperResFilterShifts = 1 << kSuperResFilterBits, + kSuperResFilterTaps = 8, + kSuperResScaleBits = 14, + kSuperResExtraBits = kSuperResScaleBits - kSuperResFilterBits, + kSuperResScaleMask = (1 << 14) - 1, + kSuperResHorizontalBorder = 4, + kSuperResVerticalBorder = 1, + // The SIMD implementations of superres calculate up to 15 extra upscaled + // pixels which will over-read up to 15 downscaled pixels in the end of each + // row. Set the padding to 16 for alignment purposes. + kSuperResHorizontalPadding = 16, + // TODO(chengchen): consider merging these constants: + // kFilterBits, kWienerFilterBits, and kSgrProjPrecisionBits, which are all 7, + // They are designed to match AV1 convolution, which increases coeff + // values up to 7 bits. We could consider to combine them and use kFilterBits + // only. + kFilterBits = 7, + // Sub pixel is used in AV1 to represent a pixel location that is not at + // integer position. Sub pixel is in 1/16 (1 << kSubPixelBits) unit of + // integer pixel. Sub pixel values are interpolated using adjacent integer + // pixel values. The interpolation is a filtering process. + kSubPixelBits = 4, + kSubPixelMask = (1 << kSubPixelBits) - 1, + // Precision bits when computing inter prediction locations. + kScaleSubPixelBits = 10, + kWarpParamRoundingBits = 6, + // Number of fractional bits of lookup in divisor lookup table. + kDivisorLookupBits = 8, + // Number of fractional bits of entries in divisor lookup table. + kDivisorLookupPrecisionBits = 14, + // Number of phases used in warped filtering. + kWarpedPixelPrecisionShifts = 1 << 6, + kResidualPaddingVertical = 4, + kWedgeMaskMasterSize = 64, + kMaxFrameDistance = 31, + kReferenceFrameScalePrecision = 14, + kNumWienerCoefficients = 3, + kLoopFilterMaxModeDeltas = 2, + kMaxCdefStrengths = 8, + kCdefLargeValue = 0x4000, // Used to indicate where CDEF is not available. + kMaxTileColumns = 64, + kMaxTileRows = 64, + kMaxOperatingPoints = 32, + // There can be a maximum of 4 spatial layers and 8 temporal layers. + kMaxLayers = 32, + // The cache line size should ideally be queried at run time. 64 is a common + // cache line size of x86 CPUs. Web searches showed the cache line size of ARM + // CPUs is 32 or 64 bytes. So aligning to 64-byte boundary will work for all + // CPUs that we care about, even though it is excessive for some ARM + // CPUs. + // + // On Linux, the cache line size can be looked up with the command: + // getconf LEVEL1_DCACHE_LINESIZE + kCacheLineSize = 64, +}; // anonymous enum + +enum FrameType : uint8_t { + kFrameKey, + kFrameInter, + kFrameIntraOnly, + kFrameSwitch +}; + +enum Plane : uint8_t { kPlaneY, kPlaneU, kPlaneV }; +enum : uint8_t { kMaxPlanesMonochrome = kPlaneY + 1, kMaxPlanes = kPlaneV + 1 }; + +// The plane types, called luma and chroma in the spec. +enum PlaneType : uint8_t { kPlaneTypeY, kPlaneTypeUV, kNumPlaneTypes }; + +enum ReferenceFrameType : int8_t { + kReferenceFrameNone = -1, + kReferenceFrameIntra, + kReferenceFrameLast, + kReferenceFrameLast2, + kReferenceFrameLast3, + kReferenceFrameGolden, + kReferenceFrameBackward, + kReferenceFrameAlternate2, + kReferenceFrameAlternate, + kNumReferenceFrameTypes, + kNumInterReferenceFrameTypes = + EnumRangeLength(kReferenceFrameLast, kReferenceFrameAlternate), + kNumForwardReferenceTypes = + EnumRangeLength(kReferenceFrameLast, kReferenceFrameGolden), + kNumBackwardReferenceTypes = + EnumRangeLength(kReferenceFrameBackward, kReferenceFrameAlternate) +}; + +enum { + // Unidirectional compound reference pairs that are signaled explicitly: + // {kReferenceFrameLast, kReferenceFrameLast2}, + // {kReferenceFrameLast, kReferenceFrameLast3}, + // {kReferenceFrameLast, kReferenceFrameGolden}, + // {kReferenceFrameBackward, kReferenceFrameAlternate} + kExplicitUnidirectionalCompoundReferences = 4, + // Other unidirectional compound reference pairs: + // {kReferenceFrameLast2, kReferenceFrameLast3}, + // {kReferenceFrameLast2, kReferenceFrameGolden}, + // {kReferenceFrameLast3, kReferenceFrameGolden}, + // {kReferenceFrameBackward, kReferenceFrameAlternate2}, + // {kReferenceFrameAlternate2, kReferenceFrameAlternate} + kUnidirectionalCompoundReferences = + kExplicitUnidirectionalCompoundReferences + 5, +}; // anonymous enum + +enum BlockSize : uint8_t { + kBlock4x4, + kBlock4x8, + kBlock4x16, + kBlock8x4, + kBlock8x8, + kBlock8x16, + kBlock8x32, + kBlock16x4, + kBlock16x8, + kBlock16x16, + kBlock16x32, + kBlock16x64, + kBlock32x8, + kBlock32x16, + kBlock32x32, + kBlock32x64, + kBlock64x16, + kBlock64x32, + kBlock64x64, + kBlock64x128, + kBlock128x64, + kBlock128x128, + kMaxBlockSizes, + kBlockInvalid +}; + +// Partition types. R: Recursive +// +// None Horizontal Vertical Split +// +-------+ +-------+ +---+---+ +---+---+ +// | | | | | | | | R | R | +// | | +-------+ | | | +---+---+ +// | | | | | | | | R | R | +// +-------+ +-------+ +---+---+ +---+---+ +// +// Horizontal Horizontal Vertical Vertical +// with top with bottom with left with right +// split split split split +// +---+---+ +-------+ +---+---+ +---+---+ +// | | | | | | | | | | | +// +---+---+ +---+---+ +---+ | | +---+ +// | | | | | | | | | | | +// +-------+ +---+---+ +---+---+ +---+---+ +// +// Horizontal4 Vertical4 +// +-----+ +-+-+-+ +// +-----+ | | | | +// +-----+ | | | | +// +-----+ +-+-+-+ +enum Partition : uint8_t { + kPartitionNone, + kPartitionHorizontal, + kPartitionVertical, + kPartitionSplit, + kPartitionHorizontalWithTopSplit, + kPartitionHorizontalWithBottomSplit, + kPartitionVerticalWithLeftSplit, + kPartitionVerticalWithRightSplit, + kPartitionHorizontal4, + kPartitionVertical4 +}; +enum : uint8_t { kMaxPartitionTypes = kPartitionVertical4 + 1 }; + +enum PredictionMode : uint8_t { + // Intra prediction modes. + kPredictionModeDc, + kPredictionModeVertical, + kPredictionModeHorizontal, + kPredictionModeD45, + kPredictionModeD135, + kPredictionModeD113, + kPredictionModeD157, + kPredictionModeD203, + kPredictionModeD67, + kPredictionModeSmooth, + kPredictionModeSmoothVertical, + kPredictionModeSmoothHorizontal, + kPredictionModePaeth, + kPredictionModeChromaFromLuma, + // Single inter prediction modes. + kPredictionModeNearestMv, + kPredictionModeNearMv, + kPredictionModeGlobalMv, + kPredictionModeNewMv, + // Compound inter prediction modes. + kPredictionModeNearestNearestMv, + kPredictionModeNearNearMv, + kPredictionModeNearestNewMv, + kPredictionModeNewNearestMv, + kPredictionModeNearNewMv, + kPredictionModeNewNearMv, + kPredictionModeGlobalGlobalMv, + kPredictionModeNewNewMv, + kNumPredictionModes, + kNumCompoundInterPredictionModes = + EnumRangeLength(kPredictionModeNearestNearestMv, kPredictionModeNewNewMv), + kIntraPredictionModesY = + EnumRangeLength(kPredictionModeDc, kPredictionModePaeth), + kIntraPredictionModesUV = + EnumRangeLength(kPredictionModeDc, kPredictionModeChromaFromLuma), + kPredictionModeInvalid = 255 +}; + +enum InterIntraMode : uint8_t { + kInterIntraModeDc, + kInterIntraModeVertical, + kInterIntraModeHorizontal, + kInterIntraModeSmooth, + kNumInterIntraModes +}; + +enum MotionMode : uint8_t { + kMotionModeSimple, + kMotionModeObmc, // Overlapped block motion compensation. + kMotionModeLocalWarp, + kNumMotionModes +}; + +enum TxMode : uint8_t { + kTxModeOnly4x4, + kTxModeLargest, + kTxModeSelect, + kNumTxModes +}; + +// These enums are named as kType1Type2 where Type1 is the transform type for +// the rows and Type2 is the transform type for the columns. +enum TransformType : uint8_t { + kTransformTypeDctDct, + kTransformTypeAdstDct, + kTransformTypeDctAdst, + kTransformTypeAdstAdst, + kTransformTypeFlipadstDct, + kTransformTypeDctFlipadst, + kTransformTypeFlipadstFlipadst, + kTransformTypeAdstFlipadst, + kTransformTypeFlipadstAdst, + kTransformTypeIdentityIdentity, + kTransformTypeIdentityDct, + kTransformTypeDctIdentity, + kTransformTypeIdentityAdst, + kTransformTypeAdstIdentity, + kTransformTypeIdentityFlipadst, + kTransformTypeFlipadstIdentity, + kNumTransformTypes +}; + +constexpr BitMaskSet kTransformFlipColumnsMask(kTransformTypeFlipadstDct, + kTransformTypeFlipadstAdst, + kTransformTypeFlipadstIdentity, + kTransformTypeFlipadstFlipadst); +constexpr BitMaskSet kTransformFlipRowsMask(kTransformTypeDctFlipadst, + kTransformTypeAdstFlipadst, + kTransformTypeIdentityFlipadst, + kTransformTypeFlipadstFlipadst); + +enum TransformSize : uint8_t { + kTransformSize4x4, + kTransformSize4x8, + kTransformSize4x16, + kTransformSize8x4, + kTransformSize8x8, + kTransformSize8x16, + kTransformSize8x32, + kTransformSize16x4, + kTransformSize16x8, + kTransformSize16x16, + kTransformSize16x32, + kTransformSize16x64, + kTransformSize32x8, + kTransformSize32x16, + kTransformSize32x32, + kTransformSize32x64, + kTransformSize64x16, + kTransformSize64x32, + kTransformSize64x64, + kNumTransformSizes +}; + +enum TransformSet : uint8_t { + // DCT Only (1). + kTransformSetDctOnly, + // 2D-DCT and 2D-ADST without flip (4) + Identity (1) + 1D Horizontal/Vertical + // DCT (2) = Total (7). + kTransformSetIntra1, + // 2D-DCT and 2D-ADST without flip (4) + Identity (1) = Total (5). + kTransformSetIntra2, + // All transforms = Total (16). + kTransformSetInter1, + // 2D-DCT and 2D-ADST with flip (9) + Identity (1) + 1D Horizontal/Vertical + // DCT (2) = Total (12). + kTransformSetInter2, + // DCT (1) + Identity (1) = Total (2). + kTransformSetInter3, + kNumTransformSets +}; + +enum TransformClass : uint8_t { + kTransformClass2D, + kTransformClassHorizontal, + kTransformClassVertical, + kNumTransformClasses +}; + +enum FilterIntraPredictor : uint8_t { + kFilterIntraPredictorDc, + kFilterIntraPredictorVertical, + kFilterIntraPredictorHorizontal, + kFilterIntraPredictorD157, + kFilterIntraPredictorPaeth, + kNumFilterIntraPredictors +}; + +enum ObmcDirection : uint8_t { + kObmcDirectionVertical, + kObmcDirectionHorizontal, + kNumObmcDirections +}; + +// In AV1 the name of the filter refers to the direction of filter application. +// Horizontal refers to the column edge and vertical the row edge. +enum LoopFilterType : uint8_t { + kLoopFilterTypeVertical, + kLoopFilterTypeHorizontal, + kNumLoopFilterTypes +}; + +enum LoopFilterTransformSizeId : uint8_t { + kLoopFilterTransformSizeId4x4, + kLoopFilterTransformSizeId8x8, + kLoopFilterTransformSizeId16x16, + kNumLoopFilterTransformSizeIds +}; + +enum LoopRestorationType : uint8_t { + kLoopRestorationTypeNone, + kLoopRestorationTypeSwitchable, + kLoopRestorationTypeWiener, + kLoopRestorationTypeSgrProj, // self guided projection filter. + kNumLoopRestorationTypes +}; + +enum CompoundReferenceType : uint8_t { + kCompoundReferenceUnidirectional, + kCompoundReferenceBidirectional, + kNumCompoundReferenceTypes +}; + +enum CompoundPredictionType : uint8_t { + kCompoundPredictionTypeWedge, + kCompoundPredictionTypeDiffWeighted, + kCompoundPredictionTypeAverage, + kCompoundPredictionTypeIntra, + kCompoundPredictionTypeDistance, + kNumCompoundPredictionTypes, + // Number of compound prediction types that are explicitly signaled in the + // bitstream (in the compound_type syntax element). + kNumExplicitCompoundPredictionTypes = 2 +}; + +enum InterpolationFilter : uint8_t { + kInterpolationFilterEightTap, + kInterpolationFilterEightTapSmooth, + kInterpolationFilterEightTapSharp, + kInterpolationFilterBilinear, + kInterpolationFilterSwitchable, + kNumInterpolationFilters, + // Number of interpolation filters that can be explicitly signaled in the + // compressed headers (when the uncompressed headers allow switchable + // interpolation filters) of the bitstream. + kNumExplicitInterpolationFilters = EnumRangeLength( + kInterpolationFilterEightTap, kInterpolationFilterEightTapSharp) +}; + +enum MvJointType : uint8_t { + kMvJointTypeZero, + kMvJointTypeHorizontalNonZeroVerticalZero, + kMvJointTypeHorizontalZeroVerticalNonZero, + kMvJointTypeNonZero, + kNumMvJointTypes +}; + +enum ObuType : int8_t { + kObuInvalid = -1, + kObuSequenceHeader = 1, + kObuTemporalDelimiter = 2, + kObuFrameHeader = 3, + kObuTileGroup = 4, + kObuMetadata = 5, + kObuFrame = 6, + kObuRedundantFrameHeader = 7, + kObuTileList = 8, + kObuPadding = 15, +}; + +//------------------------------------------------------------------------------ +// ToString() +// +// These functions are meant to be used only in debug logging and within tests. +// They are defined inline to avoid including the strings in the release +// library when logging is disabled; unreferenced functions will not be added to +// any object file in that case. + +inline const char* ToString(const BlockSize size) { + switch (size) { + case kBlock4x4: + return "kBlock4x4"; + case kBlock4x8: + return "kBlock4x8"; + case kBlock4x16: + return "kBlock4x16"; + case kBlock8x4: + return "kBlock8x4"; + case kBlock8x8: + return "kBlock8x8"; + case kBlock8x16: + return "kBlock8x16"; + case kBlock8x32: + return "kBlock8x32"; + case kBlock16x4: + return "kBlock16x4"; + case kBlock16x8: + return "kBlock16x8"; + case kBlock16x16: + return "kBlock16x16"; + case kBlock16x32: + return "kBlock16x32"; + case kBlock16x64: + return "kBlock16x64"; + case kBlock32x8: + return "kBlock32x8"; + case kBlock32x16: + return "kBlock32x16"; + case kBlock32x32: + return "kBlock32x32"; + case kBlock32x64: + return "kBlock32x64"; + case kBlock64x16: + return "kBlock64x16"; + case kBlock64x32: + return "kBlock64x32"; + case kBlock64x64: + return "kBlock64x64"; + case kBlock64x128: + return "kBlock64x128"; + case kBlock128x64: + return "kBlock128x64"; + case kBlock128x128: + return "kBlock128x128"; + case kMaxBlockSizes: + return "kMaxBlockSizes"; + case kBlockInvalid: + return "kBlockInvalid"; + } + abort(); +} + +inline const char* ToString(const InterIntraMode mode) { + switch (mode) { + case kInterIntraModeDc: + return "kInterIntraModeDc"; + case kInterIntraModeVertical: + return "kInterIntraModeVertical"; + case kInterIntraModeHorizontal: + return "kInterIntraModeHorizontal"; + case kInterIntraModeSmooth: + return "kInterIntraModeSmooth"; + case kNumInterIntraModes: + return "kNumInterIntraModes"; + } + abort(); +} + +inline const char* ToString(const ObmcDirection direction) { + switch (direction) { + case kObmcDirectionVertical: + return "kObmcDirectionVertical"; + case kObmcDirectionHorizontal: + return "kObmcDirectionHorizontal"; + case kNumObmcDirections: + return "kNumObmcDirections"; + } + abort(); +} + +inline const char* ToString(const LoopRestorationType type) { + switch (type) { + case kLoopRestorationTypeNone: + return "kLoopRestorationTypeNone"; + case kLoopRestorationTypeSwitchable: + return "kLoopRestorationTypeSwitchable"; + case kLoopRestorationTypeWiener: + return "kLoopRestorationTypeWiener"; + case kLoopRestorationTypeSgrProj: + return "kLoopRestorationTypeSgrProj"; + case kNumLoopRestorationTypes: + return "kNumLoopRestorationTypes"; + } + abort(); +} + +inline const char* ToString(const TransformType type) { + switch (type) { + case kTransformTypeDctDct: + return "kTransformTypeDctDct"; + case kTransformTypeAdstDct: + return "kTransformTypeAdstDct"; + case kTransformTypeDctAdst: + return "kTransformTypeDctAdst"; + case kTransformTypeAdstAdst: + return "kTransformTypeAdstAdst"; + case kTransformTypeFlipadstDct: + return "kTransformTypeFlipadstDct"; + case kTransformTypeDctFlipadst: + return "kTransformTypeDctFlipadst"; + case kTransformTypeFlipadstFlipadst: + return "kTransformTypeFlipadstFlipadst"; + case kTransformTypeAdstFlipadst: + return "kTransformTypeAdstFlipadst"; + case kTransformTypeFlipadstAdst: + return "kTransformTypeFlipadstAdst"; + case kTransformTypeIdentityIdentity: + return "kTransformTypeIdentityIdentity"; + case kTransformTypeIdentityDct: + return "kTransformTypeIdentityDct"; + case kTransformTypeDctIdentity: + return "kTransformTypeDctIdentity"; + case kTransformTypeIdentityAdst: + return "kTransformTypeIdentityAdst"; + case kTransformTypeAdstIdentity: + return "kTransformTypeAdstIdentity"; + case kTransformTypeIdentityFlipadst: + return "kTransformTypeIdentityFlipadst"; + case kTransformTypeFlipadstIdentity: + return "kTransformTypeFlipadstIdentity"; + // case to quiet compiler + case kNumTransformTypes: + return "kNumTransformTypes"; + } + abort(); +} + +//------------------------------------------------------------------------------ + +extern const uint8_t k4x4WidthLog2[kMaxBlockSizes]; + +extern const uint8_t k4x4HeightLog2[kMaxBlockSizes]; + +extern const uint8_t kNum4x4BlocksWide[kMaxBlockSizes]; + +extern const uint8_t kNum4x4BlocksHigh[kMaxBlockSizes]; + +extern const uint8_t kBlockWidthPixels[kMaxBlockSizes]; + +extern const uint8_t kBlockHeightPixels[kMaxBlockSizes]; + +extern const BlockSize kSubSize[kMaxPartitionTypes][kMaxBlockSizes]; + +extern const BlockSize kPlaneResidualSize[kMaxBlockSizes][2][2]; + +extern const int16_t kProjectionMvDivisionLookup[kMaxFrameDistance + 1]; + +extern const uint8_t kTransformWidth[kNumTransformSizes]; + +extern const uint8_t kTransformHeight[kNumTransformSizes]; + +extern const uint8_t kTransformWidth4x4[kNumTransformSizes]; + +extern const uint8_t kTransformHeight4x4[kNumTransformSizes]; + +extern const uint8_t kTransformWidthLog2[kNumTransformSizes]; + +extern const uint8_t kTransformHeightLog2[kNumTransformSizes]; + +extern const TransformSize kSplitTransformSize[kNumTransformSizes]; + +// Square transform of size min(w,h). +extern const TransformSize kTransformSizeSquareMin[kNumTransformSizes]; + +// Square transform of size max(w,h). +extern const TransformSize kTransformSizeSquareMax[kNumTransformSizes]; + +extern const uint8_t kNumTransformTypesInSet[kNumTransformSets]; + +extern const uint8_t kSgrProjParams[1 << kSgrProjParamsBits][4]; + +extern const int8_t kSgrProjMultiplierMin[2]; + +extern const int8_t kSgrProjMultiplierMax[2]; + +extern const int8_t kWienerTapsMin[3]; + +extern const int8_t kWienerTapsMax[3]; + +extern const uint8_t kUpscaleFilterUnsigned[kSuperResFilterShifts] + [kSuperResFilterTaps]; + +// An int8_t version of the kWarpedFilters array. +// Note: The array could be removed with a performance penalty. +extern const int8_t kWarpedFilters8[3 * kWarpedPixelPrecisionShifts + 1][8]; + +extern const int16_t kWarpedFilters[3 * kWarpedPixelPrecisionShifts + 1][8]; + +extern const int8_t kHalfSubPixelFilters[6][16][8]; + +extern const uint8_t kAbsHalfSubPixelFilters[6][16][8]; + +extern const int16_t kDirectionalIntraPredictorDerivative[44]; + +extern const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes]; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_CONSTANTS_H_ diff --git a/src/utils/cpu.cc b/src/utils/cpu.cc new file mode 100644 index 0000000..a6b7057 --- /dev/null +++ b/src/utils/cpu.cc @@ -0,0 +1,84 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/cpu.h" + +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#include +#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) +#include // _xgetbv +#include +#endif + +namespace libgav1 { + +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) +namespace { + +#if defined(__GNUC__) +void CpuId(int leaf, uint32_t info[4]) { + __cpuid_count(leaf, 0 /*ecx=subleaf*/, info[0], info[1], info[2], info[3]); +} + +uint64_t Xgetbv() { + const uint32_t ecx = 0; // ecx specifies the extended control register + uint32_t eax; + uint32_t edx; + __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx)); + return (static_cast(edx) << 32) | eax; +} +#else // _MSC_VER +void CpuId(int leaf, uint32_t info[4]) { + __cpuidex(reinterpret_cast(info), leaf, 0 /*ecx=subleaf*/); +} + +uint64_t Xgetbv() { return _xgetbv(0); } +#endif // __GNUC__ + +} // namespace + +uint32_t GetCpuInfo() { + uint32_t info[4]; + + // Get the highest feature value cpuid supports + CpuId(0, info); + const int max_cpuid_value = info[0]; + if (max_cpuid_value < 1) return 0; + + CpuId(1, info); + uint32_t features = 0; + if ((info[3] & (1 << 26)) != 0) features |= kSSE2; + if ((info[2] & (1 << 9)) != 0) features |= kSSSE3; + if ((info[2] & (1 << 19)) != 0) features |= kSSE4_1; + + // Bits 27 (OSXSAVE) & 28 (256-bit AVX) + if ((info[2] & (3 << 27)) == (3 << 27)) { + // XMM state and YMM state enabled by the OS + if ((Xgetbv() & 0x6) == 0x6) { + features |= kAVX; + if (max_cpuid_value >= 7) { + CpuId(7, info); + if ((info[1] & (1 << 5)) != 0) features |= kAVX2; + } + } + } + + return features; +} +#else +uint32_t GetCpuInfo() { return 0; } +#endif // x86 || x86_64 + +} // namespace libgav1 diff --git a/src/utils/cpu.h b/src/utils/cpu.h new file mode 100644 index 0000000..630b251 --- /dev/null +++ b/src/utils/cpu.h @@ -0,0 +1,107 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_CPU_H_ +#define LIBGAV1_SRC_UTILS_CPU_H_ + +#include + +namespace libgav1 { + +#if defined(__i386__) || defined(__x86_64__) +#define LIBGAV1_X86 +#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) +#define LIBGAV1_X86 +#define LIBGAV1_X86_MSVC +#endif + +#if defined(LIBGAV1_X86) + +#if !defined(LIBGAV1_ENABLE_SSE4_1) +#define LIBGAV1_ENABLE_SSE4_1 1 +#endif + +#if LIBGAV1_ENABLE_SSE4_1 +#if !defined(LIBGAV1_ENABLE_AVX2) +#define LIBGAV1_ENABLE_AVX2 1 +#endif // !defined(LIBGAV1_ENABLE_AVX2) +#else // !LIBGAV1_ENABLE_SSE4_1 +// Disable AVX2 when SSE4.1 is disabled as it may rely on shared components. +#undef LIBGAV1_ENABLE_AVX2 +#define LIBGAV1_ENABLE_AVX2 0 +#endif // LIBGAV1_ENABLE_SSE4_1 + +#else // !LIBGAV1_X86 + +#undef LIBGAV1_ENABLE_AVX2 +#define LIBGAV1_ENABLE_AVX2 0 +#undef LIBGAV1_ENABLE_SSE4_1 +#define LIBGAV1_ENABLE_SSE4_1 0 + +#endif // LIBGAV1_X86 + +// For x86 LIBGAV1_TARGETING_* indicate the source being built is targeting +// (at least) that instruction set. This prevents disabling other instruction +// sets if the current instruction set isn't a global target, e.g., building +// *_avx2.cc w/-mavx2, but the remaining files without the flag. +#if LIBGAV1_ENABLE_AVX2 && defined(__AVX2__) +#define LIBGAV1_TARGETING_AVX2 1 +#else +#define LIBGAV1_TARGETING_AVX2 0 +#endif + +// Note: LIBGAV1_X86_MSVC isn't completely correct for Visual Studio, but there +// is no equivalent to __SSE4_1__. LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS will be +// enabled in dsp.h to compensate for this. +#if LIBGAV1_ENABLE_SSE4_1 && (defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC)) +#define LIBGAV1_TARGETING_SSE4_1 1 +#else +#define LIBGAV1_TARGETING_SSE4_1 0 +#endif + +#undef LIBGAV1_X86 + +#if !defined(LIBGAV1_ENABLE_NEON) +// TODO(jzern): add support for _M_ARM64. +#if defined(__ARM_NEON__) || defined(__aarch64__) || \ + (defined(_MSC_VER) && defined(_M_ARM)) +#define LIBGAV1_ENABLE_NEON 1 +#else +#define LIBGAV1_ENABLE_NEON 0 +#endif +#endif // !defined(LIBGAV1_ENABLE_NEON) + +enum CpuFeatures : uint8_t { + kSSE2 = 1 << 0, +#define LIBGAV1_CPU_SSE2 (1 << 0) + kSSSE3 = 1 << 1, +#define LIBGAV1_CPU_SSSE3 (1 << 1) + kSSE4_1 = 1 << 2, +#define LIBGAV1_CPU_SSE4_1 (1 << 2) + kAVX = 1 << 3, +#define LIBGAV1_CPU_AVX (1 << 3) + kAVX2 = 1 << 4, +#define LIBGAV1_CPU_AVX2 (1 << 4) + kNEON = 1 << 5, +#define LIBGAV1_CPU_NEON (1 << 5) +}; + +// Returns a bit-wise OR of CpuFeatures supported by this platform. +uint32_t GetCpuInfo(); + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_CPU_H_ diff --git a/src/utils/dynamic_buffer.h b/src/utils/dynamic_buffer.h new file mode 100644 index 0000000..b51345a --- /dev/null +++ b/src/utils/dynamic_buffer.h @@ -0,0 +1,82 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_ +#define LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_ + +#include +#include + +#include "src/utils/memory.h" + +namespace libgav1 { + +template +class DynamicBuffer { + public: + T* get() { return buffer_.get(); } + const T* get() const { return buffer_.get(); } + + // Resizes the buffer so that it can hold at least |size| elements. Existing + // contents will be destroyed when resizing to a larger size. + // + // Returns true on success. If Resize() returns false, then subsequent calls + // to get() will return nullptr. + bool Resize(size_t size) { + if (size <= size_) return true; + buffer_.reset(new (std::nothrow) T[size]); + if (buffer_ == nullptr) { + size_ = 0; + return false; + } + size_ = size; + return true; + } + + private: + std::unique_ptr buffer_; + size_t size_ = 0; +}; + +template +class AlignedDynamicBuffer { + public: + T* get() { return buffer_.get(); } + + // Resizes the buffer so that it can hold at least |size| elements. Existing + // contents will be destroyed when resizing to a larger size. + // + // Returns true on success. If Resize() returns false, then subsequent calls + // to get() will return nullptr. + bool Resize(size_t size) { + if (size <= size_) return true; + buffer_ = MakeAlignedUniquePtr(alignment, size); + if (buffer_ == nullptr) { + size_ = 0; + return false; + } + size_ = size; + return true; + } + + private: + AlignedUniquePtr buffer_; + size_t size_ = 0; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_ diff --git a/src/utils/entropy_decoder.cc b/src/utils/entropy_decoder.cc new file mode 100644 index 0000000..bf21199 --- /dev/null +++ b/src/utils/entropy_decoder.cc @@ -0,0 +1,1117 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/entropy_decoder.h" + +#include +#include + +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" + +#if defined(__ARM_NEON__) || defined(__aarch64__) || \ + (defined(_MSC_VER) && defined(_M_ARM)) +#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 1 +#else +#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 0 +#endif + +#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON +#include +#endif + +#if defined(__SSE2__) || defined(LIBGAV1_X86_MSVC) +#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 1 +#else +#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 0 +#endif + +#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 +#include +#endif + +namespace libgav1 { +namespace { + +constexpr uint32_t kReadBitMask = ~255; +constexpr int kCdfPrecision = 6; +constexpr int kMinimumProbabilityPerSymbol = 4; + +// This function computes the "cur" variable as specified inside the do-while +// loop in Section 8.2.6 of the spec. This function is monotonically +// decreasing as the values of index increases (note that the |cdf| array is +// sorted in decreasing order). +uint32_t ScaleCdf(uint32_t values_in_range_shifted, const uint16_t* const cdf, + int index, int symbol_count) { + return ((values_in_range_shifted * (cdf[index] >> kCdfPrecision)) >> 1) + + (kMinimumProbabilityPerSymbol * (symbol_count - index)); +} + +void UpdateCdf(uint16_t* const cdf, const int symbol_count, const int symbol) { + const uint16_t count = cdf[symbol_count]; + // rate is computed in the spec as: + // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2) + // In this case cdf[N] is |count|. + // Min(FloorLog2(N), 2) is 1 for symbol_count == {2, 3} and 2 for all + // symbol_count > 3. So the equation becomes: + // 4 + (count > 15) + (count > 31) + (symbol_count > 3). + // Note that the largest value for count is 32 (it is not incremented beyond + // 32). So using that information: + // count >> 4 is 0 for count from 0 to 15. + // count >> 4 is 1 for count from 16 to 31. + // count >> 4 is 2 for count == 31. + // Now, the equation becomes: + // 4 + (count >> 4) + (symbol_count > 3). + // Since (count >> 4) can only be 0 or 1 or 2, the addition could be replaced + // with bitwise or: + // (4 | (count >> 4)) + (symbol_count > 3). + // but using addition will allow the compiler to eliminate an operation when + // symbol_count is known and this function is inlined. + const int rate = (count >> 4) + 4 + static_cast(symbol_count > 3); + // Hints for further optimizations: + // + // 1. clang can vectorize this for loop with width 4, even though the loop + // contains an if-else statement. Therefore, it may be advantageous to use + // "i < symbol_count" as the loop condition when symbol_count is 8, 12, or 16 + // (a multiple of 4 that's not too small). + // + // 2. The for loop can be rewritten in the following form, which would enable + // clang to vectorize the loop with width 8: + // + // const int rounding = (1 << rate) - 1; + // for (int i = 0; i < symbol_count - 1; ++i) { + // const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding; + // cdf[i] += static_cast(a - cdf[i]) >> rate; + // } + // + // The subtraction (a - cdf[i]) relies on the overflow semantics of unsigned + // integer arithmetic. The result of the unsigned subtraction is cast to a + // signed integer and right-shifted. This requires the right shift of a + // signed integer be an arithmetic shift, which is true for clang, gcc, and + // Visual C++. + assert(symbol_count - 1 > 0); + int i = 0; + do { + if (i < symbol) { + cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate; + } else { + cdf[i] -= cdf[i] >> rate; + } + } while (++i < symbol_count - 1); + cdf[symbol_count] += static_cast(count < 32); +} + +// Define the UpdateCdfN functions. UpdateCdfN is a specialized implementation +// of UpdateCdf based on the fact that symbol_count == N. UpdateCdfN uses the +// SIMD instruction sets if available. + +#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON + +// The UpdateCdf() method contains the following for loop: +// +// for (int i = 0; i < symbol_count - 1; ++i) { +// if (i < symbol) { +// cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate; +// } else { +// cdf[i] -= cdf[i] >> rate; +// } +// } +// +// It can be rewritten in the following two forms, which are amenable to SIMD +// implementations: +// +// const int rounding = (1 << rate) - 1; +// for (int i = 0; i < symbol_count - 1; ++i) { +// const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding; +// cdf[i] += static_cast(a - cdf[i]) >> rate; +// } +// +// or: +// +// const int rounding = (1 << rate) - 1; +// for (int i = 0; i < symbol_count - 1; ++i) { +// const uint16_t a = (i < symbol) ? (kCdfMaxProbability - rounding) : 0; +// cdf[i] -= static_cast(cdf[i] - a) >> rate; +// } +// +// The following ARM NEON implementations use a modified version of the first +// form, using the comparison mask and unsigned rollover to avoid the need to +// calculate rounding. +// +// The cdf array has symbol_count + 1 elements. The first symbol_count elements +// are the CDF. The last element is a count that is initialized to 0 and may +// grow up to 32. The for loop in UpdateCdf updates the CDF in the array. Since +// cdf[symbol_count - 1] is always 0, the for loop does not update +// cdf[symbol_count - 1]. However, it would be correct to have the for loop +// update cdf[symbol_count - 1] anyway: since symbol_count - 1 >= symbol, the +// for loop would take the else branch when i is symbol_count - 1: +// cdf[i] -= cdf[i] >> rate; +// Since cdf[symbol_count - 1] is 0, cdf[symbol_count - 1] would still be 0 +// after the update. The ARM NEON implementations take advantage of this in the +// following two cases: +// 1. When symbol_count is 8 or 16, the vectorized code updates the first +// symbol_count elements in the array. +// 2. When symbol_count is 7, the vectorized code updates all the 8 elements in +// the cdf array. Since an invalid CDF value is written into cdf[7], the +// count in cdf[7] needs to be fixed up after the vectorized code. + +void UpdateCdf5(uint16_t* const cdf, const int symbol) { + uint16x4_t cdf_vec = vld1_u16(cdf); + const uint16_t count = cdf[5]; + const int rate = (count >> 4) + 5; + const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability); + const uint16x4_t index = vcreate_u16(0x0003000200010000); + const uint16x4_t symbol_vec = vdup_n_u16(symbol); + const uint16x4_t mask = vcge_u16(index, symbol_vec); + // i < symbol: 32768, i >= symbol: 65535. + const uint16x4_t a = vorr_u16(mask, cdf_max_probability); + // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf. + const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(a, cdf_vec)); + // i < symbol: cdf - 0, i >= symbol: cdf - 65535. + const uint16x4_t cdf_offset = vsub_u16(cdf_vec, mask); + const int16x4_t negative_rate = vdup_n_s16(-rate); + // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate. + const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate)); + // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate). + // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate). + cdf_vec = vadd_u16(cdf_offset, delta); + vst1_u16(cdf, cdf_vec); + cdf[5] = count + static_cast(count < 32); +} + +// This version works for |symbol_count| = 7, 8, or 9. +// See UpdateCdf5 for implementation details. +template +void UpdateCdf7To9(uint16_t* const cdf, const int symbol) { + static_assert(symbol_count >= 7 && symbol_count <= 9, ""); + uint16x8_t cdf_vec = vld1q_u16(cdf); + const uint16_t count = cdf[symbol_count]; + const int rate = (count >> 4) + 5; + const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability); + const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000), + vcreate_u16(0x0007000600050004)); + const uint16x8_t symbol_vec = vdupq_n_u16(symbol); + const uint16x8_t mask = vcgeq_u16(index, symbol_vec); + const uint16x8_t a = vorrq_u16(mask, cdf_max_probability); + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec)); + const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask); + const int16x8_t negative_rate = vdupq_n_s16(-rate); + const uint16x8_t delta = + vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); + cdf_vec = vaddq_u16(cdf_offset, delta); + vst1q_u16(cdf, cdf_vec); + cdf[symbol_count] = count + static_cast(count < 32); +} + +void UpdateCdf7(uint16_t* const cdf, const int symbol) { + UpdateCdf7To9<7>(cdf, symbol); +} + +void UpdateCdf8(uint16_t* const cdf, const int symbol) { + UpdateCdf7To9<8>(cdf, symbol); +} + +void UpdateCdf9(uint16_t* const cdf, const int symbol) { + UpdateCdf7To9<9>(cdf, symbol); +} + +// See UpdateCdf5 for implementation details. +void UpdateCdf11(uint16_t* const cdf, const int symbol) { + uint16x8_t cdf_vec = vld1q_u16(cdf + 2); + const uint16_t count = cdf[11]; + cdf[11] = count + static_cast(count < 32); + const int rate = (count >> 4) + 5; + if (symbol > 1) { + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate; + const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability); + const uint16x8_t symbol_vec = vdupq_n_u16(symbol); + const int16x8_t negative_rate = vdupq_n_s16(-rate); + const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002), + vcreate_u16(0x0009000800070006)); + const uint16x8_t mask = vcgeq_u16(index, symbol_vec); + const uint16x8_t a = vorrq_u16(mask, cdf_max_probability); + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec)); + const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask); + const uint16x8_t delta = + vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); + cdf_vec = vaddq_u16(cdf_offset, delta); + vst1q_u16(cdf + 2, cdf_vec); + } else { + if (symbol != 0) { + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] -= cdf[1] >> rate; + } else { + cdf[0] -= cdf[0] >> rate; + cdf[1] -= cdf[1] >> rate; + } + const int16x8_t negative_rate = vdupq_n_s16(-rate); + const uint16x8_t delta = vshlq_u16(cdf_vec, negative_rate); + cdf_vec = vsubq_u16(cdf_vec, delta); + vst1q_u16(cdf + 2, cdf_vec); + } +} + +// See UpdateCdf5 for implementation details. +void UpdateCdf13(uint16_t* const cdf, const int symbol) { + uint16x8_t cdf_vec0 = vld1q_u16(cdf); + uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4); + const uint16_t count = cdf[13]; + const int rate = (count >> 4) + 5; + const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability); + const uint16x8_t symbol_vec = vdupq_n_u16(symbol); + const int16x8_t negative_rate = vdupq_n_s16(-rate); + + uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000), + vcreate_u16(0x0007000600050004)); + uint16x8_t mask = vcgeq_u16(index, symbol_vec); + uint16x8_t a = vorrq_u16(mask, cdf_max_probability); + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec0)); + uint16x8_t cdf_offset = vsubq_u16(cdf_vec0, mask); + uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); + cdf_vec0 = vaddq_u16(cdf_offset, delta); + vst1q_u16(cdf, cdf_vec0); + + index = vcombine_u16(vcreate_u16(0x0007000600050004), + vcreate_u16(0x000b000a00090008)); + mask = vcgeq_u16(index, symbol_vec); + a = vorrq_u16(mask, cdf_max_probability); + diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec1)); + cdf_offset = vsubq_u16(cdf_vec1, mask); + delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); + cdf_vec1 = vaddq_u16(cdf_offset, delta); + vst1q_u16(cdf + 4, cdf_vec1); + + cdf[13] = count + static_cast(count < 32); +} + +// See UpdateCdf5 for implementation details. +void UpdateCdf16(uint16_t* const cdf, const int symbol) { + uint16x8_t cdf_vec = vld1q_u16(cdf); + const uint16_t count = cdf[16]; + const int rate = (count >> 4) + 5; + const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability); + const uint16x8_t symbol_vec = vdupq_n_u16(symbol); + const int16x8_t negative_rate = vdupq_n_s16(-rate); + + uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000), + vcreate_u16(0x0007000600050004)); + uint16x8_t mask = vcgeq_u16(index, symbol_vec); + uint16x8_t a = vorrq_u16(mask, cdf_max_probability); + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec)); + uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask); + uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); + cdf_vec = vaddq_u16(cdf_offset, delta); + vst1q_u16(cdf, cdf_vec); + + cdf_vec = vld1q_u16(cdf + 8); + index = vcombine_u16(vcreate_u16(0x000b000a00090008), + vcreate_u16(0x000f000e000d000c)); + mask = vcgeq_u16(index, symbol_vec); + a = vorrq_u16(mask, cdf_max_probability); + diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec)); + cdf_offset = vsubq_u16(cdf_vec, mask); + delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); + cdf_vec = vaddq_u16(cdf_offset, delta); + vst1q_u16(cdf + 8, cdf_vec); + + cdf[16] = count + static_cast(count < 32); +} + +#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON + +#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 + +inline __m128i LoadLo8(const void* a) { + return _mm_loadl_epi64(static_cast(a)); +} + +inline __m128i LoadUnaligned16(const void* a) { + return _mm_loadu_si128(static_cast(a)); +} + +inline void StoreLo8(void* a, const __m128i v) { + _mm_storel_epi64(static_cast<__m128i*>(a), v); +} + +inline void StoreUnaligned16(void* a, const __m128i v) { + _mm_storeu_si128(static_cast<__m128i*>(a), v); +} + +void UpdateCdf5(uint16_t* const cdf, const int symbol) { + __m128i cdf_vec = LoadLo8(cdf); + const uint16_t count = cdf[5]; + const int rate = (count >> 4) + 5; + const __m128i cdf_max_probability = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0); + const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001); + const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0); + // i >= symbol. + const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec); + // i < symbol: 32768, i >= symbol: 65535. + const __m128i a = _mm_or_si128(mask, cdf_max_probability); + // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf. + const __m128i diff = _mm_sub_epi16(a, cdf_vec); + // i < symbol: cdf - 0, i >= symbol: cdf - 65535. + const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask); + // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate. + const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); + // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate). + // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate). + cdf_vec = _mm_add_epi16(cdf_offset, delta); + StoreLo8(cdf, cdf_vec); + cdf[5] = count + static_cast(count < 32); +} + +// This version works for |symbol_count| = 7, 8, or 9. +// See UpdateCdf5 for implementation details. +template +void UpdateCdf7To9(uint16_t* const cdf, const int symbol) { + static_assert(symbol_count >= 7 && symbol_count <= 9, ""); + __m128i cdf_vec = LoadUnaligned16(cdf); + const uint16_t count = cdf[symbol_count]; + const int rate = (count >> 4) + 5; + const __m128i cdf_max_probability = + _mm_set1_epi16(static_cast(kCdfMaxProbability)); + const __m128i index = + _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); + const __m128i symbol_vec = _mm_set1_epi16(static_cast(symbol)); + const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec); + const __m128i a = _mm_or_si128(mask, cdf_max_probability); + const __m128i diff = _mm_sub_epi16(a, cdf_vec); + const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask); + const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); + cdf_vec = _mm_add_epi16(cdf_offset, delta); + StoreUnaligned16(cdf, cdf_vec); + cdf[symbol_count] = count + static_cast(count < 32); +} + +void UpdateCdf7(uint16_t* const cdf, const int symbol) { + UpdateCdf7To9<7>(cdf, symbol); +} + +void UpdateCdf8(uint16_t* const cdf, const int symbol) { + UpdateCdf7To9<8>(cdf, symbol); +} + +void UpdateCdf9(uint16_t* const cdf, const int symbol) { + UpdateCdf7To9<9>(cdf, symbol); +} + +// See UpdateCdf5 for implementation details. +void UpdateCdf11(uint16_t* const cdf, const int symbol) { + __m128i cdf_vec = LoadUnaligned16(cdf + 2); + const uint16_t count = cdf[11]; + cdf[11] = count + static_cast(count < 32); + const int rate = (count >> 4) + 5; + if (symbol > 1) { + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate; + const __m128i cdf_max_probability = + _mm_set1_epi16(static_cast(kCdfMaxProbability)); + const __m128i index = + _mm_set_epi32(0x000a0009, 0x00080007, 0x00060005, 0x00040003); + const __m128i symbol_vec = _mm_set1_epi16(static_cast(symbol)); + const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec); + const __m128i a = _mm_or_si128(mask, cdf_max_probability); + const __m128i diff = _mm_sub_epi16(a, cdf_vec); + const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask); + const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); + cdf_vec = _mm_add_epi16(cdf_offset, delta); + StoreUnaligned16(cdf + 2, cdf_vec); + } else { + if (symbol != 0) { + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] -= cdf[1] >> rate; + } else { + cdf[0] -= cdf[0] >> rate; + cdf[1] -= cdf[1] >> rate; + } + const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate)); + cdf_vec = _mm_sub_epi16(cdf_vec, delta); + StoreUnaligned16(cdf + 2, cdf_vec); + } +} + +// See UpdateCdf5 for implementation details. +void UpdateCdf13(uint16_t* const cdf, const int symbol) { + __m128i cdf_vec0 = LoadLo8(cdf); + __m128i cdf_vec1 = LoadUnaligned16(cdf + 4); + const uint16_t count = cdf[13]; + const int rate = (count >> 4) + 5; + const __m128i cdf_max_probability = + _mm_set1_epi16(static_cast(kCdfMaxProbability)); + const __m128i symbol_vec = _mm_set1_epi16(static_cast(symbol)); + + const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001); + const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec); + const __m128i a = _mm_or_si128(mask, cdf_max_probability); + const __m128i diff = _mm_sub_epi16(a, cdf_vec0); + const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask); + const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); + cdf_vec0 = _mm_add_epi16(cdf_offset, delta); + StoreLo8(cdf, cdf_vec0); + + const __m128i index1 = + _mm_set_epi32(0x000c000b, 0x000a0009, 0x00080007, 0x00060005); + const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec); + const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability); + const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1); + const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1); + const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate)); + cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1); + StoreUnaligned16(cdf + 4, cdf_vec1); + + cdf[13] = count + static_cast(count < 32); +} + +void UpdateCdf16(uint16_t* const cdf, const int symbol) { + __m128i cdf_vec0 = LoadUnaligned16(cdf); + const uint16_t count = cdf[16]; + const int rate = (count >> 4) + 5; + const __m128i cdf_max_probability = + _mm_set1_epi16(static_cast(kCdfMaxProbability)); + const __m128i symbol_vec = _mm_set1_epi16(static_cast(symbol)); + + const __m128i index = + _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); + const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec); + const __m128i a = _mm_or_si128(mask, cdf_max_probability); + const __m128i diff = _mm_sub_epi16(a, cdf_vec0); + const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask); + const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); + cdf_vec0 = _mm_add_epi16(cdf_offset, delta); + StoreUnaligned16(cdf, cdf_vec0); + + __m128i cdf_vec1 = LoadUnaligned16(cdf + 8); + const __m128i index1 = + _mm_set_epi32(0x0010000f, 0x000e000d, 0x000c000b, 0x000a0009); + const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec); + const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability); + const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1); + const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1); + const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate)); + cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1); + StoreUnaligned16(cdf + 8, cdf_vec1); + + cdf[16] = count + static_cast(count < 32); +} + +#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 + +void UpdateCdf5(uint16_t* const cdf, const int symbol) { + UpdateCdf(cdf, 5, symbol); +} + +void UpdateCdf7(uint16_t* const cdf, const int symbol) { + UpdateCdf(cdf, 7, symbol); +} + +void UpdateCdf8(uint16_t* const cdf, const int symbol) { + UpdateCdf(cdf, 8, symbol); +} + +void UpdateCdf9(uint16_t* const cdf, const int symbol) { + UpdateCdf(cdf, 9, symbol); +} + +void UpdateCdf11(uint16_t* const cdf, const int symbol) { + UpdateCdf(cdf, 11, symbol); +} + +void UpdateCdf13(uint16_t* const cdf, const int symbol) { + UpdateCdf(cdf, 13, symbol); +} + +void UpdateCdf16(uint16_t* const cdf, const int symbol) { + UpdateCdf(cdf, 16, symbol); +} + +#endif // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 +#endif // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON + +inline DaalaBitReader::WindowSize HostToBigEndian( + const DaalaBitReader::WindowSize x) { + static_assert(sizeof(x) == 4 || sizeof(x) == 8, ""); +#if defined(__GNUC__) +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return (sizeof(x) == 8) ? __builtin_bswap64(x) : __builtin_bswap32(x); +#else + return x; +#endif +#elif defined(_WIN32) + // Note Windows targets are assumed to be little endian. + return static_cast( + (sizeof(x) == 8) ? _byteswap_uint64(static_cast(x)) + : _byteswap_ulong(static_cast(x))); +#else +#error Unknown compiler! +#endif // defined(__GNUC__) +} + +} // namespace + +#if !LIBGAV1_CXX17 +constexpr int DaalaBitReader::kWindowSize; // static. +#endif + +DaalaBitReader::DaalaBitReader(const uint8_t* data, size_t size, + bool allow_update_cdf) + : data_(data), + data_end_(data + size), + data_memcpy_end_((size >= sizeof(WindowSize)) + ? data + size - sizeof(WindowSize) + 1 + : data), + allow_update_cdf_(allow_update_cdf), + values_in_range_(kCdfMaxProbability) { + if (data_ < data_memcpy_end_) { + // This is a simplified version of PopulateBits() which loads 8 extra bits + // and skips the unnecessary shifts of value and window_diff_. + WindowSize value; + memcpy(&value, data_, sizeof(value)); + data_ += sizeof(value); + window_diff_ = HostToBigEndian(value) ^ -1; + // Note the initial value of bits_ is larger than kMaxCachedBits as it's + // used to restore the most significant 0 bit that would be present after + // PopulateBits() when we extract the first symbol value. + // As shown in Section 8.2.2 Initialization process for symbol decoder, + // which uses a fixed offset to read the symbol values, the most + // significant bit is always 0: + // The variable numBits is set equal to Min( sz * 8, 15). + // The variable buf is read using the f(numBits) parsing process. + // The variable paddedBuf is set equal to ( buf << (15 - numBits) ). + // The variable SymbolValue is set to ((1 << 15) - 1) ^ paddedBuf. + bits_ = kWindowSize - 15; + return; + } + window_diff_ = 0; + bits_ = -15; + PopulateBits(); +} + +// This is similar to the ReadSymbol() implementation but it is optimized based +// on the following facts: +// * The probability is fixed at half. So some multiplications can be replaced +// with bit operations. +// * Symbol count is fixed at 2. +int DaalaBitReader::ReadBit() { + const uint32_t curr = + ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol; + const auto symbol_value = static_cast(window_diff_ >> bits_); + int bit = 1; + if (symbol_value >= curr) { + values_in_range_ -= curr; + window_diff_ -= static_cast(curr) << bits_; + bit = 0; + } else { + values_in_range_ = curr; + } + NormalizeRange(); + return bit; +} + +int64_t DaalaBitReader::ReadLiteral(int num_bits) { + assert(num_bits <= 32); + assert(num_bits > 0); + uint32_t literal = 0; + int bit = num_bits - 1; + do { + // ARM can combine a shift operation with a constant number of bits with + // some other operations, such as the OR operation. + // Here is an ARM disassembly example: + // orr w1, w0, w1, lsl #1 + // which left shifts register w1 by 1 bit and OR the shift result with + // register w0. + // The next 2 lines are equivalent to: + // literal |= static_cast(ReadBit()) << bit; + literal <<= 1; + literal |= static_cast(ReadBit()); + } while (--bit >= 0); + return literal; +} + +int DaalaBitReader::ReadSymbol(uint16_t* const cdf, int symbol_count) { + const int symbol = ReadSymbolImpl(cdf, symbol_count); + if (allow_update_cdf_) { + UpdateCdf(cdf, symbol_count, symbol); + } + return symbol; +} + +bool DaalaBitReader::ReadSymbol(uint16_t* cdf) { + assert(cdf[1] == 0); + const bool symbol = ReadSymbolImpl(cdf[0]) != 0; + if (allow_update_cdf_) { + const uint16_t count = cdf[2]; + // rate is computed in the spec as: + // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2) + // In this case N is 2 and cdf[N] is |count|. So the equation becomes: + // 4 + (count > 15) + (count > 31) + // Note that the largest value for count is 32 (it is not incremented beyond + // 32). So using that information: + // count >> 4 is 0 for count from 0 to 15. + // count >> 4 is 1 for count from 16 to 31. + // count >> 4 is 2 for count == 32. + // Now, the equation becomes: + // 4 + (count >> 4). + // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced + // with bitwise or. So the final equation is: + // 4 | (count >> 4). + const int rate = 4 | (count >> 4); + if (symbol) { + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + } else { + cdf[0] -= cdf[0] >> rate; + } + cdf[2] += static_cast(count < 32); + } + return symbol; +} + +bool DaalaBitReader::ReadSymbolWithoutCdfUpdate(uint16_t cdf) { + return ReadSymbolImpl(cdf) != 0; +} + +template +int DaalaBitReader::ReadSymbol(uint16_t* const cdf) { + static_assert(symbol_count >= 3 && symbol_count <= 16, ""); + if (symbol_count == 3 || symbol_count == 4) { + return ReadSymbol3Or4(cdf, symbol_count); + } + int symbol; + if (symbol_count == 8) { + symbol = ReadSymbolImpl8(cdf); + } else if (symbol_count <= 13) { + symbol = ReadSymbolImpl(cdf, symbol_count); + } else { + symbol = ReadSymbolImplBinarySearch(cdf, symbol_count); + } + if (allow_update_cdf_) { + if (symbol_count == 5) { + UpdateCdf5(cdf, symbol); + } else if (symbol_count == 7) { + UpdateCdf7(cdf, symbol); + } else if (symbol_count == 8) { + UpdateCdf8(cdf, symbol); + } else if (symbol_count == 9) { + UpdateCdf9(cdf, symbol); + } else if (symbol_count == 11) { + UpdateCdf11(cdf, symbol); + } else if (symbol_count == 13) { + UpdateCdf13(cdf, symbol); + } else if (symbol_count == 16) { + UpdateCdf16(cdf, symbol); + } else { + UpdateCdf(cdf, symbol_count, symbol); + } + } + return symbol; +} + +int DaalaBitReader::ReadSymbolImpl(const uint16_t* const cdf, + int symbol_count) { + assert(cdf[symbol_count - 1] == 0); + --symbol_count; + uint32_t curr = values_in_range_; + int symbol = -1; + uint32_t prev; + const auto symbol_value = static_cast(window_diff_ >> bits_); + uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count; + // Search through the |cdf| array to determine where the scaled cdf value and + // |symbol_value| cross over. + do { + prev = curr; + curr = (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) + + delta; + delta -= kMinimumProbabilityPerSymbol; + } while (symbol_value < curr); + values_in_range_ = prev - curr; + window_diff_ -= static_cast(curr) << bits_; + NormalizeRange(); + return symbol; +} + +int DaalaBitReader::ReadSymbolImplBinarySearch(const uint16_t* const cdf, + int symbol_count) { + assert(cdf[symbol_count - 1] == 0); + assert(symbol_count > 1 && symbol_count <= 16); + --symbol_count; + const auto symbol_value = static_cast(window_diff_ >> bits_); + // Search through the |cdf| array to determine where the scaled cdf value and + // |symbol_value| cross over. Since the CDFs are sorted, we can use binary + // search to do this. Let |symbol| be the index of the first |cdf| array + // entry whose scaled cdf value is less than or equal to |symbol_value|. The + // binary search maintains the invariant: + // low <= symbol <= high + 1 + // and terminates when low == high + 1. + int low = 0; + int high = symbol_count - 1; + // The binary search maintains the invariants that |prev| is the scaled cdf + // value for low - 1 and |curr| is the scaled cdf value for high + 1. (By + // convention, the scaled cdf value for -1 is values_in_range_.) When the + // binary search terminates, |prev| is the scaled cdf value for symbol - 1 + // and |curr| is the scaled cdf value for |symbol|. + uint32_t prev = values_in_range_; + uint32_t curr = 0; + const uint32_t values_in_range_shifted = values_in_range_ >> 8; + do { + const int mid = DivideBy2(low + high); + const uint32_t scaled_cdf = + ScaleCdf(values_in_range_shifted, cdf, mid, symbol_count); + if (symbol_value < scaled_cdf) { + low = mid + 1; + prev = scaled_cdf; + } else { + high = mid - 1; + curr = scaled_cdf; + } + } while (low <= high); + assert(low == high + 1); + // At this point, |low| is the symbol that has been decoded. + values_in_range_ = prev - curr; + window_diff_ -= static_cast(curr) << bits_; + NormalizeRange(); + return low; +} + +int DaalaBitReader::ReadSymbolImpl(uint16_t cdf) { + const auto symbol_value = static_cast(window_diff_ >> bits_); + const uint32_t curr = + (((values_in_range_ >> 8) * (cdf >> kCdfPrecision)) >> 1) + + kMinimumProbabilityPerSymbol; + const int symbol = static_cast(symbol_value < curr); + if (symbol == 1) { + values_in_range_ = curr; + } else { + values_in_range_ -= curr; + window_diff_ -= static_cast(curr) << bits_; + } + NormalizeRange(); + return symbol; +} + +// Equivalent to ReadSymbol(cdf, [3,4]), with the ReadSymbolImpl and UpdateCdf +// calls inlined. +int DaalaBitReader::ReadSymbol3Or4(uint16_t* const cdf, + const int symbol_count) { + assert(cdf[symbol_count - 1] == 0); + uint32_t curr = values_in_range_; + uint32_t prev; + const auto symbol_value = static_cast(window_diff_ >> bits_); + uint32_t delta = kMinimumProbabilityPerSymbol * (symbol_count - 1); + const uint32_t values_in_range_shifted = values_in_range_ >> 8; + + // Search through the |cdf| array to determine where the scaled cdf value and + // |symbol_value| cross over. If allow_update_cdf_ is true, update the |cdf| + // array. + // + // The original code is: + // + // int symbol = -1; + // do { + // prev = curr; + // curr = + // ((values_in_range_shifted * (cdf[++symbol] >> kCdfPrecision)) >> 1) + // + delta; + // delta -= kMinimumProbabilityPerSymbol; + // } while (symbol_value < curr); + // if (allow_update_cdf_) { + // UpdateCdf(cdf, [3,4], symbol); + // } + // + // The do-while loop is unrolled with three or four iterations, and the + // UpdateCdf call is inlined and merged into the iterations. + int symbol = 0; + // Iteration 0. + prev = curr; + curr = + ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta; + if (symbol_value >= curr) { + // symbol == 0. + if (allow_update_cdf_) { + // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/0). + const uint16_t count = cdf[symbol_count]; + cdf[symbol_count] += static_cast(count < 32); + const int rate = (count >> 4) + 4 + static_cast(symbol_count == 4); + if (symbol_count == 4) { +#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON + // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM + // NEON code is slower. Consider using the C version if __arm__ is + // defined. + // 2. The ARM NEON code (compiled for arm64) is slightly slower on + // Samsung Galaxy S8+ (SM-G955FD). + uint16x4_t cdf_vec = vld1_u16(cdf); + const int16x4_t negative_rate = vdup_n_s16(-rate); + const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate); + cdf_vec = vsub_u16(cdf_vec, delta); + vst1_u16(cdf, cdf_vec); +#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 + __m128i cdf_vec = LoadLo8(cdf); + const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate)); + cdf_vec = _mm_sub_epi16(cdf_vec, delta); + StoreLo8(cdf, cdf_vec); +#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 + cdf[0] -= cdf[0] >> rate; + cdf[1] -= cdf[1] >> rate; + cdf[2] -= cdf[2] >> rate; +#endif + } else { // symbol_count == 3. + cdf[0] -= cdf[0] >> rate; + cdf[1] -= cdf[1] >> rate; + } + } + goto found; + } + ++symbol; + delta -= kMinimumProbabilityPerSymbol; + // Iteration 1. + prev = curr; + curr = + ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta; + if (symbol_value >= curr) { + // symbol == 1. + if (allow_update_cdf_) { + // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/1). + const uint16_t count = cdf[symbol_count]; + cdf[symbol_count] += static_cast(count < 32); + const int rate = (count >> 4) + 4 + static_cast(symbol_count == 4); + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] -= cdf[1] >> rate; + if (symbol_count == 4) cdf[2] -= cdf[2] >> rate; + } + goto found; + } + ++symbol; + if (symbol_count == 4) { + delta -= kMinimumProbabilityPerSymbol; + // Iteration 2. + prev = curr; + curr = ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + + delta; + if (symbol_value >= curr) { + // symbol == 2. + if (allow_update_cdf_) { + // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2). + const uint16_t count = cdf[4]; + cdf[4] += static_cast(count < 32); + const int rate = (count >> 4) + 5; + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate; + cdf[2] -= cdf[2] >> rate; + } + goto found; + } + ++symbol; + } + // |delta| is 0 for the last iteration. + // Iteration 2 (symbol_count == 3) or 3 (symbol_count == 4). + prev = curr; + // Since cdf[symbol_count - 1] is 0 and |delta| is 0, |curr| is also 0. + curr = 0; + // symbol == [2,3]. + if (allow_update_cdf_) { + // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/[2,3]). + const uint16_t count = cdf[symbol_count]; + cdf[symbol_count] += static_cast(count < 32); + const int rate = (4 | (count >> 4)) + static_cast(symbol_count == 4); + if (symbol_count == 4) { +#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON + // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON + // code is a tiny bit slower. Consider using the C version if __arm__ is + // defined. + uint16x4_t cdf_vec = vld1_u16(cdf); + const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability); + const int16x4_t diff = + vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec)); + const int16x4_t negative_rate = vdup_n_s16(-rate); + const uint16x4_t delta = + vreinterpret_u16_s16(vshl_s16(diff, negative_rate)); + cdf_vec = vadd_u16(cdf_vec, delta); + vst1_u16(cdf, cdf_vec); + cdf[3] = 0; +#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 + __m128i cdf_vec = LoadLo8(cdf); + const __m128i cdf_max_probability = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0); + const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec); + const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); + cdf_vec = _mm_add_epi16(cdf_vec, delta); + StoreLo8(cdf, cdf_vec); + cdf[3] = 0; +#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate; + cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate; +#endif + } else { // symbol_count == 3. + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate; + } + } +found: + // End of unrolled do-while loop. + + values_in_range_ = prev - curr; + window_diff_ -= static_cast(curr) << bits_; + NormalizeRange(); + return symbol; +} + +int DaalaBitReader::ReadSymbolImpl8(const uint16_t* const cdf) { + assert(cdf[7] == 0); + uint32_t curr = values_in_range_; + uint32_t prev; + const auto symbol_value = static_cast(window_diff_ >> bits_); + uint32_t delta = kMinimumProbabilityPerSymbol * 7; + // Search through the |cdf| array to determine where the scaled cdf value and + // |symbol_value| cross over. + // + // The original code is: + // + // int symbol = -1; + // do { + // prev = curr; + // curr = + // (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) + // + delta; + // delta -= kMinimumProbabilityPerSymbol; + // } while (symbol_value < curr); + // + // The do-while loop is unrolled with eight iterations. + int symbol = 0; + +#define READ_SYMBOL_ITERATION \ + prev = curr; \ + curr = (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + \ + delta; \ + if (symbol_value >= curr) goto found; \ + ++symbol; \ + delta -= kMinimumProbabilityPerSymbol + + READ_SYMBOL_ITERATION; // Iteration 0. + READ_SYMBOL_ITERATION; // Iteration 1. + READ_SYMBOL_ITERATION; // Iteration 2. + READ_SYMBOL_ITERATION; // Iteration 3. + READ_SYMBOL_ITERATION; // Iteration 4. + READ_SYMBOL_ITERATION; // Iteration 5. + + // The last two iterations can be simplified, so they don't use the + // READ_SYMBOL_ITERATION macro. +#undef READ_SYMBOL_ITERATION + + // Iteration 6. + prev = curr; + curr = + (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta; + if (symbol_value >= curr) goto found; // symbol == 6. + ++symbol; + // |delta| is 0 for the last iteration. + // Iteration 7. + prev = curr; + // Since cdf[7] is 0 and |delta| is 0, |curr| is also 0. + curr = 0; + // symbol == 7. +found: + // End of unrolled do-while loop. + + values_in_range_ = prev - curr; + window_diff_ -= static_cast(curr) << bits_; + NormalizeRange(); + return symbol; +} + +void DaalaBitReader::PopulateBits() { + constexpr int kMaxCachedBits = kWindowSize - 16; +#if defined(__aarch64__) + // Fast path: read eight bytes and add the first six bytes to window_diff_. + // This fast path makes the following assumptions. + // 1. We assume that unaligned load of uint64_t is fast. + // 2. When there are enough bytes in data_, the for loop below reads 6 or 7 + // bytes depending on the value of bits_. This fast path always reads 6 + // bytes, which results in more calls to PopulateBits(). We assume that + // making more calls to a faster PopulateBits() is overall a win. + // NOTE: Although this fast path could also be used on x86_64, it hurts + // performance (measured on Lenovo ThinkStation P920 running Linux). (The + // reason is still unknown.) Therefore this fast path is only used on arm64. + static_assert(kWindowSize == 64, ""); + if (data_ < data_memcpy_end_) { + uint64_t value; + // arm64 supports unaligned loads, so this memcpy call is compiled to a + // single ldr instruction. + memcpy(&value, data_, sizeof(value)); + data_ += kMaxCachedBits >> 3; + value = HostToBigEndian(value) ^ -1; + value >>= kWindowSize - kMaxCachedBits; + window_diff_ = value | (window_diff_ << kMaxCachedBits); + bits_ += kMaxCachedBits; + return; + } +#endif + + const uint8_t* data = data_; + int bits = bits_; + WindowSize window_diff = window_diff_; + + int count = kWindowSize - 9 - (bits + 15); + // The fast path above, if compiled, would cause clang 8.0.7 to vectorize + // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7 + // iterations when WindowSize is 64 bits. So it is not profitable to + // vectorize this loop. Note that clang 8.0.7 does not vectorize this loop if + // the fast path above is not compiled. + +#ifdef __clang__ +#pragma clang loop vectorize(disable) interleave(disable) +#endif + for (; count >= 0 && data < data_end_; count -= 8) { + const uint8_t value = *data++ ^ -1; + window_diff = static_cast(value) | (window_diff << 8); + bits += 8; + } + assert(bits <= kMaxCachedBits); + if (data == data_end_) { + // Shift in some 1s. This is equivalent to providing fake 0 data bits. + window_diff = ((window_diff + 1) << (kMaxCachedBits - bits)) - 1; + bits = kMaxCachedBits; + } + + data_ = data; + bits_ = bits; + window_diff_ = window_diff; +} + +void DaalaBitReader::NormalizeRange() { + const int bits_used = 15 ^ FloorLog2(values_in_range_); + bits_ -= bits_used; + values_in_range_ <<= bits_used; + if (bits_ < 0) PopulateBits(); +} + +// Explicit instantiations. +template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<6>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<9>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<12>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf); + +} // namespace libgav1 diff --git a/src/utils/entropy_decoder.h b/src/utils/entropy_decoder.h new file mode 100644 index 0000000..c066b98 --- /dev/null +++ b/src/utils/entropy_decoder.h @@ -0,0 +1,123 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_ +#define LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_ + +#include +#include + +#include "src/utils/bit_reader.h" +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { + +class DaalaBitReader : public BitReader { + public: + // WindowSize must be an unsigned integer type with at least 32 bits. Use the + // largest type with fast arithmetic. size_t should meet these requirements. + using WindowSize = size_t; + + DaalaBitReader(const uint8_t* data, size_t size, bool allow_update_cdf); + ~DaalaBitReader() override = default; + + // Move only. + DaalaBitReader(DaalaBitReader&& rhs) noexcept; + DaalaBitReader& operator=(DaalaBitReader&& rhs) noexcept; + + int ReadBit() final; + int64_t ReadLiteral(int num_bits) override; + // ReadSymbol() calls for which the |symbol_count| is only known at runtime + // will use this variant. + int ReadSymbol(uint16_t* cdf, int symbol_count); + // ReadSymbol() calls for which the |symbol_count| is equal to 2 (boolean + // symbols) will use this variant. + bool ReadSymbol(uint16_t* cdf); + bool ReadSymbolWithoutCdfUpdate(uint16_t cdf); + // Use either linear search or binary search for decoding the symbol depending + // on |symbol_count|. ReadSymbol calls for which the |symbol_count| is known + // at compile time will use this variant. + template + int ReadSymbol(uint16_t* cdf); + + private: + static constexpr int kWindowSize = static_cast(sizeof(WindowSize)) * 8; + static_assert(kWindowSize >= 32, ""); + + // Reads a symbol using the |cdf| table which contains the probabilities of + // each symbol. On a high level, this function does the following: + // 1) Scale the |cdf| values. + // 2) Find the index in the |cdf| array where the scaled CDF value crosses + // the modified |window_diff_| threshold. + // 3) That index is the symbol that has been decoded. + // 4) Update |window_diff_| and |values_in_range_| based on the symbol that + // has been decoded. + inline int ReadSymbolImpl(const uint16_t* cdf, int symbol_count); + // Similar to ReadSymbolImpl but it uses binary search to perform step 2 in + // the comment above. As of now, this function is called when |symbol_count| + // is greater than or equal to 14. + inline int ReadSymbolImplBinarySearch(const uint16_t* cdf, int symbol_count); + // Specialized implementation of ReadSymbolImpl based on the fact that + // symbol_count == 2. + inline int ReadSymbolImpl(uint16_t cdf); + // ReadSymbolN is a specialization of ReadSymbol for symbol_count == N. + LIBGAV1_ALWAYS_INLINE int ReadSymbol3Or4(uint16_t* cdf, int symbol_count); + // ReadSymbolImplN is a specialization of ReadSymbolImpl for + // symbol_count == N. + LIBGAV1_ALWAYS_INLINE int ReadSymbolImpl8(const uint16_t* cdf); + inline void PopulateBits(); + // Normalizes the range so that 32768 <= |values_in_range_| < 65536. Also + // calls PopulateBits() if necessary. + inline void NormalizeRange(); + + const uint8_t* data_; + const uint8_t* const data_end_; + // If |data_| < |data_memcpy_end_|, then we can read sizeof(WindowSize) bytes + // from |data_|. Note with sizeof(WindowSize) == 4 this is only used in the + // constructor, not PopulateBits(). + const uint8_t* const data_memcpy_end_; + const bool allow_update_cdf_; + // Number of cached bits of data in the current value. + int bits_; + // Number of values in the current range. Declared as uint32_t for better + // performance but only the lower 16 bits are used. + uint32_t values_in_range_; + // The difference between the high end of the current range and the coded + // value minus 1. The 16 bits above |bits_| of this variable are used to + // decode the next symbol. It is filled in whenever |bits_| is less than 0. + // Note this implementation differs from the spec as it trades the need to + // shift in 1s in NormalizeRange() with an extra shift in PopulateBits(), + // which occurs less frequently. + WindowSize window_diff_; +}; + +extern template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<6>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<9>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<12>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf); + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_ diff --git a/src/utils/executor.cc b/src/utils/executor.cc new file mode 100644 index 0000000..6934057 --- /dev/null +++ b/src/utils/executor.cc @@ -0,0 +1,21 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/executor.h" + +namespace libgav1 { + +Executor::~Executor() = default; + +} // namespace libgav1 diff --git a/src/utils/executor.h b/src/utils/executor.h new file mode 100644 index 0000000..21abdf8 --- /dev/null +++ b/src/utils/executor.h @@ -0,0 +1,36 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_EXECUTOR_H_ +#define LIBGAV1_SRC_UTILS_EXECUTOR_H_ + +#include + +namespace libgav1 { + +class Executor { + public: + virtual ~Executor(); + + // Schedules the specified "callback" for execution in this executor. + // Depending on the subclass implementation, this may block in some + // situations. + virtual void Schedule(std::function callback) = 0; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_EXECUTOR_H_ diff --git a/src/utils/libgav1_utils.cmake b/src/utils/libgav1_utils.cmake new file mode 100644 index 0000000..8b6ec4b --- /dev/null +++ b/src/utils/libgav1_utils.cmake @@ -0,0 +1,72 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_) + return() +endif() # LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_ +set(LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_ 1) + +list(APPEND libgav1_utils_sources + "${libgav1_source}/utils/array_2d.h" + "${libgav1_source}/utils/bit_mask_set.h" + "${libgav1_source}/utils/bit_reader.cc" + "${libgav1_source}/utils/bit_reader.h" + "${libgav1_source}/utils/block_parameters_holder.cc" + "${libgav1_source}/utils/block_parameters_holder.h" + "${libgav1_source}/utils/blocking_counter.h" + "${libgav1_source}/utils/common.h" + "${libgav1_source}/utils/compiler_attributes.h" + "${libgav1_source}/utils/constants.cc" + "${libgav1_source}/utils/constants.h" + "${libgav1_source}/utils/cpu.cc" + "${libgav1_source}/utils/cpu.h" + "${libgav1_source}/utils/dynamic_buffer.h" + "${libgav1_source}/utils/entropy_decoder.cc" + "${libgav1_source}/utils/entropy_decoder.h" + "${libgav1_source}/utils/executor.cc" + "${libgav1_source}/utils/executor.h" + "${libgav1_source}/utils/logging.cc" + "${libgav1_source}/utils/logging.h" + "${libgav1_source}/utils/memory.h" + "${libgav1_source}/utils/parameter_tree.cc" + "${libgav1_source}/utils/parameter_tree.h" + "${libgav1_source}/utils/queue.h" + "${libgav1_source}/utils/raw_bit_reader.cc" + "${libgav1_source}/utils/raw_bit_reader.h" + "${libgav1_source}/utils/reference_info.h" + "${libgav1_source}/utils/segmentation.cc" + "${libgav1_source}/utils/segmentation.h" + "${libgav1_source}/utils/segmentation_map.cc" + "${libgav1_source}/utils/segmentation_map.h" + "${libgav1_source}/utils/stack.h" + "${libgav1_source}/utils/threadpool.cc" + "${libgav1_source}/utils/threadpool.h" + "${libgav1_source}/utils/types.h" + "${libgav1_source}/utils/unbounded_queue.h" + "${libgav1_source}/utils/vector.h") + +macro(libgav1_add_utils_targets) + libgav1_add_library(NAME + libgav1_utils + TYPE + OBJECT + SOURCES + ${libgav1_utils_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_include_paths} + ${libgav1_gtest_include_paths}) + +endmacro() diff --git a/src/utils/logging.cc b/src/utils/logging.cc new file mode 100644 index 0000000..9a43c22 --- /dev/null +++ b/src/utils/logging.cc @@ -0,0 +1,65 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/logging.h" + +#include +#include +#include +#include // NOLINT (unapproved c++11 header) + +#if !defined(LIBGAV1_LOG_LEVEL) +#define LIBGAV1_LOG_LEVEL (1 << 30) +#endif + +namespace libgav1 { +namespace internal { +#if LIBGAV1_ENABLE_LOGGING +namespace { + +const char* LogSeverityName(LogSeverity severity) { + switch (severity) { + case LogSeverity::kInfo: + return "INFO"; + case LogSeverity::kError: + return "ERROR"; + case LogSeverity::kWarning: + return "WARNING"; + } + return "UNKNOWN"; +} + +} // namespace + +void Log(LogSeverity severity, const char* file, int line, const char* format, + ...) { + if (LIBGAV1_LOG_LEVEL < static_cast(severity)) return; + std::ostringstream ss; + ss << std::hex << std::this_thread::get_id(); + fprintf(stderr, "%s %s %s:%d] ", LogSeverityName(severity), ss.str().c_str(), + file, line); + + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + fprintf(stderr, "\n"); +} +#else // !LIBGAV1_ENABLE_LOGGING +void Log(LogSeverity /*severity*/, const char* /*file*/, int /*line*/, + const char* /*format*/, ...) {} +#endif // LIBGAV1_ENABLE_LOGGING + +} // namespace internal +} // namespace libgav1 diff --git a/src/utils/logging.h b/src/utils/logging.h new file mode 100644 index 0000000..48928db --- /dev/null +++ b/src/utils/logging.h @@ -0,0 +1,85 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_LOGGING_H_ +#define LIBGAV1_SRC_UTILS_LOGGING_H_ + +#include + +#include "src/utils/compiler_attributes.h" + +#if !defined(LIBGAV1_ENABLE_LOGGING) +#if defined(NDEBUG) || defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) +#define LIBGAV1_ENABLE_LOGGING 0 +#else +#define LIBGAV1_ENABLE_LOGGING 1 +#endif +#endif + +#if LIBGAV1_ENABLE_LOGGING +// LIBGAV1_DLOG(severity, printf-format-string) +// Debug logging that can optionally be enabled in release builds by explicitly +// setting LIBGAV1_ENABLE_LOGGING. +// Severity is given as an all-caps version of enum LogSeverity with the +// leading 'k' removed: LIBGAV1_DLOG(INFO, "..."); +#define LIBGAV1_DLOG(severity, ...) \ + do { \ + constexpr const char* libgav1_logging_internal_basename = \ + ::libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1); \ + ::libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity, \ + libgav1_logging_internal_basename, __LINE__, \ + __VA_ARGS__); \ + } while (0) +#else +#define LIBGAV1_DLOG(severity, ...) \ + do { \ + } while (0) +#endif // LIBGAV1_ENABLE_LOGGING + +#define LIBGAV1_LOGGING_INTERNAL_ERROR ::libgav1::internal::LogSeverity::kError +#define LIBGAV1_LOGGING_INTERNAL_WARNING \ + ::libgav1::internal::LogSeverity::kWarning +#define LIBGAV1_LOGGING_INTERNAL_INFO ::libgav1::internal::LogSeverity::kInfo + +namespace libgav1 { +namespace internal { + +enum class LogSeverity : int { + kError, + kWarning, + kInfo, +}; + +// Helper function to implement LIBGAV1_DLOG +// Logs |format, ...| at |severity| level, reporting it as called from +// |file|:|line|. +void Log(libgav1::internal::LogSeverity severity, const char* file, int line, + const char* format, ...) LIBGAV1_PRINTF_ATTRIBUTE(4, 5); + +// Compile-time function to get the 'base' file_name, that is, the part of +// a file_name after the last '/' or '\' path separator. The search starts at +// the end of the string; the second parameter is the length of the string. +constexpr const char* Basename(const char* file_name, size_t offset) { + return (offset == 0 || file_name[offset - 1] == '/' || + file_name[offset - 1] == '\\') + ? file_name + offset + : Basename(file_name, offset - 1); +} + +} // namespace internal +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_LOGGING_H_ diff --git a/src/utils/memory.h b/src/utils/memory.h new file mode 100644 index 0000000..219a83f --- /dev/null +++ b/src/utils/memory.h @@ -0,0 +1,237 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_MEMORY_H_ +#define LIBGAV1_SRC_UTILS_MEMORY_H_ + +#if defined(__ANDROID__) || defined(_MSC_VER) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +namespace libgav1 { + +enum { +// The byte alignment required for buffers used with SIMD code to be read or +// written with aligned operations. +#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \ + defined(_M_X64) + kMaxAlignment = 32, // extended alignment is safe on x86. +#else + kMaxAlignment = alignof(max_align_t), +#endif +}; + +// AlignedAlloc, AlignedFree +// +// void* AlignedAlloc(size_t alignment, size_t size); +// Allocate aligned memory. +// |alignment| must be a power of 2. +// Unlike posix_memalign(), |alignment| may be smaller than sizeof(void*). +// Unlike aligned_alloc(), |size| does not need to be a multiple of +// |alignment|. +// The returned pointer should be freed by AlignedFree(). +// +// void AlignedFree(void* aligned_memory); +// Free aligned memory. + +#if defined(_MSC_VER) // MSVC + +inline void* AlignedAlloc(size_t alignment, size_t size) { + return _aligned_malloc(size, alignment); +} + +inline void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); } + +#else // !defined(_MSC_VER) + +inline void* AlignedAlloc(size_t alignment, size_t size) { +#if defined(__ANDROID__) + // Although posix_memalign() was introduced in Android API level 17, it is + // more convenient to use memalign(). Unlike glibc, Android does not consider + // memalign() an obsolete function. + return memalign(alignment, size); +#else // !defined(__ANDROID__) + void* ptr = nullptr; + // posix_memalign requires that the requested alignment be at least + // sizeof(void*). In this case, fall back on malloc which should return + // memory aligned to at least the size of a pointer. + const size_t required_alignment = sizeof(void*); + if (alignment < required_alignment) return malloc(size); + const int error = posix_memalign(&ptr, alignment, size); + if (error != 0) { + errno = error; + return nullptr; + } + return ptr; +#endif // defined(__ANDROID__) +} + +inline void AlignedFree(void* aligned_memory) { free(aligned_memory); } + +#endif // defined(_MSC_VER) + +inline void Memset(uint8_t* const dst, int value, size_t count) { + memset(dst, value, count); +} + +inline void Memset(uint16_t* const dst, int value, size_t count) { + for (size_t i = 0; i < count; ++i) { + dst[i] = static_cast(value); + } +} + +struct MallocDeleter { + void operator()(void* ptr) const { free(ptr); } +}; + +struct AlignedDeleter { + void operator()(void* ptr) const { AlignedFree(ptr); } +}; + +template +using AlignedUniquePtr = std::unique_ptr; + +// Allocates aligned memory for an array of |count| elements of type T. +template +inline AlignedUniquePtr MakeAlignedUniquePtr(size_t alignment, + size_t count) { + return AlignedUniquePtr( + static_cast(AlignedAlloc(alignment, count * sizeof(T)))); +} + +// A base class with custom new and delete operators. The exception-throwing +// new operators are deleted. The "new (std::nothrow)" form must be used. +// +// The new operators return nullptr if the requested size is greater than +// 0x40000000 bytes (1 GB). TODO(wtc): Make the maximum allocable memory size +// a compile-time configuration macro. +// +// See https://en.cppreference.com/w/cpp/memory/new/operator_new and +// https://en.cppreference.com/w/cpp/memory/new/operator_delete. +// +// NOTE: The allocation and deallocation functions are static member functions +// whether the keyword 'static' is used or not. +struct Allocable { + // Class-specific allocation functions. + static void* operator new(size_t size) = delete; + static void* operator new[](size_t size) = delete; + + // Class-specific non-throwing allocation functions + static void* operator new(size_t size, const std::nothrow_t& tag) noexcept { + if (size > 0x40000000) return nullptr; + return ::operator new(size, tag); + } + static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept { + if (size > 0x40000000) return nullptr; + return ::operator new[](size, tag); + } + + // Class-specific deallocation functions. + static void operator delete(void* ptr) noexcept { ::operator delete(ptr); } + static void operator delete[](void* ptr) noexcept { + ::operator delete[](ptr); + } + + // Only called if new (std::nothrow) is used and the constructor throws an + // exception. + static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept { + ::operator delete(ptr, tag); + } + // Only called if new[] (std::nothrow) is used and the constructor throws an + // exception. + static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept { + ::operator delete[](ptr, tag); + } +}; + +// A variant of Allocable that forces allocations to be aligned to +// kMaxAlignment bytes. This is intended for use with classes that use +// alignas() with this value. C++17 aligned new/delete are used if available, +// otherwise we use AlignedAlloc/Free. +struct MaxAlignedAllocable { + // Class-specific allocation functions. + static void* operator new(size_t size) = delete; + static void* operator new[](size_t size) = delete; + + // Class-specific non-throwing allocation functions + static void* operator new(size_t size, const std::nothrow_t& tag) noexcept { + if (size > 0x40000000) return nullptr; +#ifdef __cpp_aligned_new + return ::operator new(size, std::align_val_t(kMaxAlignment), tag); +#else + static_cast(tag); + return AlignedAlloc(kMaxAlignment, size); +#endif + } + static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept { + if (size > 0x40000000) return nullptr; +#ifdef __cpp_aligned_new + return ::operator new[](size, std::align_val_t(kMaxAlignment), tag); +#else + static_cast(tag); + return AlignedAlloc(kMaxAlignment, size); +#endif + } + + // Class-specific deallocation functions. + static void operator delete(void* ptr) noexcept { +#ifdef __cpp_aligned_new + ::operator delete(ptr, std::align_val_t(kMaxAlignment)); +#else + AlignedFree(ptr); +#endif + } + static void operator delete[](void* ptr) noexcept { +#ifdef __cpp_aligned_new + ::operator delete[](ptr, std::align_val_t(kMaxAlignment)); +#else + AlignedFree(ptr); +#endif + } + + // Only called if new (std::nothrow) is used and the constructor throws an + // exception. + static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept { +#ifdef __cpp_aligned_new + ::operator delete(ptr, std::align_val_t(kMaxAlignment), tag); +#else + static_cast(tag); + AlignedFree(ptr); +#endif + } + // Only called if new[] (std::nothrow) is used and the constructor throws an + // exception. + static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept { +#ifdef __cpp_aligned_new + ::operator delete[](ptr, std::align_val_t(kMaxAlignment), tag); +#else + static_cast(tag); + AlignedFree(ptr); +#endif + } +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_MEMORY_H_ diff --git a/src/utils/parameter_tree.cc b/src/utils/parameter_tree.cc new file mode 100644 index 0000000..9426ce6 --- /dev/null +++ b/src/utils/parameter_tree.cc @@ -0,0 +1,133 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/parameter_tree.h" + +#include +#include +#include + +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/logging.h" +#include "src/utils/types.h" + +namespace libgav1 { + +// static +std::unique_ptr ParameterTree::Create(int row4x4, int column4x4, + BlockSize block_size, + bool is_leaf) { + std::unique_ptr tree( + new (std::nothrow) ParameterTree(row4x4, column4x4, block_size)); + if (tree != nullptr && is_leaf && !tree->SetPartitionType(kPartitionNone)) { + tree = nullptr; + } + return tree; +} + +bool ParameterTree::SetPartitionType(Partition partition) { + assert(!partition_type_set_); + partition_ = partition; + partition_type_set_ = true; + const int block_width4x4 = kNum4x4BlocksWide[block_size_]; + const int half_block4x4 = block_width4x4 >> 1; + const int quarter_block4x4 = half_block4x4 >> 1; + const BlockSize sub_size = kSubSize[partition][block_size_]; + const BlockSize split_size = kSubSize[kPartitionSplit][block_size_]; + assert(partition == kPartitionNone || sub_size != kBlockInvalid); + switch (partition) { + case kPartitionNone: + parameters_.reset(new (std::nothrow) BlockParameters()); + return parameters_ != nullptr; + case kPartitionHorizontal: + children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true); + children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, + sub_size, true); + return children_[0] != nullptr && children_[1] != nullptr; + case kPartitionVertical: + children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true); + children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, + sub_size, true); + return children_[0] != nullptr && children_[1] != nullptr; + case kPartitionSplit: + children_[0] = + ParameterTree::Create(row4x4_, column4x4_, sub_size, false); + children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, + sub_size, false); + children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, + sub_size, false); + children_[3] = ParameterTree::Create( + row4x4_ + half_block4x4, column4x4_ + half_block4x4, sub_size, false); + return children_[0] != nullptr && children_[1] != nullptr && + children_[2] != nullptr && children_[3] != nullptr; + case kPartitionHorizontalWithTopSplit: + assert(split_size != kBlockInvalid); + children_[0] = + ParameterTree::Create(row4x4_, column4x4_, split_size, true); + children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, + split_size, true); + children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, + sub_size, true); + return children_[0] != nullptr && children_[1] != nullptr && + children_[2] != nullptr; + case kPartitionHorizontalWithBottomSplit: + assert(split_size != kBlockInvalid); + children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true); + children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, + split_size, true); + children_[2] = + ParameterTree::Create(row4x4_ + half_block4x4, + column4x4_ + half_block4x4, split_size, true); + return children_[0] != nullptr && children_[1] != nullptr && + children_[2] != nullptr; + case kPartitionVerticalWithLeftSplit: + assert(split_size != kBlockInvalid); + children_[0] = + ParameterTree::Create(row4x4_, column4x4_, split_size, true); + children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, + split_size, true); + children_[2] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, + sub_size, true); + return children_[0] != nullptr && children_[1] != nullptr && + children_[2] != nullptr; + case kPartitionVerticalWithRightSplit: + assert(split_size != kBlockInvalid); + children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true); + children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, + split_size, true); + children_[2] = + ParameterTree::Create(row4x4_ + half_block4x4, + column4x4_ + half_block4x4, split_size, true); + return children_[0] != nullptr && children_[1] != nullptr && + children_[2] != nullptr; + case kPartitionHorizontal4: + for (int i = 0; i < 4; ++i) { + children_[i] = ParameterTree::Create(row4x4_ + i * quarter_block4x4, + column4x4_, sub_size, true); + if (children_[i] == nullptr) return false; + } + return true; + default: + assert(partition == kPartitionVertical4); + for (int i = 0; i < 4; ++i) { + children_[i] = ParameterTree::Create( + row4x4_, column4x4_ + i * quarter_block4x4, sub_size, true); + if (children_[i] == nullptr) return false; + } + return true; + } +} + +} // namespace libgav1 diff --git a/src/utils/parameter_tree.h b/src/utils/parameter_tree.h new file mode 100644 index 0000000..935f3eb --- /dev/null +++ b/src/utils/parameter_tree.h @@ -0,0 +1,113 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_ +#define LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_ + +#include +#include + +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" +#include "src/utils/types.h" + +namespace libgav1 { + +class ParameterTree : public Allocable { + public: + // Creates a parameter tree to store the parameters of a block of size + // |block_size| starting at coordinates |row4x4| and |column4x4|. If |is_leaf| + // is set to true, the memory will be allocated for the BlockParameters for + // this node. Otherwise, no memory will be allocated. If |is_leaf| is set to + // false, |block_size| must be a square block, i.e., + // kBlockWidthPixels[block_size] must be equal to + // kBlockHeightPixels[block_size]. + static std::unique_ptr Create(int row4x4, int column4x4, + BlockSize block_size, + bool is_leaf = false); + + // Move only (not Copyable). + ParameterTree(ParameterTree&& other) = default; + ParameterTree& operator=(ParameterTree&& other) = default; + ParameterTree(const ParameterTree&) = delete; + ParameterTree& operator=(const ParameterTree&) = delete; + + // Set the partition type of the current node to |partition|. + // if (partition == kPartitionNone) { + // Memory will be allocated for the BlockParameters for this node. + // } else if (partition != kPartitionSplit) { + // The appropriate child nodes will be populated and memory will be + // allocated for the BlockParameters of the children. + // } else { + // The appropriate child nodes will be populated but they are considered to + // be hanging, i.e., future calls to SetPartitionType() on the child nodes + // will have to set them or their descendants to a terminal type. + // } + // This function must be called only once per node. + LIBGAV1_MUST_USE_RESULT bool SetPartitionType(Partition partition); + + // Basic getters. + int row4x4() const { return row4x4_; } + int column4x4() const { return column4x4_; } + BlockSize block_size() const { return block_size_; } + Partition partition() const { return partition_; } + ParameterTree* children(int index) const { + assert(index < 4); + return children_[index].get(); + } + // Returns the BlockParameters object of the current node if one exists. + // Otherwise returns nullptr. This function will return a valid + // BlockParameters object only for leaf nodes. + BlockParameters* parameters() const { return parameters_.get(); } + + private: + ParameterTree(int row4x4, int column4x4, BlockSize block_size) + : row4x4_(row4x4), column4x4_(column4x4), block_size_(block_size) {} + + Partition partition_ = kPartitionNone; + std::unique_ptr parameters_ = nullptr; + int row4x4_ = -1; + int column4x4_ = -1; + BlockSize block_size_ = kBlockInvalid; + bool partition_type_set_ = false; + + // Child values are defined as follows for various partition types: + // * Horizontal: 0 top partition; 1 bottom partition; 2 nullptr; 3 nullptr; + // * Vertical: 0 left partition; 1 right partition; 2 nullptr; 3 nullptr; + // * Split: 0 top-left partition; 1 top-right partition; 2; bottom-left + // partition; 3 bottom-right partition; + // * HorizontalWithTopSplit: 0 top-left partition; 1 top-right partition; 2 + // bottom partition; 3 nullptr; + // * HorizontalWithBottomSplit: 0 top partition; 1 bottom-left partition; 2 + // bottom-right partition; 3 nullptr; + // * VerticalWithLeftSplit: 0 top-left partition; 1 bottom-left partition; 2 + // right partition; 3 nullptr; + // * VerticalWithRightSplit: 0 left-partition; 1 top-right partition; 2 + // bottom-right partition; 3 nullptr; + // * Horizontal4: 0 top partition; 1 second top partition; 2 third top + // partition; 3 bottom partition; + // * Vertical4: 0 left partition; 1 second left partition; 2 third left + // partition; 3 right partition; + std::unique_ptr children_[4] = {}; + + friend class ParameterTreeTest; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_ diff --git a/src/utils/queue.h b/src/utils/queue.h new file mode 100644 index 0000000..cffb9ca --- /dev/null +++ b/src/utils/queue.h @@ -0,0 +1,105 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_QUEUE_H_ +#define LIBGAV1_SRC_UTILS_QUEUE_H_ + +#include +#include +#include +#include + +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { + +// A FIFO queue of a fixed capacity. +// +// WARNING: No error checking is performed. +template +class Queue { + public: + LIBGAV1_MUST_USE_RESULT bool Init(size_t capacity) { + elements_.reset(new (std::nothrow) T[capacity]); + if (elements_ == nullptr) return false; + capacity_ = capacity; + return true; + } + + // Pushes the element |value| to the end of the queue. It is an error to call + // Push() when the queue is full. + void Push(T&& value) { + assert(size_ < capacity_); + elements_[end_++] = std::move(value); + if (end_ == capacity_) end_ = 0; + ++size_; + } + + // Removes the element at the front of the queue. It is an error to call Pop() + // when the queue is empty. + void Pop() { + assert(size_ != 0); + const T element = std::move(elements_[begin_++]); + static_cast(element); + if (begin_ == capacity_) begin_ = 0; + --size_; + } + + // Returns a reference to the element at the front of the queue. It is an + // error to call Front() when the queue is empty. + T& Front() { + assert(size_ != 0); + return elements_[begin_]; + } + + // Returns a reference to the element at the back of the queue. It is an error + // to call Back() when the queue is empty. + T& Back() { + assert(size_ != 0); + const size_t back = ((end_ == 0) ? capacity_ : end_) - 1; + return elements_[back]; + } + + // Clears the queue. + void Clear() { + while (!Empty()) { + Pop(); + } + } + + // Returns true if the queue is empty. + bool Empty() const { return size_ == 0; } + + // Returns true if the queue is full. + bool Full() const { return size_ >= capacity_; } + + // Returns the number of elements in the queue. + size_t Size() const { return size_; } + + private: + // An array of |capacity| elements. Used as a circular array. + std::unique_ptr elements_; + size_t capacity_ = 0; + // The index of the element to be removed by Pop(). + size_t begin_ = 0; + // The index where the new element is inserted by Push(). + size_t end_ = 0; + size_t size_ = 0; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_QUEUE_H_ diff --git a/src/utils/raw_bit_reader.cc b/src/utils/raw_bit_reader.cc new file mode 100644 index 0000000..15e980d --- /dev/null +++ b/src/utils/raw_bit_reader.cc @@ -0,0 +1,224 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/raw_bit_reader.h" + +#include +#include + +#include "src/utils/common.h" +#include "src/utils/logging.h" + +// Note is only needed when logging is enabled (for the PRI* +// macros). It depends on the definition of LIBGAV1_ENABLE_LOGGING from +// logging.h, thus the non-standard header ordering. +#if LIBGAV1_ENABLE_LOGGING +#include +#endif + +namespace libgav1 { +namespace { + +constexpr int kMaximumLeb128Size = 8; +constexpr uint8_t kLeb128ValueByteMask = 0x7f; +constexpr uint8_t kLeb128TerminationByteMask = 0x80; + +uint8_t Mod8(size_t n) { + // Last 3 bits are the value of mod 8. + return n & 0x07; +} + +size_t DivideBy8(size_t n, bool ceil) { return (n + (ceil ? 7 : 0)) >> 3; } + +} // namespace + +RawBitReader::RawBitReader(const uint8_t* data, size_t size) + : data_(data), bit_offset_(0), size_(size) { + assert(data_ != nullptr || size_ == 0); +} + +int RawBitReader::ReadBitImpl() { + const size_t byte_offset = DivideBy8(bit_offset_, false); + const uint8_t byte = data_[byte_offset]; + const uint8_t shift = 7 - Mod8(bit_offset_); + ++bit_offset_; + return static_cast((byte >> shift) & 0x01); +} + +int RawBitReader::ReadBit() { + if (Finished()) return -1; + return ReadBitImpl(); +} + +int64_t RawBitReader::ReadLiteral(int num_bits) { + assert(num_bits <= 32); + if (!CanReadLiteral(num_bits)) return -1; + assert(num_bits > 0); + uint32_t literal = 0; + int bit = num_bits - 1; + do { + // ARM can combine a shift operation with a constant number of bits with + // some other operations, such as the OR operation. + // Here is an ARM disassembly example: + // orr w1, w0, w1, lsl #1 + // which left shifts register w1 by 1 bit and OR the shift result with + // register w0. + // The next 2 lines are equivalent to: + // literal |= static_cast(ReadBitImpl()) << bit; + literal <<= 1; + literal |= static_cast(ReadBitImpl()); + } while (--bit >= 0); + return literal; +} + +bool RawBitReader::ReadInverseSignedLiteral(int num_bits, int* const value) { + assert(num_bits + 1 < 32); + *value = static_cast(ReadLiteral(num_bits + 1)); + if (*value == -1) return false; + const int sign_bit = 1 << num_bits; + if ((*value & sign_bit) != 0) { + *value -= 2 * sign_bit; + } + return true; +} + +bool RawBitReader::ReadLittleEndian(int num_bytes, size_t* const value) { + // We must be at a byte boundary. + assert(Mod8(bit_offset_) == 0); + assert(num_bytes <= 4); + static_assert(sizeof(size_t) >= 4, ""); + if (value == nullptr) return false; + size_t byte_offset = DivideBy8(bit_offset_, false); + if (Finished() || byte_offset + num_bytes > size_) { + LIBGAV1_DLOG(ERROR, "Not enough bits to read Little Endian value."); + return false; + } + *value = 0; + for (int i = 0; i < num_bytes; ++i) { + const size_t byte = data_[byte_offset]; + *value |= (byte << (i * 8)); + ++byte_offset; + } + bit_offset_ = byte_offset * 8; + return true; +} + +bool RawBitReader::ReadUnsignedLeb128(size_t* const value) { + // We must be at a byte boundary. + assert(Mod8(bit_offset_) == 0); + if (value == nullptr) return false; + uint64_t value64 = 0; + for (int i = 0; i < kMaximumLeb128Size; ++i) { + if (Finished()) { + LIBGAV1_DLOG(ERROR, "Not enough bits to read LEB128 value."); + return false; + } + const size_t byte_offset = DivideBy8(bit_offset_, false); + const uint8_t byte = data_[byte_offset]; + bit_offset_ += 8; + value64 |= static_cast(byte & kLeb128ValueByteMask) << (i * 7); + if ((byte & kLeb128TerminationByteMask) == 0) { + if (value64 != static_cast(value64) || + value64 > std::numeric_limits::max()) { + LIBGAV1_DLOG( + ERROR, "LEB128 value (%" PRIu64 ") exceeded uint32_t maximum (%u).", + value64, std::numeric_limits::max()); + return false; + } + *value = static_cast(value64); + return true; + } + } + LIBGAV1_DLOG( + ERROR, + "Exceeded kMaximumLeb128Size (%d) when trying to read LEB128 value", + kMaximumLeb128Size); + return false; +} + +bool RawBitReader::ReadUvlc(uint32_t* const value) { + if (value == nullptr) return false; + int leading_zeros = 0; + while (true) { + const int bit = ReadBit(); + if (bit == -1) { + LIBGAV1_DLOG(ERROR, "Not enough bits to read uvlc value."); + return false; + } + if (bit == 1) break; + ++leading_zeros; + if (leading_zeros == 32) { + LIBGAV1_DLOG(ERROR, + "Exceeded maximum size (32) when trying to read uvlc value"); + return false; + } + } + int literal; + if (leading_zeros != 0) { + literal = static_cast(ReadLiteral(leading_zeros)); + if (literal == -1) { + LIBGAV1_DLOG(ERROR, "Not enough bits to read uvlc value."); + return false; + } + literal += (1U << leading_zeros) - 1; + } else { + literal = 0; + } + *value = literal; + return true; +} + +bool RawBitReader::AlignToNextByte() { + while ((bit_offset_ & 7) != 0) { + if (ReadBit() != 0) { + return false; + } + } + return true; +} + +bool RawBitReader::VerifyAndSkipTrailingBits(size_t num_bits) { + if (ReadBit() != 1) return false; + for (size_t i = 0; i < num_bits - 1; ++i) { + if (ReadBit() != 0) return false; + } + return true; +} + +bool RawBitReader::SkipBytes(size_t num_bytes) { + // If we are not at a byte boundary, return false. + return ((bit_offset_ & 7) != 0) ? false : SkipBits(num_bytes * 8); +} + +bool RawBitReader::SkipBits(size_t num_bits) { + // If the reader is already finished, return false. + if (Finished()) return false; + // If skipping |num_bits| runs out of buffer, return false. + const size_t bit_offset = bit_offset_ + num_bits - 1; + if (DivideBy8(bit_offset, false) >= size_) return false; + bit_offset_ += num_bits; + return true; +} + +bool RawBitReader::CanReadLiteral(size_t num_bits) const { + if (Finished()) return false; + const size_t bit_offset = bit_offset_ + num_bits - 1; + return DivideBy8(bit_offset, false) < size_; +} + +bool RawBitReader::Finished() const { + return DivideBy8(bit_offset_, false) >= size_; +} + +} // namespace libgav1 diff --git a/src/utils/raw_bit_reader.h b/src/utils/raw_bit_reader.h new file mode 100644 index 0000000..76e7bfa --- /dev/null +++ b/src/utils/raw_bit_reader.h @@ -0,0 +1,78 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_ +#define LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_ + +#include +#include + +#include "src/utils/bit_reader.h" +#include "src/utils/memory.h" + +namespace libgav1 { + +class RawBitReader : public BitReader, public Allocable { + public: + RawBitReader(const uint8_t* data, size_t size); + ~RawBitReader() override = default; + + int ReadBit() override; + int64_t ReadLiteral(int num_bits) override; // f(n) in the spec. + bool ReadInverseSignedLiteral(int num_bits, + int* value); // su(1+num_bits) in the spec. + bool ReadLittleEndian(int num_bytes, + size_t* value); // le(n) in the spec. + bool ReadUnsignedLeb128(size_t* value); // leb128() in the spec. + // Reads a variable length unsigned number and stores it in |*value|. On a + // successful return, |*value| is in the range of 0 to UINT32_MAX − 1, + // inclusive. + bool ReadUvlc(uint32_t* value); // uvlc() in the spec. + bool Finished() const; + size_t bit_offset() const { return bit_offset_; } + // Return the bytes consumed so far (rounded up). + size_t byte_offset() const { return (bit_offset() + 7) >> 3; } + size_t size() const { return size_; } + // Move to the next byte boundary if not already at one. Return false if any + // of the bits being skipped over is non-zero. Return true otherwise. If this + // function returns false, the reader is left in an undefined state and must + // not be used further. section 5.3.5. + bool AlignToNextByte(); + // Make sure that the trailing bits structure is as expected and skip over it. + // section 5.3.4. + bool VerifyAndSkipTrailingBits(size_t num_bits); + // Skip |num_bytes| bytes. This only works if the current position is at a + // byte boundary. The function returns false if the current position is not at + // a byte boundary or if skipping |num_bytes| causes the reader to run out of + // buffer. Returns true otherwise. + bool SkipBytes(size_t num_bytes); + // Skip |num_bits| bits. The function returns false if skipping |num_bits| + // causes the reader to run out of buffer. Returns true otherwise. + bool SkipBits(size_t num_bits); + + private: + // Returns true if it is safe to read a literal of size |num_bits|. + bool CanReadLiteral(size_t num_bits) const; + int ReadBitImpl(); + + const uint8_t* const data_; + size_t bit_offset_; + const size_t size_; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_ diff --git a/src/utils/reference_info.h b/src/utils/reference_info.h new file mode 100644 index 0000000..a660791 --- /dev/null +++ b/src/utils/reference_info.h @@ -0,0 +1,92 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_ +#define LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_ + +#include +#include + +#include "src/utils/array_2d.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { + +// This struct collects some members related to reference frames in one place to +// make it easier to pass them as parameters to some dsp functions. +struct ReferenceInfo { + // Initialize |motion_field_reference_frame| so that + // Tile::StoreMotionFieldMvsIntoCurrentFrame() can skip some updates when + // the updates are the same as the initialized value. + // Set to kReferenceFrameIntra instead of kReferenceFrameNone to simplify + // branch conditions in motion field projection. + // The following memory initialization of contiguous memory is very fast. It + // is not recommended to make the initialization multi-threaded, unless the + // memory which needs to be initialized in each thread is still contiguous. + LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns) { + return motion_field_reference_frame.Reset(rows, columns, + /*zero_initialize=*/true) && + motion_field_mv.Reset( + rows, columns, +#if LIBGAV1_MSAN + // It is set in Tile::StoreMotionFieldMvsIntoCurrentFrame() only + // for qualified blocks. In MotionFieldProjectionKernel() dsp + // optimizations, it is read no matter it was set or not. + /*zero_initialize=*/true +#else + /*zero_initialize=*/false +#endif + ); + } + + // All members are used by inter frames only. + // For intra frames, they are not initialized. + + std::array order_hint; + + // An example when |relative_distance_from| does not equal + // -|relative_distance_to|: + // |relative_distance_from| = GetRelativeDistance(7, 71, 25) = -64 + // -|relative_distance_to| = -GetRelativeDistance(71, 7, 25) = 64 + // This is why we need both |relative_distance_from| and + // |relative_distance_to|. + // |relative_distance_from|: Relative distances from reference frames to this + // frame. + std::array relative_distance_from; + // |relative_distance_to|: Relative distances to reference frames. + std::array relative_distance_to; + + // Skip motion field projection of specific types of frames if their + // |relative_distance_to| is negative or too large. + std::array skip_references; + // Lookup table to get motion field projection division multiplier of specific + // types of frames. Derived from kProjectionMvDivisionLookup. + std::array projection_divisions; + + // The current frame's |motion_field_reference_frame| and |motion_field_mv_| + // are guaranteed to be allocated only when refresh_frame_flags is not 0. + // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds + // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec. + Array2D motion_field_reference_frame; + // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds + // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec. + Array2D motion_field_mv; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_ diff --git a/src/utils/segmentation.cc b/src/utils/segmentation.cc new file mode 100644 index 0000000..75fa776 --- /dev/null +++ b/src/utils/segmentation.cc @@ -0,0 +1,31 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/segmentation.h" + +namespace libgav1 { + +const int8_t kSegmentationFeatureBits[kSegmentFeatureMax] = {8, 6, 6, 6, + 6, 3, 0, 0}; +const int kSegmentationFeatureMaxValues[kSegmentFeatureMax] = { + 255, + kMaxLoopFilterValue, + kMaxLoopFilterValue, + kMaxLoopFilterValue, + kMaxLoopFilterValue, + 7, + 0, + 0}; + +} // namespace libgav1 diff --git a/src/utils/segmentation.h b/src/utils/segmentation.h new file mode 100644 index 0000000..67ff74c --- /dev/null +++ b/src/utils/segmentation.h @@ -0,0 +1,32 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_SEGMENTATION_H_ +#define LIBGAV1_SRC_UTILS_SEGMENTATION_H_ + +#include + +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { + +extern const int8_t kSegmentationFeatureBits[kSegmentFeatureMax]; +extern const int kSegmentationFeatureMaxValues[kSegmentFeatureMax]; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_SEGMENTATION_H_ diff --git a/src/utils/segmentation_map.cc b/src/utils/segmentation_map.cc new file mode 100644 index 0000000..4284ca2 --- /dev/null +++ b/src/utils/segmentation_map.cc @@ -0,0 +1,49 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/segmentation_map.h" + +#include +#include +#include + +namespace libgav1 { + +bool SegmentationMap::Allocate(int32_t rows4x4, int32_t columns4x4) { + rows4x4_ = rows4x4; + columns4x4_ = columns4x4; + segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4_ * columns4x4_]); + if (segment_id_buffer_ == nullptr) return false; + segment_id_.Reset(rows4x4_, columns4x4_, segment_id_buffer_.get()); + return true; +} + +void SegmentationMap::Clear() { + memset(segment_id_buffer_.get(), 0, rows4x4_ * columns4x4_); +} + +void SegmentationMap::CopyFrom(const SegmentationMap& from) { + assert(rows4x4_ == from.rows4x4_ && columns4x4_ == from.columns4x4_); + memcpy(segment_id_buffer_.get(), from.segment_id_buffer_.get(), + rows4x4_ * columns4x4_); +} + +void SegmentationMap::FillBlock(int row4x4, int column4x4, int block_width4x4, + int block_height4x4, int8_t segment_id) { + for (int y = 0; y < block_height4x4; ++y) { + memset(&segment_id_[row4x4 + y][column4x4], segment_id, block_width4x4); + } +} + +} // namespace libgav1 diff --git a/src/utils/segmentation_map.h b/src/utils/segmentation_map.h new file mode 100644 index 0000000..499be24 --- /dev/null +++ b/src/utils/segmentation_map.h @@ -0,0 +1,71 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_ +#define LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_ + +#include +#include + +#include "src/utils/array_2d.h" +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { + +// SegmentationMap stores the segment id associated with each 4x4 block in the +// frame. +class SegmentationMap { + public: + SegmentationMap() = default; + + // Not copyable or movable + SegmentationMap(const SegmentationMap&) = delete; + SegmentationMap& operator=(const SegmentationMap&) = delete; + + // Allocates an internal buffer of the given dimensions to hold the + // segmentation map. The memory in the buffer is not initialized. Returns + // true on success, false on failure (for example, out of memory). + LIBGAV1_MUST_USE_RESULT bool Allocate(int32_t rows4x4, int32_t columns4x4); + + int8_t segment_id(int row4x4, int column4x4) const { + return segment_id_[row4x4][column4x4]; + } + + // Sets every element in the segmentation map to 0. + void Clear(); + + // Copies the entire segmentation map. |from| must be of the same dimensions. + void CopyFrom(const SegmentationMap& from); + + // Sets the region of segmentation map covered by the block to |segment_id|. + // The block is located at |row4x4|, |column4x4| and has dimensions + // |block_width4x4| and |block_height4x4|. + void FillBlock(int row4x4, int column4x4, int block_width4x4, + int block_height4x4, int8_t segment_id); + + private: + int32_t rows4x4_ = 0; + int32_t columns4x4_ = 0; + + // segment_id_ is a rows4x4_ by columns4x4_ 2D array. The underlying data + // buffer is dynamically allocated and owned by segment_id_buffer_. + std::unique_ptr segment_id_buffer_; + Array2DView segment_id_; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_ diff --git a/src/utils/stack.h b/src/utils/stack.h new file mode 100644 index 0000000..39133b9 --- /dev/null +++ b/src/utils/stack.h @@ -0,0 +1,59 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_STACK_H_ +#define LIBGAV1_SRC_UTILS_STACK_H_ + +#include +#include + +namespace libgav1 { + +// A LIFO stack of a fixed capacity. The elements are moved using std::move, so +// the element type T has to be movable. +// +// WARNING: No error checking is performed. +template +class Stack { + public: + // Pushes the element |value| to the top of the stack. It is an error to call + // Push() when the stack is full. + void Push(T value) { + ++top_; + assert(top_ < capacity); + elements_[top_] = std::move(value); + } + + // Returns the element at the top of the stack and removes it from the stack. + // It is an error to call Pop() when the stack is empty. + T Pop() { + assert(top_ >= 0); + return std::move(elements_[top_--]); + } + + // Returns true if the stack is empty. + bool Empty() const { return top_ < 0; } + + private: + static_assert(capacity > 0, ""); + T elements_[capacity]; + // The array index of the top of the stack. The stack is empty if top_ is -1. + int top_ = -1; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_STACK_H_ diff --git a/src/utils/threadpool.cc b/src/utils/threadpool.cc new file mode 100644 index 0000000..8c8f4fe --- /dev/null +++ b/src/utils/threadpool.cc @@ -0,0 +1,323 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/threadpool.h" + +#if defined(_MSC_VER) +#include +#include +#else // defined(_MSC_VER) +#include +#endif // defined(_MSC_VER) +#if defined(__ANDROID__) || defined(__GLIBC__) +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__ANDROID__) +#include // NOLINT (unapproved c++11 header) +#endif + +// The glibc wrapper for the gettid() system call was added in glibc 2.30. +// Emulate it for older versions of glibc. +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 30) + +#include + +static pid_t gettid() { return static_cast(syscall(SYS_gettid)); } + +#endif +#endif // defined(__GLIBC_PREREQ) + +namespace libgav1 { + +#if defined(__ANDROID__) +namespace { + +using Clock = std::chrono::steady_clock; +using Duration = Clock::duration; +constexpr Duration kBusyWaitDuration = + std::chrono::duration_cast(std::chrono::duration(2e-3)); + +} // namespace +#endif // defined(__ANDROID__) + +// static +std::unique_ptr ThreadPool::Create(int num_threads) { + return Create(/*name_prefix=*/"", num_threads); +} + +// static +std::unique_ptr ThreadPool::Create(const char name_prefix[], + int num_threads) { + if (name_prefix == nullptr || num_threads <= 0) return nullptr; + std::unique_ptr threads(new (std::nothrow) + WorkerThread*[num_threads]); + if (threads == nullptr) return nullptr; + std::unique_ptr pool(new (std::nothrow) ThreadPool( + name_prefix, std::move(threads), num_threads)); + if (pool != nullptr && !pool->StartWorkers()) { + pool = nullptr; + } + return pool; +} + +ThreadPool::ThreadPool(const char name_prefix[], + std::unique_ptr threads, + int num_threads) + : threads_(std::move(threads)), num_threads_(num_threads) { + threads_[0] = nullptr; + assert(name_prefix != nullptr); + const size_t name_prefix_len = + std::min(strlen(name_prefix), sizeof(name_prefix_) - 1); + memcpy(name_prefix_, name_prefix, name_prefix_len); + name_prefix_[name_prefix_len] = '\0'; +} + +ThreadPool::~ThreadPool() { Shutdown(); } + +void ThreadPool::Schedule(std::function closure) { + LockMutex(); + if (!queue_.GrowIfNeeded()) { + // queue_ is full and we can't grow it. Run |closure| directly. + UnlockMutex(); + closure(); + return; + } + queue_.Push(std::move(closure)); + UnlockMutex(); + SignalOne(); +} + +int ThreadPool::num_threads() const { return num_threads_; } + +// A simple implementation that mirrors the non-portable Thread. We may +// choose to expand this in the future as a portable implementation of +// Thread, or replace it at such a time as one is implemented. +class ThreadPool::WorkerThread : public Allocable { + public: + // Creates and starts a thread that runs pool->WorkerFunction(). + explicit WorkerThread(ThreadPool* pool); + + // Not copyable or movable. + WorkerThread(const WorkerThread&) = delete; + WorkerThread& operator=(const WorkerThread&) = delete; + + // REQUIRES: Join() must have been called if Start() was called and + // succeeded. + ~WorkerThread() = default; + + LIBGAV1_MUST_USE_RESULT bool Start(); + + // Joins with the running thread. + void Join(); + + private: +#if defined(_MSC_VER) + static unsigned int __stdcall ThreadBody(void* arg); +#else + static void* ThreadBody(void* arg); +#endif + + void SetupName(); + void Run(); + + ThreadPool* pool_; +#if defined(_MSC_VER) + HANDLE handle_; +#else + pthread_t thread_; +#endif +}; + +ThreadPool::WorkerThread::WorkerThread(ThreadPool* pool) : pool_(pool) {} + +#if defined(_MSC_VER) + +bool ThreadPool::WorkerThread::Start() { + // Since our code calls the C run-time library (CRT), use _beginthreadex + // rather than CreateThread. Microsoft documentation says "If a thread + // created using CreateThread calls the CRT, the CRT may terminate the + // process in low-memory conditions." + uintptr_t handle = _beginthreadex( + /*security=*/nullptr, /*stack_size=*/0, ThreadBody, this, + /*initflag=*/CREATE_SUSPENDED, /*thrdaddr=*/nullptr); + if (handle == 0) return false; + handle_ = reinterpret_cast(handle); + ResumeThread(handle_); + return true; +} + +void ThreadPool::WorkerThread::Join() { + WaitForSingleObject(handle_, INFINITE); + CloseHandle(handle_); +} + +unsigned int ThreadPool::WorkerThread::ThreadBody(void* arg) { + auto* thread = static_cast(arg); + thread->Run(); + return 0; +} + +void ThreadPool::WorkerThread::SetupName() { + // Not currently supported on Windows. +} + +#else // defined(_MSC_VER) + +bool ThreadPool::WorkerThread::Start() { + return pthread_create(&thread_, nullptr, ThreadBody, this) == 0; +} + +void ThreadPool::WorkerThread::Join() { pthread_join(thread_, nullptr); } + +void* ThreadPool::WorkerThread::ThreadBody(void* arg) { + auto* thread = static_cast(arg); + thread->Run(); + return nullptr; +} + +void ThreadPool::WorkerThread::SetupName() { + if (pool_->name_prefix_[0] != '\0') { +#if defined(__APPLE__) + // Apple's version of pthread_setname_np takes one argument and operates on + // the current thread only. Also, pthread_mach_thread_np is Apple-specific. + // The maximum size of the |name| buffer was noted in the Chromium source + // code and was confirmed by experiments. + char name[64]; + mach_port_t id = pthread_mach_thread_np(pthread_self()); + int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_, + static_cast(id)); + assert(rv >= 0); + rv = pthread_setname_np(name); + assert(rv == 0); + static_cast(rv); +#elif defined(__ANDROID__) || defined(__GLIBC__) + // If the |name| buffer is longer than 16 bytes, pthread_setname_np fails + // with error 34 (ERANGE) on Android. + char name[16]; + pid_t id = gettid(); + int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_, + static_cast(id)); + assert(rv >= 0); + rv = pthread_setname_np(pthread_self(), name); + assert(rv == 0); + static_cast(rv); +#endif + } +} + +#endif // defined(_MSC_VER) + +void ThreadPool::WorkerThread::Run() { + SetupName(); + pool_->WorkerFunction(); +} + +bool ThreadPool::StartWorkers() { + if (!queue_.Init()) return false; + for (int i = 0; i < num_threads_; ++i) { + threads_[i] = new (std::nothrow) WorkerThread(this); + if (threads_[i] == nullptr) return false; + if (!threads_[i]->Start()) { + delete threads_[i]; + threads_[i] = nullptr; + return false; + } + } + return true; +} + +void ThreadPool::WorkerFunction() { + LockMutex(); + while (true) { + if (queue_.Empty()) { + if (exit_threads_) { + break; // Queue is empty and exit was requested. + } +#if defined(__ANDROID__) + // On android, if we go to a conditional wait right away, the CPU governor + // kicks in and starts shutting the cores down. So we do a very small busy + // wait to see if we get our next job within that period. This + // significantly improves the performance of common cases of tile parallel + // decoding. If we don't receive a job in the busy wait time, we then go + // to an actual conditional wait as usual. + UnlockMutex(); + bool found_job = false; + const auto wait_start = Clock::now(); + while (Clock::now() - wait_start < kBusyWaitDuration) { + LockMutex(); + if (!queue_.Empty()) { + found_job = true; + break; + } + UnlockMutex(); + } + // If |found_job| is true, we simply continue since we already hold the + // mutex and we know for sure that the |queue_| is not empty. + if (found_job) continue; + // Since |found_job_| was false, the mutex is not being held at this + // point. + LockMutex(); + // Ensure that the queue is still empty. + if (!queue_.Empty()) continue; + if (exit_threads_) { + break; // Queue is empty and exit was requested. + } +#endif // defined(__ANDROID__) + // Queue is still empty, wait for signal or broadcast. + Wait(); + } else { + // Take a job from the queue. + std::function job = std::move(queue_.Front()); + queue_.Pop(); + + UnlockMutex(); + // Note that it is good practice to surround this with a try/catch so + // the thread pool doesn't go to hell if the job throws an exception. + // This is omitted here because Google3 doesn't like exceptions. + std::move(job)(); + job = nullptr; + + LockMutex(); + } + } + UnlockMutex(); +} + +void ThreadPool::Shutdown() { + // Tell worker threads how to exit. + LockMutex(); + exit_threads_ = true; + UnlockMutex(); + SignalAll(); + + // Join all workers. This will block. + for (int i = 0; i < num_threads_; ++i) { + if (threads_[i] == nullptr) break; + threads_[i]->Join(); + delete threads_[i]; + } +} + +} // namespace libgav1 diff --git a/src/utils/threadpool.h b/src/utils/threadpool.h new file mode 100644 index 0000000..fac875e --- /dev/null +++ b/src/utils/threadpool.h @@ -0,0 +1,167 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_THREADPOOL_H_ +#define LIBGAV1_SRC_UTILS_THREADPOOL_H_ + +#include +#include + +#if defined(__APPLE__) +#include +#endif + +#if !defined(LIBGAV1_THREADPOOL_USE_STD_MUTEX) +#if defined(__ANDROID__) || (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) +#define LIBGAV1_THREADPOOL_USE_STD_MUTEX 1 +#else +#define LIBGAV1_THREADPOOL_USE_STD_MUTEX 0 +#endif +#endif + +#if LIBGAV1_THREADPOOL_USE_STD_MUTEX +#include // NOLINT (unapproved c++11 header) +#include // NOLINT (unapproved c++11 header) +#else +// absl::Mutex & absl::CondVar are significantly faster than the pthread +// variants on platforms other than Android. iOS may deadlock on Shutdown() +// using absl, see b/142251739. +#include "absl/base/thread_annotations.h" +#include "absl/synchronization/mutex.h" +#endif + +#include "src/utils/compiler_attributes.h" +#include "src/utils/executor.h" +#include "src/utils/memory.h" +#include "src/utils/unbounded_queue.h" + +namespace libgav1 { + +// An implementation of ThreadPool using POSIX threads (pthreads) or Windows +// threads. +// +// - The pool allocates a fixed number of worker threads on instantiation. +// - The worker threads will pick up work jobs as they arrive. +// - If all workers are busy, work jobs are queued for later execution. +// +// The thread pool is shut down when the pool is destroyed. +// +// Example usage of the thread pool: +// { +// std::unique_ptr pool = ThreadPool::Create(4); +// for (int i = 0; i < 100; ++i) { // Dispatch 100 jobs. +// pool->Schedule([&my_data]() { MyFunction(&my_data); }); +// } +// } // ThreadPool gets destroyed only when all jobs are done. +class ThreadPool : public Executor, public Allocable { + public: + // Creates the thread pool with the specified number of worker threads. + // If num_threads is 1, the closures are run in FIFO order. + static std::unique_ptr Create(int num_threads); + + // Like the above factory method, but also sets the name prefix for threads. + static std::unique_ptr Create(const char name_prefix[], + int num_threads); + + // The destructor will shut down the thread pool and all jobs are executed. + // Note that after shutdown, the thread pool does not accept further jobs. + ~ThreadPool() override; + + // Adds the specified "closure" to the queue for processing. If worker threads + // are available, "closure" will run immediately. Otherwise "closure" is + // queued for later execution. + // + // NOTE: If the internal queue is full and cannot be resized because of an + // out-of-memory error, the current thread runs "closure" before returning + // from Schedule(). For our use cases, this seems better than the + // alternatives: + // 1. Return a failure status. + // 2. Have the current thread wait until the queue is not full. + void Schedule(std::function closure) override; + + int num_threads() const; + + private: + class WorkerThread; + + // Creates the thread pool with the specified number of worker threads. + // If num_threads is 1, the closures are run in FIFO order. + ThreadPool(const char name_prefix[], std::unique_ptr threads, + int num_threads); + + // Starts the worker pool. + LIBGAV1_MUST_USE_RESULT bool StartWorkers(); + + void WorkerFunction(); + + // Shuts down the thread pool, i.e. worker threads finish their work and + // pick up new jobs until the queue is empty. This call will block until + // the shutdown is complete. + // + // Note: If a worker encounters an empty queue after this call, it will exit. + // Other workers might still be running, and if the queue fills up again, the + // thread pool will continue to operate with a decreased number of workers. + // It is up to the caller to prevent adding new jobs. + void Shutdown(); + +#if LIBGAV1_THREADPOOL_USE_STD_MUTEX + + void LockMutex() { queue_mutex_.lock(); } + void UnlockMutex() { queue_mutex_.unlock(); } + + void Wait() { + std::unique_lock queue_lock(queue_mutex_, std::adopt_lock); + condition_.wait(queue_lock); + queue_lock.release(); + } + + void SignalOne() { condition_.notify_one(); } + void SignalAll() { condition_.notify_all(); } + + std::condition_variable condition_; + std::mutex queue_mutex_; + +#else // !LIBGAV1_THREADPOOL_USE_STD_MUTEX + + void LockMutex() ABSL_EXCLUSIVE_LOCK_FUNCTION() { queue_mutex_.Lock(); } + void UnlockMutex() ABSL_UNLOCK_FUNCTION() { queue_mutex_.Unlock(); } + void Wait() { condition_.Wait(&queue_mutex_); } + void SignalOne() { condition_.Signal(); } + void SignalAll() { condition_.SignalAll(); } + + absl::CondVar condition_; + absl::Mutex queue_mutex_; + +#endif // LIBGAV1_THREADPOOL_USE_STD_MUTEX + + UnboundedQueue> queue_ LIBGAV1_GUARDED_BY(queue_mutex_); + // If not all the worker threads are created, the first entry after the + // created worker threads is a null pointer. + const std::unique_ptr threads_; + + bool exit_threads_ LIBGAV1_GUARDED_BY(queue_mutex_) = false; + const int num_threads_ = 0; + // name_prefix_ is a C string, whose length is restricted to 16 characters, + // including the terminating null byte ('\0'). This restriction comes from + // the Linux pthread_setname_np() function. + char name_prefix_[16]; +}; + +} // namespace libgav1 + +#undef LIBGAV1_THREADPOOL_USE_STD_MUTEX + +#endif // LIBGAV1_SRC_UTILS_THREADPOOL_H_ diff --git a/src/utils/types.h b/src/utils/types.h new file mode 100644 index 0000000..374f06b --- /dev/null +++ b/src/utils/types.h @@ -0,0 +1,525 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_TYPES_H_ +#define LIBGAV1_SRC_UTILS_TYPES_H_ + +#include +#include +#include + +#include "src/utils/array_2d.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" + +namespace libgav1 { + +struct MotionVector : public Allocable { + static constexpr int kRow = 0; + static constexpr int kColumn = 1; + + MotionVector() = default; + MotionVector(const MotionVector& mv) = default; + + MotionVector& operator=(const MotionVector& rhs) { + mv32 = rhs.mv32; + return *this; + } + + bool operator==(const MotionVector& rhs) const { return mv32 == rhs.mv32; } + + union { + // Motion vectors will always fit in int16_t and using int16_t here instead + // of int saves significant memory since some of the frame sized structures + // store motion vectors. + int16_t mv[2]; + // A uint32_t view into the |mv| array. Useful for cases where both the + // motion vectors have to be copied or compared with a single 32 bit + // instruction. + uint32_t mv32; + }; +}; + +union CompoundMotionVector { + CompoundMotionVector() = default; + CompoundMotionVector(const CompoundMotionVector& mv) = default; + + CompoundMotionVector& operator=(const CompoundMotionVector& rhs) { + mv64 = rhs.mv64; + return *this; + } + + bool operator==(const CompoundMotionVector& rhs) const { + return mv64 == rhs.mv64; + } + + MotionVector mv[2]; + // A uint64_t view into the |mv| array. Useful for cases where all the motion + // vectors have to be copied or compared with a single 64 bit instruction. + uint64_t mv64; +}; + +// Stores the motion information used for motion field estimation. +struct TemporalMotionField : public Allocable { + Array2D mv; + Array2D reference_offset; +}; + +// MvContexts contains the contexts used to decode portions of an inter block +// mode info to set the y_mode field in BlockParameters. +// +// The contexts in the struct correspond to the ZeroMvContext, RefMvContext, +// and NewMvContext variables in the spec. +struct MvContexts { + int zero_mv; + int reference_mv; + int new_mv; +}; + +struct PaletteModeInfo { + uint8_t size[kNumPlaneTypes]; + uint16_t color[kMaxPlanes][kMaxPaletteSize]; +}; + +// Stores the parameters used by the prediction process. The members of the +// struct are filled in when parsing the bitstream and used when the prediction +// is computed. The information in this struct is associated with a single +// block. +// While both BlockParameters and PredictionParameters store information +// pertaining to a Block, the only difference is that BlockParameters outlives +// the block itself (for example, some of the variables in BlockParameters are +// used to compute the context for reading elements in the subsequent blocks). +struct PredictionParameters : public Allocable { + // Restore the index in the unsorted mv stack from the least 3 bits of sorted + // |weight_index_stack|. + const MotionVector& reference_mv(int stack_index) const { + return ref_mv_stack[7 - (weight_index_stack[stack_index] & 7)]; + } + const MotionVector& reference_mv(int stack_index, int mv_index) const { + return compound_ref_mv_stack[7 - (weight_index_stack[stack_index] & 7)] + .mv[mv_index]; + } + + void IncreaseWeight(ptrdiff_t index, int weight) { + weight_index_stack[index] += weight << 3; + } + + void SetWeightIndexStackEntry(int index, int weight) { + weight_index_stack[index] = (weight << 3) + 7 - index; + } + + bool use_filter_intra; + FilterIntraPredictor filter_intra_mode; + int angle_delta[kNumPlaneTypes]; + int8_t cfl_alpha_u; + int8_t cfl_alpha_v; + int max_luma_width; + int max_luma_height; + Array2D color_index_map[kNumPlaneTypes]; + bool use_intra_block_copy; + InterIntraMode inter_intra_mode; + bool is_wedge_inter_intra; + int wedge_index; + int wedge_sign; + bool mask_is_inverse; + MotionMode motion_mode; + CompoundPredictionType compound_prediction_type; + union { + // |ref_mv_stack| and |compound_ref_mv_stack| are not sorted after + // construction. reference_mv() must be called to get the correct element. + MotionVector ref_mv_stack[kMaxRefMvStackSize]; + CompoundMotionVector compound_ref_mv_stack[kMaxRefMvStackSize]; + }; + // The least 3 bits of |weight_index_stack| store the index information, and + // the other bits store the weight. The index information is actually 7 - + // index to make the descending order sort stable (preserves the original + // order for elements with the same weight). Sorting an int16_t array is much + // faster than sorting a struct array with weight and index stored separately. + int16_t weight_index_stack[kMaxRefMvStackSize]; + // In the spec, the weights of all the nearest mvs are incremented by a bonus + // weight which is larger than any natural weight, and later the weights of + // the mvs are compared with this bonus weight to determine their contexts. We + // replace this procedure by introducing |nearest_mv_count|, which records the + // count of the nearest mvs. Since all the nearest mvs are in the beginning of + // the mv stack, the index of a mv in the mv stack can be compared with + // |nearest_mv_count| to get that mv's context. + int nearest_mv_count; + int ref_mv_count; + int ref_mv_index; + MotionVector global_mv[2]; + int num_warp_samples; + int warp_estimate_candidates[kMaxLeastSquaresSamples][4]; +}; + +// A lot of BlockParameters objects are created, so the smallest type is used +// for each field. The ranges of some fields are documented to justify why +// their types are large enough. +struct BlockParameters : public Allocable { + BlockSize size; + bool skip; + // True means that this block will use some default settings (that + // correspond to compound prediction) and so most of the mode info is + // skipped. False means that the mode info is not skipped. + bool skip_mode; + bool is_inter; + bool is_explicit_compound_type; // comp_group_idx in the spec. + bool is_compound_type_average; // compound_idx in the spec. + bool is_global_mv_block; + bool use_predicted_segment_id; // only valid with temporal update enabled. + int8_t segment_id; // segment_id is in the range [0, 7]. + PredictionMode y_mode; + PredictionMode uv_mode; + TransformSize transform_size; + TransformSize uv_transform_size; + InterpolationFilter interpolation_filter[2]; + ReferenceFrameType reference_frame[2]; + // The index of this array is as follows: + // 0 - Y plane vertical filtering. + // 1 - Y plane horizontal filtering. + // 2 - U plane (both directions). + // 3 - V plane (both directions). + uint8_t deblock_filter_level[kFrameLfCount]; + CompoundMotionVector mv; + PaletteModeInfo palette_mode_info; + // When |Tile::split_parse_and_decode_| is true, each block gets its own + // instance of |prediction_parameters|. When it is false, all the blocks point + // to |Tile::prediction_parameters_|. This field is valid only as long as the + // block is *being* decoded. The lifetime and usage of this field can be + // better understood by following its flow in tile.cc. + std::unique_ptr prediction_parameters; +}; + +// A five dimensional array used to store the wedge masks. The dimensions are: +// - block_size_index (returned by GetWedgeBlockSizeIndex() in prediction.cc). +// - flip_sign (0 or 1). +// - wedge_index (0 to 15). +// - each of those three dimensions is a 2d array of block_width by +// block_height. +using WedgeMaskArray = + std::array, 16>, 2>, 9>; + +enum GlobalMotionTransformationType : uint8_t { + kGlobalMotionTransformationTypeIdentity, + kGlobalMotionTransformationTypeTranslation, + kGlobalMotionTransformationTypeRotZoom, + kGlobalMotionTransformationTypeAffine, + kNumGlobalMotionTransformationTypes +}; + +// Global motion and warped motion parameters. See the paper for more info: +// S. Parker, Y. Chen, D. Barker, P. de Rivaz, D. Mukherjee, "Global and locally +// adaptive warped motion compensation in video compression", Proc. IEEE +// International Conference on Image Processing (ICIP), pp. 275-279, Sep. 2017. +struct GlobalMotion { + GlobalMotionTransformationType type; + int32_t params[6]; + + // Represent two shearing operations. Computed from |params| by SetupShear(). + // + // The least significant six (= kWarpParamRoundingBits) bits are all zeros. + // (This means alpha, beta, gamma, and delta could be represented by a 10-bit + // signed integer.) The minimum value is INT16_MIN (= -32768) and the maximum + // value is 32704 = 0x7fc0, the largest int16_t value whose least significant + // six bits are all zeros. + // + // Valid warp parameters (as validated by SetupShear()) have smaller ranges. + // Their absolute values are less than 2^14 (= 16384). (This follows from + // the warpValid check at the end of Section 7.11.3.6.) + // + // NOTE: Section 7.11.3.6 of the spec allows a maximum value of 32768, which + // is outside the range of int16_t. When cast to int16_t, 32768 becomes + // -32768. This potential int16_t overflow does not matter because either + // 32768 or -32768 causes SetupShear() to return false, + int16_t alpha; + int16_t beta; + int16_t gamma; + int16_t delta; +}; + +// Loop filter parameters: +// +// If level[0] and level[1] are both equal to 0, the loop filter process is +// not invoked. +// +// |sharpness| and |delta_enabled| are only used by the loop filter process. +// +// The |ref_deltas| and |mode_deltas| arrays are used not only by the loop +// filter process but also by the reference frame update and loading +// processes. The loop filter process uses |ref_deltas| and |mode_deltas| only +// when |delta_enabled| is true. +struct LoopFilter { + // Contains loop filter strength values in the range of [0, 63]. + std::array level; + // Indicates the sharpness level in the range of [0, 7]. + int8_t sharpness; + // Whether the filter level depends on the mode and reference frame used to + // predict a block. + bool delta_enabled; + // Whether additional syntax elements were read that specify which mode and + // reference frame deltas are to be updated. loop_filter_delta_update field in + // Section 5.9.11 of the spec. + bool delta_update; + // Contains the adjustment needed for the filter level based on the chosen + // reference frame, in the range of [-64, 63]. + std::array ref_deltas; + // Contains the adjustment needed for the filter level based on the chosen + // mode, in the range of [-64, 63]. + std::array mode_deltas; +}; + +struct Delta { + bool present; + uint8_t scale; + bool multi; +}; + +struct Cdef { + uint8_t damping; // damping value from the spec + (bitdepth - 8). + uint8_t bits; + // All the strength values are the values from the spec and left shifted by + // (bitdepth - 8). + uint8_t y_primary_strength[kMaxCdefStrengths]; + uint8_t y_secondary_strength[kMaxCdefStrengths]; + uint8_t uv_primary_strength[kMaxCdefStrengths]; + uint8_t uv_secondary_strength[kMaxCdefStrengths]; +}; + +struct TileInfo { + bool uniform_spacing; + int sb_rows; + int sb_columns; + int tile_count; + int tile_columns_log2; + int tile_columns; + int tile_column_start[kMaxTileColumns + 1]; + // This field is not used by libgav1, but is populated for use by some + // hardware decoders. So it must not be removed. + int tile_column_width_in_superblocks[kMaxTileColumns + 1]; + int tile_rows_log2; + int tile_rows; + int tile_row_start[kMaxTileRows + 1]; + // This field is not used by libgav1, but is populated for use by some + // hardware decoders. So it must not be removed. + int tile_row_height_in_superblocks[kMaxTileRows + 1]; + int16_t context_update_id; + uint8_t tile_size_bytes; +}; + +struct LoopRestoration { + LoopRestorationType type[kMaxPlanes]; + int unit_size_log2[kMaxPlanes]; +}; + +// Stores the quantization parameters of Section 5.9.12. +struct QuantizerParameters { + // base_index is in the range [0, 255]. + uint8_t base_index; + int8_t delta_dc[kMaxPlanes]; + // delta_ac[kPlaneY] is always 0. + int8_t delta_ac[kMaxPlanes]; + bool use_matrix; + // The |matrix_level| array is used only when |use_matrix| is true. + // matrix_level[plane] specifies the level in the quantizer matrix that + // should be used for decoding |plane|. The quantizer matrix has 15 levels, + // from 0 to 14. The range of matrix_level[plane] is [0, 15]. If + // matrix_level[plane] is 15, the quantizer matrix is not used. + int8_t matrix_level[kMaxPlanes]; +}; + +// The corresponding segment feature constants in the AV1 spec are named +// SEG_LVL_xxx. +enum SegmentFeature : uint8_t { + kSegmentFeatureQuantizer, + kSegmentFeatureLoopFilterYVertical, + kSegmentFeatureLoopFilterYHorizontal, + kSegmentFeatureLoopFilterU, + kSegmentFeatureLoopFilterV, + kSegmentFeatureReferenceFrame, + kSegmentFeatureSkip, + kSegmentFeatureGlobalMv, + kSegmentFeatureMax +}; + +struct Segmentation { + // 5.11.14. + // Returns true if the feature is enabled in the segment. + bool FeatureActive(int segment_id, SegmentFeature feature) const { + return enabled && segment_id < kMaxSegments && + feature_enabled[segment_id][feature]; + } + + // Returns true if the feature is signed. + static bool FeatureSigned(SegmentFeature feature) { + // Only the first five segment features are signed, so this comparison + // suffices. + return feature <= kSegmentFeatureLoopFilterV; + } + + bool enabled; + bool update_map; + bool update_data; + bool temporal_update; + // True if the segment id will be read before the skip syntax element. False + // if the skip syntax element will be read first. + bool segment_id_pre_skip; + // The highest numbered segment id that has some enabled feature. Used as + // the upper bound for decoding segment ids. + int8_t last_active_segment_id; + + bool feature_enabled[kMaxSegments][kSegmentFeatureMax]; + int16_t feature_data[kMaxSegments][kSegmentFeatureMax]; + bool lossless[kMaxSegments]; + // Cached values of get_qindex(1, segmentId), to be consumed by + // Tile::ReadTransformType(). The values are in the range [0, 255]. + uint8_t qindex[kMaxSegments]; +}; + +// Section 6.8.20. +// Note: In spec, film grain section uses YCbCr to denote variable names, +// such as num_cb_points, num_cr_points. To keep it consistent with other +// parts of code, we use YUV, i.e., num_u_points, num_v_points, etc. +struct FilmGrainParams { + bool apply_grain; + bool update_grain; + bool chroma_scaling_from_luma; + bool overlap_flag; + bool clip_to_restricted_range; + + uint8_t num_y_points; // [0, 14]. + uint8_t num_u_points; // [0, 10]. + uint8_t num_v_points; // [0, 10]. + // Must be [0, 255]. 10/12 bit /= 4 or 16. Must be in increasing order. + uint8_t point_y_value[14]; + uint8_t point_y_scaling[14]; + uint8_t point_u_value[10]; + uint8_t point_u_scaling[10]; + uint8_t point_v_value[10]; + uint8_t point_v_scaling[10]; + + uint8_t chroma_scaling; // [8, 11]. + uint8_t auto_regression_coeff_lag; // [0, 3]. + int8_t auto_regression_coeff_y[24]; // [-128, 127] + int8_t auto_regression_coeff_u[25]; // [-128, 127] + int8_t auto_regression_coeff_v[25]; // [-128, 127] + // Shift value: auto regression coeffs range + // 6: [-2, 2) + // 7: [-1, 1) + // 8: [-0.5, 0.5) + // 9: [-0.25, 0.25) + uint8_t auto_regression_shift; + + uint16_t grain_seed; + int reference_index; + int grain_scale_shift; + // These multipliers are encoded as nonnegative values by adding 128 first. + // The 128 is subtracted during parsing. + int8_t u_multiplier; // [-128, 127] + int8_t u_luma_multiplier; // [-128, 127] + // These offsets are encoded as nonnegative values by adding 256 first. The + // 256 is subtracted during parsing. + int16_t u_offset; // [-256, 255] + int8_t v_multiplier; // [-128, 127] + int8_t v_luma_multiplier; // [-128, 127] + int16_t v_offset; // [-256, 255] +}; + +struct ObuFrameHeader { + uint16_t display_frame_id; + uint16_t current_frame_id; + int64_t frame_offset; + uint16_t expected_frame_id[kNumInterReferenceFrameTypes]; + int32_t width; + int32_t height; + int32_t columns4x4; + int32_t rows4x4; + // The render size (render_width and render_height) is a hint to the + // application about the desired display size. It has no effect on the + // decoding process. + int32_t render_width; + int32_t render_height; + int32_t upscaled_width; + LoopRestoration loop_restoration; + uint32_t buffer_removal_time[kMaxOperatingPoints]; + uint32_t frame_presentation_time; + // Note: global_motion[0] (for kReferenceFrameIntra) is not used. + std::array global_motion; + TileInfo tile_info; + QuantizerParameters quantizer; + Segmentation segmentation; + bool show_existing_frame; + // frame_to_show is in the range [0, 7]. Only used if show_existing_frame is + // true. + int8_t frame_to_show; + FrameType frame_type; + bool show_frame; + bool showable_frame; + bool error_resilient_mode; + bool enable_cdf_update; + bool frame_size_override_flag; + // The order_hint syntax element in the uncompressed header. If + // show_existing_frame is false, the OrderHint variable in the spec is equal + // to this field, and so this field can be used in place of OrderHint when + // show_existing_frame is known to be false, such as during tile decoding. + uint8_t order_hint; + int8_t primary_reference_frame; + bool render_and_frame_size_different; + bool use_superres; + uint8_t superres_scale_denominator; + bool allow_screen_content_tools; + bool allow_intrabc; + bool frame_refs_short_signaling; + // A bitmask that specifies which reference frame slots will be updated with + // the current frame after it is decoded. + uint8_t refresh_frame_flags; + static_assert(sizeof(ObuFrameHeader::refresh_frame_flags) * 8 == + kNumReferenceFrameTypes, + ""); + bool found_reference; + int8_t force_integer_mv; + bool allow_high_precision_mv; + InterpolationFilter interpolation_filter; + bool is_motion_mode_switchable; + bool use_ref_frame_mvs; + bool enable_frame_end_update_cdf; + // True if all segments are losslessly encoded at the coded resolution. + bool coded_lossless; + // True if all segments are losslessly encoded at the upscaled resolution. + bool upscaled_lossless; + TxMode tx_mode; + // True means that the mode info for inter blocks contains the syntax + // element comp_mode that indicates whether to use single or compound + // prediction. False means that all inter blocks will use single prediction. + bool reference_mode_select; + // The frames to use for compound prediction when skip_mode is true. + ReferenceFrameType skip_mode_frame[2]; + bool skip_mode_present; + bool reduced_tx_set; + bool allow_warped_motion; + Delta delta_q; + Delta delta_lf; + // A valid value of reference_frame_index[i] is in the range [0, 7]. -1 + // indicates an invalid value. + int8_t reference_frame_index[kNumInterReferenceFrameTypes]; + // The ref_order_hint[ i ] syntax element in the uncompressed header. + // Specifies the expected output order hint for each reference frame. + uint8_t reference_order_hint[kNumReferenceFrameTypes]; + LoopFilter loop_filter; + Cdef cdef; + FilmGrainParams film_grain_params; +}; + +} // namespace libgav1 +#endif // LIBGAV1_SRC_UTILS_TYPES_H_ diff --git a/src/utils/unbounded_queue.h b/src/utils/unbounded_queue.h new file mode 100644 index 0000000..fa0d303 --- /dev/null +++ b/src/utils/unbounded_queue.h @@ -0,0 +1,245 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_ +#define LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_ + +#include +#include +#include +#include +#include + +#include "src/utils/compiler_attributes.h" +#include "src/utils/memory.h" + +namespace libgav1 { + +// A FIFO queue of an unbounded capacity. +// +// This implementation uses the general approach used in std::deque +// implementations. See, for example, +// https://stackoverflow.com/questions/6292332/what-really-is-a-deque-in-stl +// +// It is much simpler because it just needs to support the queue interface. +// The blocks are chained into a circular list, not managed by a "map". It +// does not shrink the internal buffer. +// +// An alternative implementation approach is a resizable circular array. See, +// for example, ResizingArrayQueue.java in https://algs4.cs.princeton.edu/code/ +// and base::circular_deque in Chromium's base/containers library. +template +class UnboundedQueue { + public: + UnboundedQueue() = default; + + // Move only. + UnboundedQueue(UnboundedQueue&& other) + : first_block_(other.first_block_), + front_(other.front_), + last_block_(other.last_block_), + back_(other.back_) { + other.first_block_ = nullptr; + other.front_ = 0; + other.last_block_ = nullptr; + other.back_ = 0; + } + UnboundedQueue& operator=(UnboundedQueue&& other) { + if (this != &other) { + Destroy(); + first_block_ = other.first_block_; + front_ = other.front_; + last_block_ = other.last_block_; + back_ = other.back_; + other.first_block_ = nullptr; + other.front_ = 0; + other.last_block_ = nullptr; + other.back_ = 0; + } + return *this; + } + + ~UnboundedQueue() { Destroy(); } + + // Allocates two Blocks upfront because most access patterns require at + // least two Blocks. Returns false if the allocation of the Blocks failed. + LIBGAV1_MUST_USE_RESULT bool Init() { + std::unique_ptr new_block0(new (std::nothrow) Block); + std::unique_ptr new_block1(new (std::nothrow) Block); + if (new_block0 == nullptr || new_block1 == nullptr) return false; + first_block_ = last_block_ = new_block0.release(); + new_block1->next = first_block_; + last_block_->next = new_block1.release(); + return true; + } + + // Checks if the queue has room for a new element. If the queue is full, + // tries to grow it. Returns false if the queue is full and the attempt to + // grow it failed. + // + // NOTE: GrowIfNeeded() must be called before each call to Push(). This + // inconvenient design is necessary to guarantee a successful Push() call. + // + // Push(T&& value) is often called with the argument std::move(value). The + // moved-from object |value| won't be usable afterwards, so it would be + // problematic if Push(T&& value) failed and we lost access to the original + // |value| object. + LIBGAV1_MUST_USE_RESULT bool GrowIfNeeded() { + assert(last_block_ != nullptr); + if (back_ == kBlockCapacity) { + if (last_block_->next == first_block_) { + // All Blocks are in use. + std::unique_ptr new_block(new (std::nothrow) Block); + if (new_block == nullptr) return false; + new_block->next = first_block_; + last_block_->next = new_block.release(); + } + last_block_ = last_block_->next; + back_ = 0; + } + return true; + } + + // Pushes the element |value| to the end of the queue. It is an error to call + // Push() when the queue is full. + void Push(const T& value) { + assert(last_block_ != nullptr); + assert(back_ < kBlockCapacity); + T* elements = reinterpret_cast(last_block_->buffer); + new (&elements[back_++]) T(value); + } + + void Push(T&& value) { + assert(last_block_ != nullptr); + assert(back_ < kBlockCapacity); + T* elements = reinterpret_cast(last_block_->buffer); + new (&elements[back_++]) T(std::move(value)); + } + + // Returns the element at the front of the queue. It is an error to call + // Front() when the queue is empty. + T& Front() { + assert(!Empty()); + T* elements = reinterpret_cast(first_block_->buffer); + return elements[front_]; + } + + const T& Front() const { + assert(!Empty()); + T* elements = reinterpret_cast(first_block_->buffer); + return elements[front_]; + } + + // Removes the element at the front of the queue from the queue. It is an + // error to call Pop() when the queue is empty. + void Pop() { + assert(!Empty()); + T* elements = reinterpret_cast(first_block_->buffer); + elements[front_++].~T(); + if (front_ == kBlockCapacity) { + // The first block has become empty. + front_ = 0; + if (first_block_ == last_block_) { + // Only one Block is in use. Simply reset back_. + back_ = 0; + } else { + first_block_ = first_block_->next; + } + } + } + + // Returns true if the queue is empty. + bool Empty() const { return first_block_ == last_block_ && front_ == back_; } + + private: + // kBlockCapacity is the maximum number of elements each Block can hold. + // sizeof(void*) is subtracted from 2048 to account for the |next| pointer in + // the Block struct. + // + // In Linux x86_64, sizeof(std::function) is 32, so each Block can + // hold 63 std::function objects. + // + // NOTE: The corresponding value in in libc++ revision + // 245b5ba3448b9d3f6de5962066557e253a6bc9a4 is: + // template + // struct __deque_block_size { + // static const _DiffType value = + // sizeof(_ValueType) < 256 ? 4096 / sizeof(_ValueType) : 16; + // }; + // + // Note that 4096 / 256 = 16, so apparently this expression is intended to + // ensure the block size is at least 4096 bytes and each block can hold at + // least 16 elements. + static constexpr size_t kBlockCapacity = + (sizeof(T) < 128) ? (2048 - sizeof(void*)) / sizeof(T) : 16; + + struct Block : public Allocable { + alignas(T) char buffer[kBlockCapacity * sizeof(T)]; + Block* next; + }; + + void Destroy() { + if (first_block_ == nullptr) return; // An uninitialized queue. + + // First free the unused blocks, which are located after last_block and + // before first_block_. + Block* block = last_block_->next; + // Cut the circular list open after last_block_. + last_block_->next = nullptr; + while (block != first_block_) { + Block* next = block->next; + delete block; + block = next; + } + + // Then free the used blocks. Destruct the elements in the used blocks. + while (block != nullptr) { + const size_t begin = (block == first_block_) ? front_ : 0; + const size_t end = (block == last_block_) ? back_ : kBlockCapacity; + T* elements = reinterpret_cast(block->buffer); + for (size_t i = begin; i < end; ++i) { + elements[i].~T(); + } + Block* next = block->next; + delete block; + block = next; + } + } + + // Blocks are chained in a circular singly-linked list. If the list of Blocks + // is empty, both first_block_ and last_block_ are null pointers. If the list + // is nonempty, first_block_ points to the first used Block and last_block_ + // points to the last used Block. + // + // Invariant: If Init() is called and succeeds, the queue is always nonempty. + // This allows all methods (except the destructor) to avoid null pointer + // checks for first_block_ and last_block_. + Block* first_block_ = nullptr; + // The index of the element in first_block_ to be removed by Pop(). + size_t front_ = 0; + Block* last_block_ = nullptr; + // The index in last_block_ where the new element is inserted by Push(). + size_t back_ = 0; +}; + +#if !LIBGAV1_CXX17 +template +constexpr size_t UnboundedQueue::kBlockCapacity; +#endif + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_ diff --git a/src/utils/vector.h b/src/utils/vector.h new file mode 100644 index 0000000..e211240 --- /dev/null +++ b/src/utils/vector.h @@ -0,0 +1,352 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// libgav1::Vector implementation + +#ifndef LIBGAV1_SRC_UTILS_VECTOR_H_ +#define LIBGAV1_SRC_UTILS_VECTOR_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { +namespace internal { + +static constexpr size_t kMinVectorAllocation = 16; + +// Returns the smallest power of two greater or equal to 'value'. +inline size_t NextPow2(size_t value) { + if (value == 0) return 0; + --value; + for (size_t i = 1; i < sizeof(size_t) * 8; i *= 2) value |= value >> i; + return value + 1; +} + +// Returns the smallest capacity greater or equal to 'value'. +inline size_t NextCapacity(size_t value) { + if (value == 0) return 0; + if (value <= kMinVectorAllocation) return kMinVectorAllocation; + return NextPow2(value); +} + +//------------------------------------------------------------------------------ +// Data structure equivalent to std::vector but returning false and to its last +// valid state on memory allocation failure. +// std::vector with a custom allocator does not fill this need without +// exceptions. + +template +class VectorBase { + public: + using iterator = T*; + using const_iterator = const T*; + + VectorBase() noexcept = default; + // Move only. + VectorBase(const VectorBase&) = delete; + VectorBase& operator=(const VectorBase&) = delete; + VectorBase(VectorBase&& other) noexcept + : items_(other.items_), + capacity_(other.capacity_), + num_items_(other.num_items_) { + other.items_ = nullptr; + other.capacity_ = 0; + other.num_items_ = 0; + } + VectorBase& operator=(VectorBase&& other) noexcept { + if (this != &other) { + clear(); + free(items_); + items_ = other.items_; + capacity_ = other.capacity_; + num_items_ = other.num_items_; + other.items_ = nullptr; + other.capacity_ = 0; + other.num_items_ = 0; + } + return *this; + } + ~VectorBase() { + clear(); + free(items_); + } + + // Reallocates just enough memory if needed so that 'new_cap' items can fit. + LIBGAV1_MUST_USE_RESULT bool reserve(size_t new_cap) { + if (capacity_ < new_cap) { + T* const new_items = static_cast(malloc(new_cap * sizeof(T))); + if (new_items == nullptr) return false; + if (num_items_ > 0) { + if (std::is_trivial::value) { + // Cast |new_items| and |items_| to void* to avoid the GCC + // -Wclass-memaccess warning and additionally the + // bugprone-undefined-memory-manipulation clang-tidy warning. The + // memcpy is safe because T is a trivial type. + memcpy(static_cast(new_items), + static_cast(items_), num_items_ * sizeof(T)); + } else { + for (size_t i = 0; i < num_items_; ++i) { + new (&new_items[i]) T(std::move(items_[i])); + items_[i].~T(); + } + } + } + free(items_); + items_ = new_items; + capacity_ = new_cap; + } + return true; + } + + // Reallocates less memory so that only the existing items can fit. + bool shrink_to_fit() { + if (capacity_ == num_items_) return true; + if (num_items_ == 0) { + free(items_); + items_ = nullptr; + capacity_ = 0; + return true; + } + const size_t previous_capacity = capacity_; + capacity_ = 0; // Force reserve() to allocate and copy. + if (reserve(num_items_)) return true; + capacity_ = previous_capacity; + return false; + } + + // Constructs a new item by copy constructor. May reallocate if + // 'resize_if_needed'. + LIBGAV1_MUST_USE_RESULT bool push_back(const T& value, + bool resize_if_needed = true) { + if (num_items_ >= capacity_ && + (!resize_if_needed || + !reserve(internal::NextCapacity(num_items_ + 1)))) { + return false; + } + new (&items_[num_items_]) T(value); + ++num_items_; + return true; + } + + // Constructs a new item by copy constructor. reserve() must have been called + // with a sufficient capacity. + // + // WARNING: No error checking is performed. + void push_back_unchecked(const T& value) { + assert(num_items_ < capacity_); + new (&items_[num_items_]) T(value); + ++num_items_; + } + + // Constructs a new item by move constructor. May reallocate if + // 'resize_if_needed'. + LIBGAV1_MUST_USE_RESULT bool push_back(T&& value, + bool resize_if_needed = true) { + if (num_items_ >= capacity_ && + (!resize_if_needed || + !reserve(internal::NextCapacity(num_items_ + 1)))) { + return false; + } + new (&items_[num_items_]) T(std::move(value)); + ++num_items_; + return true; + } + + // Constructs a new item by move constructor. reserve() must have been called + // with a sufficient capacity. + // + // WARNING: No error checking is performed. + void push_back_unchecked(T&& value) { + assert(num_items_ < capacity_); + new (&items_[num_items_]) T(std::move(value)); + ++num_items_; + } + + // Constructs a new item in place by forwarding the arguments args... to the + // constructor. May reallocate. + template + LIBGAV1_MUST_USE_RESULT bool emplace_back(Args&&... args) { + if (num_items_ >= capacity_ && + !reserve(internal::NextCapacity(num_items_ + 1))) { + return false; + } + new (&items_[num_items_]) T(std::forward(args)...); + ++num_items_; + return true; + } + + // Destructs the last item. + void pop_back() { + --num_items_; + items_[num_items_].~T(); + } + + // Destructs the item at 'pos'. + void erase(iterator pos) { erase(pos, pos + 1); } + + // Destructs the items in [first,last). + void erase(iterator first, iterator last) { + for (iterator it = first; it != last; ++it) it->~T(); + if (last != end()) { + if (std::is_trivial::value) { + // Cast |first| and |last| to void* to avoid the GCC + // -Wclass-memaccess warning and additionally the + // bugprone-undefined-memory-manipulation clang-tidy warning. The + // memmove is safe because T is a trivial type. + memmove(static_cast(first), static_cast(last), + (end() - last) * sizeof(T)); + } else { + for (iterator it_src = last, it_dst = first; it_src != end(); + ++it_src, ++it_dst) { + new (it_dst) T(std::move(*it_src)); + it_src->~T(); + } + } + } + num_items_ -= std::distance(first, last); + } + + // Destructs all the items. + void clear() { erase(begin(), end()); } + + // Destroys (including deallocating) all the items. + void reset() { + clear(); + if (!shrink_to_fit()) assert(false); + } + + // Accessors + bool empty() const { return (num_items_ == 0); } + size_t size() const { return num_items_; } + size_t capacity() const { return capacity_; } + + T* data() { return items_; } + T& front() { return items_[0]; } + T& back() { return items_[num_items_ - 1]; } + T& operator[](size_t i) { return items_[i]; } + T& at(size_t i) { return items_[i]; } + const T* data() const { return items_; } + const T& front() const { return items_[0]; } + const T& back() const { return items_[num_items_ - 1]; } + const T& operator[](size_t i) const { return items_[i]; } + const T& at(size_t i) const { return items_[i]; } + + iterator begin() { return &items_[0]; } + const_iterator begin() const { return &items_[0]; } + iterator end() { return &items_[num_items_]; } + const_iterator end() const { return &items_[num_items_]; } + + void swap(VectorBase& b) { + // Although not necessary here, adding "using std::swap;" and then calling + // swap() without namespace qualification is recommended. See Effective + // C++, Item 25. + using std::swap; + swap(items_, b.items_); + swap(capacity_, b.capacity_); + swap(num_items_, b.num_items_); + } + + protected: + T* items_ = nullptr; + size_t capacity_ = 0; + size_t num_items_ = 0; +}; + +} // namespace internal + +//------------------------------------------------------------------------------ + +// Vector class that does *NOT* construct the content on resize(). +// Should be reserved to plain old data. +template +class VectorNoCtor : public internal::VectorBase { + public: + // Creates or destructs items so that 'new_num_items' exist. + // Allocated memory grows every power-of-two items. + LIBGAV1_MUST_USE_RESULT bool resize(size_t new_num_items) { + using super = internal::VectorBase; + if (super::num_items_ < new_num_items) { + if (super::capacity_ < new_num_items) { + if (!super::reserve(internal::NextCapacity(new_num_items))) { + return false; + } + } + super::num_items_ = new_num_items; + } else { + while (super::num_items_ > new_num_items) { + --super::num_items_; + super::items_[super::num_items_].~T(); + } + } + return true; + } +}; + +// This generic vector class will call the constructors. +template +class Vector : public internal::VectorBase { + public: + // Constructs or destructs items so that 'new_num_items' exist. + // Allocated memory grows every power-of-two items. + LIBGAV1_MUST_USE_RESULT bool resize(size_t new_num_items) { + using super = internal::VectorBase; + if (super::num_items_ < new_num_items) { + if (super::capacity_ < new_num_items) { + if (!super::reserve(internal::NextCapacity(new_num_items))) { + return false; + } + } + while (super::num_items_ < new_num_items) { + new (&super::items_[super::num_items_]) T(); + ++super::num_items_; + } + } else { + while (super::num_items_ > new_num_items) { + --super::num_items_; + super::items_[super::num_items_].~T(); + } + } + return true; + } +}; + +//------------------------------------------------------------------------------ + +// Define non-member swap() functions in the namespace in which VectorNoCtor +// and Vector are implemented. See Effective C++, Item 25. + +template +void swap(VectorNoCtor& a, VectorNoCtor& b) { + a.swap(b); +} + +template +void swap(Vector& a, Vector& b) { + a.swap(b); +} + +//------------------------------------------------------------------------------ + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_VECTOR_H_ diff --git a/src/version.cc b/src/version.cc new file mode 100644 index 0000000..8d1e5a9 --- /dev/null +++ b/src/version.cc @@ -0,0 +1,39 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/gav1/version.h" + +#define LIBGAV1_TOSTRING(x) #x +#define LIBGAV1_STRINGIFY(x) LIBGAV1_TOSTRING(x) +#define LIBGAV1_DOT_SEPARATED(M, m, p) M##.##m##.##p +#define LIBGAV1_DOT_SEPARATED_VERSION(M, m, p) LIBGAV1_DOT_SEPARATED(M, m, p) +#define LIBGAV1_DOT_VERSION \ + LIBGAV1_DOT_SEPARATED_VERSION(LIBGAV1_MAJOR_VERSION, LIBGAV1_MINOR_VERSION, \ + LIBGAV1_PATCH_VERSION) + +#define LIBGAV1_VERSION_STRING LIBGAV1_STRINGIFY(LIBGAV1_DOT_VERSION) + +extern "C" { + +int Libgav1GetVersion() { return LIBGAV1_VERSION; } +const char* Libgav1GetVersionString() { return LIBGAV1_VERSION_STRING; } + +const char* Libgav1GetBuildConfiguration() { + // TODO(jzern): cmake can generate the detail or in other cases we could + // produce one based on the known defines along with the defaults based on + // the toolchain, e.g., LIBGAV1_ENABLE_NEON from cpu.h. + return "Not available."; +} + +} // extern "C" diff --git a/src/warp_prediction.cc b/src/warp_prediction.cc new file mode 100644 index 0000000..dd06317 --- /dev/null +++ b/src/warp_prediction.cc @@ -0,0 +1,244 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/warp_prediction.h" + +#include +#include +#include + +#include "src/tile.h" +#include "src/utils/block_parameters_holder.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/logging.h" + +namespace libgav1 { +namespace { + +constexpr int kWarpModelTranslationClamp = 1 << 23; +constexpr int kWarpModelAffineClamp = 1 << 13; +constexpr int kLargestMotionVectorDiff = 256; + +constexpr uint16_t kDivisorLookup[257] = { + 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768, + 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142, + 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564, + 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028, + 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530, + 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066, + 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633, + 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228, + 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848, + 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491, + 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155, + 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838, + 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538, + 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255, + 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986, + 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732, + 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489, + 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259, + 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039, + 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830, + 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630, + 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439, + 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257, + 8240, 8224, 8208, 8192}; + +// Number of fractional bits of lookup in divisor lookup table. +constexpr int kDivisorLookupBits = 8; +// Number of fractional bits of entries in divisor lookup table. +constexpr int kDivisorLookupPrecisionBits = 14; + +// 7.11.3.7. +template +void GenerateApproximateDivisor(T value, int16_t* division_factor, + int16_t* division_shift) { + const int n = FloorLog2(std::abs(value)); + const T e = std::abs(value) - (static_cast(1) << n); + const int entry = (n > kDivisorLookupBits) + ? RightShiftWithRounding(e, n - kDivisorLookupBits) + : static_cast(e << (kDivisorLookupBits - n)); + *division_shift = n + kDivisorLookupPrecisionBits; + *division_factor = + (value < 0) ? -kDivisorLookup[entry] : kDivisorLookup[entry]; +} + +// 7.11.3.8. +int LeastSquareProduct(int a, int b) { return ((a * b) >> 2) + a + b; } + +// 7.11.3.8. +int DiagonalClamp(int32_t value) { + return Clip3(value, + (1 << kWarpedModelPrecisionBits) - kWarpModelAffineClamp + 1, + (1 << kWarpedModelPrecisionBits) + kWarpModelAffineClamp - 1); +} + +// 7.11.3.8. +int NonDiagonalClamp(int32_t value) { + return Clip3(value, -kWarpModelAffineClamp + 1, kWarpModelAffineClamp - 1); +} + +int16_t GetShearParameter(int value) { + return static_cast( + LeftShift(RightShiftWithRoundingSigned(Clip3(value, INT16_MIN, INT16_MAX), + kWarpParamRoundingBits), + kWarpParamRoundingBits)); +} + +} // namespace + +bool SetupShear(GlobalMotion* const warp_params) { + int16_t division_shift; + int16_t division_factor; + const auto* const params = warp_params->params; + GenerateApproximateDivisor(params[2], &division_factor, + &division_shift); + const int alpha = params[2] - (1 << kWarpedModelPrecisionBits); + const int beta = params[3]; + const int64_t v = LeftShift(params[4], kWarpedModelPrecisionBits); + const int gamma = + RightShiftWithRoundingSigned(v * division_factor, division_shift); + const int64_t w = static_cast(params[3]) * params[4]; + const int delta = + params[5] - + RightShiftWithRoundingSigned(w * division_factor, division_shift) - + (1 << kWarpedModelPrecisionBits); + + warp_params->alpha = GetShearParameter(alpha); + warp_params->beta = GetShearParameter(beta); + warp_params->gamma = GetShearParameter(gamma); + warp_params->delta = GetShearParameter(delta); + if ((4 * std::abs(warp_params->alpha) + 7 * std::abs(warp_params->beta) >= + (1 << kWarpedModelPrecisionBits)) || + (4 * std::abs(warp_params->gamma) + 4 * std::abs(warp_params->delta) >= + (1 << kWarpedModelPrecisionBits))) { + return false; // NOLINT (easier condition to understand). + } + + return true; +} + +bool WarpEstimation(const int num_samples, const int block_width4x4, + const int block_height4x4, const int row4x4, + const int column4x4, const MotionVector& mv, + const int candidates[kMaxLeastSquaresSamples][4], + GlobalMotion* const warp_params) { + // |a| fits into int32_t. To avoid cast to int64_t in the following + // computation, we declare |a| as int64_t. + int64_t a[2][2] = {}; + int bx[2] = {}; + int by[2] = {}; + + // Note: for simplicity, the spec always uses absolute coordinates + // in the warp estimation process. subpixel_mid_x, subpixel_mid_y, + // and candidates are relative to the top left of the frame. + // In contrast, libaom uses a mixture of coordinate systems. + // In av1/common/warped_motion.c:find_affine_int(). The coordinate is relative + // to the top left of the block. + // mid_y/mid_x: the row/column coordinate of the center of the block. + const int mid_y = MultiplyBy4(row4x4) + MultiplyBy2(block_height4x4) - 1; + const int mid_x = MultiplyBy4(column4x4) + MultiplyBy2(block_width4x4) - 1; + const int subpixel_mid_y = MultiplyBy8(mid_y); + const int subpixel_mid_x = MultiplyBy8(mid_x); + const int reference_subpixel_mid_y = + subpixel_mid_y + mv.mv[MotionVector::kRow]; + const int reference_subpixel_mid_x = + subpixel_mid_x + mv.mv[MotionVector::kColumn]; + + for (int i = 0; i < num_samples; ++i) { + // candidates[][0] and candidates[][1] are the row/column coordinates of the + // sample point in this block, to the top left of the frame. + // candidates[][2] and candidates[][3] are the row/column coordinates of the + // sample point in this reference block, to the top left of the frame. + // sy/sx: the row/column coordinates of the sample point, with center of + // the block as origin. + const int sy = candidates[i][0] - subpixel_mid_y; + const int sx = candidates[i][1] - subpixel_mid_x; + // dy/dx: the row/column coordinates of the sample point in the reference + // block, with center of the reference block as origin. + const int dy = candidates[i][2] - reference_subpixel_mid_y; + const int dx = candidates[i][3] - reference_subpixel_mid_x; + if (std::abs(sx - dx) < kLargestMotionVectorDiff && + std::abs(sy - dy) < kLargestMotionVectorDiff) { + a[0][0] += LeastSquareProduct(sx, sx) + 8; + a[0][1] += LeastSquareProduct(sx, sy) + 4; + a[1][1] += LeastSquareProduct(sy, sy) + 8; + bx[0] += LeastSquareProduct(sx, dx) + 8; + bx[1] += LeastSquareProduct(sy, dx) + 4; + by[0] += LeastSquareProduct(sx, dy) + 4; + by[1] += LeastSquareProduct(sy, dy) + 8; + } + } + + // a[0][1] == a[1][0], because the matrix is symmetric. We don't have to + // compute a[1][0]. + const int64_t determinant = a[0][0] * a[1][1] - a[0][1] * a[0][1]; + if (determinant == 0) return false; + + int16_t division_shift; + int16_t division_factor; + GenerateApproximateDivisor(determinant, &division_factor, + &division_shift); + + division_shift -= kWarpedModelPrecisionBits; + + const int64_t params_2 = a[1][1] * bx[0] - a[0][1] * bx[1]; + const int64_t params_3 = -a[0][1] * bx[0] + a[0][0] * bx[1]; + const int64_t params_4 = a[1][1] * by[0] - a[0][1] * by[1]; + const int64_t params_5 = -a[0][1] * by[0] + a[0][0] * by[1]; + auto* const params = warp_params->params; + + if (division_shift <= 0) { + division_factor <<= -division_shift; + params[2] = static_cast(params_2) * division_factor; + params[3] = static_cast(params_3) * division_factor; + params[4] = static_cast(params_4) * division_factor; + params[5] = static_cast(params_5) * division_factor; + } else { + params[2] = RightShiftWithRoundingSigned(params_2 * division_factor, + division_shift); + params[3] = RightShiftWithRoundingSigned(params_3 * division_factor, + division_shift); + params[4] = RightShiftWithRoundingSigned(params_4 * division_factor, + division_shift); + params[5] = RightShiftWithRoundingSigned(params_5 * division_factor, + division_shift); + } + + params[2] = DiagonalClamp(params[2]); + params[3] = NonDiagonalClamp(params[3]); + params[4] = NonDiagonalClamp(params[4]); + params[5] = DiagonalClamp(params[5]); + + const int vx = + mv.mv[MotionVector::kColumn] * (1 << (kWarpedModelPrecisionBits - 3)) - + (mid_x * (params[2] - (1 << kWarpedModelPrecisionBits)) + + mid_y * params[3]); + const int vy = + mv.mv[MotionVector::kRow] * (1 << (kWarpedModelPrecisionBits - 3)) - + (mid_x * params[4] + + mid_y * (params[5] - (1 << kWarpedModelPrecisionBits))); + params[0] = + Clip3(vx, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1); + params[1] = + Clip3(vy, -kWarpModelTranslationClamp, kWarpModelTranslationClamp - 1); + + params[6] = 0; + params[7] = 0; + return true; +} + +} // namespace libgav1 diff --git a/src/warp_prediction.h b/src/warp_prediction.h new file mode 100644 index 0000000..6c86df3 --- /dev/null +++ b/src/warp_prediction.h @@ -0,0 +1,40 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_WARP_PREDICTION_H_ +#define LIBGAV1_SRC_WARP_PREDICTION_H_ + +#include "src/obu_parser.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { + +// Sets the alpha, beta, gamma, delta fields in warp_params using the +// warp_params->params array as input (only array entries at indexes 2, 3, 4, +// 5 are used). Returns whether alpha, beta, gamma, delta are valid. +bool SetupShear(GlobalMotion* warp_params); // 7.11.3.6. + +// Computes local warp parameters by performing a least square fit. +// Returns whether the computed parameters are valid. +bool WarpEstimation(int num_samples, int block_width4x4, int block_height4x4, + int row4x4, int column4x4, const MotionVector& mv, + const int candidates[kMaxLeastSquaresSamples][4], + GlobalMotion* warp_params); // 7.11.3.8. + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_WARP_PREDICTION_H_ diff --git a/src/yuv_buffer.cc b/src/yuv_buffer.cc new file mode 100644 index 0000000..c74e140 --- /dev/null +++ b/src/yuv_buffer.cc @@ -0,0 +1,201 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/yuv_buffer.h" + +#include +#include +#include + +#include "src/frame_buffer_utils.h" +#include "src/utils/common.h" +#include "src/utils/logging.h" + +namespace libgav1 { + +// Size conventions: +// * Widths, heights, and border sizes are in pixels. +// * Strides and plane sizes are in bytes. +// +// YuvBuffer objects may be reused through the BufferPool. Realloc() must +// assume that data members (except buffer_alloc_ and buffer_alloc_size_) may +// contain stale values from the previous use, and must set all data members +// from scratch. In particular, Realloc() must not rely on the initial values +// of data members set by the YuvBuffer constructor. +bool YuvBuffer::Realloc(int bitdepth, bool is_monochrome, int width, int height, + int8_t subsampling_x, int8_t subsampling_y, + int left_border, int right_border, int top_border, + int bottom_border, + GetFrameBufferCallback get_frame_buffer, + void* callback_private_data, + void** buffer_private_data) { + // Only support allocating buffers that have borders that are a multiple of + // 2. The border restriction is required because we may subsample the + // borders in the chroma planes. + if (((left_border | right_border | top_border | bottom_border) & 1) != 0) { + LIBGAV1_DLOG(ERROR, + "Borders must be a multiple of 2: left_border = %d, " + "right_border = %d, top_border = %d, bottom_border = %d.", + left_border, right_border, top_border, bottom_border); + return false; + } + + // Every row in the plane buffers needs to be kFrameBufferRowAlignment-byte + // aligned. Since the strides are multiples of kFrameBufferRowAlignment bytes, + // it suffices to just make the plane buffers kFrameBufferRowAlignment-byte + // aligned. + const int plane_align = kFrameBufferRowAlignment; + const int uv_width = + is_monochrome ? 0 : SubsampledValue(width, subsampling_x); + const int uv_height = + is_monochrome ? 0 : SubsampledValue(height, subsampling_y); + const int uv_left_border = is_monochrome ? 0 : left_border >> subsampling_x; + const int uv_right_border = is_monochrome ? 0 : right_border >> subsampling_x; + const int uv_top_border = is_monochrome ? 0 : top_border >> subsampling_y; + const int uv_bottom_border = + is_monochrome ? 0 : bottom_border >> subsampling_y; + + if (get_frame_buffer != nullptr) { + assert(buffer_private_data != nullptr); + + const Libgav1ImageFormat image_format = + ComposeImageFormat(is_monochrome, subsampling_x, subsampling_y); + FrameBuffer frame_buffer; + if (get_frame_buffer(callback_private_data, bitdepth, image_format, width, + height, left_border, right_border, top_border, + bottom_border, kFrameBufferRowAlignment, + &frame_buffer) != kStatusOk) { + return false; + } + + if (frame_buffer.plane[0] == nullptr || + (!is_monochrome && frame_buffer.plane[1] == nullptr) || + (!is_monochrome && frame_buffer.plane[2] == nullptr)) { + assert(false && "The get_frame_buffer callback malfunctioned."); + LIBGAV1_DLOG(ERROR, "The get_frame_buffer callback malfunctioned."); + return false; + } + + stride_[kPlaneY] = frame_buffer.stride[0]; + stride_[kPlaneU] = frame_buffer.stride[1]; + stride_[kPlaneV] = frame_buffer.stride[2]; + buffer_[kPlaneY] = frame_buffer.plane[0]; + buffer_[kPlaneU] = frame_buffer.plane[1]; + buffer_[kPlaneV] = frame_buffer.plane[2]; + *buffer_private_data = frame_buffer.private_data; + } else { + assert(callback_private_data == nullptr); + assert(buffer_private_data == nullptr); + + // Calculate y_stride (in bytes). It is padded to a multiple of + // kFrameBufferRowAlignment bytes. + int y_stride = width + left_border + right_border; +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth > 8) y_stride *= sizeof(uint16_t); +#endif + y_stride = Align(y_stride, kFrameBufferRowAlignment); + // Size of the Y plane in bytes. + const uint64_t y_plane_size = (height + top_border + bottom_border) * + static_cast(y_stride) + + (plane_align - 1); + + // Calculate uv_stride (in bytes). It is padded to a multiple of + // kFrameBufferRowAlignment bytes. + int uv_stride = uv_width + uv_left_border + uv_right_border; +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth > 8) uv_stride *= sizeof(uint16_t); +#endif + uv_stride = Align(uv_stride, kFrameBufferRowAlignment); + // Size of the U or V plane in bytes. + const uint64_t uv_plane_size = + is_monochrome ? 0 + : (uv_height + uv_top_border + uv_bottom_border) * + static_cast(uv_stride) + + (plane_align - 1); + + // Allocate unaligned y_buffer, u_buffer, and v_buffer. + uint8_t* y_buffer = nullptr; + uint8_t* u_buffer = nullptr; + uint8_t* v_buffer = nullptr; + + const uint64_t frame_size = y_plane_size + 2 * uv_plane_size; + if (frame_size > buffer_alloc_size_) { + // Allocation to hold larger frame, or first allocation. + if (frame_size != static_cast(frame_size)) return false; + + buffer_alloc_.reset(new (std::nothrow) + uint8_t[static_cast(frame_size)]); + if (buffer_alloc_ == nullptr) { + buffer_alloc_size_ = 0; + return false; + } + + buffer_alloc_size_ = static_cast(frame_size); + } + + y_buffer = buffer_alloc_.get(); + if (!is_monochrome) { + u_buffer = y_buffer + y_plane_size; + v_buffer = u_buffer + uv_plane_size; + } + + stride_[kPlaneY] = y_stride; + stride_[kPlaneU] = stride_[kPlaneV] = uv_stride; + + int left_border_bytes = left_border; + int uv_left_border_bytes = uv_left_border; +#if LIBGAV1_MAX_BITDEPTH >= 10 + if (bitdepth > 8) { + left_border_bytes *= sizeof(uint16_t); + uv_left_border_bytes *= sizeof(uint16_t); + } +#endif + buffer_[kPlaneY] = AlignAddr( + y_buffer + (top_border * y_stride) + left_border_bytes, plane_align); + buffer_[kPlaneU] = + AlignAddr(u_buffer + (uv_top_border * uv_stride) + uv_left_border_bytes, + plane_align); + buffer_[kPlaneV] = + AlignAddr(v_buffer + (uv_top_border * uv_stride) + uv_left_border_bytes, + plane_align); + } + + y_width_ = width; + y_height_ = height; + left_border_[kPlaneY] = left_border; + right_border_[kPlaneY] = right_border; + top_border_[kPlaneY] = top_border; + bottom_border_[kPlaneY] = bottom_border; + + uv_width_ = uv_width; + uv_height_ = uv_height; + left_border_[kPlaneU] = left_border_[kPlaneV] = uv_left_border; + right_border_[kPlaneU] = right_border_[kPlaneV] = uv_right_border; + top_border_[kPlaneU] = top_border_[kPlaneV] = uv_top_border; + bottom_border_[kPlaneU] = bottom_border_[kPlaneV] = uv_bottom_border; + + subsampling_x_ = subsampling_x; + subsampling_y_ = subsampling_y; + + bitdepth_ = bitdepth; + is_monochrome_ = is_monochrome; + assert(!is_monochrome || stride_[kPlaneU] == 0); + assert(!is_monochrome || stride_[kPlaneV] == 0); + assert(!is_monochrome || buffer_[kPlaneU] == nullptr); + assert(!is_monochrome || buffer_[kPlaneV] == nullptr); + + return true; +} + +} // namespace libgav1 diff --git a/src/yuv_buffer.h b/src/yuv_buffer.h new file mode 100644 index 0000000..b9e8cd3 --- /dev/null +++ b/src/yuv_buffer.h @@ -0,0 +1,183 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_YUV_BUFFER_H_ +#define LIBGAV1_SRC_YUV_BUFFER_H_ + +#include +#include +#include +#include +#include + +#include "src/gav1/frame_buffer.h" +#include "src/utils/constants.h" + +namespace libgav1 { + +class YuvBuffer { + public: + // Allocates the buffer. Returns true on success. Returns false on failure. + // + // * |width| and |height| are the image dimensions in pixels. + // * |subsampling_x| and |subsampling_y| (either 0 or 1) specify the + // subsampling of the width and height of the chroma planes, respectively. + // * |left_border|, |right_border|, |top_border|, and |bottom_border| are + // the sizes (in pixels) of the borders on the left, right, top, and + // bottom sides, respectively. The four border sizes must all be a + // multiple of 2. + // * If |get_frame_buffer| is not null, it is invoked to allocate the memory. + // If |get_frame_buffer| is null, YuvBuffer allocates the memory directly + // and ignores the |callback_private_data| and |buffer_private_data| + // parameters, which should be null. + // + // NOTE: The strides are a multiple of 16. Since the first row in each plane + // is 16-byte aligned, subsequent rows are also 16-byte aligned. + // + // Example: bitdepth=8 width=20 height=6 left/right/top/bottom_border=2. The + // diagram below shows how Realloc() allocates the data buffer for the Y + // plane. + // + // 16-byte aligned + // | + // v + // ++++++++++++++++++++++++pppppppp + // ++++++++++++++++++++++++pppppppp + // ++01234567890123456789++pppppppp + // ++11234567890123456789++pppppppp + // ++21234567890123456789++pppppppp + // ++31234567890123456789++pppppppp + // ++41234567890123456789++pppppppp + // ++51234567890123456789++pppppppp + // ++++++++++++++++++++++++pppppppp + // ++++++++++++++++++++++++pppppppp + // | | + // |<-- stride (multiple of 16) ->| + // + // The video frame has 6 rows of 20 pixels each. Each row is shown as the + // pattern r1234567890123456789, where |r| is 0, 1, 2, 3, 4, 5. + // + // Realloc() first adds a border of 2 pixels around the video frame. The + // border pixels are shown as '+'. + // + // Each row is then padded to a multiple of the default alignment in bytes, + // which is 16. The padding bytes are shown as lowercase 'p'. (Since + // |bitdepth| is 8 in this example, each pixel is one byte.) The padded size + // in bytes is the stride. In this example, the stride is 32 bytes. + // + // Finally, Realloc() aligns the first byte of frame data, which is the '0' + // pixel/byte in the upper left corner of the frame, to the default (16-byte) + // alignment boundary. + // + // TODO(wtc): Add a check for width and height limits to defend against + // invalid bitstreams. + bool Realloc(int bitdepth, bool is_monochrome, int width, int height, + int8_t subsampling_x, int8_t subsampling_y, int left_border, + int right_border, int top_border, int bottom_border, + GetFrameBufferCallback get_frame_buffer, + void* callback_private_data, void** buffer_private_data); + + int bitdepth() const { return bitdepth_; } + + bool is_monochrome() const { return is_monochrome_; } + + int8_t subsampling_x() const { return subsampling_x_; } + int8_t subsampling_y() const { return subsampling_y_; } + + int width(int plane) const { + return (plane == kPlaneY) ? y_width_ : uv_width_; + } + int height(int plane) const { + return (plane == kPlaneY) ? y_height_ : uv_height_; + } + + // Returns border sizes in pixels. + int left_border(int plane) const { return left_border_[plane]; } + int right_border(int plane) const { return right_border_[plane]; } + int top_border(int plane) const { return top_border_[plane]; } + int bottom_border(int plane) const { return bottom_border_[plane]; } + + // Returns the alignment of frame buffer row in bytes. + int alignment() const { return kFrameBufferRowAlignment; } + + // Backup the current set of warnings and disable -Warray-bounds for the + // following three functions as the compiler cannot, in all cases, determine + // whether |plane| is within [0, kMaxPlanes), e.g., with a variable based for + // loop. +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif + // Returns the data buffer for |plane|. + uint8_t* data(int plane) { + assert(plane >= 0); + assert(static_cast(plane) < std::extent::value); + return buffer_[plane]; + } + const uint8_t* data(int plane) const { + assert(plane >= 0); + assert(static_cast(plane) < std::extent::value); + return buffer_[plane]; + } + + // Returns the stride in bytes for |plane|. + int stride(int plane) const { + assert(plane >= 0); + assert(static_cast(plane) < std::extent::value); + return stride_[plane]; + } + // Restore the previous set of compiler warnings. +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + + private: + static constexpr int kFrameBufferRowAlignment = 16; + int bitdepth_ = 0; + bool is_monochrome_ = false; + + // y_width_ and y_height_ are the |width| and |height| arguments passed to the + // Realloc() method. + // + // uv_width_ and uv_height_ are computed from y_width_ and y_height_ as + // follows: + // uv_width_ = (y_width_ + subsampling_x_) >> subsampling_x_ + // uv_height_ = (y_height_ + subsampling_y_) >> subsampling_y_ + int y_width_ = 0; + int uv_width_ = 0; + int y_height_ = 0; + int uv_height_ = 0; + + int left_border_[kMaxPlanes] = {}; + int right_border_[kMaxPlanes] = {}; + int top_border_[kMaxPlanes] = {}; + int bottom_border_[kMaxPlanes] = {}; + + int stride_[kMaxPlanes] = {}; + uint8_t* buffer_[kMaxPlanes] = {}; + + // buffer_alloc_ and buffer_alloc_size_ are only used if the + // get_frame_buffer callback is null and we allocate the buffer ourselves. + std::unique_ptr buffer_alloc_; + size_t buffer_alloc_size_ = 0; + + int8_t subsampling_x_ = 0; // 0 or 1. + int8_t subsampling_y_ = 0; // 0 or 1. +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_YUV_BUFFER_H_ diff --git a/tests/fuzzer/decoder_fuzzer.cc b/tests/fuzzer/decoder_fuzzer.cc new file mode 100644 index 0000000..236fd3c --- /dev/null +++ b/tests/fuzzer/decoder_fuzzer.cc @@ -0,0 +1,87 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "examples/file_reader.h" +#include "examples/file_reader_constants.h" +#include "examples/file_reader_interface.h" +#include "src/gav1/decoder.h" +#include "tests/fuzzer/fuzzer_temp_file.h" + +namespace { + +#if defined(LIBGAV1_EXHAUSTIVE_FUZZING) +// Set a large upper bound to give more coverage of a single input; this value +// should be larger than most of the frame counts in the corpus. +constexpr int kMaxFrames = 100; +constexpr size_t kMaxDataSize = 400 * 1024; +#else +// Restrict the number of frames to improve fuzzer throughput. +constexpr int kMaxFrames = 5; +constexpr size_t kMaxDataSize = 200 * 1024; +#endif + +void Decode(const uint8_t* const data, const size_t size, + libgav1::Decoder* const decoder) { + decoder->EnqueueFrame(data, size, /*user_private_data=*/0, + /*buffer_private_data=*/nullptr); + const libgav1::DecoderBuffer* buffer; + decoder->DequeueFrame(&buffer); +} + +} // namespace + +// Always returns 0. Nonzero return values are reserved by libFuzzer for future +// use. +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + // Reject large chunks of data to improve fuzzer throughput. + if (size > kMaxDataSize) return 0; + + libgav1::Decoder decoder; + libgav1::DecoderSettings settings = {}; + // Use the low byte of the width to seed the number of threads. + // We use both nibbles of the lower byte as this results in values != 1 much + // more quickly than using the lower nibble alone. + settings.threads = (size >= 13) ? ((data[12] >> 4 | data[12]) & 0xF) + 1 : 1; + if (decoder.Init(&settings) != libgav1::kStatusOk) return 0; + + // Treat the input as a raw OBU stream. + Decode(data, size, &decoder); + + // Use the first frame from an IVF to bypass any read errors from the parser. + static constexpr size_t kIvfHeaderSize = + libgav1::kIvfFileHeaderSize + libgav1::kIvfFrameHeaderSize; + if (size >= kIvfHeaderSize) { + Decode(data + kIvfHeaderSize, size - kIvfHeaderSize, &decoder); + } + + FuzzerTemporaryFile tempfile(data, size); + auto file_reader = + libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true); + if (file_reader == nullptr) return 0; + + std::vector buffer; + int decoded_frames = 0; + do { + if (!file_reader->ReadTemporalUnit(&buffer, nullptr)) break; + Decode(buffer.data(), buffer.size(), &decoder); + if (++decoded_frames >= kMaxFrames) break; + } while (!file_reader->IsEndOfFile()); + + return 0; +} diff --git a/tests/fuzzer/decoder_fuzzer_frame_parallel.cc b/tests/fuzzer/decoder_fuzzer_frame_parallel.cc new file mode 100644 index 0000000..d1b1c54 --- /dev/null +++ b/tests/fuzzer/decoder_fuzzer_frame_parallel.cc @@ -0,0 +1,139 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "examples/file_reader.h" +#include "examples/file_reader_constants.h" +#include "examples/file_reader_interface.h" +#include "src/gav1/decoder.h" +#include "src/gav1/status_code.h" +#include "tests/fuzzer/fuzzer_temp_file.h" + +namespace { + +#if defined(LIBGAV1_EXHAUSTIVE_FUZZING) +// Set a large upper bound to give more coverage of a single input; this value +// should be larger than most of the frame counts in the corpus. +constexpr size_t kMaxDataSize = 400 * 1024; +#else +constexpr size_t kMaxDataSize = 200 * 1024; +#endif + +using InputBuffer = std::vector; + +struct InputBuffers { + ~InputBuffers() { + for (auto& buffer : free_buffers) { + delete buffer; + } + } + std::deque free_buffers; +}; + +void ReleaseInputBuffer(void* callback_private_data, + void* buffer_private_data) { + auto* const test = static_cast(callback_private_data); + test->free_buffers.push_back(static_cast(buffer_private_data)); +} + +} // namespace + +// Always returns 0. Nonzero return values are reserved by libFuzzer for future +// use. +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + // Reject large chunks of data to improve fuzzer throughput. + if (size > kMaxDataSize) return 0; + + // Note that |input_buffers| has to outlive the |decoder| object since the + // |release_input_buffer| callback could be called on the |decoder|'s + // destructor. + InputBuffers input_buffers; + + libgav1::Decoder decoder; + libgav1::DecoderSettings settings = {}; + // Use the 33 + low byte of the width to seed the number of threads. This + // ensures that we will trigger the frame parallel path in most cases. + // We use both nibbles of the lower byte as this results in values != 1 much + // more quickly than using the lower nibble alone. + settings.threads = + 33 + ((size >= 13) ? ((data[12] >> 4 | data[12]) & 0xF) + 1 : 1); + + settings.frame_parallel = true; + settings.blocking_dequeue = true; + settings.callback_private_data = &input_buffers; + settings.release_input_buffer = ReleaseInputBuffer; + if (decoder.Init(&settings) != libgav1::kStatusOk) return 0; + + FuzzerTemporaryFile tempfile(data, size); + auto file_reader = + libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true); + if (file_reader == nullptr) return 0; + + InputBuffer* input_buffer = nullptr; + bool dequeue_finished = false; + + do { + if (input_buffer == nullptr && !file_reader->IsEndOfFile()) { + if (input_buffers.free_buffers.empty()) { + auto* const buffer = new (std::nothrow) InputBuffer(); + if (buffer == nullptr) { + break; + } + input_buffers.free_buffers.push_back(buffer); + } + input_buffer = input_buffers.free_buffers.front(); + input_buffers.free_buffers.pop_front(); + if (!file_reader->ReadTemporalUnit(input_buffer, nullptr)) { + break; + } + } + + if (input_buffer != nullptr) { + libgav1::StatusCode status = + decoder.EnqueueFrame(input_buffer->data(), input_buffer->size(), + /*user_private_data=*/0, + /*buffer_private_data=*/input_buffer); + if (status == libgav1::kStatusOk) { + input_buffer = nullptr; + // Continue to enqueue frames until we get a kStatusTryAgain status. + continue; + } + if (status != libgav1::kStatusTryAgain) { + break; + } + } + + const libgav1::DecoderBuffer* buffer; + libgav1::StatusCode status = decoder.DequeueFrame(&buffer); + if (status == libgav1::kStatusNothingToDequeue) { + dequeue_finished = true; + } else if (status == libgav1::kStatusOk) { + dequeue_finished = false; + } else { + break; + } + } while (input_buffer != nullptr || !file_reader->IsEndOfFile() || + !dequeue_finished); + + if (input_buffer != nullptr) { + input_buffers.free_buffers.push_back(input_buffer); + } + + return 0; +} diff --git a/tests/fuzzer/fuzzer_temp_file.h b/tests/fuzzer/fuzzer_temp_file.h new file mode 100644 index 0000000..5d12bbe --- /dev/null +++ b/tests/fuzzer/fuzzer_temp_file.h @@ -0,0 +1,148 @@ +/* + * Copyright 2020 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_ +#define LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_ + +// Adapter utility from fuzzer input to a temporary file, for fuzzing APIs that +// require a file instead of an input buffer. + +#include +#include +#include +#include +#include +#include + +// Pure-C interface for creating and cleaning up temporary files. + +static char* fuzzer_get_tmpfile_with_suffix(const uint8_t* data, size_t size, + const char* suffix) { + if (suffix == NULL) { // NOLINT (this could be a C compilation unit) + suffix = ""; + } + const size_t suffix_len = strlen(suffix); + if (suffix_len > INT_MAX) { // mkstemps takes int for suffixlen param + perror("Suffix too long"); + abort(); + } + +#ifdef __ANDROID__ + const char* leading_temp_path = + "/data/local/tmp/generate_temporary_file.XXXXXX"; +#else + const char* leading_temp_path = "/tmp/generate_temporary_file.XXXXXX"; +#endif + const size_t buffer_sz = strlen(leading_temp_path) + suffix_len + 1; + char* filename_buffer = + (char*)malloc(buffer_sz); // NOLINT (this could be a C compilation unit) + if (!filename_buffer) { + perror("Failed to allocate file name buffer."); + abort(); + } + + if (snprintf(filename_buffer, buffer_sz, "%s%s", leading_temp_path, suffix) >= + buffer_sz) { + perror("File name buffer too short."); + abort(); + } + + const int file_descriptor = mkstemps(filename_buffer, suffix_len); + if (file_descriptor < 0) { + perror("Failed to make temporary file."); + abort(); + } + FILE* file = fdopen(file_descriptor, "wb"); + if (!file) { + perror("Failed to open file descriptor."); + close(file_descriptor); + abort(); + } + const size_t bytes_written = fwrite(data, sizeof(uint8_t), size, file); + if (bytes_written < size) { + close(file_descriptor); + fprintf(stderr, "Failed to write all bytes to file (%zu out of %zu)", + bytes_written, size); + abort(); + } + fclose(file); + return filename_buffer; +} + +static char* fuzzer_get_tmpfile( + const uint8_t* data, + size_t size) { // NOLINT (people include this .inc file directly) + return fuzzer_get_tmpfile_with_suffix(data, size, NULL); // NOLINT +} + +static void fuzzer_release_tmpfile(char* filename) { + if (unlink(filename) != 0) { + perror("WARNING: Failed to delete temporary file."); + } + free(filename); +} + +// C++ RAII object for creating temporary files. + +#ifdef __cplusplus +class FuzzerTemporaryFile { + public: + FuzzerTemporaryFile(const uint8_t* data, size_t size) + : original_filename_(fuzzer_get_tmpfile(data, size)) { + filename_ = strdup(original_filename_); + if (!filename_) { + perror("Failed to allocate file name copy."); + abort(); + } + } + + FuzzerTemporaryFile(const uint8_t* data, size_t size, const char* suffix) + : original_filename_(fuzzer_get_tmpfile_with_suffix(data, size, suffix)) { + filename_ = strdup(original_filename_); + if (!filename_) { + perror("Failed to allocate file name copy."); + abort(); + } + } + + ~FuzzerTemporaryFile() { + free(filename_); + fuzzer_release_tmpfile(original_filename_); + } + + FuzzerTemporaryFile(const FuzzerTemporaryFile& other) = delete; + FuzzerTemporaryFile operator=(const FuzzerTemporaryFile& other) = delete; + + FuzzerTemporaryFile(const FuzzerTemporaryFile&& other) = delete; + FuzzerTemporaryFile operator=(const FuzzerTemporaryFile&& other) = delete; + + const char* filename() const { return filename_; } + + // Returns a mutable pointer to the file name. Should be used sparingly, only + // in case the fuzzed API demands it or when making a mutable copy is + // inconvenient (e.g., in auto-generated code). + char* mutable_filename() const { return filename_; } + + private: + char* original_filename_; + + // A mutable copy of the original filename, returned by the accessor. This + // guarantees that the original filename can always be used to release the + // temporary path. + char* filename_; +}; +#endif // __cplusplus +#endif // LIBGAV1_TESTS_FUZZER_FUZZER_TEMP_FILE_H_ diff --git a/tests/fuzzer/obu_parser_fuzzer.cc b/tests/fuzzer/obu_parser_fuzzer.cc new file mode 100644 index 0000000..634a802 --- /dev/null +++ b/tests/fuzzer/obu_parser_fuzzer.cc @@ -0,0 +1,89 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "examples/file_reader.h" +#include "examples/file_reader_constants.h" +#include "examples/file_reader_interface.h" +#include "src/buffer_pool.h" +#include "src/decoder_impl.h" +#include "src/decoder_state.h" +#include "src/internal_frame_buffer_list.h" +#include "src/obu_parser.h" +#include "tests/fuzzer/fuzzer_temp_file.h" + +namespace { + +#if defined(LIBGAV1_EXHAUSTIVE_FUZZING) +// Set a large upper bound to give more coverage of a single input; this value +// should be larger than most of the frame counts in the corpus. +constexpr int kMaxFrames = 100; +constexpr size_t kMaxDataSize = 400 * 1024; +#else +// Restrict the number of frames and obus to improve fuzzer throughput. +constexpr int kMaxFrames = 5; +constexpr size_t kMaxDataSize = 200 * 1024; +#endif + +inline void ParseObu(const uint8_t* const data, size_t size) { + libgav1::InternalFrameBufferList buffer_list; + libgav1::BufferPool buffer_pool(libgav1::OnInternalFrameBufferSizeChanged, + libgav1::GetInternalFrameBuffer, + libgav1::ReleaseInternalFrameBuffer, + &buffer_list); + libgav1::DecoderState decoder_state; + libgav1::ObuParser parser(data, size, 0, &buffer_pool, &decoder_state); + libgav1::RefCountedBufferPtr current_frame; + int parsed_frames = 0; + while (parser.HasData()) { + if (parser.ParseOneFrame(¤t_frame) != libgav1::kStatusOk) break; + if (++parsed_frames >= kMaxFrames) break; + } +} + +} // namespace + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + // Reject large chunks of data to improve fuzzer throughput. + if (size > kMaxDataSize) return 0; + + // Treat the input as a raw OBU stream. + ParseObu(data, size); + + // Use the first frame from an IVF to bypass any read errors from the parser. + static constexpr size_t kIvfHeaderSize = + libgav1::kIvfFileHeaderSize + libgav1::kIvfFrameHeaderSize; + if (size >= kIvfHeaderSize) { + ParseObu(data + kIvfHeaderSize, size - kIvfHeaderSize); + } + + FuzzerTemporaryFile tempfile(data, size); + auto file_reader = + libgav1::FileReader::Open(tempfile.filename(), /*error_tolerant=*/true); + if (file_reader == nullptr) return 0; + + std::vector buffer; + int parsed_frames = 0; + do { + if (!file_reader->ReadTemporalUnit(&buffer, nullptr)) break; + ParseObu(buffer.data(), buffer.size()); + if (++parsed_frames >= kMaxFrames) break; + } while (!file_reader->IsEndOfFile()); + + return 0; +} -- cgit v1.2.3