/* * Copyright 2019 The libgav1 Authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBGAV1_SRC_POST_FILTER_H_ #define LIBGAV1_SRC_POST_FILTER_H_ #include <algorithm> #include <array> #include <atomic> #include <cstddef> #include <cstdint> #include <cstring> #include <type_traits> #include "src/dsp/common.h" #include "src/dsp/dsp.h" #include "src/frame_scratch_buffer.h" #include "src/loop_restoration_info.h" #include "src/obu_parser.h" #include "src/utils/array_2d.h" #include "src/utils/block_parameters_holder.h" #include "src/utils/common.h" #include "src/utils/constants.h" #include "src/utils/memory.h" #include "src/utils/threadpool.h" #include "src/yuv_buffer.h" namespace libgav1 { // This class applies in-loop filtering for each frame after it is // reconstructed. The in-loop filtering contains all post processing filtering // for the reconstructed frame, including deblock filter, CDEF, superres, // and loop restoration. // Historically, for example in libaom, loop filter refers to deblock filter. // To avoid name conflicts, we call this class PostFilter (post processing). // In-loop post filtering order is: // deblock --> CDEF --> super resolution--> loop restoration. // When CDEF and super resolution is not used, we can combine deblock // and restoration together to only filter frame buffer once. class PostFilter { public: // This class does not take ownership of the masks/restoration_info, but it // may change their values. // // The overall flow of data in this class (for both single and multi-threaded // cases) is as follows: // -> Input: |frame_buffer_|. // -> Initialize |source_buffer_|, |cdef_buffer_|, |superres_buffer_| and // |loop_restoration_buffer_|. // -> Deblocking: // * Input: |source_buffer_| // * Output: |source_buffer_| // -> CDEF: // * Input: |source_buffer_| // * Output: |cdef_buffer_| // -> SuperRes: // * Input: |cdef_buffer_| // * Output: |superres_buffer_| // -> Loop Restoration: // * Input: |superres_buffer_| // * Output: |loop_restoration_buffer_|. // -> Now |frame_buffer_| contains the filtered frame. PostFilter(const ObuFrameHeader& frame_header, const ObuSequenceHeader& sequence_header, FrameScratchBuffer* frame_scratch_buffer, YuvBuffer* frame_buffer, const dsp::Dsp* dsp, int do_post_filter_mask); // non copyable/movable. PostFilter(const PostFilter&) = delete; PostFilter& operator=(const PostFilter&) = delete; PostFilter(PostFilter&&) = delete; PostFilter& operator=(PostFilter&&) = delete; // The overall function that applies all post processing filtering with // multiple threads. // * The filtering order is: // deblock --> CDEF --> super resolution--> loop restoration. // * The output of each filter is the input for the following filter. A // special case is that loop restoration needs a few rows of the deblocked // frame and the entire cdef filtered frame: // deblock --> CDEF --> super resolution --> loop restoration. // | ^ // | | // -----------> super resolution ----- // * Any of these filters could be present or absent. // * |frame_buffer_| points to the decoded frame buffer. When // ApplyFilteringThreaded() is called, |frame_buffer_| is modified by each // of the filters as described below. // Filter behavior (multi-threaded): // * Deblock: In-place filtering. The output is written to |source_buffer_|. // If cdef and loop restoration are both on, then 4 rows (as // specified by |kLoopRestorationBorderRows|) in every 64x64 block // is copied into |loop_restoration_border_|. // * Cdef: In-place filtering. Uses the |source_buffer_| and |cdef_border_| as // the input and the output is written into |cdef_buffer_| (which is // the same as |source_buffer_|). // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| and // |superres_line_buffer_| as the input and the output is written // into |superres_buffer_| (which is just |cdef_buffer_| with a // shift to the top). // * Restoration: Near in-place filtering. // Uses the |superres_buffer_| and |loop_restoration_border_| // as the input and the output is written into // |loop_restoration_buffer_| (which is just |superres_buffer_| // with a shift to the left). void ApplyFilteringThreaded(); // Does the overall post processing filter for one superblock row starting at // |row4x4| with height 4*|sb4x4|. If |do_deblock| is false, deblocking filter // will not be applied. // // Filter behavior (single-threaded): // * Deblock: In-place filtering. The output is written to |source_buffer_|. // If cdef and loop restoration are both on, then 4 rows (as // specified by |kLoopRestorationBorderRows|) in every 64x64 block // is copied into |loop_restoration_border_|. // * Cdef: In-place filtering. The output is written into |cdef_buffer_| // (which is just |source_buffer_| with a shift to the top-left). // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| as the input // and the output is written into |superres_buffer_| (which is // just |cdef_buffer_| with a shift to the top). // * Restoration: Near in-place filtering. // Uses the |superres_buffer_| and |loop_restoration_border_| // as the input and the output is written into // |loop_restoration_buffer_| (which is just |superres_buffer_| // with a shift to the left or top-left). // Returns the index of the last row whose post processing is complete and can // be used for referencing. int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row, bool do_deblock); // Apply deblocking filter in one direction (specified by |loop_filter_type|) // for the superblock row starting at |row4x4_start| for columns starting from // |column4x4_start| in increments of 16 (or 8 for chroma with subsampling) // until the smallest multiple of 16 that is >= |column4x4_end| or until // |frame_header_.columns4x4|, whichever is lower. This function must be // called only if |DoDeblock()| returns true. void ApplyDeblockFilter(LoopFilterType loop_filter_type, int row4x4_start, int column4x4_start, int column4x4_end, int sb4x4); static bool DoCdef(const ObuFrameHeader& frame_header, int do_post_filter_mask) { return (frame_header.cdef.bits > 0 || frame_header.cdef.y_primary_strength[0] > 0 || frame_header.cdef.y_secondary_strength[0] > 0 || frame_header.cdef.uv_primary_strength[0] > 0 || frame_header.cdef.uv_secondary_strength[0] > 0) && (do_post_filter_mask & 0x02) != 0; } bool DoCdef() const { return do_cdef_; } // If filter levels for Y plane (0 for vertical, 1 for horizontal), // are all zero, deblock filter will not be applied. static bool DoDeblock(const ObuFrameHeader& frame_header, uint8_t do_post_filter_mask) { return (frame_header.loop_filter.level[0] > 0 || frame_header.loop_filter.level[1] > 0) && (do_post_filter_mask & 0x01) != 0; } bool DoDeblock() const { return do_deblock_; } uint8_t GetZeroDeltaDeblockFilterLevel(int segment_id, int level_index, ReferenceFrameType type, int mode_id) const { return deblock_filter_levels_[segment_id][level_index][type][mode_id]; } // Computes the deblock filter levels using |delta_lf| and stores them in // |deblock_filter_levels|. void ComputeDeblockFilterLevels( const int8_t delta_lf[kFrameLfCount], uint8_t deblock_filter_levels[kMaxSegments][kFrameLfCount] [kNumReferenceFrameTypes][2]) const; // Returns true if loop restoration will be performed for the given parameters // and mask. static bool DoRestoration(const LoopRestoration& loop_restoration, uint8_t do_post_filter_mask, int num_planes) { if (num_planes == kMaxPlanesMonochrome) { return loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone && (do_post_filter_mask & 0x08) != 0; } return (loop_restoration.type[kPlaneY] != kLoopRestorationTypeNone || loop_restoration.type[kPlaneU] != kLoopRestorationTypeNone || loop_restoration.type[kPlaneV] != kLoopRestorationTypeNone) && (do_post_filter_mask & 0x08) != 0; } bool DoRestoration() const { return do_restoration_; } // Returns a pointer to the unfiltered buffer. This is used by the Tile class // to determine where to write the output of the tile decoding process taking // in-place filtering offsets into consideration. uint8_t* GetUnfilteredBuffer(int plane) { return source_buffer_[plane]; } const YuvBuffer& frame_buffer() const { return frame_buffer_; } // Returns true if SuperRes will be performed for the given frame header and // mask. static bool DoSuperRes(const ObuFrameHeader& frame_header, uint8_t do_post_filter_mask) { return frame_header.width != frame_header.upscaled_width && (do_post_filter_mask & 0x04) != 0; } bool DoSuperRes() const { return do_superres_; } LoopRestorationInfo* restoration_info() const { return restoration_info_; } uint8_t* GetBufferOffset(uint8_t* base_buffer, int stride, Plane plane, int row, int column) const { return base_buffer + (row >> subsampling_y_[plane]) * stride + ((column >> subsampling_x_[plane]) << pixel_size_log2_); } uint8_t* GetSourceBuffer(Plane plane, int row4x4, int column4x4) const { return GetBufferOffset(source_buffer_[plane], frame_buffer_.stride(plane), plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4)); } uint8_t* GetCdefBuffer(Plane plane, int row4x4, int column4x4) const { return GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane), plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4)); } uint8_t* GetSuperResBuffer(Plane plane, int row4x4, int column4x4) const { return GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane), plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4)); } template <typename Pixel> static void ExtendFrame(Pixel* frame_start, int width, int height, ptrdiff_t stride, int left, int right, int top, int bottom); private: // The type of the HorizontalDeblockFilter and VerticalDeblockFilter member // functions. using DeblockFilter = void (PostFilter::*)(int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end); // Functions common to all post filters. // Extends the frame by setting the border pixel values to the one from its // closest frame boundary. void ExtendFrameBoundary(uint8_t* frame_start, int width, int height, ptrdiff_t stride, int left, int right, int top, int bottom) const; // Extend frame boundary for referencing if the frame will be saved as a // reference frame. void ExtendBordersForReferenceFrame(); // Copies the deblocked pixels needed for loop restoration. void CopyDeblockedPixels(Plane plane, int row4x4); // Copies the border for one superblock row. If |for_loop_restoration| is // true, then it assumes that the border extension is being performed for the // input of the loop restoration process. If |for_loop_restoration| is false, // then it assumes that the border extension is being performed for using the // current frame as a reference frame. In this case, |progress_row_| is also // updated. void CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4, bool for_loop_restoration); // Sets up the |loop_restoration_border_| for loop restoration. // This is called when there is no CDEF filter. We copy rows from // |superres_buffer_| and do the line extension. void SetupLoopRestorationBorder(int row4x4_start); // This is called when there is CDEF filter. We copy rows from // |source_buffer_|, apply superres and do the line extension. void SetupLoopRestorationBorder(int row4x4_start, int sb4x4); // Returns true if we can perform border extension in loop (i.e.) without // waiting until the entire frame is decoded. If intra_block_copy is true, we // do in-loop border extension only if the upscaled_width is the same as 4 * // columns4x4. Otherwise, we cannot do in loop border extension since those // pixels may be used by intra block copy. bool DoBorderExtensionInLoop() const { return !frame_header_.allow_intrabc || frame_header_.upscaled_width == MultiplyBy4(frame_header_.columns4x4); } template <typename Pixel> void CopyPlane(const Pixel* src, ptrdiff_t src_stride, int width, int height, Pixel* dst, ptrdiff_t dst_stride) { assert(height > 0); do { memcpy(dst, src, width * sizeof(Pixel)); src += src_stride; dst += dst_stride; } while (--height != 0); } // Worker function used for multi-threaded implementation of Deblocking, CDEF // and Loop Restoration. using WorkerFunction = void (PostFilter::*)(std::atomic<int>* row4x4_atomic); // Schedules |worker| jobs to the |thread_pool_|, runs them in the calling // thread and returns once all the jobs are completed. void RunJobs(WorkerFunction worker); // Functions for the Deblocking filter. bool GetHorizontalDeblockFilterEdgeInfo(int row4x4, int column4x4, uint8_t* level, int* step, int* filter_length) const; void GetHorizontalDeblockFilterEdgeInfoUV(int row4x4, int column4x4, uint8_t* level_u, uint8_t* level_v, int* step, int* filter_length) const; bool GetVerticalDeblockFilterEdgeInfo(int row4x4, int column4x4, BlockParameters* const* bp_ptr, uint8_t* level, int* step, int* filter_length) const; void GetVerticalDeblockFilterEdgeInfoUV(int column4x4, BlockParameters* const* bp_ptr, uint8_t* level_u, uint8_t* level_v, int* step, int* filter_length) const; void HorizontalDeblockFilter(int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end); void VerticalDeblockFilter(int row4x4_start, int row4x4_end, int column4x4_start, int column4x4_end); // HorizontalDeblockFilter and VerticalDeblockFilter must have the correct // signature. static_assert(std::is_same<decltype(&PostFilter::HorizontalDeblockFilter), DeblockFilter>::value, ""); static_assert(std::is_same<decltype(&PostFilter::VerticalDeblockFilter), DeblockFilter>::value, ""); // Worker function used for multi-threaded deblocking. template <LoopFilterType loop_filter_type> void DeblockFilterWorker(std::atomic<int>* row4x4_atomic); static_assert( std::is_same< decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>), WorkerFunction>::value, ""); static_assert( std::is_same< decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>), WorkerFunction>::value, ""); // Functions for the cdef filter. // Copies the deblocked pixels necessary for use by the multi-threaded cdef // implementation into |cdef_border_|. void SetupCdefBorder(int row4x4); // This function prepares the input source block for cdef filtering. The input // source block contains a 12x12 block, with the inner 8x8 as the desired // filter region. It pads the block if the 12x12 block includes out of frame // pixels with a large value. This achieves the required behavior defined in // section 5.11.52 of the spec. template <typename Pixel> void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row4x4, int column4x4, uint16_t* cdef_source, ptrdiff_t cdef_stride, bool y_plane, const uint8_t border_columns[kMaxPlanes][256], bool use_border_columns); // Applies cdef for one 64x64 block. template <typename Pixel> void ApplyCdefForOneUnit(uint16_t* cdef_block, int index, int block_width4x4, int block_height4x4, int row4x4_start, int column4x4_start, uint8_t border_columns[2][kMaxPlanes][256], bool use_border_columns[2][2]); // Helper function used by ApplyCdefForOneSuperBlockRow to avoid some code // duplication. void ApplyCdefForOneSuperBlockRowHelper( uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256], int row4x4, int block_height4x4); // Applies CDEF filtering for the superblock row starting at |row4x4| with a // height of 4*|sb4x4|. void ApplyCdefForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row); // Worker function used for multi-threaded CDEF. void ApplyCdefWorker(std::atomic<int>* row4x4_atomic); static_assert(std::is_same<decltype(&PostFilter::ApplyCdefWorker), WorkerFunction>::value, ""); // Functions for the SuperRes filter. // Applies super resolution for the |src| for |rows[plane]| rows of each // plane. If |line_buffer_row| is larger than or equal to 0, one more row will // be processed, the line buffer indicated by |line_buffer_row| will be used // as the source. If |dst_is_loop_restoration_border| is true, then it means // that the |dst| pointers come from |loop_restoration_border_| and the // strides will be populated from that buffer. void ApplySuperRes( const std::array<uint8_t*, kMaxPlanes>& src, const std::array<int, kMaxPlanes>& rows, int line_buffer_row, const std::array<uint8_t*, kMaxPlanes>& dst, bool dst_is_loop_restoration_border = false); // Section 7.16. // Applies SuperRes for the superblock row starting at |row4x4| with a height // of 4*|sb4x4|. void ApplySuperResForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row); void ApplySuperResThreaded(); // Functions for the Loop Restoration filter. // Notes about Loop Restoration: // (1). Loop restoration processing unit size is default to 64x64. // Only when the remaining filtering area is smaller than 64x64, the // processing unit size is the actual area size. // For U/V plane, it is (64 >> subsampling_x) x (64 >> subsampling_y). // (2). Loop restoration unit size can be 64x64, 128x128, 256x256 for Y // plane. The unit size for chroma can be the same or half, depending on // subsampling. If either subsampling_x or subsampling_y is one, unit size // is halved on both x and y sides. // All loop restoration units have the same size for one plane. // One loop restoration unit could contain multiple processing units. // But they share the same sets of loop restoration parameters. // (3). Loop restoration has a row offset, kRestorationUnitOffset = 8. The // size of first row of loop restoration units and processing units is // shrunk by the offset. // (4). Loop restoration units wrap the bottom and the right of the frame, // if the remaining area is small. The criteria is whether the number of // remaining rows/columns is smaller than half of loop restoration unit // size. // For example, if the frame size is 140x140, loop restoration unit size is // 128x128. The size of the first loop restoration unit is 128x(128-8) = // 128 columns x 120 rows. // Since 140 - 120 < 128/2. The remaining 20 rows will be folded to the loop // restoration unit. Similarly, the remaining 12 columns will also be folded // to current loop restoration unit. So, even frame size is 140x140, // there's only one loop restoration unit. Suppose processing unit is 64x64, // then sizes of the first row of processing units are 64x56, 64x56, 12x56, // respectively. The second row is 64x64, 64x64, 12x64. // The third row is 64x20, 64x20, 12x20. // |stride| is shared by |src_buffer| and |dst_buffer|. template <typename Pixel> void ApplyLoopRestorationForOneRow(const Pixel* src_buffer, ptrdiff_t stride, Plane plane, int plane_height, int plane_width, int y, int unit_row, int current_process_unit_height, int plane_unit_size, Pixel* dst_buffer); // Applies loop restoration for the superblock row starting at |row4x4_start| // with a height of 4*|sb4x4|. template <typename Pixel> void ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start, int sb4x4); // Helper function that calls the right variant of // ApplyLoopRestorationForOneSuperBlockRow based on the bitdepth. void ApplyLoopRestoration(int row4x4_start, int sb4x4); // Worker function used for multithreaded Loop Restoration. void ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic); static_assert(std::is_same<decltype(&PostFilter::ApplyLoopRestorationWorker), WorkerFunction>::value, ""); // The lookup table for picking the deblock filter, according to deblock // filter type. const DeblockFilter deblock_filter_func_[2] = { &PostFilter::VerticalDeblockFilter, &PostFilter::HorizontalDeblockFilter}; const ObuFrameHeader& frame_header_; const LoopRestoration& loop_restoration_; const dsp::Dsp& dsp_; const int8_t bitdepth_; const int8_t subsampling_x_[kMaxPlanes]; const int8_t subsampling_y_[kMaxPlanes]; const int8_t planes_; const int pixel_size_log2_; const uint8_t* const inner_thresh_; const uint8_t* const outer_thresh_; const bool needs_chroma_deblock_; const bool do_cdef_; const bool do_deblock_; const bool do_restoration_; const bool do_superres_; // This stores the deblocking filter levels assuming that the delta is zero. // This will be used by all superblocks whose delta is zero (without having to // recompute them). The dimensions (in order) are: segment_id, level_index // (based on plane and direction), reference_frame and mode_id. uint8_t deblock_filter_levels_[kMaxSegments][kFrameLfCount] [kNumReferenceFrameTypes][2]; // Stores the SuperRes info for the frame. struct { int upscaled_width; int initial_subpixel_x; int step; } super_res_info_[kMaxPlanes]; const Array2D<int8_t>& cdef_index_; const Array2D<uint8_t>& cdef_skip_; const Array2D<TransformSize>& inter_transform_sizes_; LoopRestorationInfo* const restoration_info_; uint8_t* const superres_coefficients_[kNumPlaneTypes]; // Line buffer used by multi-threaded ApplySuperRes(). // In the multi-threaded case, this buffer will store the last downscaled row // input of each thread to avoid overwrites by the first upscaled row output // of the thread below it. YuvBuffer& superres_line_buffer_; const BlockParametersHolder& block_parameters_; // Frame buffer to hold cdef filtered frame. YuvBuffer cdef_filtered_buffer_; // Input frame buffer. YuvBuffer& frame_buffer_; // A view into |frame_buffer_| that points to the input and output of the // deblocking process. uint8_t* source_buffer_[kMaxPlanes]; // A view into |frame_buffer_| that points to the output of the CDEF filtered // planes (to facilitate in-place CDEF filtering). uint8_t* cdef_buffer_[kMaxPlanes]; // A view into |frame_buffer_| that points to the planes after the SuperRes // filter is applied (to facilitate in-place SuperRes). uint8_t* superres_buffer_[kMaxPlanes]; // A view into |frame_buffer_| that points to the output of the Loop Restored // planes (to facilitate in-place Loop Restoration). uint8_t* loop_restoration_buffer_[kMaxPlanes]; YuvBuffer& cdef_border_; // Buffer used to store the border pixels that are necessary for loop // restoration. This buffer will store 4 rows for every 64x64 block (4 rows // for every 32x32 for chroma with subsampling). The indices of the rows that // are stored are specified in |kLoopRestorationBorderRows|. First 4 rows of // this buffer are never populated and never used. // This buffer is used only when both of the following conditions are true: // (1). Loop Restoration is on. // (2). Cdef is on, or multi-threading is enabled for post filter. YuvBuffer& loop_restoration_border_; ThreadPool* const thread_pool_; // Tracks the progress of the post filters. int progress_row_ = -1; // A block buffer to hold the input that is converted to uint16_t before // cdef filtering. Only used in single threaded case. Y plane is processed // separately. U and V planes are processed together. So it is sufficient to // have this buffer to accommodate 2 planes at a time. uint16_t cdef_block_[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2]; template <int bitdepth, typename Pixel> friend class PostFilterSuperResTest; template <int bitdepth, typename Pixel> friend class PostFilterHelperFuncTest; }; extern template void PostFilter::ExtendFrame<uint8_t>(uint8_t* frame_start, int width, int height, ptrdiff_t stride, int left, int right, int top, int bottom); #if LIBGAV1_MAX_BITDEPTH >= 10 extern template void PostFilter::ExtendFrame<uint16_t>(uint16_t* frame_start, int width, int height, ptrdiff_t stride, int left, int right, int top, int bottom); #endif } // namespace libgav1 #endif // LIBGAV1_SRC_POST_FILTER_H_