/* * Copyright 2019 The libgav1 Authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBGAV1_SRC_DSP_DSP_H_ #define LIBGAV1_SRC_DSP_DSP_H_ #include #include #include #include "src/dsp/common.h" #include "src/dsp/constants.h" #include "src/dsp/film_grain_common.h" #include "src/utils/cpu.h" #include "src/utils/reference_info.h" #include "src/utils/types.h" namespace libgav1 { namespace dsp { #if !defined(LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS) #define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 0 #endif enum IntraPredictor : uint8_t { kIntraPredictorDcFill, kIntraPredictorDcTop, kIntraPredictorDcLeft, kIntraPredictorDc, kIntraPredictorVertical, kIntraPredictorHorizontal, kIntraPredictorPaeth, kIntraPredictorSmooth, kIntraPredictorSmoothVertical, kIntraPredictorSmoothHorizontal, kNumIntraPredictors }; // List of valid 1D transforms. enum Transform1D : uint8_t { k1DTransformDct, // Discrete Cosine Transform. k1DTransformAdst, // Asymmetric Discrete Sine Transform. k1DTransformIdentity, k1DTransformWht, // Walsh Hadamard Transform. kNum1DTransforms }; // List of valid 1D transform sizes. Not all transforms may be available for all // the sizes. enum TransformSize1D : uint8_t { k1DTransformSize4, k1DTransformSize8, k1DTransformSize16, k1DTransformSize32, k1DTransformSize64, kNum1DTransformSizes }; // The maximum width of the loop filter, fewer pixels may be filtered depending // on strength thresholds. enum LoopFilterSize : uint8_t { kLoopFilterSize4, kLoopFilterSize6, kLoopFilterSize8, kLoopFilterSize14, kNumLoopFilterSizes }; enum : uint8_t { kRow = 0, kColumn = 1, }; //------------------------------------------------------------------------------ // ToString() // // These functions are meant to be used only in debug logging and within tests. // They are defined inline to avoid including the strings in the release // library when logging is disabled; unreferenced functions will not be added to // any object file in that case. inline const char* ToString(const IntraPredictor predictor) { switch (predictor) { case kIntraPredictorDcFill: return "kIntraPredictorDcFill"; case kIntraPredictorDcTop: return "kIntraPredictorDcTop"; case kIntraPredictorDcLeft: return "kIntraPredictorDcLeft"; case kIntraPredictorDc: return "kIntraPredictorDc"; case kIntraPredictorVertical: return "kIntraPredictorVertical"; case kIntraPredictorHorizontal: return "kIntraPredictorHorizontal"; case kIntraPredictorPaeth: return "kIntraPredictorPaeth"; case kIntraPredictorSmooth: return "kIntraPredictorSmooth"; case kIntraPredictorSmoothVertical: return "kIntraPredictorSmoothVertical"; case kIntraPredictorSmoothHorizontal: return "kIntraPredictorSmoothHorizontal"; case kNumIntraPredictors: return "kNumIntraPredictors"; } abort(); } inline const char* ToString(const Transform1D transform) { switch (transform) { case k1DTransformDct: return "k1DTransformDct"; case k1DTransformAdst: return "k1DTransformAdst"; case k1DTransformIdentity: return "k1DTransformIdentity"; case k1DTransformWht: return "k1DTransformWht"; case kNum1DTransforms: return "kNum1DTransforms"; } abort(); } inline const char* ToString(const TransformSize1D transform_size) { switch (transform_size) { case k1DTransformSize4: return "k1DTransformSize4"; case k1DTransformSize8: return "k1DTransformSize8"; case k1DTransformSize16: return "k1DTransformSize16"; case k1DTransformSize32: return "k1DTransformSize32"; case k1DTransformSize64: return "k1DTransformSize64"; case kNum1DTransformSizes: return "kNum1DTransformSizes"; } abort(); } inline const char* ToString(const LoopFilterSize filter_size) { switch (filter_size) { case kLoopFilterSize4: return "kLoopFilterSize4"; case kLoopFilterSize6: return "kLoopFilterSize6"; case kLoopFilterSize8: return "kLoopFilterSize8"; case kLoopFilterSize14: return "kLoopFilterSize14"; case kNumLoopFilterSizes: return "kNumLoopFilterSizes"; } abort(); } inline const char* ToString(const LoopFilterType filter_type) { switch (filter_type) { case kLoopFilterTypeVertical: return "kLoopFilterTypeVertical"; case kLoopFilterTypeHorizontal: return "kLoopFilterTypeHorizontal"; case kNumLoopFilterTypes: return "kNumLoopFilterTypes"; } abort(); } //------------------------------------------------------------------------------ // Intra predictors. Section 7.11.2. // These require access to one or both of the top row and left column. Some may // access the top-left (top[-1]), top-right (top[width+N]), bottom-left // (left[height+N]) or upper-left (left[-1]). // Intra predictor function signature. Sections 7.11.2.2, 7.11.2.4 (#10,#11), // 7.11.2.5, 7.11.2.6. // |dst| is an unaligned pointer to the output block. Pixel size is determined // by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to // the row above |dst|. |left| is an aligned vector of the column to the left // of |dst|. top-left and bottom-left may be accessed. using IntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride, const void* top, const void* left); using IntraPredictorFuncs = IntraPredictorFunc[kNumTransformSizes][kNumIntraPredictors]; // Directional intra predictor function signature, zone 1 (0 < angle < 90). // Section 7.11.2.4 (#7). // |dst| is an unaligned pointer to the output block. Pixel size is determined // by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to // the row above |dst|. |width| and |height| give the dimensions of the block. // |xstep| is the scaled starting index to |top| from // kDirectionalIntraPredictorDerivative. |upsampled_top| indicates whether // |top| has been upsampled as described in '7.11.2.11. Intra edge upsample // process'. This can occur in cases with |width| + |height| <= 16. top-right // is accessed. using DirectionalIntraPredictorZone1Func = void (*)(void* dst, ptrdiff_t stride, const void* top, int width, int height, int xstep, bool upsampled_top); // Directional intra predictor function signature, zone 2 (90 < angle < 180). // Section 7.11.2.4 (#8). // |dst| is an unaligned pointer to the output block. Pixel size is determined // by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to // the row above |dst|. |left| is an aligned vector of the column to the left of // |dst|. |width| and |height| give the dimensions of the block. |xstep| and // |ystep| are the scaled starting index to |top| and |left|, respectively, // from kDirectionalIntraPredictorDerivative. |upsampled_top| and // |upsampled_left| indicate whether |top| and |left| have been upsampled as // described in '7.11.2.11. Intra edge upsample process'. This can occur in // cases with |width| + |height| <= 16. top-left and upper-left are accessed, // up to [-2] in each if |upsampled_top/left| are set. using DirectionalIntraPredictorZone2Func = void (*)( void* dst, ptrdiff_t stride, const void* top, const void* left, int width, int height, int xstep, int ystep, bool upsampled_top, bool upsampled_left); // Directional intra predictor function signature, zone 3 (180 < angle < 270). // Section 7.11.2.4 (#9). // |dst| is an unaligned pointer to the output block. Pixel size is determined // by bitdepth with |stride| given in bytes. |left| is an aligned vector of the // column to the left of |dst|. |width| and |height| give the dimensions of the // block. |ystep| is the scaled starting index to |left| from // kDirectionalIntraPredictorDerivative. |upsampled_left| indicates whether // |left| has been upsampled as described in '7.11.2.11. Intra edge upsample // process'. This can occur in cases with |width| + |height| <= 16. bottom-left // is accessed. using DirectionalIntraPredictorZone3Func = void (*)(void* dst, ptrdiff_t stride, const void* left, int width, int height, int ystep, bool upsampled_left); // Filter intra predictor function signature. Section 7.11.2.3. // |dst| is an unaligned pointer to the output block. Pixel size is determined // by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to // the row above |dst|. |left| is an aligned vector of the column to the left // of |dst|. |width| and |height| are the size of the block in pixels. using FilterIntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride, const void* top, const void* left, FilterIntraPredictor pred, int width, int height); //------------------------------------------------------------------------------ // Chroma from Luma (Cfl) prediction. Section 7.11.5. // Chroma from Luma (Cfl) intra prediction function signature. |dst| is an // unaligned pointer to the output block. Pixel size is determined by bitdepth // with |stride| given in bytes. |luma| contains subsampled luma pixels with 3 // fractional bits of precision. |alpha| is the signed Cfl alpha value for the // appropriate plane. using CflIntraPredictorFunc = void (*)( void* dst, ptrdiff_t stride, const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], int alpha); using CflIntraPredictorFuncs = CflIntraPredictorFunc[kNumTransformSizes]; // Chroma from Luma (Cfl) subsampler function signature. |luma| is an unaligned // pointer to the output block. |src| is an unaligned pointer to the input // block. Pixel size is determined by bitdepth with |stride| given in bytes. using CflSubsamplerFunc = void (*)(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], int max_luma_width, int max_luma_height, const void* source, ptrdiff_t stride); using CflSubsamplerFuncs = CflSubsamplerFunc[kNumTransformSizes][kNumSubsamplingTypes]; //------------------------------------------------------------------------------ // Intra Edge Filtering and Upsampling. Step 4 in section 7.11.2.4. // Intra edge filter function signature. |buffer| is a pointer to the top_row or // left_column that needs to be filtered. Typically the -1'th index of |top_row| // and |left_column| need to be filtered as well, so the caller can merely pass // the |buffer| as top_row[-1] or left_column[-1]. Pixel size is determined by // bitdepth. |size| is the number of pixels to be filtered. |strength| is the // filter strength. Section 7.11.2.12 in the spec. using IntraEdgeFilterFunc = void (*)(void* buffer, int size, int strength); // Intra edge upsampler function signature. |buffer| is a pointer to the top_row // or left_column that needs to be upsampled. Pixel size is determined by // bitdepth. |size| is the number of pixels to be upsampled; valid values are: // 4, 8, 12, 16. This function needs access to negative indices -1 and -2 of // the |buffer|. Section 7.11.2.11 in the spec. using IntraEdgeUpsamplerFunc = void (*)(void* buffer, int size); //------------------------------------------------------------------------------ // Inverse transform add function signature. // // Steps 2 and 3 of section 7.12.3 (contains the implementation of section // 7.13.3). // Apply the inverse transforms and add the residual to the destination frame // for the transform type and block size |tx_size| starting at position // |start_x| and |start_y|. |dst_frame| is a pointer to an Array2D. // |adjusted_tx_height| is the number of rows to process based on the non-zero // coefficient count in the block. It will be 1 (non-zero coefficient count == // 1), 4 or a multiple of 8 up to 32 or the original transform height, // whichever is less. using InverseTransformAddFunc = void (*)(TransformType tx_type, TransformSize tx_size, int adjusted_tx_height, void* src_buffer, int start_x, int start_y, void* dst_frame); // The final dimension holds row and column transforms indexed with kRow and // kColumn. using InverseTransformAddFuncs = InverseTransformAddFunc[kNum1DTransforms][kNum1DTransformSizes][2]; //------------------------------------------------------------------------------ // Post processing. // Loop filter function signature. Section 7.14. // |dst| is an unaligned pointer to the output block. Pixel size is determined // by bitdepth with |stride| given in bytes. using LoopFilterFunc = void (*)(void* dst, ptrdiff_t stride, int outer_thresh, int inner_thresh, int hev_thresh); using LoopFilterFuncs = LoopFilterFunc[kNumLoopFilterSizes][kNumLoopFilterTypes]; // Cdef direction function signature. Section 7.15.2. // |src| is a pointer to the source block. Pixel size is determined by bitdepth // with |stride| given in bytes. |direction| and |variance| are output // parameters and must not be nullptr. using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride, uint8_t* direction, int* variance); // Cdef filtering function signature. Section 7.15.3. // |source| is a pointer to the input block padded with kCdefLargeValue if at a // frame border. |source_stride| is given in units of uint16_t. // |block_width|, |block_height| are the width/height of the input block. // |primary_strength|, |secondary_strength|, and |damping| are Cdef filtering // parameters. // |direction| is the filtering direction. // |dest| is the output buffer. |dest_stride| is given in bytes. using CdefFilteringFunc = void (*)(const uint16_t* source, ptrdiff_t source_stride, int block_height, int primary_strength, int secondary_strength, int damping, int direction, void* dest, ptrdiff_t dest_stride); // The first index is block width: [0]: 4, [1]: 8. The second is based on // non-zero strengths: [0]: |primary_strength| and |secondary_strength|, [1]: // |primary_strength| only, [2]: |secondary_strength| only. using CdefFilteringFuncs = CdefFilteringFunc[2][3]; // Upscaling coefficients function signature. Section 7.16. // This is an auxiliary function for SIMD optimizations and has no corresponding // C function. Different SIMD versions may have different outputs. So it must // pair with the corresponding version of SuperResFunc. // |upscaled_width| is the width of the output frame. // |step| is the number of subpixels to move the kernel for the next destination // pixel. // |initial_subpixel_x| is a base offset from which |step| increments. // |coefficients| is the upscale filter used by each pixel in a row. using SuperResCoefficientsFunc = void (*)(int upscaled_width, int initial_subpixel_x, int step, void* coefficients); // Upscaling process function signature. Section 7.16. // |coefficients| is the upscale filter used by each pixel in a row. It is not // used by the C function. // |source| is the input frame buffer. It will be line extended. // |source_stride| is given in pixels. // |dest| is the output buffer. // |dest_stride| is given in pixels. // |height| is the height of the block to be processed. // |downscaled_width| is the width of the input frame. // |upscaled_width| is the width of the output frame. // |step| is the number of subpixels to move the kernel for the next destination // pixel. // |initial_subpixel_x| is a base offset from which |step| increments. using SuperResFunc = void (*)(const void* coefficients, void* source, ptrdiff_t source_stride, int height, int downscaled_width, int upscaled_width, int initial_subpixel_x, int step, void* dest, ptrdiff_t dest_stride); // Loop restoration function signature. Sections 7.16, 7.17. // |restoration_info| contains loop restoration information, such as filter // type, strength. // |source| is the input frame buffer, which is deblocked and cdef filtered. // |top_border| and |bottom_border| are the top and bottom borders. // |dest| is the output. // |stride| is given in pixels, and shared by |source| and |dest|. // |top_border_stride| and |bottom_border_stride| are given in pixels. // |restoration_buffer| contains buffers required for self guided filter and // wiener filter. They must be initialized before calling. using LoopRestorationFunc = void (*)( const RestorationUnitInfo& restoration_info, const void* source, ptrdiff_t stride, const void* top_border, ptrdiff_t top_border_stride, const void* bottom_border, ptrdiff_t bottom_border_stride, int width, int height, RestorationBuffer* restoration_buffer, void* dest); // Index 0 is Wiener Filter. // Index 1 is Self Guided Restoration Filter. // This can be accessed as LoopRestorationType - 2. using LoopRestorationFuncs = LoopRestorationFunc[2]; // Convolve function signature. Section 7.11.3.4. // This function applies a horizontal filter followed by a vertical filter. // |reference| is the input block (reference frame buffer). |reference_stride| // is the corresponding frame stride. // |vertical_filter_index|/|horizontal_filter_index| is the index to // retrieve the type of filter to be applied for vertical/horizontal direction // from the filter lookup table 'kSubPixelFilters'. // |horizontal_filter_id| and |vertical_filter_id| are the filter ids. // |width| and |height| are width and height of the block to be filtered. // |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in // x/y direction. // |prediction| is the output block (output frame buffer). // Rounding precision is derived from the function being called. For horizontal // filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be // used. For compound vertical filtering kInterRoundBitsCompoundVertical will be // used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will // be used. using ConvolveFunc = void (*)(const void* reference, ptrdiff_t reference_stride, int horizontal_filter_index, int vertical_filter_index, int horizontal_filter_id, int vertical_filter_id, int width, int height, void* prediction, ptrdiff_t pred_stride); // Convolve functions signature. Each points to one convolve function with // a specific setting: // ConvolveFunc[is_intra_block_copy][is_compound][has_vertical_filter] // [has_horizontal_filter]. // If is_compound is false, the prediction is clipped to Pixel. // If is_compound is true, the range of prediction is: // 8bpp: [-5132, 9212] (int16_t) // 10bpp: [ 3988, 61532] (uint16_t) // 12bpp: [ 3974, 61559] (uint16_t) // See src/dsp/convolve.cc using ConvolveFuncs = ConvolveFunc[2][2][2][2]; // Convolve + scale function signature. Section 7.11.3.4. // This function applies a horizontal filter followed by a vertical filter. // |reference| is the input block (reference frame buffer). |reference_stride| // is the corresponding frame stride. // |vertical_filter_index|/|horizontal_filter_index| is the index to // retrieve the type of filter to be applied for vertical/horizontal direction // from the filter lookup table 'kSubPixelFilters'. // |subpixel_x| and |subpixel_y| are starting positions in units of 1/1024. // |step_x| and |step_y| are step sizes in units of 1/1024 of a pixel. // |width| and |height| are width and height of the block to be filtered. // |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in // x/y direction. // |prediction| is the output block (output frame buffer). // Rounding precision is derived from the function being called. For horizontal // filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be // used. For compound vertical filtering kInterRoundBitsCompoundVertical will be // used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will // be used. using ConvolveScaleFunc = void (*)(const void* reference, ptrdiff_t reference_stride, int horizontal_filter_index, int vertical_filter_index, int subpixel_x, int subpixel_y, int step_x, int step_y, int width, int height, void* prediction, ptrdiff_t pred_stride); // Convolve functions signature for scaling version. // 0: single predictor. 1: compound predictor. using ConvolveScaleFuncs = ConvolveScaleFunc[2]; // Weight mask function signature. Section 7.11.3.12. // |prediction_0| is the first input block. // |prediction_1| is the second input block. Both blocks are int16_t* when // bitdepth == 8 and uint16_t* otherwise. // |width| and |height| are the prediction width and height. // The stride for the input buffers is equal to |width|. // The valid range of block size is [8x8, 128x128] for the luma plane. // |mask| is the output buffer. |mask_stride| is the output buffer stride. using WeightMaskFunc = void (*)(const void* prediction_0, const void* prediction_1, uint8_t* mask, ptrdiff_t mask_stride); // Weight mask functions signature. The dimensions (in order) are: // * Width index (4 => 0, 8 => 1, 16 => 2 and so on). // * Height index (4 => 0, 8 => 1, 16 => 2 and so on). // * mask_is_inverse. using WeightMaskFuncs = WeightMaskFunc[6][6][2]; // Average blending function signature. // Two predictors are averaged to generate the output. // Input predictor values are int16_t. Output type is uint8_t, with actual // range of Pixel value. // Average blending is in the bottom of Section 7.11.3.1 (COMPOUND_AVERAGE). // |prediction_0| is the first input block. // |prediction_1| is the second input block. Both blocks are int16_t* when // bitdepth == 8 and uint16_t* otherwise. // |width| and |height| are the same for the first and second input blocks. // The stride for the input buffers is equal to |width|. // The valid range of block size is [8x8, 128x128] for the luma plane. // |dest| is the output buffer. |dest_stride| is the output buffer stride. using AverageBlendFunc = void (*)(const void* prediction_0, const void* prediction_1, int width, int height, void* dest, ptrdiff_t dest_stride); // Distance weighted blending function signature. // Weights are generated in Section 7.11.3.15. // Weighted blending is in the bottom of Section 7.11.3.1 (COMPOUND_DISTANCE). // This function takes two blocks (inter frame prediction) and produces a // weighted output. // |prediction_0| is the first input block. // |prediction_1| is the second input block. Both blocks are int16_t* when // bitdepth == 8 and uint16_t* otherwise. // |weight_0| is the weight for the first block. It is derived from the relative // distance of the first reference frame and the current frame. // |weight_1| is the weight for the second block. It is derived from the // relative distance of the second reference frame and the current frame. // |width| and |height| are the same for the first and second input blocks. // The stride for the input buffers is equal to |width|. // The valid range of block size is [8x8, 128x128] for the luma plane. // |dest| is the output buffer. |dest_stride| is the output buffer stride. using DistanceWeightedBlendFunc = void (*)(const void* prediction_0, const void* prediction_1, uint8_t weight_0, uint8_t weight_1, int width, int height, void* dest, ptrdiff_t dest_stride); // Mask blending function signature. Section 7.11.3.14. // This function takes two blocks and produces a blended output stored into the // output block |dest|. The blending is a weighted average process, controlled // by values of the mask. // |prediction_0| is the first input block. When prediction mode is inter_intra // (or wedge_inter_intra), this refers to the inter frame prediction. It is // int16_t* when bitdepth == 8 and uint16_t* otherwise. // The stride for |prediction_0| is equal to |width|. // |prediction_1| is the second input block. When prediction mode is inter_intra // (or wedge_inter_intra), this refers to the intra frame prediction and uses // Pixel values. It is only used for intra frame prediction when bitdepth >= 10. // It is int16_t* when bitdepth == 8 and uint16_t* otherwise. // |prediction_stride_1| is the stride, given in units of [u]int16_t. When // |is_inter_intra| is false (compound prediction) then |prediction_stride_1| is // equal to |width|. // |mask| is an integer array, whose value indicates the weight of the blending. // |mask_stride| is corresponding stride. // |width|, |height| are the same for both input blocks. // If it's inter_intra (or wedge_inter_intra), the valid range of block size is // [8x8, 32x32]. Otherwise (including difference weighted prediction and // compound average prediction), the valid range is [8x8, 128x128]. // If there's subsampling, the corresponding width and height are halved for // chroma planes. // |subsampling_x|, |subsampling_y| are the subsampling factors. // |is_inter_intra| stands for the prediction mode. If it is true, one of the // prediction blocks is from intra prediction of current frame. Otherwise, two // prediction blocks are both inter frame predictions. // |is_wedge_inter_intra| indicates if the mask is for the wedge prediction. // |dest| is the output block. // |dest_stride| is the corresponding stride for dest. using MaskBlendFunc = void (*)(const void* prediction_0, const void* prediction_1, ptrdiff_t prediction_stride_1, const uint8_t* mask, ptrdiff_t mask_stride, int width, int height, void* dest, ptrdiff_t dest_stride); // Mask blending functions signature. Each points to one function with // a specific setting: // MaskBlendFunc[subsampling_x + subsampling_y][is_inter_intra]. using MaskBlendFuncs = MaskBlendFunc[3][2]; // This function is similar to the MaskBlendFunc. It is only used when // |is_inter_intra| is true and |bitdepth| == 8. // |prediction_[01]| are Pixel values (uint8_t). // |prediction_1| is also the output buffer. using InterIntraMaskBlendFunc8bpp = void (*)(const uint8_t* prediction_0, uint8_t* prediction_1, ptrdiff_t prediction_stride_1, const uint8_t* mask, ptrdiff_t mask_stride, int width, int height); // InterIntra8bpp mask blending functions signature. When is_wedge_inter_intra // is false, the function at index 0 must be used. Otherwise, the function at // index subsampling_x + subsampling_y must be used. using InterIntraMaskBlendFuncs8bpp = InterIntraMaskBlendFunc8bpp[3]; // Obmc (overlapped block motion compensation) blending function signature. // Section 7.11.3.10. // This function takes two blocks and produces a blended output stored into the // first input block. The blending is a weighted average process, controlled by // values of the mask. // Obmc is not a compound mode. It is different from other compound blending, // in terms of precision. The current block is computed using convolution with // clipping to the range of pixel values. Its above and left blocks are also // clipped. Therefore obmc blending process doesn't need to clip the output. // |prediction| is the first input block, which will be overwritten. // |prediction_stride| is the stride, given in bytes. // |width|, |height| are the same for both input blocks. // |obmc_prediction| is the second input block. // |obmc_prediction_stride| is its stride, given in bytes. using ObmcBlendFunc = void (*)(void* prediction, ptrdiff_t prediction_stride, int width, int height, const void* obmc_prediction, ptrdiff_t obmc_prediction_stride); using ObmcBlendFuncs = ObmcBlendFunc[kNumObmcDirections]; // Warp function signature. Section 7.11.3.5. // This function applies warp filtering for each 8x8 block inside the current // coding block. The filtering process is similar to 2d convolve filtering. // The horizontal filter is applied followed by the vertical filter. // The function has to calculate corresponding pixel positions before and // after warping. // |source| is the input reference frame buffer. // |source_stride|, |source_width|, |source_height| are corresponding frame // stride, width, and height. |source_stride| is given in bytes. // |warp_params| is the matrix of warp motion: warp_params[i] = mN. // [x' (m2 m3 m0 [x // z . y' = m4 m5 m1 * y // 1] m6 m7 1) 1] // |subsampling_x/y| is the current frame's plane subsampling factor. // |block_start_x| and |block_start_y| are the starting position the current // coding block. // |block_width| and |block_height| are width and height of the current coding // block. |block_width| and |block_height| are at least 8. // |alpha|, |beta|, |gamma|, |delta| are valid warp parameters. See the // comments in the definition of struct GlobalMotion for the range of their // values. // |dest| is the output buffer of type Pixel. The output values are clipped to // Pixel values. // |dest_stride| is the stride, in units of bytes. // Rounding precision is derived from the function being called. For horizontal // filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be // used. For vertical filtering kInterRoundBitsVertical & // kInterRoundBitsVertical12bpp will be used. // // NOTE: WarpFunc assumes the source frame has left, right, top, and bottom // borders that extend the frame boundary pixels. // * The left and right borders must be at least 13 pixels wide. In addition, // Warp_NEON() may read up to 14 bytes after a row in the |source| buffer. // Therefore, there must be at least one extra padding byte after the right // border of the last row in the source buffer. // * The top and bottom borders must be at least 13 pixels high. using WarpFunc = void (*)(const void* source, ptrdiff_t source_stride, int source_width, int source_height, const int* warp_params, int subsampling_x, int subsampling_y, int block_start_x, int block_start_y, int block_width, int block_height, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta, void* dest, ptrdiff_t dest_stride); // Warp for compound predictions. Section 7.11.3.5. // Similar to WarpFunc, but |dest| is a uint16_t predictor buffer, // |dest_stride| is given in units of uint16_t and |inter_round_bits_vertical| // is always 7 (kCompoundInterRoundBitsVertical). // Rounding precision is derived from the function being called. For horizontal // filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be // used. For vertical filtering kInterRoundBitsCompondVertical will be used. using WarpCompoundFunc = WarpFunc; constexpr int kNumAutoRegressionLags = 4; // Applies an auto-regressive filter to the white noise in |luma_grain_buffer|. // Section 7.18.3.3, second code block // |params| are parameters read from frame header, mainly providing // auto_regression_coeff_y for the filter and auto_regression_shift to right // shift the filter sum by. Note: This method assumes // params.auto_regression_coeff_lag is not 0. Do not call this method if // params.auto_regression_coeff_lag is 0. using LumaAutoRegressionFunc = void (*)(const FilmGrainParams& params, void* luma_grain_buffer); // Function index is auto_regression_coeff_lag - 1. using LumaAutoRegressionFuncs = LumaAutoRegressionFunc[kNumAutoRegressionLags - 1]; // Applies an auto-regressive filter to the white noise in u_grain and v_grain. // Section 7.18.3.3, third code block // The |luma_grain_buffer| provides samples that are added to the autoregressive // sum when num_y_points > 0. // |u_grain_buffer| and |v_grain_buffer| point to the buffers of chroma noise // that were generated from the stored Gaussian sequence, and are overwritten // with the results of the autoregressive filter. |params| are parameters read // from frame header, mainly providing auto_regression_coeff_u and // auto_regression_coeff_v for each chroma plane's filter, and // auto_regression_shift to right shift the filter sums by. using ChromaAutoRegressionFunc = void (*)(const FilmGrainParams& params, const void* luma_grain_buffer, int subsampling_x, int subsampling_y, void* u_grain_buffer, void* v_grain_buffer); using ChromaAutoRegressionFuncs = ChromaAutoRegressionFunc[/*use_luma*/ 2][kNumAutoRegressionLags]; // Build an image-wide "stripe" of grain noise for every 32 rows in the image. // Section 7.18.3.5, first code block. // Each 32x32 luma block is copied at a random offset specified via // |grain_seed| from the grain template produced by autoregression, and the same // is done for chroma grains, subject to subsampling. // |width| and |height| are the dimensions of the overall image. // |noise_stripes_buffer| points to an Array2DView with one row for each stripe. // Because this function treats all planes identically and independently, it is // simplified to take one grain buffer at a time. This means duplicating some // random number generations, but that work can be reduced in other ways. using ConstructNoiseStripesFunc = void (*)(const void* grain_buffer, int grain_seed, int width, int height, int subsampling_x, int subsampling_y, void* noise_stripes_buffer); using ConstructNoiseStripesFuncs = ConstructNoiseStripesFunc[/*overlap_flag*/ 2]; // Compute the one or two overlap rows for each stripe copied to the noise // image. // Section 7.18.3.5, second code block. |width| and |height| are the // dimensions of the overall image. |noise_stripes_buffer| points to an // Array2DView with one row for each stripe. |noise_image_buffer| points to an // Array2D containing the allocated plane for this frame. Because this function // treats all planes identically and independently, it is simplified to take one // grain buffer at a time. using ConstructNoiseImageOverlapFunc = void (*)(const void* noise_stripes_buffer, int width, int height, int subsampling_x, int subsampling_y, void* noise_image_buffer); // Populate a scaling lookup table with interpolated values of a piecewise // linear function where values in |point_value| are mapped to the values in // |point_scaling|. // |num_points| can be between 0 and 15. When 0, the lookup table is set to // zero. // |point_value| and |point_scaling| have |num_points| valid elements. using InitializeScalingLutFunc = void (*)( int num_points, const uint8_t point_value[], const uint8_t point_scaling[], uint8_t scaling_lut[kScalingLookupTableSize]); // Blend noise with image. Section 7.18.3.5, third code block. // |width| is the width of each row, while |height| is how many rows to compute. // |start_height| is an offset for the noise image, to support multithreading. // |min_value|, |max_luma|, and |max_chroma| are computed by the caller of these // functions, according to the code in the spec. // |source_plane_y| and |source_plane_uv| are the plane buffers of the decoded // frame. They are blended with the film grain noise and written to // |dest_plane_y| and |dest_plane_uv| as final output for display. // source_plane_* and dest_plane_* may point to the same buffer, in which case // the film grain noise is added in place. // |scaling_lut_y| and |scaling_lut| represent a piecewise linear mapping from // the frame's raw pixel value, to a scaling factor for the noise sample. // |scaling_shift| is applied as a right shift after scaling, so that scaling // down is possible. It is found in FilmGrainParams, but supplied directly to // BlendNoiseWithImageLumaFunc because it's the only member used. using BlendNoiseWithImageLumaFunc = void (*)(const void* noise_image_ptr, int min_value, int max_value, int scaling_shift, int width, int height, int start_height, const uint8_t scaling_lut_y[kScalingLookupTableSize], const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y); using BlendNoiseWithImageChromaFunc = void (*)( Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, int min_value, int max_value, int width, int height, int start_height, int subsampling_x, int subsampling_y, const uint8_t scaling_lut[kScalingLookupTableSize], const void* source_plane_y, ptrdiff_t source_stride_y, const void* source_plane_uv, ptrdiff_t source_stride_uv, void* dest_plane_uv, ptrdiff_t dest_stride_uv); using BlendNoiseWithImageChromaFuncs = BlendNoiseWithImageChromaFunc[/*chroma_scaling_from_luma*/ 2]; //------------------------------------------------------------------------------ struct FilmGrainFuncs { LumaAutoRegressionFuncs luma_auto_regression; ChromaAutoRegressionFuncs chroma_auto_regression; ConstructNoiseStripesFuncs construct_noise_stripes; ConstructNoiseImageOverlapFunc construct_noise_image_overlap; InitializeScalingLutFunc initialize_scaling_lut; BlendNoiseWithImageLumaFunc blend_noise_luma; BlendNoiseWithImageChromaFuncs blend_noise_chroma; }; // Motion field projection function signature. Section 7.9. // |reference_info| provides reference information for motion field projection. // |reference_to_current_with_sign| is the precalculated reference frame id // distance from current frame. // |dst_sign| is -1 for LAST_FRAME and LAST2_FRAME, or 0 (1 in spec) for others. // |y8_start| and |y8_end| are the start and end 8x8 rows of the current tile. // |x8_start| and |x8_end| are the start and end 8x8 columns of the current // tile. // |motion_field| is the output which saves the projected motion field // information. using MotionFieldProjectionKernelFunc = void (*)( const ReferenceInfo& reference_info, int reference_to_current_with_sign, int dst_sign, int y8_start, int y8_end, int x8_start, int x8_end, TemporalMotionField* motion_field); // Compound temporal motion vector projection function signature. // Section 7.9.3 and 7.10.2.10. // |temporal_mvs| is the set of temporal reference motion vectors. // |temporal_reference_offsets| specifies the number of frames covered by the // original motion vector. // |reference_offsets| specifies the number of frames to be covered by the // projected motion vector. // |count| is the number of the temporal motion vectors. // |candidate_mvs| is the set of projected motion vectors. using MvProjectionCompoundFunc = void (*)( const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, const int reference_offsets[2], int count, CompoundMotionVector* candidate_mvs); // Single temporal motion vector projection function signature. // Section 7.9.3 and 7.10.2.10. // |temporal_mvs| is the set of temporal reference motion vectors. // |temporal_reference_offsets| specifies the number of frames covered by the // original motion vector. // |reference_offset| specifies the number of frames to be covered by the // projected motion vector. // |count| is the number of the temporal motion vectors. // |candidate_mvs| is the set of projected motion vectors. using MvProjectionSingleFunc = void (*)( const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets, int reference_offset, int count, MotionVector* candidate_mvs); struct Dsp { AverageBlendFunc average_blend; CdefDirectionFunc cdef_direction; CdefFilteringFuncs cdef_filters; CflIntraPredictorFuncs cfl_intra_predictors; CflSubsamplerFuncs cfl_subsamplers; ConvolveFuncs convolve; ConvolveScaleFuncs convolve_scale; DirectionalIntraPredictorZone1Func directional_intra_predictor_zone1; DirectionalIntraPredictorZone2Func directional_intra_predictor_zone2; DirectionalIntraPredictorZone3Func directional_intra_predictor_zone3; DistanceWeightedBlendFunc distance_weighted_blend; FilmGrainFuncs film_grain; FilterIntraPredictorFunc filter_intra_predictor; InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp; IntraEdgeFilterFunc intra_edge_filter; IntraEdgeUpsamplerFunc intra_edge_upsampler; IntraPredictorFuncs intra_predictors; InverseTransformAddFuncs inverse_transforms; LoopFilterFuncs loop_filters; LoopRestorationFuncs loop_restorations; MaskBlendFuncs mask_blend; MotionFieldProjectionKernelFunc motion_field_projection_kernel; MvProjectionCompoundFunc mv_projection_compound[3]; MvProjectionSingleFunc mv_projection_single[3]; ObmcBlendFuncs obmc_blend; SuperResCoefficientsFunc super_res_coefficients; SuperResFunc super_res; WarpCompoundFunc warp_compound; WarpFunc warp; WeightMaskFuncs weight_mask; }; // Initializes function pointers based on build config and runtime // environment. Must be called once before first use. This function is // thread-safe. void DspInit(); // Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't // exist. const Dsp* GetDspTable(int bitdepth); } // namespace dsp namespace dsp_internal { // Visual Studio builds don't have a way to detect SSE4_1. Only exclude the C // functions if /arch:AVX2 is used across all sources. #if !LIBGAV1_TARGETING_AVX2 && \ (defined(_MSC_VER) || (defined(_M_IX86) || defined(_M_X64))) #undef LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS #define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 1 #endif // Returns true if a more highly optimized version of |func| is not defined for // the associated bitdepth or if it is forcibly enabled with // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS. The define checked for |func| corresponds // to the LIBGAV1_Dspbpp_|func| define in the header file associated // with the module. // |func| is one of: // - FunctionName, e.g., SelfGuidedFilter. // - [sub-table-index1][...-indexN] e.g., // TransformSize4x4_IntraPredictorDc. The indices correspond to enum values // used as lookups with leading 'k' removed. // // NEON support is the only extension available for ARM and it is always // required. Because of this restriction DSP_ENABLED_8BPP_NEON(func) is always // true and can be omitted. #define DSP_ENABLED_8BPP_AVX2(func) \ (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_AVX2) #define DSP_ENABLED_10BPP_AVX2(func) \ (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_AVX2) #define DSP_ENABLED_8BPP_SSE4_1(func) \ (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_SSE4_1) #define DSP_ENABLED_10BPP_SSE4_1(func) \ (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \ LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_SSE4_1) // Initializes C-only function pointers. Note some entries may be set to // nullptr if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS is not defined. This is meant // for use in tests only, it is not thread-safe. void DspInit_C(); // Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't // exist. This version is meant for use by test or dsp/*Init() functions only. dsp::Dsp* GetWritableDspTable(int bitdepth); } // namespace dsp_internal } // namespace libgav1 #endif // LIBGAV1_SRC_DSP_DSP_H_