diff options
Diffstat (limited to 'src/dsp/warp.cc')
-rw-r--r-- | src/dsp/warp.cc | 475 |
1 files changed, 475 insertions, 0 deletions
diff --git a/src/dsp/warp.cc b/src/dsp/warp.cc new file mode 100644 index 0000000..fbde65a --- /dev/null +++ b/src/dsp/warp.cc @@ -0,0 +1,475 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/warp.h" + +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstdlib> +#include <type_traits> + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" + +namespace libgav1 { +namespace dsp { +namespace { + +// Number of extra bits of precision in warped filtering. +constexpr int kWarpedDiffPrecisionBits = 10; + +// Warp prediction output ranges from WarpTest.ShowRange. +// Bitdepth: 8 Input range: [ 0, 255] +// 8bpp intermediate offset: 16384. +// intermediate range: [ 4399, 61009] +// first pass output range: [ 550, 7626] +// 8bpp intermediate offset removal: 262144. +// intermediate range: [ -620566, 1072406] +// second pass output range: [ 0, 255] +// compound second pass output range: [ -4848, 8378] +// +// Bitdepth: 10 Input range: [ 0, 1023] +// intermediate range: [ -48081, 179025] +// first pass output range: [ -6010, 22378] +// intermediate range: [-2103516, 4198620] +// second pass output range: [ 0, 1023] +// compound second pass output range: [ 8142, 57378] +// +// Bitdepth: 12 Input range: [ 0, 4095] +// intermediate range: [ -192465, 716625] +// first pass output range: [ -6015, 22395] +// intermediate range: [-2105190, 4201830] +// second pass output range: [ 0, 4095] +// compound second pass output range: [ 8129, 57403] + +template <bool is_compound, int bitdepth, typename Pixel> +void Warp_C(const void* const source, ptrdiff_t source_stride, + const int source_width, const int source_height, + const int* const warp_params, const int subsampling_x, + const int subsampling_y, const int block_start_x, + const int block_start_y, const int block_width, + const int block_height, const int16_t alpha, const int16_t beta, + const int16_t gamma, const int16_t delta, void* dest, + ptrdiff_t dest_stride) { + assert(block_width >= 8 && block_height >= 8); + if (is_compound) { + assert(dest_stride == block_width); + } + constexpr int kRoundBitsHorizontal = (bitdepth == 12) + ? kInterRoundBitsHorizontal12bpp + : kInterRoundBitsHorizontal; + constexpr int kRoundBitsVertical = + is_compound ? kInterRoundBitsCompoundVertical + : (bitdepth == 12) ? kInterRoundBitsVertical12bpp + : kInterRoundBitsVertical; + + // Only used for 8bpp. Allows for keeping the first pass intermediates within + // uint16_t. With 10/12bpp the intermediate value will always require int32_t. + constexpr int first_pass_offset = (bitdepth == 8) ? 1 << 14 : 0; + constexpr int offset_removal = + (first_pass_offset >> kRoundBitsHorizontal) * 128; + + constexpr int kMaxPixel = (1 << bitdepth) - 1; + union { + // |intermediate_result| is the output of the horizontal filtering and + // rounding. The range is within int16_t. + int16_t intermediate_result[15][8]; // 15 rows, 8 columns. + // In the simple special cases where the samples in each row are all the + // same, store one sample per row in a column vector. + int16_t intermediate_result_column[15]; + }; + const auto* const src = static_cast<const Pixel*>(source); + source_stride /= sizeof(Pixel); + using DestType = + typename std::conditional<is_compound, uint16_t, Pixel>::type; + auto* dst = static_cast<DestType*>(dest); + if (!is_compound) dest_stride /= sizeof(dst[0]); + + assert(block_width >= 8); + assert(block_height >= 8); + + // Warp process applies for each 8x8 block (or smaller). + for (int start_y = block_start_y; start_y < block_start_y + block_height; + start_y += 8) { + for (int start_x = block_start_x; start_x < block_start_x + block_width; + start_x += 8) { + const int src_x = (start_x + 4) << subsampling_x; + const int src_y = (start_y + 4) << subsampling_y; + const int dst_x = + src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0]; + const int dst_y = + src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1]; + const int x4 = dst_x >> subsampling_x; + const int y4 = dst_y >> subsampling_y; + const int ix4 = x4 >> kWarpedModelPrecisionBits; + const int iy4 = y4 >> kWarpedModelPrecisionBits; + + // A prediction block may fall outside the frame's boundaries. If a + // prediction block is calculated using only samples outside the frame's + // boundary, the filtering can be simplified. We can divide the plane + // into several regions and handle them differently. + // + // | | + // 1 | 3 | 1 + // | | + // -------+-----------+------- + // |***********| + // 2 |*****4*****| 2 + // |***********| + // -------+-----------+------- + // | | + // 1 | 3 | 1 + // | | + // + // At the center, region 4 represents the frame and is the general case. + // + // In regions 1 and 2, the prediction block is outside the frame's + // boundary horizontally. Therefore the horizontal filtering can be + // simplified. Furthermore, in the region 1 (at the four corners), the + // prediction is outside the frame's boundary both horizontally and + // vertically, so we get a constant prediction block. + // + // In region 3, the prediction block is outside the frame's boundary + // vertically. Unfortunately because we apply the horizontal filters + // first, by the time we apply the vertical filters, they no longer see + // simple inputs. So the only simplification is that all the rows are + // the same, but we still need to apply all the horizontal and vertical + // filters. + + // Check for two simple special cases, where the horizontal filter can + // be significantly simplified. + // + // In general, for each row, the horizontal filter is calculated as + // follows: + // for (int x = -4; x < 4; ++x) { + // const int offset = ...; + // int sum = first_pass_offset; + // for (int k = 0; k < 8; ++k) { + // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1); + // sum += kWarpedFilters[offset][k] * src_row[column]; + // } + // ... + // } + // The column index before clipping, ix4 + x + k - 3, varies in the range + // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1 + // or ix4 + 7 <= 0, then all the column indexes are clipped to the same + // border index (source_width - 1 or 0, respectively). Then for each x, + // the inner for loop of the horizontal filter is reduced to multiplying + // the border pixel by the sum of the filter coefficients. + if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) { + // Regions 1 and 2. + // Points to the left or right border of the first row of |src|. + const Pixel* first_row_border = + (ix4 + 7 <= 0) ? src : src + source_width - 1; + // In general, for y in [-7, 8), the row number iy4 + y is clipped: + // const int row = Clip3(iy4 + y, 0, source_height - 1); + // In two special cases, iy4 + y is clipped to either 0 or + // source_height - 1 for all y. In the rest of the cases, iy4 + y is + // bounded and we can avoid clipping iy4 + y by relying on a reference + // frame's boundary extension on the top and bottom. + if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + // Region 1. + // Every sample used to calculate the prediction block has the same + // value. So the whole prediction block has the same value. + const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const Pixel row_border_pixel = first_row_border[row * source_stride]; + DestType* dst_row = dst + start_x - block_start_x; + if (is_compound) { + int sum = row_border_pixel + << ((14 - kRoundBitsHorizontal) - kRoundBitsVertical); + sum += (bitdepth == 8) ? 0 : kCompoundOffset; + Memset(dst_row, sum, 8); + } else { + Memset(dst_row, row_border_pixel, 8); + } + const DestType* const first_dst_row = dst_row; + dst_row += dest_stride; + for (int y = 1; y < 8; ++y) { + memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row)); + dst_row += dest_stride; + } + // End of region 1. Continue the |start_x| for loop. + continue; + } + + // Region 2. + // Horizontal filter. + // The input values in this region are generated by extending the border + // which makes them identical in the horizontal direction. This + // computation could be inlined in the vertical pass but most + // implementations will need a transpose of some sort. + // It is not necessary to use the offset values here because the + // horizontal pass is a simple shift and the vertical pass will always + // require using 32 bits. + for (int y = -7; y < 8; ++y) { + // We may over-read up to 13 pixels above the top source row, or up + // to 13 pixels below the bottom source row. This is proved below. + const int row = iy4 + y; + int sum = first_row_border[row * source_stride]; + sum <<= kFilterBits - kRoundBitsHorizontal; + intermediate_result_column[y + 7] = sum; + } + // Vertical filter. + DestType* dst_row = dst + start_x - block_start_x; + int sy4 = + (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + for (int y = 0; y < 8; ++y) { + int sy = sy4 - MultiplyBy4(gamma); + for (int x = 0; x < 8; ++x) { + const int offset = + RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + assert(offset >= 0); + assert(offset < 3 * kWarpedPixelPrecisionShifts + 1); + int sum = 0; + for (int k = 0; k < 8; ++k) { + sum += + kWarpedFilters[offset][k] * intermediate_result_column[y + k]; + } + sum = RightShiftWithRounding(sum, kRoundBitsVertical); + if (is_compound) { + sum += (bitdepth == 8) ? 0 : kCompoundOffset; + dst_row[x] = static_cast<DestType>(sum); + } else { + dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel)); + } + sy += gamma; + } + dst_row += dest_stride; + sy4 += delta; + } + // End of region 2. Continue the |start_x| for loop. + continue; + } + + // Regions 3 and 4. + // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0. + // It follows that -6 <= ix4 <= source_width + 5. This inequality is + // used below. + + // In general, for y in [-7, 8), the row number iy4 + y is clipped: + // const int row = Clip3(iy4 + y, 0, source_height - 1); + // In two special cases, iy4 + y is clipped to either 0 or + // source_height - 1 for all y. In the rest of the cases, iy4 + y is + // bounded and we can avoid clipping iy4 + y by relying on a reference + // frame's boundary extension on the top and bottom. + if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) { + // Region 3. + // Horizontal filter. + const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1; + const Pixel* const src_row = src + row * source_stride; + int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + for (int y = -7; y < 8; ++y) { + int sx = sx4 - MultiplyBy4(alpha); + for (int x = -4; x < 4; ++x) { + const int offset = + RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + // Since alpha and beta have been validated by SetupShear(), one + // can prove that 0 <= offset <= 3 * 2^6. + assert(offset >= 0); + assert(offset < 3 * kWarpedPixelPrecisionShifts + 1); + // For SIMD optimization: + // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp. + // For 10/12 bit, the range of sum requires 32 bits. + int sum = first_pass_offset; + for (int k = 0; k < 8; ++k) { + // We assume the source frame has left and right borders of at + // least 13 pixels that extend the frame boundary pixels. + // + // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on + // ix4 above, we have + // -13 <= ix4 + x + k - 3 <= source_width + 12, + // or + // -13 <= column <= (source_width - 1) + 13. + // Therefore we may over-read up to 13 pixels before the source + // row, or up to 13 pixels after the source row. + const int column = ix4 + x + k - 3; + sum += kWarpedFilters[offset][k] * src_row[column]; + } + intermediate_result[y + 7][x + 4] = + RightShiftWithRounding(sum, kRoundBitsHorizontal); + sx += alpha; + } + sx4 += beta; + } + } else { + // Region 4. + // Horizontal filter. + // At this point, we know iy4 - 7 < source_height - 1 and iy4 + 7 > 0. + // It follows that -6 <= iy4 <= source_height + 5. This inequality is + // used below. + int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7; + for (int y = -7; y < 8; ++y) { + // We assume the source frame has top and bottom borders of at least + // 13 pixels that extend the frame boundary pixels. + // + // Since -7 <= y <= 7, using the inequality on iy4 above, we have + // -13 <= iy4 + y <= source_height + 12, + // or + // -13 <= row <= (source_height - 1) + 13. + // Therefore we may over-read up to 13 pixels above the top source + // row, or up to 13 pixels below the bottom source row. + const int row = iy4 + y; + const Pixel* const src_row = src + row * source_stride; + int sx = sx4 - MultiplyBy4(alpha); + for (int x = -4; x < 4; ++x) { + const int offset = + RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + // Since alpha and beta have been validated by SetupShear(), one + // can prove that 0 <= offset <= 3 * 2^6. + assert(offset >= 0); + assert(offset < 3 * kWarpedPixelPrecisionShifts + 1); + // For SIMD optimization: + // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp. + // For 10/12 bit, the range of sum requires 32 bits. + int sum = first_pass_offset; + for (int k = 0; k < 8; ++k) { + // We assume the source frame has left and right borders of at + // least 13 pixels that extend the frame boundary pixels. + // + // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on + // ix4 above, we have + // -13 <= ix4 + x + k - 3 <= source_width + 12, + // or + // -13 <= column <= (source_width - 1) + 13. + // Therefore we may over-read up to 13 pixels before the source + // row, or up to 13 pixels after the source row. + const int column = ix4 + x + k - 3; + sum += kWarpedFilters[offset][k] * src_row[column]; + } + intermediate_result[y + 7][x + 4] = + RightShiftWithRounding(sum, kRoundBitsHorizontal) - + offset_removal; + sx += alpha; + } + sx4 += beta; + } + } + + // Regions 3 and 4. + // Vertical filter. + DestType* dst_row = dst + start_x - block_start_x; + int sy4 = + (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta); + // The spec says we should use the following loop condition: + // y < std::min(4, block_start_y + block_height - start_y - 4); + // We can prove that block_start_y + block_height - start_y >= 8, which + // implies std::min(4, block_start_y + block_height - start_y - 4) = 4. + // So the loop condition is simply y < 4. + // + // Proof: + // start_y < block_start_y + block_height + // => block_start_y + block_height - start_y > 0 + // => block_height - (start_y - block_start_y) > 0 + // + // Since block_height >= 8 and is a power of 2, it follows that + // block_height is a multiple of 8. start_y - block_start_y is also a + // multiple of 8. Therefore their difference is a multiple of 8. Since + // their difference is > 0, their difference must be >= 8. + // + // We then add an offset of 4 to y so that the loop starts with y = 0 + // and continues if y < 8. + for (int y = 0; y < 8; ++y) { + int sy = sy4 - MultiplyBy4(gamma); + // The spec says we should use the following loop condition: + // x < std::min(4, block_start_x + block_width - start_x - 4); + // Similar to the above, we can prove that the loop condition can be + // simplified to x < 4. + // + // We then add an offset of 4 to x so that the loop starts with x = 0 + // and continues if x < 8. + for (int x = 0; x < 8; ++x) { + const int offset = + RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) + + kWarpedPixelPrecisionShifts; + // Since gamma and delta have been validated by SetupShear(), one can + // prove that 0 <= offset <= 3 * 2^6. + assert(offset >= 0); + assert(offset < 3 * kWarpedPixelPrecisionShifts + 1); + int sum = 0; + for (int k = 0; k < 8; ++k) { + sum += kWarpedFilters[offset][k] * intermediate_result[y + k][x]; + } + sum -= offset_removal; + sum = RightShiftWithRounding(sum, kRoundBitsVertical); + if (is_compound) { + sum += (bitdepth == 8) ? 0 : kCompoundOffset; + dst_row[x] = static_cast<DestType>(sum); + } else { + dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel)); + } + sy += gamma; + } + dst_row += dest_stride; + sy4 += delta; + } + } + dst += 8 * dest_stride; + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(8); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>; + dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp8bpp_Warp + dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>; +#endif +#ifndef LIBGAV1_Dsp8bpp_WarpCompound + dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(10); + assert(dsp != nullptr); +#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>; + dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>; +#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS + static_cast<void>(dsp); +#ifndef LIBGAV1_Dsp10bpp_Warp + dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>; +#endif +#ifndef LIBGAV1_Dsp10bpp_WarpCompound + dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>; +#endif +#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS +} +#endif + +} // namespace + +void WarpInit_C() { + Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + Init10bpp(); +#endif +} + +} // namespace dsp +} // namespace libgav1 |