diff options
Diffstat (limited to 'src/dsp/arm/intrapred_directional_neon.cc')
-rw-r--r-- | src/dsp/arm/intrapred_directional_neon.cc | 688 |
1 files changed, 411 insertions, 277 deletions
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc index 3cad4a6..e9bdcf0 100644 --- a/src/dsp/arm/intrapred_directional_neon.cc +++ b/src/dsp/arm/intrapred_directional_neon.cc @@ -505,20 +505,12 @@ inline void DirectionalZone1Blend_WxH( } while (++y < height); } -// The height at which a load of 16 bytes will not contain enough source pixels -// from |left_column| to supply an accurate row when computing 8 pixels at a -// time. The values are found by inspection. By coincidence, all angles that -// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up -// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. -constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { - 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; - -// 7.11.2.4 (8) 90 < angle > 180 -// The strategy for these functions (4xH and 8+xH) is to know how many blocks -// can be processed with just pixels from |top_ptr|, then handle mixed blocks, -// then handle only blocks that take from |left_ptr|. Additionally, a fast -// index-shuffle approach is used for pred values from |left_column| in sections -// that permit it. +// 7.11.2.4 (8) 90 < angle > 180 +// The strategy for these functions (4xH and 8+xH) is to know how many blocks +// can be processed with just pixels from |top_ptr|, then handle mixed blocks, +// then handle only blocks that take from |left_ptr|. Additionally, a fast +// index-shuffle approach is used for pred values from |left_column| in +// sections that permit it. inline void DirectionalZone2_4xH( uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint8_t* LIBGAV1_RESTRICT const top_row, @@ -544,13 +536,6 @@ inline void DirectionalZone2_4xH( assert(xstep >= 3); const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4); - // For steep angles, the source pixels from |left_column| may not fit in a - // 16-byte load for shuffling. - // TODO(petersonab): Find a more precise formula for this subject to x. - // TODO(johannkoenig): Revisit this for |width| == 4. - const int max_shuffle_height = - std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height); - // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; @@ -569,9 +554,9 @@ inline void DirectionalZone2_4xH( // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. if (min_top_only_x > 0) { - // Round down to the nearest multiple of 8. - // TODO(johannkoenig): This never hits for Wx4 blocks but maybe it should. - const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7; + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min((1 << 6) / xstep, height) & ~(min_height - 1); DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep, upsampled_top); @@ -584,18 +569,11 @@ inline void DirectionalZone2_4xH( // All rows from |min_left_only_y| down for this set of columns only need // |left_column| to compute. const int min_left_only_y = std::min((4 << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); int xstep_bounds = xstep_bounds_base + xstep_y; int top_x = -xstep - xstep_y; // +8 increment is OK because if height is 4 this only goes once. - for (; y < left_shuffle_stop_y; + for (; y < min_left_only_y; y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) { DirectionalZone2FromLeftCol_WxH<4>( dst, stride, min_height, @@ -607,21 +585,8 @@ inline void DirectionalZone2_4xH( upsample_top_shift); } - // Pick up from the last y-value, using the slower but secure method for - // left prediction. - const int16_t base_left_y = vgetq_lane_s16(left_y, 0); - for (; y < min_left_only_y; - y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone3_WxH<4>( - dst, stride, min_height, - left_column + ((y - left_base_increment) << upsample_left_shift), - base_left_y, -ystep, upsample_left_shift); - - DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row, - xstep_bounds, top_x, xstep, - upsample_top_shift); - } // Loop over y for left_only rows. + const int16_t base_left_y = vgetq_lane_s16(left_y, 0); for (; y < height; y += 8, dst += stride8) { DirectionalZone3_WxH<4>( dst, stride, min_height, @@ -634,34 +599,88 @@ inline void DirectionalZone2_4xH( } } -// Process a multiple of 8 |width|. -inline void DirectionalZone2_8( +template <bool shuffle_left_column> +inline void DirectionalZone2_8xH( uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, const uint8_t* LIBGAV1_RESTRICT const top_row, - const uint8_t* LIBGAV1_RESTRICT const left_column, const int width, - const int height, const int xstep, const int ystep, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int height, + const int xstep, const int ystep, const int x, const int left_offset, + const int xstep_bounds_base, const int16x8_t left_y, const bool upsampled_top, const bool upsampled_left) { const int upsample_left_shift = static_cast<int>(upsampled_left); const int upsample_top_shift = static_cast<int>(upsampled_top); - // Helper vector. - const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; - // Loop incrementers for moving by block (8x8). This function handles blocks // with height 4 as well. They are calculated in one pass so these variables // do not get used. const ptrdiff_t stride8 = stride << 3; const int xstep8 = xstep << 3; - const int ystep8 = ystep << 3; - // Process Wx4 blocks. + // Cover 8x4 case. const int min_height = (height == 4) ? 4 : 8; - // All columns from |min_top_only_x| to the right will only need |top_row| to - // compute and can therefore call the Zone1 functions. This assumes |xstep| is - // at least 3. - assert(xstep >= 3); - const int min_top_only_x = std::min((height * xstep) >> 6, width); + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + uint8_t* dst_x = dst + x; + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min((1 << 6) / xstep, height) & ~(min_height - 1); + DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y, + top_row + (x << upsample_top_shift), -xstep, + upsampled_top); + + if (max_top_only_y == height) return; + + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + + // All rows from |min_left_only_y| down for this set of columns only need + // |left_column| to compute. Round up to the nearest 8. + const int min_left_only_y = + Align(std::min(((x + 8) << 6) / xstep, height), 8); + int xstep_bounds = xstep_bounds_base + xstep_y; + int top_x = -xstep - xstep_y; + + const int16_t base_left_y = vgetq_lane_s16(left_y, 0); + for (; y < min_left_only_y; + y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + if (shuffle_left_column) { + DirectionalZone2FromLeftCol_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), left_y, + upsample_left_shift); + } else { + DirectionalZone3_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep, upsample_left_shift); + } + + DirectionalZone1Blend_WxH<8>( + dst_x, stride, min_height, top_row + (x << upsample_top_shift), + xstep_bounds, top_x, xstep, upsample_top_shift); + } + + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_WxH<8>( + dst_x, stride, min_height, + left_column + ((left_offset + y) << upsample_left_shift), base_left_y, + -ystep, upsample_left_shift); + } +} + +// Process a multiple of 8 |width|. +inline void DirectionalZone2_WxH( + uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, + const uint8_t* LIBGAV1_RESTRICT const top_row, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int width, + const int height, const int xstep, const int ystep, + const bool upsampled_top, const bool upsampled_left) { + const int ystep8 = ystep << 3; // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; @@ -677,90 +696,43 @@ inline void DirectionalZone2_8( // left_y vector omits the portion which is covered under the left_column // offset. Following values need the full ystep as a relative offset. const int16x8_t remainder = vdupq_n_s16(-ystep_remainder); + const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep); + // For ystep > 90, at least two sets of 8 columns can be fully computed from + // top_row only. + const int min_top_only_x = std::min((height * xstep) >> 6, width); + // Analysis finds that, for most angles (ystep < 132), all segments that use + // both top_row and left_column can compute from left_column using byte + // shuffles from a single vector. For steeper angles, the shuffle is also + // fully reliable when x >= 32. + const int shuffle_left_col_x = (ystep < 132) ? 0 : 32; + const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x); + // This loop treats each set of 4 columns in 3 stages with y-value boundaries. // The first stage, before the first y-loop, covers blocks that are only // computed from the top row. The second stage, comprising two y-loops, covers // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. int x = 0; - // For steep angles, the source pixels from |left_column| may not fit in a - // 16-byte load for shuffling. |d| represents the number of pixels that can - // fit in one contiguous vector when stepping by |ystep|. For a given x - // position, the left column values can be obtained by VTBL as long as the - // values at row[x + d] and beyond come from the top row. However, this does - // not guarantee that the vector will also contain all of the values needed - // from top row. - const int d = 16 / ((ystep >> 6) + 1); + for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8, + xstep_bounds_base -= (8 << 6), + left_y = vsubq_s16(left_y, increment_left8), + left_offset -= left_base_increment8) { + DirectionalZone2_8xH<false>(dst, stride, top_row, left_column, height, + xstep, ystep, x, left_offset, xstep_bounds_base, + left_y, upsampled_top, upsampled_left); + } for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, xstep_bounds_base -= (8 << 6), left_y = vsubq_s16(left_y, increment_left8), left_offset -= left_base_increment8) { - uint8_t* dst_x = dst + x; - const int max_shuffle_height = - std::min(((x + d) << 6) / xstep, height) & ~7; - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; - DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y, - top_row + (x << upsample_top_shift), -xstep, - upsampled_top); - - if (max_top_only_y == height) continue; - - int y = max_top_only_y; - dst_x += stride * y; - const int xstep_y = xstep * y; - - // All rows from |min_left_only_y| down for this set of columns only need - // |left_column| to compute. - const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); - int xstep_bounds = xstep_bounds_base + xstep_y; - int top_x = -xstep - xstep_y; - - for (; y < left_shuffle_stop_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone2FromLeftCol_WxH<8>( - dst_x, stride, min_height, - left_column + ((left_offset + y) << upsample_left_shift), left_y, - upsample_left_shift); - - DirectionalZone1Blend_WxH<8>( - dst_x, stride, min_height, top_row + (x << upsample_top_shift), - xstep_bounds, top_x, xstep, upsample_top_shift); - } - - // Pick up from the last y-value, using the slower but secure method for - // left prediction. - const int16_t base_left_y = vgetq_lane_s16(left_y, 0); - for (; y < min_left_only_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone3_WxH<8>( - dst_x, stride, min_height, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep, upsample_left_shift); - - DirectionalZone1Blend_WxH<8>( - dst_x, stride, min_height, top_row + (x << upsample_top_shift), - xstep_bounds, top_x, xstep, upsample_top_shift); - } - // Loop over y for left_only rows. - for (; y < height; y += 8, dst_x += stride8) { - DirectionalZone3_WxH<8>( - dst_x, stride, min_height, - left_column + ((left_offset + y) << upsample_left_shift), base_left_y, - -ystep, upsample_left_shift); - } + DirectionalZone2_8xH<true>(dst, stride, top_row, left_column, height, xstep, + ystep, x, left_offset, xstep_bounds_base, left_y, + upsampled_top, upsampled_left); } - // TODO(johannkoenig): May be able to remove this branch. if (x < width) { + const int upsample_top_shift = static_cast<int>(upsampled_top); DirectionalZone1_WxH(dst + x, stride, width - x, height, top_row + (x << upsample_top_shift), -xstep, upsampled_top); @@ -792,8 +764,8 @@ void DirectionalIntraPredictorZone2_NEON( DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep, upsampled_top, upsampled_left); } else { - DirectionalZone2_8(dst, stride, top_ptr, left_ptr, width, height, xstep, - ystep, upsampled_top, upsampled_left); + DirectionalZone2_WxH(dst, stride, top_ptr, left_ptr, width, height, xstep, + ystep, upsampled_top, upsampled_left); } } @@ -935,6 +907,16 @@ inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, return vrshrq_n_u16(sum, 5 /*log2(32)*/); } +// Blend two values based on weights that sum to 32. +inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, + const uint16x8_t a_weight, + const uint16x8_t b_weight) { + const uint16x8_t a_product = vmulq_u16(a, a_weight); + const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight); + + return vrshrq_n_u16(sum, 5 /*log2(32)*/); +} + // Each element of |dest| contains values associated with one weight value. inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* LIBGAV1_RESTRICT const source, @@ -959,6 +941,24 @@ inline void LoadEdgeVals(uint16x8x2_t* dest, } } +// For Wx4 blocks, load the source for 2 columns. The source for the second +// column is held in the high half of each vector. +inline void LoadEdgeVals2x4(uint16x8x2_t* dest, + const uint16_t* LIBGAV1_RESTRICT const source_low, + const uint16_t* LIBGAV1_RESTRICT const source_high, + const bool upsampled) { + if (upsampled) { + const uint16x4x2_t low = vld2_u16(source_low); + const uint16x4x2_t high = vld2_u16(source_high); + dest->val[0] = vcombine_u16(low.val[0], high.val[0]); + dest->val[1] = vcombine_u16(low.val[1], high.val[1]); + } else { + dest->val[0] = vcombine_u16(vld1_u16(source_low), vld1_u16(source_high)); + dest->val[1] = + vcombine_u16(vld1_u16(source_low + 1), vld1_u16(source_high + 1)); + } +} + template <bool upsampled> inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, @@ -1286,18 +1286,162 @@ inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst, } template <bool upsampled> +inline void DirectionalZone3_8x4(uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const left, + const int ystep, const int base_left_y = 0) { + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + const uint16x8_t inverter = vdupq_n_u16(32); + + uint16x8x2_t sampled_left_col; + // Compute two columns at a time, then transpose for storage. + uint16x8_t result[4]; + + // The low half of pre-transpose vectors contains columns 0 through 3. + int left_y_low = base_left_y + ystep; + int left_offset_low = left_y_low >> index_scale_bits; + int shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + // The high half of pre-transpose vectors contains columns 4 through 7. + int left_y_high = left_y_low + (ystep << 2); + int left_offset_high = left_y_high >> index_scale_bits; + int shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + uint16x8_t weights_0 = + vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + uint16x8_t weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + left_y_low += ystep; + left_offset_low = left_y_low >> index_scale_bits; + shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + left_y_high += ystep; + left_offset_high = left_y_high >> index_scale_bits; + shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + left_y_high += ystep; + left_y_low += ystep; + left_offset_low = left_y_low >> index_scale_bits; + shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + left_offset_high = left_y_high >> index_scale_bits; + shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + left_y_low += ystep; + left_offset_low = left_y_low >> index_scale_bits; + shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1; + + left_y_high += ystep; + left_offset_high = left_y_high >> index_scale_bits; + shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1; + weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high)); + weights_1 = vsubq_u16(inverter, weights_0); + LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low], + &left[left_offset_high], upsampled); + result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + weights_1, weights_0); + + Transpose4x8(result); + Store8(dst, result[0]); + dst += stride; + Store8(dst, result[1]); + dst += stride; + Store8(dst, result[2]); + dst += stride; + Store8(dst, result[3]); +} + +template <bool upsampled> +inline void DirectionalZone3_4x8(uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const left, + const int ystep, const int base_left_y = 0) { + const int upsample_shift = static_cast<int>(upsampled); + const int index_scale_bits = 6 - upsample_shift; + + // Compute one column at a time, then transpose for storage. + uint16x8_t result[4]; + + int left_y = base_left_y + ystep; + int left_offset = left_y >> index_scale_bits; + int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + int shift_1 = 32 - shift_0; + uint16x8x2_t sampled_left_col; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + left_y += ystep; + left_offset = left_y >> index_scale_bits; + shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1; + shift_1 = 32 - shift_0; + LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled); + result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1], + shift_1, shift_0); + + Transpose4x8(result); + Store4(dst, vget_low_u16(result[0])); + dst += stride; + Store4(dst, vget_low_u16(result[1])); + dst += stride; + Store4(dst, vget_low_u16(result[2])); + dst += stride; + Store4(dst, vget_low_u16(result[3])); + dst += stride; + Store4(dst, vget_high_u16(result[0])); + dst += stride; + Store4(dst, vget_high_u16(result[1])); + dst += stride; + Store4(dst, vget_high_u16(result[2])); + dst += stride; + Store4(dst, vget_high_u16(result[3])); +} + +template <bool upsampled> inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep) { + assert(height == 8 || height == 16); const int upsample_shift = static_cast<int>(upsampled); - int y = 0; - do { - DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift), + DirectionalZone3_4x8<upsampled>(dest, stride, left, ystep); + if (height == 16) { + dest += stride << 3; + DirectionalZone3_4x8<upsampled>(dest, stride, left + (8 << upsample_shift), ystep); - dest += 4 * stride; - y += 4; - } while (y < height); + } } template <bool upsampled> @@ -1305,16 +1449,17 @@ inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int width, const uint16_t* LIBGAV1_RESTRICT const left, const int ystep) { - int x = 0; - int base_left_y = 0; - do { - // TODO(petersonab): Establish 8x4 transpose to reserve this function for - // 8x4 and 16x4. - DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep, - base_left_y); - base_left_y += 4 * ystep; - x += 4; - } while (x < width); + assert(width <= 16); + if (width == 4) { + DirectionalZone3_4x4<upsampled>(dest, stride, left, ystep); + return; + } + DirectionalZone3_8x4<upsampled>(dest, stride, left, ystep); + if (width == 16) { + const int base_left_y = ystep << 3; + DirectionalZone3_8x4<upsampled>(dest + 8 * sizeof(uint16_t), stride, left, + ystep, base_left_y); + } } template <bool upsampled> @@ -1460,17 +1605,17 @@ void DirectionalIntraPredictorZone3_NEON( } while (y != 0); return; } - if (width == 4) { + if (height == 4) { if (upsampled_left) { - DirectionalZone3_4xH<true>(dst, stride, height, left, ystep); + DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep); } else { - DirectionalZone3_4xH<false>(dst, stride, height, left, ystep); + DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep); } - } else if (height == 4) { + } else if (width == 4) { if (upsampled_left) { - DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep); + DirectionalZone3_4xH<true>(dst, stride, height, left, ystep); } else { - DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep); + DirectionalZone3_4xH<false>(dst, stride, height, left, ystep); } } else { if (upsampled_left) { @@ -1532,16 +1677,6 @@ inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b, return vrshr_n_u16(sum, 5 /*log2(32)*/); } -// Blend two values based on weight pairs that each sum to 32. -inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, - const uint16x8_t a_weight, - const uint16x8_t b_weight) { - const uint16x8_t a_product = vmulq_u16(a, a_weight); - const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight); - - return vrshrq_n_u16(sum, 5 /*log2(32)*/); -} - // Because the source values "move backwards" as the row index increases, the // indices derived from ystep are generally negative in localized functions. // This is accommodated by making sure the relative indices are within [-15, 0] @@ -1608,8 +1743,8 @@ inline void DirectionalZone2FromLeftCol_4xH( } while (++y < height); } -inline void DirectionalZone2FromLeftCol_8xH( - uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, +inline void DirectionalZone2FromLeftCol_8x8( + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y, const bool upsampled) { const int upsample_shift = static_cast<int>(upsampled); @@ -1653,8 +1788,7 @@ inline void DirectionalZone2FromLeftCol_8xH( vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1)); const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0); - int y = 0; - do { + for (int y = 0; y < 8; ++y) { uint16x8_t src_left, src_right; LoadStepwise( left_column - kPositiveIndexOffsetPixels + (y << upsample_shift), @@ -1664,7 +1798,7 @@ inline void DirectionalZone2FromLeftCol_8xH( Store8(dst, val); dst += stride; - } while (++y < height); + } } template <bool upsampled> @@ -1704,8 +1838,8 @@ inline void DirectionalZone1Blend_4xH( } template <bool upsampled> -inline void DirectionalZone1Blend_8xH( - uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, +inline void DirectionalZone1Blend_8x8( + uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x, const int xstep) { const int upsample_shift = static_cast<int>(upsampled); @@ -1716,8 +1850,7 @@ inline void DirectionalZone1Blend_8xH( const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7}; uint16x8x2_t top_vals; - int y = height; - do { + for (int y = 0; y < 8; ++y) { const uint16_t* const src = top_row + (top_x >> scale_bits_x); LoadEdgeVals(&top_vals, src, upsampled); @@ -1736,20 +1869,9 @@ inline void DirectionalZone1Blend_8xH( dest += stride; zone_bounds += xstep; top_x -= xstep; - } while (--y != 0); + } } -// The height at which a load of 16 bytes will not contain enough source pixels -// from |left_column| to supply an accurate row when computing 8 pixels at a -// time. The values are found by inspection. By coincidence, all angles that -// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up -// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. Indices -// that do not correspond to angle derivatives are left at zero. -// Notably, in cases with upsampling, the shuffle-invalid height is always -// greater than the prediction height (which is 8 at maximum). -constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { - 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; - // 7.11.2.4 (8) 90 < angle > 180 // The strategy for these functions (4xH and 8+xH) is to know how many blocks // can be processed with just pixels from |top_ptr|, then handle mixed blocks, @@ -1796,9 +1918,9 @@ inline void DirectionalZone2_4xH( // computed from the top row. The second stage, comprising two y-loops, covers // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. - // Round down to the nearest multiple of 8. - // TODO(petersonab): Check if rounding to the nearest 4 is okay. - const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7; + // Round down to the nearest multiple of 8 (or 4, if height is 4). + const int max_top_only_y = + std::min((1 << 6) / xstep, height) & ~(min_height - 1); DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst), stride >> 1, max_top_only_y, top_row, -xstep); @@ -1827,12 +1949,15 @@ inline void DirectionalZone2_4xH( xstep_bounds, top_x, xstep); } - // Loop over y for left-only rows. - for (; y < height; y += 8, dst += stride8) { - // Angle expected by Zone3 is flipped about the 180 degree vector, which - // is the x-axis. + // Left-only section. |height| - |y| is assumed equivalent to: + // (y == 0) && (height == 4) + if (height - y == 4) { + DirectionalZone3_4x4<upsampled_left>(dst, stride, left_column, -ystep); + return; + } + if (y < height) { DirectionalZone3_4xH<upsampled_left>( - dst, stride, min_height, left_column + (y << upsample_left_shift), + dst, stride, height - y, left_column + (y << upsample_left_shift), -ystep); } } @@ -1882,9 +2007,75 @@ inline void DirectionalZone2_Wx4( } } +template <bool shuffle_left_column, bool upsampled_top, bool upsampled_left> +inline void DirectionalZone2_8xH( + uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const top_row, + const uint16_t* LIBGAV1_RESTRICT const left_column, const int height, + const int xstep, const int ystep, const int x, const int left_offset, + const int xstep_bounds_base, const int16x8_t left_y) { + const int upsample_left_shift = static_cast<int>(upsampled_left); + const int upsample_top_shift = static_cast<int>(upsampled_top); + + // Loop incrementers for moving by block (8x8). This function handles blocks + // with height 4 as well. They are calculated in one pass so these variables + // do not get used. + const ptrdiff_t stride8 = stride << 3; + const int xstep8 = xstep << 3; + + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + uint8_t* dst_x = dst + x * sizeof(uint16_t); + // Round down to the nearest multiple of 8. + const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; + DirectionalZone1_WxH<upsampled_top>( + reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y, + top_row + (x << upsample_top_shift), -xstep); + + if (max_top_only_y == height) return; + + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + + // All rows from |min_left_only_y| down for this set of columns only need + // |left_column| to compute. Round up to the nearest 8. + const int min_left_only_y = + Align(std::min(((x + 8) << 6) / xstep, height), 8); + int xstep_bounds = xstep_bounds_base + xstep_y; + int top_x = -xstep - xstep_y; + + for (; y < min_left_only_y; + y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + if (shuffle_left_column) { + DirectionalZone2FromLeftCol_8x8( + dst_x, stride, + left_column + ((left_offset + y) << upsample_left_shift), left_y, + upsampled_left); + } else { + DirectionalZone3_8x8<upsampled_left>( + dst_x, stride, left_column + (y << upsample_left_shift), -ystep, + -ystep * x); + } + + DirectionalZone1Blend_8x8<upsampled_top>( + dst_x, stride, top_row + (x << upsample_top_shift), xstep_bounds, top_x, + xstep); + } + + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_8x8<upsampled_left>( + dst_x, stride, left_column + (y << upsample_left_shift), -ystep, + -ystep * x); + } +} + // Process a multiple of 8 |width|. template <bool upsampled_top, bool upsampled_left> -inline void DirectionalZone2_8( +inline void DirectionalZone2_NEON( uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, const uint16_t* LIBGAV1_RESTRICT const top_row, const uint16_t* LIBGAV1_RESTRICT const left_column, const int width, @@ -1894,30 +2085,24 @@ inline void DirectionalZone2_8( dst, stride, top_row, left_column, width, xstep, ystep); return; } - const int upsample_left_shift = static_cast<int>(upsampled_left); const int upsample_top_shift = static_cast<int>(upsampled_top); // Helper vector. const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; - // Loop increments for moving by block (8x8). This function handles blocks - // with height 4 as well. They are calculated in one pass so these variables - // do not get used. - const ptrdiff_t stride8 = stride << 3; - const int xstep8 = xstep << 3; const int ystep8 = ystep << 3; // All columns from |min_top_only_x| to the right will only need |top_row| to // compute and can therefore call the Zone1 functions. This assumes |xstep| is // at least 3. assert(xstep >= 3); - const int min_top_only_x = std::min((height * xstep) >> 6, width); - - // For steep angles, the source pixels from |left_column| may not fit in a - // 16-byte load for shuffling. - // TODO(petersonab): Find a more precise formula for this subject to x. - const int max_shuffle_height = - std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height); + const int min_top_only_x = Align(std::min((height * xstep) >> 6, width), 8); + // Analysis finds that, for most angles (ystep < 132), all segments that use + // both top_row and left_column can compute from left_column using byte + // shuffles from a single vector. For steeper angles, the shuffle is also + // fully reliable when x >= 32. + const int shuffle_left_col_x = (ystep < 132) ? 0 : 32; + const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x); // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; @@ -1935,73 +2120,22 @@ inline void DirectionalZone2_8( int16x8_t left_y = vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep); - // This loop treats each set of 4 columns in 3 stages with y-value boundaries. - // The first stage, before the first y-loop, covers blocks that are only - // computed from the top row. The second stage, comprising two y-loops, covers - // blocks that have a mixture of values computed from top or left. The final - // stage covers blocks that are only computed from the left. int x = 0; + for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8, + xstep_bounds_base -= (8 << 6), + left_y = vsubq_s16(left_y, increment_left8), + left_offset -= left_base_increment8) { + DirectionalZone2_8xH<false, upsampled_top, upsampled_left>( + dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, + xstep_bounds_base, left_y); + } for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, xstep_bounds_base -= (8 << 6), left_y = vsubq_s16(left_y, increment_left8), left_offset -= left_base_increment8) { - uint8_t* dst_x = dst + x * sizeof(uint16_t); - - // Round down to the nearest multiple of 8. - const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; - DirectionalZone1_WxH<upsampled_top>( - reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y, - top_row + (x << upsample_top_shift), -xstep); - - if (max_top_only_y == height) continue; - - int y = max_top_only_y; - dst_x += stride * y; - const int xstep_y = xstep * y; - - // All rows from |min_left_only_y| down for this set of columns only need - // |left_column| to compute. - const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); - // At high angles such that min_left_only_y < 8, ystep is low and xstep is - // high. This means that max_shuffle_height is unbounded and xstep_bounds - // will overflow in 16 bits. This is prevented by stopping the first - // blending loop at min_left_only_y for such cases, which means we skip over - // the second blending loop as well. - const int left_shuffle_stop_y = - std::min(max_shuffle_height, min_left_only_y); - int xstep_bounds = xstep_bounds_base + xstep_y; - int top_x = -xstep - xstep_y; - - for (; y < left_shuffle_stop_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone2FromLeftCol_8xH( - dst_x, stride, 8, - left_column + ((left_offset + y) << upsample_left_shift), left_y, - upsample_left_shift); - - DirectionalZone1Blend_8xH<upsampled_top>( - dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds, - top_x, xstep); - } - - // Pick up from the last y-value, using the slower but secure method for - // left prediction. - for (; y < min_left_only_y; - y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { - DirectionalZone3_8x8<upsampled_left>( - dst_x, stride, left_column + (y << upsample_left_shift), -ystep, - -ystep * x); - - DirectionalZone1Blend_8xH<upsampled_top>( - dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds, - top_x, xstep); - } - // Loop over y for left_only rows. - for (; y < height; y += 8, dst_x += stride8) { - DirectionalZone3_8x8<upsampled_left>( - dst_x, stride, left_column + (y << upsample_left_shift), -ystep, - -ystep * x); - } + DirectionalZone2_8xH<true, upsampled_top, upsampled_left>( + dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset, + xstep_bounds_base, left_y); } // Reached |min_top_only_x|. if (x < width) { @@ -2129,18 +2263,18 @@ void DirectionalIntraPredictorZone2_NEON( } if (upsampled_top) { if (upsampled_left) { - DirectionalZone2_8<true, true>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<true, true>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } else { - DirectionalZone2_8<true, false>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<true, false>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } } else if (upsampled_left) { - DirectionalZone2_8<false, true>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<false, true>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } else { - DirectionalZone2_8<false, false>(dst, stride, top_ptr, left_ptr, width, - height, xstep, ystep); + DirectionalZone2_NEON<false, false>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); } } |