aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/arm/intrapred_directional_neon.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/arm/intrapred_directional_neon.cc')
-rw-r--r--src/dsp/arm/intrapred_directional_neon.cc688
1 files changed, 411 insertions, 277 deletions
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
index 3cad4a6..e9bdcf0 100644
--- a/src/dsp/arm/intrapred_directional_neon.cc
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -505,20 +505,12 @@ inline void DirectionalZone1Blend_WxH(
} while (++y < height);
}
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
- 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
-// 7.11.2.4 (8) 90 < angle > 180
-// The strategy for these functions (4xH and 8+xH) is to know how many blocks
-// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
-// then handle only blocks that take from |left_ptr|. Additionally, a fast
-// index-shuffle approach is used for pred values from |left_column| in sections
-// that permit it.
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for these functions (4xH and 8+xH) is to know how many blocks
+// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
+// then handle only blocks that take from |left_ptr|. Additionally, a fast
+// index-shuffle approach is used for pred values from |left_column| in
+// sections that permit it.
inline void DirectionalZone2_4xH(
uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
const uint8_t* LIBGAV1_RESTRICT const top_row,
@@ -544,13 +536,6 @@ inline void DirectionalZone2_4xH(
assert(xstep >= 3);
const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4);
- // For steep angles, the source pixels from |left_column| may not fit in a
- // 16-byte load for shuffling.
- // TODO(petersonab): Find a more precise formula for this subject to x.
- // TODO(johannkoenig): Revisit this for |width| == 4.
- const int max_shuffle_height =
- std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
-
// Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
@@ -569,9 +554,9 @@ inline void DirectionalZone2_4xH(
// blocks that have a mixture of values computed from top or left. The final
// stage covers blocks that are only computed from the left.
if (min_top_only_x > 0) {
- // Round down to the nearest multiple of 8.
- // TODO(johannkoenig): This never hits for Wx4 blocks but maybe it should.
- const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min((1 << 6) / xstep, height) & ~(min_height - 1);
DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep,
upsampled_top);
@@ -584,18 +569,11 @@ inline void DirectionalZone2_4xH(
// All rows from |min_left_only_y| down for this set of columns only need
// |left_column| to compute.
const int min_left_only_y = std::min((4 << 6) / xstep, height);
- // At high angles such that min_left_only_y < 8, ystep is low and xstep is
- // high. This means that max_shuffle_height is unbounded and xstep_bounds
- // will overflow in 16 bits. This is prevented by stopping the first
- // blending loop at min_left_only_y for such cases, which means we skip over
- // the second blending loop as well.
- const int left_shuffle_stop_y =
- std::min(max_shuffle_height, min_left_only_y);
int xstep_bounds = xstep_bounds_base + xstep_y;
int top_x = -xstep - xstep_y;
// +8 increment is OK because if height is 4 this only goes once.
- for (; y < left_shuffle_stop_y;
+ for (; y < min_left_only_y;
y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
DirectionalZone2FromLeftCol_WxH<4>(
dst, stride, min_height,
@@ -607,21 +585,8 @@ inline void DirectionalZone2_4xH(
upsample_top_shift);
}
- // Pick up from the last y-value, using the slower but secure method for
- // left prediction.
- const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
- for (; y < min_left_only_y;
- y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone3_WxH<4>(
- dst, stride, min_height,
- left_column + ((y - left_base_increment) << upsample_left_shift),
- base_left_y, -ystep, upsample_left_shift);
-
- DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
- xstep_bounds, top_x, xstep,
- upsample_top_shift);
- }
// Loop over y for left_only rows.
+ const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
for (; y < height; y += 8, dst += stride8) {
DirectionalZone3_WxH<4>(
dst, stride, min_height,
@@ -634,34 +599,88 @@ inline void DirectionalZone2_4xH(
}
}
-// Process a multiple of 8 |width|.
-inline void DirectionalZone2_8(
+template <bool shuffle_left_column>
+inline void DirectionalZone2_8xH(
uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
const uint8_t* LIBGAV1_RESTRICT const top_row,
- const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
- const int height, const int xstep, const int ystep,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep, const int x, const int left_offset,
+ const int xstep_bounds_base, const int16x8_t left_y,
const bool upsampled_top, const bool upsampled_left) {
const int upsample_left_shift = static_cast<int>(upsampled_left);
const int upsample_top_shift = static_cast<int>(upsampled_top);
- // Helper vector.
- const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
-
// Loop incrementers for moving by block (8x8). This function handles blocks
// with height 4 as well. They are calculated in one pass so these variables
// do not get used.
const ptrdiff_t stride8 = stride << 3;
const int xstep8 = xstep << 3;
- const int ystep8 = ystep << 3;
- // Process Wx4 blocks.
+ // Cover 8x4 case.
const int min_height = (height == 4) ? 4 : 8;
- // All columns from |min_top_only_x| to the right will only need |top_row| to
- // compute and can therefore call the Zone1 functions. This assumes |xstep| is
- // at least 3.
- assert(xstep >= 3);
- const int min_top_only_x = std::min((height * xstep) >> 6, width);
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ uint8_t* dst_x = dst + x;
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min((1 << 6) / xstep, height) & ~(min_height - 1);
+ DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
+ top_row + (x << upsample_top_shift), -xstep,
+ upsampled_top);
+
+ if (max_top_only_y == height) return;
+
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+
+ // All rows from |min_left_only_y| down for this set of columns only need
+ // |left_column| to compute. Round up to the nearest 8.
+ const int min_left_only_y =
+ Align(std::min(((x + 8) << 6) / xstep, height), 8);
+ int xstep_bounds = xstep_bounds_base + xstep_y;
+ int top_x = -xstep - xstep_y;
+
+ const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ if (shuffle_left_column) {
+ DirectionalZone2FromLeftCol_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y,
+ upsample_left_shift);
+ } else {
+ DirectionalZone3_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep, upsample_left_shift);
+ }
+
+ DirectionalZone1Blend_WxH<8>(
+ dst_x, stride, min_height, top_row + (x << upsample_top_shift),
+ xstep_bounds, top_x, xstep, upsample_top_shift);
+ }
+
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep, upsample_left_shift);
+ }
+}
+
+// Process a multiple of 8 |width|.
+inline void DirectionalZone2_WxH(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint8_t* LIBGAV1_RESTRICT const top_row,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int xstep, const int ystep,
+ const bool upsampled_top, const bool upsampled_left) {
+ const int ystep8 = ystep << 3;
// Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
@@ -677,90 +696,43 @@ inline void DirectionalZone2_8(
// left_y vector omits the portion which is covered under the left_column
// offset. Following values need the full ystep as a relative offset.
const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
+ const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
+ // For ystep > 90, at least two sets of 8 columns can be fully computed from
+ // top_row only.
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+ // Analysis finds that, for most angles (ystep < 132), all segments that use
+ // both top_row and left_column can compute from left_column using byte
+ // shuffles from a single vector. For steeper angles, the shuffle is also
+ // fully reliable when x >= 32.
+ const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+ const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
+
// This loop treats each set of 4 columns in 3 stages with y-value boundaries.
// The first stage, before the first y-loop, covers blocks that are only
// computed from the top row. The second stage, comprising two y-loops, covers
// blocks that have a mixture of values computed from top or left. The final
// stage covers blocks that are only computed from the left.
int x = 0;
- // For steep angles, the source pixels from |left_column| may not fit in a
- // 16-byte load for shuffling. |d| represents the number of pixels that can
- // fit in one contiguous vector when stepping by |ystep|. For a given x
- // position, the left column values can be obtained by VTBL as long as the
- // values at row[x + d] and beyond come from the top row. However, this does
- // not guarantee that the vector will also contain all of the values needed
- // from top row.
- const int d = 16 / ((ystep >> 6) + 1);
+ for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
+ xstep_bounds_base -= (8 << 6),
+ left_y = vsubq_s16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ DirectionalZone2_8xH<false>(dst, stride, top_row, left_column, height,
+ xstep, ystep, x, left_offset, xstep_bounds_base,
+ left_y, upsampled_top, upsampled_left);
+ }
for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
xstep_bounds_base -= (8 << 6),
left_y = vsubq_s16(left_y, increment_left8),
left_offset -= left_base_increment8) {
- uint8_t* dst_x = dst + x;
- const int max_shuffle_height =
- std::min(((x + d) << 6) / xstep, height) & ~7;
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
- DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
- top_row + (x << upsample_top_shift), -xstep,
- upsampled_top);
-
- if (max_top_only_y == height) continue;
-
- int y = max_top_only_y;
- dst_x += stride * y;
- const int xstep_y = xstep * y;
-
- // All rows from |min_left_only_y| down for this set of columns only need
- // |left_column| to compute.
- const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
- // At high angles such that min_left_only_y < 8, ystep is low and xstep is
- // high. This means that max_shuffle_height is unbounded and xstep_bounds
- // will overflow in 16 bits. This is prevented by stopping the first
- // blending loop at min_left_only_y for such cases, which means we skip over
- // the second blending loop as well.
- const int left_shuffle_stop_y =
- std::min(max_shuffle_height, min_left_only_y);
- int xstep_bounds = xstep_bounds_base + xstep_y;
- int top_x = -xstep - xstep_y;
-
- for (; y < left_shuffle_stop_y;
- y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone2FromLeftCol_WxH<8>(
- dst_x, stride, min_height,
- left_column + ((left_offset + y) << upsample_left_shift), left_y,
- upsample_left_shift);
-
- DirectionalZone1Blend_WxH<8>(
- dst_x, stride, min_height, top_row + (x << upsample_top_shift),
- xstep_bounds, top_x, xstep, upsample_top_shift);
- }
-
- // Pick up from the last y-value, using the slower but secure method for
- // left prediction.
- const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
- for (; y < min_left_only_y;
- y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone3_WxH<8>(
- dst_x, stride, min_height,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep, upsample_left_shift);
-
- DirectionalZone1Blend_WxH<8>(
- dst_x, stride, min_height, top_row + (x << upsample_top_shift),
- xstep_bounds, top_x, xstep, upsample_top_shift);
- }
- // Loop over y for left_only rows.
- for (; y < height; y += 8, dst_x += stride8) {
- DirectionalZone3_WxH<8>(
- dst_x, stride, min_height,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep, upsample_left_shift);
- }
+ DirectionalZone2_8xH<true>(dst, stride, top_row, left_column, height, xstep,
+ ystep, x, left_offset, xstep_bounds_base, left_y,
+ upsampled_top, upsampled_left);
}
- // TODO(johannkoenig): May be able to remove this branch.
if (x < width) {
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
DirectionalZone1_WxH(dst + x, stride, width - x, height,
top_row + (x << upsample_top_shift), -xstep,
upsampled_top);
@@ -792,8 +764,8 @@ void DirectionalIntraPredictorZone2_NEON(
DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep,
upsampled_top, upsampled_left);
} else {
- DirectionalZone2_8(dst, stride, top_ptr, left_ptr, width, height, xstep,
- ystep, upsampled_top, upsampled_left);
+ DirectionalZone2_WxH(dst, stride, top_ptr, left_ptr, width, height, xstep,
+ ystep, upsampled_top, upsampled_left);
}
}
@@ -935,6 +907,16 @@ inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
return vrshrq_n_u16(sum, 5 /*log2(32)*/);
}
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+ const uint16x8_t a_weight,
+ const uint16x8_t b_weight) {
+ const uint16x8_t a_product = vmulq_u16(a, a_weight);
+ const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
+
+ return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
// Each element of |dest| contains values associated with one weight value.
inline void LoadEdgeVals(uint16x4x2_t* dest,
const uint16_t* LIBGAV1_RESTRICT const source,
@@ -959,6 +941,24 @@ inline void LoadEdgeVals(uint16x8x2_t* dest,
}
}
+// For Wx4 blocks, load the source for 2 columns. The source for the second
+// column is held in the high half of each vector.
+inline void LoadEdgeVals2x4(uint16x8x2_t* dest,
+ const uint16_t* LIBGAV1_RESTRICT const source_low,
+ const uint16_t* LIBGAV1_RESTRICT const source_high,
+ const bool upsampled) {
+ if (upsampled) {
+ const uint16x4x2_t low = vld2_u16(source_low);
+ const uint16x4x2_t high = vld2_u16(source_high);
+ dest->val[0] = vcombine_u16(low.val[0], high.val[0]);
+ dest->val[1] = vcombine_u16(low.val[1], high.val[1]);
+ } else {
+ dest->val[0] = vcombine_u16(vld1_u16(source_low), vld1_u16(source_high));
+ dest->val[1] =
+ vcombine_u16(vld1_u16(source_low + 1), vld1_u16(source_high + 1));
+ }
+}
+
template <bool upsampled>
inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t stride, const int height,
@@ -1286,18 +1286,162 @@ inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst,
}
template <bool upsampled>
+inline void DirectionalZone3_8x4(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep, const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+ const uint16x8_t inverter = vdupq_n_u16(32);
+
+ uint16x8x2_t sampled_left_col;
+ // Compute two columns at a time, then transpose for storage.
+ uint16x8_t result[4];
+
+ // The low half of pre-transpose vectors contains columns 0 through 3.
+ int left_y_low = base_left_y + ystep;
+ int left_offset_low = left_y_low >> index_scale_bits;
+ int shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ // The high half of pre-transpose vectors contains columns 4 through 7.
+ int left_y_high = left_y_low + (ystep << 2);
+ int left_offset_high = left_y_high >> index_scale_bits;
+ int shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ uint16x8_t weights_0 =
+ vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ uint16x8_t weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ left_y_low += ystep;
+ left_offset_low = left_y_low >> index_scale_bits;
+ shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ left_y_high += ystep;
+ left_offset_high = left_y_high >> index_scale_bits;
+ shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ left_y_high += ystep;
+ left_y_low += ystep;
+ left_offset_low = left_y_low >> index_scale_bits;
+ shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ left_offset_high = left_y_high >> index_scale_bits;
+ shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ left_y_low += ystep;
+ left_offset_low = left_y_low >> index_scale_bits;
+ shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ left_y_high += ystep;
+ left_offset_high = left_y_high >> index_scale_bits;
+ shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ Transpose4x8(result);
+ Store8(dst, result[0]);
+ dst += stride;
+ Store8(dst, result[1]);
+ dst += stride;
+ Store8(dst, result[2]);
+ dst += stride;
+ Store8(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x8(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep, const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ // Compute one column at a time, then transpose for storage.
+ uint16x8_t result[4];
+
+ int left_y = base_left_y + ystep;
+ int left_offset = left_y >> index_scale_bits;
+ int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ int shift_1 = 32 - shift_0;
+ uint16x8x2_t sampled_left_col;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ Transpose4x8(result);
+ Store4(dst, vget_low_u16(result[0]));
+ dst += stride;
+ Store4(dst, vget_low_u16(result[1]));
+ dst += stride;
+ Store4(dst, vget_low_u16(result[2]));
+ dst += stride;
+ Store4(dst, vget_low_u16(result[3]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[0]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[1]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[2]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[3]));
+}
+
+template <bool upsampled>
inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest,
const ptrdiff_t stride, const int height,
const uint16_t* LIBGAV1_RESTRICT const left,
const int ystep) {
+ assert(height == 8 || height == 16);
const int upsample_shift = static_cast<int>(upsampled);
- int y = 0;
- do {
- DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift),
+ DirectionalZone3_4x8<upsampled>(dest, stride, left, ystep);
+ if (height == 16) {
+ dest += stride << 3;
+ DirectionalZone3_4x8<upsampled>(dest, stride, left + (8 << upsample_shift),
ystep);
- dest += 4 * stride;
- y += 4;
- } while (y < height);
+ }
}
template <bool upsampled>
@@ -1305,16 +1449,17 @@ inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest,
const ptrdiff_t stride, const int width,
const uint16_t* LIBGAV1_RESTRICT const left,
const int ystep) {
- int x = 0;
- int base_left_y = 0;
- do {
- // TODO(petersonab): Establish 8x4 transpose to reserve this function for
- // 8x4 and 16x4.
- DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep,
- base_left_y);
- base_left_y += 4 * ystep;
- x += 4;
- } while (x < width);
+ assert(width <= 16);
+ if (width == 4) {
+ DirectionalZone3_4x4<upsampled>(dest, stride, left, ystep);
+ return;
+ }
+ DirectionalZone3_8x4<upsampled>(dest, stride, left, ystep);
+ if (width == 16) {
+ const int base_left_y = ystep << 3;
+ DirectionalZone3_8x4<upsampled>(dest + 8 * sizeof(uint16_t), stride, left,
+ ystep, base_left_y);
+ }
}
template <bool upsampled>
@@ -1460,17 +1605,17 @@ void DirectionalIntraPredictorZone3_NEON(
} while (y != 0);
return;
}
- if (width == 4) {
+ if (height == 4) {
if (upsampled_left) {
- DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
+ DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
} else {
- DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
+ DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
}
- } else if (height == 4) {
+ } else if (width == 4) {
if (upsampled_left) {
- DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
+ DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
} else {
- DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
+ DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
}
} else {
if (upsampled_left) {
@@ -1532,16 +1677,6 @@ inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
return vrshr_n_u16(sum, 5 /*log2(32)*/);
}
-// Blend two values based on weight pairs that each sum to 32.
-inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
- const uint16x8_t a_weight,
- const uint16x8_t b_weight) {
- const uint16x8_t a_product = vmulq_u16(a, a_weight);
- const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
-
- return vrshrq_n_u16(sum, 5 /*log2(32)*/);
-}
-
// Because the source values "move backwards" as the row index increases, the
// indices derived from ystep are generally negative in localized functions.
// This is accommodated by making sure the relative indices are within [-15, 0]
@@ -1608,8 +1743,8 @@ inline void DirectionalZone2FromLeftCol_4xH(
} while (++y < height);
}
-inline void DirectionalZone2FromLeftCol_8xH(
- uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+inline void DirectionalZone2FromLeftCol_8x8(
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
const bool upsampled) {
const int upsample_shift = static_cast<int>(upsampled);
@@ -1653,8 +1788,7 @@ inline void DirectionalZone2FromLeftCol_8xH(
vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1));
const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0);
- int y = 0;
- do {
+ for (int y = 0; y < 8; ++y) {
uint16x8_t src_left, src_right;
LoadStepwise(
left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
@@ -1664,7 +1798,7 @@ inline void DirectionalZone2FromLeftCol_8xH(
Store8(dst, val);
dst += stride;
- } while (++y < height);
+ }
}
template <bool upsampled>
@@ -1704,8 +1838,8 @@ inline void DirectionalZone1Blend_4xH(
}
template <bool upsampled>
-inline void DirectionalZone1Blend_8xH(
- uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+inline void DirectionalZone1Blend_8x8(
+ uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
const int xstep) {
const int upsample_shift = static_cast<int>(upsampled);
@@ -1716,8 +1850,7 @@ inline void DirectionalZone1Blend_8xH(
const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7};
uint16x8x2_t top_vals;
- int y = height;
- do {
+ for (int y = 0; y < 8; ++y) {
const uint16_t* const src = top_row + (top_x >> scale_bits_x);
LoadEdgeVals(&top_vals, src, upsampled);
@@ -1736,20 +1869,9 @@ inline void DirectionalZone1Blend_8xH(
dest += stride;
zone_bounds += xstep;
top_x -= xstep;
- } while (--y != 0);
+ }
}
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. Indices
-// that do not correspond to angle derivatives are left at zero.
-// Notably, in cases with upsampling, the shuffle-invalid height is always
-// greater than the prediction height (which is 8 at maximum).
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
- 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
// 7.11.2.4 (8) 90 < angle > 180
// The strategy for these functions (4xH and 8+xH) is to know how many blocks
// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
@@ -1796,9 +1918,9 @@ inline void DirectionalZone2_4xH(
// computed from the top row. The second stage, comprising two y-loops, covers
// blocks that have a mixture of values computed from top or left. The final
// stage covers blocks that are only computed from the left.
- // Round down to the nearest multiple of 8.
- // TODO(petersonab): Check if rounding to the nearest 4 is okay.
- const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min((1 << 6) / xstep, height) & ~(min_height - 1);
DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst),
stride >> 1, max_top_only_y, top_row,
-xstep);
@@ -1827,12 +1949,15 @@ inline void DirectionalZone2_4xH(
xstep_bounds, top_x, xstep);
}
- // Loop over y for left-only rows.
- for (; y < height; y += 8, dst += stride8) {
- // Angle expected by Zone3 is flipped about the 180 degree vector, which
- // is the x-axis.
+ // Left-only section. |height| - |y| is assumed equivalent to:
+ // (y == 0) && (height == 4)
+ if (height - y == 4) {
+ DirectionalZone3_4x4<upsampled_left>(dst, stride, left_column, -ystep);
+ return;
+ }
+ if (y < height) {
DirectionalZone3_4xH<upsampled_left>(
- dst, stride, min_height, left_column + (y << upsample_left_shift),
+ dst, stride, height - y, left_column + (y << upsample_left_shift),
-ystep);
}
}
@@ -1882,9 +2007,75 @@ inline void DirectionalZone2_Wx4(
}
}
+template <bool shuffle_left_column, bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_8xH(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const top_row,
+ const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep, const int x, const int left_offset,
+ const int xstep_bounds_base, const int16x8_t left_y) {
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+ // Loop incrementers for moving by block (8x8). This function handles blocks
+ // with height 4 as well. They are calculated in one pass so these variables
+ // do not get used.
+ const ptrdiff_t stride8 = stride << 3;
+ const int xstep8 = xstep << 3;
+
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ uint8_t* dst_x = dst + x * sizeof(uint16_t);
+ // Round down to the nearest multiple of 8.
+ const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+ DirectionalZone1_WxH<upsampled_top>(
+ reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
+ top_row + (x << upsample_top_shift), -xstep);
+
+ if (max_top_only_y == height) return;
+
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+
+ // All rows from |min_left_only_y| down for this set of columns only need
+ // |left_column| to compute. Round up to the nearest 8.
+ const int min_left_only_y =
+ Align(std::min(((x + 8) << 6) / xstep, height), 8);
+ int xstep_bounds = xstep_bounds_base + xstep_y;
+ int top_x = -xstep - xstep_y;
+
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ if (shuffle_left_column) {
+ DirectionalZone2FromLeftCol_8x8(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y,
+ upsampled_left);
+ } else {
+ DirectionalZone3_8x8<upsampled_left>(
+ dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+ -ystep * x);
+ }
+
+ DirectionalZone1Blend_8x8<upsampled_top>(
+ dst_x, stride, top_row + (x << upsample_top_shift), xstep_bounds, top_x,
+ xstep);
+ }
+
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_8x8<upsampled_left>(
+ dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+ -ystep * x);
+ }
+}
+
// Process a multiple of 8 |width|.
template <bool upsampled_top, bool upsampled_left>
-inline void DirectionalZone2_8(
+inline void DirectionalZone2_NEON(
uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
const uint16_t* LIBGAV1_RESTRICT const top_row,
const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
@@ -1894,30 +2085,24 @@ inline void DirectionalZone2_8(
dst, stride, top_row, left_column, width, xstep, ystep);
return;
}
- const int upsample_left_shift = static_cast<int>(upsampled_left);
const int upsample_top_shift = static_cast<int>(upsampled_top);
// Helper vector.
const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
- // Loop increments for moving by block (8x8). This function handles blocks
- // with height 4 as well. They are calculated in one pass so these variables
- // do not get used.
- const ptrdiff_t stride8 = stride << 3;
- const int xstep8 = xstep << 3;
const int ystep8 = ystep << 3;
// All columns from |min_top_only_x| to the right will only need |top_row| to
// compute and can therefore call the Zone1 functions. This assumes |xstep| is
// at least 3.
assert(xstep >= 3);
- const int min_top_only_x = std::min((height * xstep) >> 6, width);
-
- // For steep angles, the source pixels from |left_column| may not fit in a
- // 16-byte load for shuffling.
- // TODO(petersonab): Find a more precise formula for this subject to x.
- const int max_shuffle_height =
- std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
+ const int min_top_only_x = Align(std::min((height * xstep) >> 6, width), 8);
+ // Analysis finds that, for most angles (ystep < 132), all segments that use
+ // both top_row and left_column can compute from left_column using byte
+ // shuffles from a single vector. For steeper angles, the shuffle is also
+ // fully reliable when x >= 32.
+ const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+ const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
// Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
@@ -1935,73 +2120,22 @@ inline void DirectionalZone2_8(
int16x8_t left_y =
vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep);
- // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
- // The first stage, before the first y-loop, covers blocks that are only
- // computed from the top row. The second stage, comprising two y-loops, covers
- // blocks that have a mixture of values computed from top or left. The final
- // stage covers blocks that are only computed from the left.
int x = 0;
+ for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
+ xstep_bounds_base -= (8 << 6),
+ left_y = vsubq_s16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ DirectionalZone2_8xH<false, upsampled_top, upsampled_left>(
+ dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+ xstep_bounds_base, left_y);
+ }
for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
xstep_bounds_base -= (8 << 6),
left_y = vsubq_s16(left_y, increment_left8),
left_offset -= left_base_increment8) {
- uint8_t* dst_x = dst + x * sizeof(uint16_t);
-
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
- DirectionalZone1_WxH<upsampled_top>(
- reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
- top_row + (x << upsample_top_shift), -xstep);
-
- if (max_top_only_y == height) continue;
-
- int y = max_top_only_y;
- dst_x += stride * y;
- const int xstep_y = xstep * y;
-
- // All rows from |min_left_only_y| down for this set of columns only need
- // |left_column| to compute.
- const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
- // At high angles such that min_left_only_y < 8, ystep is low and xstep is
- // high. This means that max_shuffle_height is unbounded and xstep_bounds
- // will overflow in 16 bits. This is prevented by stopping the first
- // blending loop at min_left_only_y for such cases, which means we skip over
- // the second blending loop as well.
- const int left_shuffle_stop_y =
- std::min(max_shuffle_height, min_left_only_y);
- int xstep_bounds = xstep_bounds_base + xstep_y;
- int top_x = -xstep - xstep_y;
-
- for (; y < left_shuffle_stop_y;
- y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone2FromLeftCol_8xH(
- dst_x, stride, 8,
- left_column + ((left_offset + y) << upsample_left_shift), left_y,
- upsample_left_shift);
-
- DirectionalZone1Blend_8xH<upsampled_top>(
- dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds,
- top_x, xstep);
- }
-
- // Pick up from the last y-value, using the slower but secure method for
- // left prediction.
- for (; y < min_left_only_y;
- y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone3_8x8<upsampled_left>(
- dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
- -ystep * x);
-
- DirectionalZone1Blend_8xH<upsampled_top>(
- dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds,
- top_x, xstep);
- }
- // Loop over y for left_only rows.
- for (; y < height; y += 8, dst_x += stride8) {
- DirectionalZone3_8x8<upsampled_left>(
- dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
- -ystep * x);
- }
+ DirectionalZone2_8xH<true, upsampled_top, upsampled_left>(
+ dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+ xstep_bounds_base, left_y);
}
// Reached |min_top_only_x|.
if (x < width) {
@@ -2129,18 +2263,18 @@ void DirectionalIntraPredictorZone2_NEON(
}
if (upsampled_top) {
if (upsampled_left) {
- DirectionalZone2_8<true, true>(dst, stride, top_ptr, left_ptr, width,
- height, xstep, ystep);
+ DirectionalZone2_NEON<true, true>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
} else {
- DirectionalZone2_8<true, false>(dst, stride, top_ptr, left_ptr, width,
- height, xstep, ystep);
+ DirectionalZone2_NEON<true, false>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
}
} else if (upsampled_left) {
- DirectionalZone2_8<false, true>(dst, stride, top_ptr, left_ptr, width,
- height, xstep, ystep);
+ DirectionalZone2_NEON<false, true>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
} else {
- DirectionalZone2_8<false, false>(dst, stride, top_ptr, left_ptr, width,
- height, xstep, ystep);
+ DirectionalZone2_NEON<false, false>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
}
}