aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/arm/intrapred_directional_neon.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/arm/intrapred_directional_neon.cc')
-rw-r--r--src/dsp/arm/intrapred_directional_neon.cc901
1 files changed, 787 insertions, 114 deletions
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
index 3f5edbd..3cad4a6 100644
--- a/src/dsp/arm/intrapred_directional_neon.cc
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -29,6 +29,7 @@
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
namespace libgav1 {
namespace dsp {
@@ -40,9 +41,9 @@ inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
const uint8x8_t a_weight,
const uint8x8_t b_weight) {
const uint16x8_t a_product = vmull_u8(a, a_weight);
- const uint16x8_t b_product = vmull_u8(b, b_weight);
+ const uint16x8_t sum = vmlal_u8(a_product, b, b_weight);
- return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5 /*log2(32)*/);
+ return vrshrn_n_u16(sum, 5 /*log2(32)*/);
}
// For vertical operations the weights are one constant value.
@@ -52,9 +53,9 @@ inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
}
// Fill |left| and |right| with the appropriate values for a given |base_step|.
-inline void LoadStepwise(const uint8_t* const source, const uint8x8_t left_step,
- const uint8x8_t right_step, uint8x8_t* left,
- uint8x8_t* right) {
+inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source,
+ const uint8x8_t left_step, const uint8x8_t right_step,
+ uint8x8_t* left, uint8x8_t* right) {
const uint8x16_t mixed = vld1q_u8(source);
*left = VQTbl1U8(mixed, left_step);
*right = VQTbl1U8(mixed, right_step);
@@ -62,17 +63,18 @@ inline void LoadStepwise(const uint8_t* const source, const uint8x8_t left_step,
// Handle signed step arguments by ignoring the sign. Negative values are
// considered out of range and overwritten later.
-inline void LoadStepwise(const uint8_t* const source, const int8x8_t left_step,
- const int8x8_t right_step, uint8x8_t* left,
- uint8x8_t* right) {
+inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source,
+ const int8x8_t left_step, const int8x8_t right_step,
+ uint8x8_t* left, uint8x8_t* right) {
LoadStepwise(source, vreinterpret_u8_s8(left_step),
vreinterpret_u8_s8(right_step), left, right);
}
// Process 4 or 8 |width| by any |height|.
template <int width>
-inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
- const int height, const uint8_t* const top,
+inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const top,
const int xstep, const bool upsampled) {
assert(width == 4 || width == 8);
@@ -142,10 +144,11 @@ inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
// Process a multiple of 8 |width| by any |height|. Processes horizontally
// before vertically in the hopes of being a little more cache friendly.
-inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
- const int width, const int height,
- const uint8_t* const top, const int xstep,
- const bool upsampled) {
+inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride, const int width,
+ const int height,
+ const uint8_t* LIBGAV1_RESTRICT const top,
+ const int xstep, const bool upsampled) {
assert(width % 8 == 0);
const int upsample_shift = static_cast<int>(upsampled);
const int scale_bits = 6 - upsample_shift;
@@ -203,14 +206,12 @@ inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
} while (++y < height);
}
-void DirectionalIntraPredictorZone1_NEON(void* const dest,
- const ptrdiff_t stride,
- const void* const top_row,
- const int width, const int height,
- const int xstep,
- const bool upsampled_top) {
- const uint8_t* const top = static_cast<const uint8_t*>(top_row);
- uint8_t* dst = static_cast<uint8_t*>(dest);
+void DirectionalIntraPredictorZone1_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row, const int width,
+ const int height, const int xstep, const bool upsampled_top) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
assert(xstep > 0);
@@ -282,11 +283,10 @@ void DirectionalIntraPredictorZone1_NEON(void* const dest,
// Process 4 or 8 |width| by 4 or 8 |height|.
template <int width>
-inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
- const int height,
- const uint8_t* const left_column,
- const int base_left_y, const int ystep,
- const int upsample_shift) {
+inline void DirectionalZone3_WxH(
+ uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int base_left_y,
+ const int ystep, const int upsample_shift) {
assert(width == 4 || width == 8);
assert(height == 4 || height == 8);
const int scale_bits = 6 - upsample_shift;
@@ -417,12 +417,10 @@ constexpr int kPositiveIndexOffset = 15;
// Process 4 or 8 |width| by any |height|.
template <int width>
-inline void DirectionalZone2FromLeftCol_WxH(uint8_t* dst,
- const ptrdiff_t stride,
- const int height,
- const uint8_t* const left_column,
- const int16x8_t left_y,
- const int upsample_shift) {
+inline void DirectionalZone2FromLeftCol_WxH(
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
+ const int upsample_shift) {
assert(width == 4 || width == 8);
// The shift argument must be a constant.
@@ -468,12 +466,10 @@ inline void DirectionalZone2FromLeftCol_WxH(uint8_t* dst,
// Process 4 or 8 |width| by any |height|.
template <int width>
-inline void DirectionalZone1Blend_WxH(uint8_t* dest, const ptrdiff_t stride,
- const int height,
- const uint8_t* const top_row,
- int zone_bounds, int top_x,
- const int xstep,
- const int upsample_shift) {
+inline void DirectionalZone1Blend_WxH(
+ uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+ const uint8_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+ const int xstep, const int upsample_shift) {
assert(width == 4 || width == 8);
const int scale_bits_x = 6 - upsample_shift;
@@ -523,12 +519,12 @@ constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
// then handle only blocks that take from |left_ptr|. Additionally, a fast
// index-shuffle approach is used for pred values from |left_column| in sections
// that permit it.
-inline void DirectionalZone2_4xH(uint8_t* dst, const ptrdiff_t stride,
- const uint8_t* const top_row,
- const uint8_t* const left_column,
- const int height, const int xstep,
- const int ystep, const bool upsampled_top,
- const bool upsampled_left) {
+inline void DirectionalZone2_4xH(
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+ const uint8_t* LIBGAV1_RESTRICT const top_row,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep, const bool upsampled_top,
+ const bool upsampled_left) {
const int upsample_left_shift = static_cast<int>(upsampled_left);
const int upsample_top_shift = static_cast<int>(upsampled_top);
@@ -564,8 +560,8 @@ inline void DirectionalZone2_4xH(uint8_t* dst, const ptrdiff_t stride,
// If the 64 scaling is regarded as a decimal point, the first value of the
// left_y vector omits the portion which is covered under the left_column
// offset. The following values need the full ystep as a relative offset.
- int16x8_t left_y = vmulq_n_s16(zero_to_seven, -ystep);
- left_y = vaddq_s16(left_y, vdupq_n_s16(-ystep_remainder));
+ const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
+ const int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
// This loop treats each set of 4 columns in 3 stages with y-value boundaries.
// The first stage, before the first y-loop, covers blocks that are only
@@ -639,13 +635,12 @@ inline void DirectionalZone2_4xH(uint8_t* dst, const ptrdiff_t stride,
}
// Process a multiple of 8 |width|.
-inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride,
- const uint8_t* const top_row,
- const uint8_t* const left_column,
- const int width, const int height,
- const int xstep, const int ystep,
- const bool upsampled_top,
- const bool upsampled_left) {
+inline void DirectionalZone2_8(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint8_t* LIBGAV1_RESTRICT const top_row,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int xstep, const int ystep,
+ const bool upsampled_top, const bool upsampled_left) {
const int upsample_left_shift = static_cast<int>(upsampled_left);
const int upsample_top_shift = static_cast<int>(upsampled_top);
@@ -668,12 +663,6 @@ inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride,
assert(xstep >= 3);
const int min_top_only_x = std::min((height * xstep) >> 6, width);
- // For steep angles, the source pixels from |left_column| may not fit in a
- // 16-byte load for shuffling.
- // TODO(petersonab): Find a more precise formula for this subject to x.
- const int max_shuffle_height =
- std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
-
// Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
@@ -687,8 +676,8 @@ inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride,
// If the 64 scaling is regarded as a decimal point, the first value of the
// left_y vector omits the portion which is covered under the left_column
// offset. Following values need the full ystep as a relative offset.
- int16x8_t left_y = vmulq_n_s16(zero_to_seven, -ystep);
- left_y = vaddq_s16(left_y, vdupq_n_s16(-ystep_remainder));
+ const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
+ int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
// This loop treats each set of 4 columns in 3 stages with y-value boundaries.
// The first stage, before the first y-loop, covers blocks that are only
@@ -696,12 +685,21 @@ inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride,
// blocks that have a mixture of values computed from top or left. The final
// stage covers blocks that are only computed from the left.
int x = 0;
+ // For steep angles, the source pixels from |left_column| may not fit in a
+ // 16-byte load for shuffling. |d| represents the number of pixels that can
+ // fit in one contiguous vector when stepping by |ystep|. For a given x
+ // position, the left column values can be obtained by VTBL as long as the
+ // values at row[x + d] and beyond come from the top row. However, this does
+ // not guarantee that the vector will also contain all of the values needed
+ // from top row.
+ const int d = 16 / ((ystep >> 6) + 1);
for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
xstep_bounds_base -= (8 << 6),
left_y = vsubq_s16(left_y, increment_left8),
left_offset -= left_base_increment8) {
uint8_t* dst_x = dst + x;
-
+ const int max_shuffle_height =
+ std::min(((x + d) << 6) / xstep, height) & ~7;
// Round down to the nearest multiple of 8.
const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
@@ -770,14 +768,20 @@ inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride,
}
void DirectionalIntraPredictorZone2_NEON(
- void* const dest, const ptrdiff_t stride, const void* const top_row,
- const void* const left_column, const int width, const int height,
- const int xstep, const int ystep, const bool upsampled_top,
- const bool upsampled_left) {
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int xstep, const int ystep,
+ const bool upsampled_top, const bool upsampled_left) {
// Increasing the negative buffer for this function allows more rows to be
// processed at a time without branching in an inner loop to check the base.
uint8_t top_buffer[288];
uint8_t left_buffer[288];
+#if LIBGAV1_MSAN
+ memset(top_buffer, 0, sizeof(top_buffer));
+ memset(left_buffer, 0, sizeof(left_buffer));
+#endif // LIBGAV1_MSAN
+
memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
const uint8_t* top_ptr = top_buffer + 144;
@@ -793,12 +797,10 @@ void DirectionalIntraPredictorZone2_NEON(
}
}
-void DirectionalIntraPredictorZone3_NEON(void* const dest,
- const ptrdiff_t stride,
- const void* const left_column,
- const int width, const int height,
- const int ystep,
- const bool upsampled_left) {
+void DirectionalIntraPredictorZone3_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int ystep, const bool upsampled_left) {
const auto* const left = static_cast<const uint8_t*>(left_column);
assert(ystep > 0);
@@ -819,7 +821,7 @@ void DirectionalIntraPredictorZone3_NEON(void* const dest,
do {
int x = 0;
do {
- uint8_t* dst = static_cast<uint8_t*>(dest);
+ auto* dst = static_cast<uint8_t*>(dest);
dst += y * stride + x;
uint8x8_t left_v[4], right_v[4], value_v[4];
const int ystep_base = ystep * x;
@@ -886,7 +888,7 @@ void DirectionalIntraPredictorZone3_NEON(void* const dest,
do {
int x = 0;
do {
- uint8_t* dst = static_cast<uint8_t*>(dest);
+ auto* dst = static_cast<uint8_t*>(dest);
dst += y * stride + x;
const int ystep_base = ystep * (x + 1);
@@ -934,7 +936,8 @@ inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
}
// Each element of |dest| contains values associated with one weight value.
-inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source,
+inline void LoadEdgeVals(uint16x4x2_t* dest,
+ const uint16_t* LIBGAV1_RESTRICT const source,
const bool upsampled) {
if (upsampled) {
*dest = vld2_u16(source);
@@ -945,7 +948,8 @@ inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source,
}
// Each element of |dest| contains values associated with one weight value.
-inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source,
+inline void LoadEdgeVals(uint16x8x2_t* dest,
+ const uint16_t* LIBGAV1_RESTRICT const source,
const bool upsampled) {
if (upsampled) {
*dest = vld2q_u16(source);
@@ -956,8 +960,9 @@ inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source,
}
template <bool upsampled>
-inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride,
- const int height, const uint16_t* const top,
+inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride, const int height,
+ const uint16_t* LIBGAV1_RESTRICT const top,
const int xstep) {
const int upsample_shift = static_cast<int>(upsampled);
const int index_scale_bits = 6 - upsample_shift;
@@ -1007,9 +1012,11 @@ inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride,
// Process a multiple of 8 |width| by any |height|. Processes horizontally
// before vertically in the hopes of being a little more cache friendly.
template <bool upsampled>
-inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride,
- const int width, const int height,
- const uint16_t* const top, const int xstep) {
+inline void DirectionalZone1_WxH(uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride, const int width,
+ const int height,
+ const uint16_t* LIBGAV1_RESTRICT const top,
+ const int xstep) {
assert(width % 8 == 0);
const int upsample_shift = static_cast<int>(upsampled);
const int index_scale_bits = 6 - upsample_shift;
@@ -1068,10 +1075,11 @@ inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride,
// Process a multiple of 8 |width| by any |height|. Processes horizontally
// before vertically in the hopes of being a little more cache friendly.
-inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride,
- const int width, const int height,
- const uint16_t* const top, const int xstep,
- const bool upsampled) {
+inline void DirectionalZone1_Large(uint16_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride, const int width,
+ const int height,
+ const uint16_t* LIBGAV1_RESTRICT const top,
+ const int xstep, const bool upsampled) {
assert(width % 8 == 0);
const int upsample_shift = static_cast<int>(upsampled);
const int index_scale_bits = 6 - upsample_shift;
@@ -1156,13 +1164,12 @@ inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride,
}
}
-void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const int width, const int height,
- const int xstep,
- const bool upsampled_top) {
- const uint16_t* const top = static_cast<const uint16_t*>(top_row);
- uint16_t* dst = static_cast<uint16_t*>(dest);
+void DirectionalIntraPredictorZone1_NEON(
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row, const int width,
+ const int height, const int xstep, const bool upsampled_top) {
+ const auto* const top = static_cast<const uint16_t*>(top_row);
+ auto* dst = static_cast<uint16_t*>(dest);
stride /= sizeof(top[0]);
assert(xstep > 0);
@@ -1225,9 +1232,10 @@ void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride,
// 42 52 62 72 60 61 62 63
// 43 53 63 73 70 71 72 73
template <bool upsampled>
-inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride,
- const uint16_t* const left, const int ystep,
- const int base_left_y = 0) {
+inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep, const int base_left_y = 0) {
const int upsample_shift = static_cast<int>(upsampled);
const int index_scale_bits = 6 - upsample_shift;
@@ -1278,8 +1286,9 @@ inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride,
}
template <bool upsampled>
-inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride,
- const int height, const uint16_t* const left,
+inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t stride, const int height,
+ const uint16_t* LIBGAV1_RESTRICT const left,
const int ystep) {
const int upsample_shift = static_cast<int>(upsampled);
int y = 0;
@@ -1292,8 +1301,9 @@ inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride,
}
template <bool upsampled>
-inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride,
- const int width, const uint16_t* const left,
+inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t stride, const int width,
+ const uint16_t* LIBGAV1_RESTRICT const left,
const int ystep) {
int x = 0;
int base_left_y = 0;
@@ -1308,9 +1318,10 @@ inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride,
}
template <bool upsampled>
-inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride,
- const uint16_t* const left, const int ystep,
- const int base_left_y = 0) {
+inline void DirectionalZone3_8x8(uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep, const int base_left_y = 0) {
const int upsample_shift = static_cast<int>(upsampled);
const int index_scale_bits = 6 - upsample_shift;
@@ -1400,9 +1411,11 @@ inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride,
}
template <bool upsampled>
-inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
- const int width, const int height,
- const uint16_t* const left, const int ystep) {
+inline void DirectionalZone3_WxH(uint8_t* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t stride, const int width,
+ const int height,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep) {
const int upsample_shift = static_cast<int>(upsampled);
// Zone3 never runs out of left_column values.
assert((width + height - 1) << upsample_shift > // max_base_y
@@ -1424,14 +1437,12 @@ inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
} while (y < height);
}
-void DirectionalIntraPredictorZone3_NEON(void* const dest,
- const ptrdiff_t stride,
- const void* const left_column,
- const int width, const int height,
- const int ystep,
- const bool upsampled_left) {
- const uint16_t* const left = static_cast<const uint16_t*>(left_column);
- uint8_t* dst = static_cast<uint8_t*>(dest);
+void DirectionalIntraPredictorZone3_NEON(
+ void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int ystep, const bool upsampled_left) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
if (ystep == 64) {
assert(!upsampled_left);
@@ -1472,10 +1483,672 @@ void DirectionalIntraPredictorZone3_NEON(void* const dest,
}
}
+// -----------------------------------------------------------------------------
+// Zone2
+// This function deals with cases not found in zone 1 or zone 3. The extreme
+// angles are 93, which makes for sharp ascents along |left_column| with each
+// successive dest row element until reaching |top_row|, and 177, with a shallow
+// ascent up |left_column| until reaching large jumps along |top_row|. In the
+// extremely steep cases, source vectors can only be loaded one lane at a time.
+
+// Fill |left| and |right| with the appropriate values for a given |base_step|.
+inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source,
+ const uint8x8_t left_step, const uint8x8_t right_step,
+ uint16x4_t* left, uint16x4_t* right) {
+ const uint8x16x2_t mixed = {
+ vld1q_u8(static_cast<const uint8_t*>(source)),
+ vld1q_u8(static_cast<const uint8_t*>(source) + 16)};
+ *left = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step));
+ *right = vreinterpret_u16_u8(VQTbl2U8(mixed, right_step));
+}
+
+inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source,
+ const uint8x8_t left_step_0,
+ const uint8x8_t right_step_0,
+ const uint8x8_t left_step_1,
+ const uint8x8_t right_step_1, uint16x8_t* left,
+ uint16x8_t* right) {
+ const uint8x16x2_t mixed = {
+ vld1q_u8(static_cast<const uint8_t*>(source)),
+ vld1q_u8(static_cast<const uint8_t*>(source) + 16)};
+ const uint16x4_t left_low = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_0));
+ const uint16x4_t left_high =
+ vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_1));
+ *left = vcombine_u16(left_low, left_high);
+ const uint16x4_t right_low =
+ vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_0));
+ const uint16x4_t right_high =
+ vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_1));
+ *right = vcombine_u16(right_low, right_high);
+}
+
+// Blend two values based on weight pairs that each sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+ const uint16x4_t a_weight,
+ const uint16x4_t b_weight) {
+ const uint16x4_t a_product = vmul_u16(a, a_weight);
+ const uint16x4_t sum = vmla_u16(a_product, b, b_weight);
+
+ return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weight pairs that each sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+ const uint16x8_t a_weight,
+ const uint16x8_t b_weight) {
+ const uint16x8_t a_product = vmulq_u16(a, a_weight);
+ const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
+
+ return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative in localized functions.
+// This is accommodated by making sure the relative indices are within [-15, 0]
+// when the function is called, and sliding them into the inclusive range
+// [0, 15], relative to a lower base address. 15 is the Pixel offset, so 30 is
+// the byte offset for table lookups.
+
+constexpr int kPositiveIndexOffsetPixels = 15;
+constexpr int kPositiveIndexOffsetBytes = 30;
+
+inline void DirectionalZone2FromLeftCol_4xH(
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+ const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x4_t left_y,
+ const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+
+ const int index_scale_bits = 6;
+ // The values in |offset_y| are negative, except for the first element, which
+ // is zero.
+ int16x4_t offset_y;
+ int16x4_t shift_upsampled = left_y;
+ // The shift argument must be a constant, otherwise use upsample_shift
+ // directly.
+ if (upsampled) {
+ offset_y = vshr_n_s16(left_y, index_scale_bits - 1 /*upsample_shift*/);
+ shift_upsampled = vshl_n_s16(shift_upsampled, 1);
+ } else {
+ offset_y = vshr_n_s16(left_y, index_scale_bits);
+ }
+ offset_y = vshl_n_s16(offset_y, 1);
+
+ // Select values to the left of the starting point.
+ // The 15th element (and 16th) will be all the way at the end, to the
+ // right. With a negative ystep everything else will be "left" of them.
+ // This supports cumulative steps up to 15. We could support up to 16 by
+ // doing separate loads for |left_values| and |right_values|. vtbl
+ // supports 2 Q registers as input which would allow for cumulative
+ // offsets of 32.
+ // |sampler_0| indexes the first byte of each 16-bit value.
+ const int16x4_t sampler_0 =
+ vadd_s16(offset_y, vdup_n_s16(kPositiveIndexOffsetBytes));
+ // |sampler_1| indexes the second byte of each 16-bit value.
+ const int16x4_t sampler_1 = vadd_s16(sampler_0, vdup_n_s16(1));
+ const int16x4x2_t sampler = vzip_s16(sampler_0, sampler_1);
+ const uint8x8_t left_indices =
+ vqmovun_s16(vcombine_s16(sampler.val[0], sampler.val[1]));
+ const uint8x8_t right_indices =
+ vadd_u8(left_indices, vdup_n_u8(sizeof(uint16_t)));
+
+ const int16x4_t shift_masked = vand_s16(shift_upsampled, vdup_n_s16(0x3f));
+ const uint16x4_t shift_0 = vreinterpret_u16_s16(vshr_n_s16(shift_masked, 1));
+ const uint16x4_t shift_1 = vsub_u16(vdup_n_u16(32), shift_0);
+
+ int y = 0;
+ do {
+ uint16x4_t src_left, src_right;
+ LoadStepwise(
+ left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
+ left_indices, right_indices, &src_left, &src_right);
+ const uint16x4_t val = WeightedBlend(src_left, src_right, shift_1, shift_0);
+
+ Store4(dst, val);
+ dst += stride;
+ } while (++y < height);
+}
+
+inline void DirectionalZone2FromLeftCol_8xH(
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+ const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
+ const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+
+ const int index_scale_bits = 6;
+ // The values in |offset_y| are negative, except for the first element, which
+ // is zero.
+ int16x8_t offset_y = left_y;
+ int16x8_t shift_upsampled = left_y;
+ // The shift argument must be a constant, otherwise use upsample_shift
+ // directly.
+ if (upsampled) {
+ offset_y = vshrq_n_s16(left_y, index_scale_bits - 1);
+ shift_upsampled = vshlq_n_s16(shift_upsampled, 1);
+ } else {
+ offset_y = vshrq_n_s16(left_y, index_scale_bits);
+ }
+ offset_y = vshlq_n_s16(offset_y, 1);
+
+ // Select values to the left of the starting point.
+ // The 15th element (and 16th) will be all the way at the end, to the right.
+ // With a negative ystep everything else will be "left" of them.
+ // This supports cumulative steps up to 15. We could support up to 16 by doing
+ // separate loads for |left_values| and |right_values|. vtbl supports 2 Q
+ // registers as input which would allow for cumulative offsets of 32.
+ // |sampler_0| indexes the first byte of each 16-bit value.
+ const int16x8_t sampler_0 =
+ vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffsetBytes));
+ // |sampler_1| indexes the second byte of each 16-bit value.
+ const int16x8_t sampler_1 = vaddq_s16(sampler_0, vdupq_n_s16(1));
+ const int16x8x2_t sampler = vzipq_s16(sampler_0, sampler_1);
+ const uint8x8_t left_values_0 = vqmovun_s16(sampler.val[0]);
+ const uint8x8_t left_values_1 = vqmovun_s16(sampler.val[1]);
+ const uint8x8_t right_values_0 =
+ vadd_u8(left_values_0, vdup_n_u8(sizeof(uint16_t)));
+ const uint8x8_t right_values_1 =
+ vadd_u8(left_values_1, vdup_n_u8(sizeof(uint16_t)));
+
+ const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f));
+ const uint16x8_t shift_0 =
+ vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1));
+ const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0);
+
+ int y = 0;
+ do {
+ uint16x8_t src_left, src_right;
+ LoadStepwise(
+ left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
+ left_values_0, right_values_0, left_values_1, right_values_1, &src_left,
+ &src_right);
+ const uint16x8_t val = WeightedBlend(src_left, src_right, shift_1, shift_0);
+
+ Store8(dst, val);
+ dst += stride;
+ } while (++y < height);
+}
+
+template <bool upsampled>
+inline void DirectionalZone1Blend_4xH(
+ uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+ const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+ const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits_x = 6 - upsample_shift;
+
+ // Representing positions along the row, which |zone_bounds| will target for
+ // the blending boundary.
+ const int16x4_t indices = {0, 1, 2, 3};
+
+ uint16x4x2_t top_vals;
+ int y = height;
+ do {
+ const uint16_t* const src = top_row + (top_x >> scale_bits_x);
+ LoadEdgeVals(&top_vals, src, upsampled);
+
+ const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ const uint16x4_t val =
+ WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0);
+
+ const uint16x4_t dst_blend = Load4U16(dest);
+ // |zone_bounds| values can be negative.
+ const uint16x4_t blend = vcge_s16(indices, vdup_n_s16(zone_bounds >> 6));
+ const uint16x4_t output = vbsl_u16(blend, val, dst_blend);
+
+ Store4(dest, output);
+ dest += stride;
+ zone_bounds += xstep;
+ top_x -= xstep;
+ } while (--y != 0);
+}
+
+template <bool upsampled>
+inline void DirectionalZone1Blend_8xH(
+ uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+ const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
+ const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits_x = 6 - upsample_shift;
+
+ // Representing positions along the row, which |zone_bounds| will target for
+ // the blending boundary.
+ const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ uint16x8x2_t top_vals;
+ int y = height;
+ do {
+ const uint16_t* const src = top_row + (top_x >> scale_bits_x);
+ LoadEdgeVals(&top_vals, src, upsampled);
+
+ const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ const uint16x8_t val =
+ WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0);
+
+ const uint16x8_t dst_blend = Load8U16(dest);
+ // |zone_bounds| values can be negative.
+ const uint16x8_t blend = vcgeq_s16(indices, vdupq_n_s16(zone_bounds >> 6));
+ const uint16x8_t output = vbslq_u16(blend, val, dst_blend);
+
+ Store8(dest, output);
+ dest += stride;
+ zone_bounds += xstep;
+ top_x -= xstep;
+ } while (--y != 0);
+}
+
+// The height at which a load of 16 bytes will not contain enough source pixels
+// from |left_column| to supply an accurate row when computing 8 pixels at a
+// time. The values are found by inspection. By coincidence, all angles that
+// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
+// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. Indices
+// that do not correspond to angle derivatives are left at zero.
+// Notably, in cases with upsampling, the shuffle-invalid height is always
+// greater than the prediction height (which is 8 at maximum).
+constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
+ 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for these functions (4xH and 8+xH) is to know how many blocks
+// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
+// then handle only blocks that take from |left_ptr|. Additionally, a fast
+// index-shuffle approach is used for pred values from |left_column| in sections
+// that permit it.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_4xH(
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const top_row,
+ const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep) {
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+
+ // Helper vector for index computation.
+ const int16x4_t zero_to_three = {0, 1, 2, 3};
+
+ // Loop increments for moving by block (4xN). Vertical still steps by 8. If
+ // it's only 4, it will be finished in the first iteration.
+ const ptrdiff_t stride8 = stride << 3;
+ const int xstep8 = xstep << 3;
+
+ const int min_height = (height == 4) ? 4 : 8;
+
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+ // at least 3.
+ assert(xstep >= 3);
+
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which is covered under the left_column
+ // offset. The following values need the full ystep as a relative offset.
+ const int16x4_t left_y =
+ vmla_n_s16(vdup_n_s16(-ystep_remainder), zero_to_three, -ystep);
+
+ // This loop treats the 4 columns in 3 stages with y-value boundaries.
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ // Round down to the nearest multiple of 8.
+ // TODO(petersonab): Check if rounding to the nearest 4 is okay.
+ const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
+ DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst),
+ stride >> 1, max_top_only_y, top_row,
+ -xstep);
+
+ if (max_top_only_y == height) return;
+
+ int y = max_top_only_y;
+ dst += stride * y;
+ const int xstep_y = xstep * y;
+
+ // All rows from |min_left_only_y| down for this set of columns only need
+ // |left_column| to compute.
+ const int min_left_only_y = std::min((4 /*width*/ << 6) / xstep, height);
+ int xstep_bounds = xstep_bounds_base + xstep_y;
+ int top_x = -xstep - xstep_y;
+
+ // +8 increment is OK because if height is 4 this only runs once.
+ for (; y < min_left_only_y;
+ y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ DirectionalZone2FromLeftCol_4xH(
+ dst, stride, min_height,
+ left_column + ((y - left_base_increment) << upsample_left_shift),
+ left_y, upsampled_left);
+
+ DirectionalZone1Blend_4xH<upsampled_top>(dst, stride, min_height, top_row,
+ xstep_bounds, top_x, xstep);
+ }
+
+ // Loop over y for left-only rows.
+ for (; y < height; y += 8, dst += stride8) {
+ // Angle expected by Zone3 is flipped about the 180 degree vector, which
+ // is the x-axis.
+ DirectionalZone3_4xH<upsampled_left>(
+ dst, stride, min_height, left_column + (y << upsample_left_shift),
+ -ystep);
+ }
+}
+
+// Process 8x4 and 16x4 blocks. This avoids a lot of overhead and simplifies
+// address safety.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_Wx4(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const top_row,
+ const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
+ const int xstep, const int ystep) {
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+ const int min_top_only_x = std::min((4 * xstep) >> 6, width);
+ int x = 0;
+ for (; x < min_top_only_x; x += 4, xstep_bounds_base -= (4 << 6)) {
+ uint8_t* dst_x = dst + x * sizeof(uint16_t);
+
+ // Round down to the nearest multiple of 4.
+ const int max_top_only_y = (((x + 1) << 6) / xstep) & ~3;
+ if (max_top_only_y != 0) {
+ DirectionalZone1_4xH<upsampled_top>(
+ reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 4,
+ top_row + (x << upsample_top_shift), -xstep);
+ continue;
+ }
+
+ DirectionalZone3_4x4<upsampled_left>(dst_x, stride, left_column, -ystep,
+ -ystep * x);
+
+ const int min_left_only_y = ((x + 4) << 6) / xstep;
+ if (min_left_only_y != 0) {
+ const int top_x = -xstep;
+ DirectionalZone1Blend_4xH<upsampled_top>(
+ dst_x, stride, 4, top_row + (x << upsample_top_shift),
+ xstep_bounds_base, top_x, xstep);
+ }
+ }
+ // Reached |min_top_only_x|.
+ for (; x < width; x += 4) {
+ DirectionalZone1_4xH<upsampled_top>(
+ reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, 4,
+ top_row + (x << upsample_top_shift), -xstep);
+ }
+}
+
+// Process a multiple of 8 |width|.
+template <bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_8(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const top_row,
+ const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int xstep, const int ystep) {
+ if (height == 4) {
+ DirectionalZone2_Wx4<upsampled_top, upsampled_left>(
+ dst, stride, top_row, left_column, width, xstep, ystep);
+ return;
+ }
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+ // Helper vector.
+ const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ // Loop increments for moving by block (8x8). This function handles blocks
+ // with height 4 as well. They are calculated in one pass so these variables
+ // do not get used.
+ const ptrdiff_t stride8 = stride << 3;
+ const int xstep8 = xstep << 3;
+ const int ystep8 = ystep << 3;
+
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute and can therefore call the Zone1 functions. This assumes |xstep| is
+ // at least 3.
+ assert(xstep >= 3);
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+ // For steep angles, the source pixels from |left_column| may not fit in a
+ // 16-byte load for shuffling.
+ // TODO(petersonab): Find a more precise formula for this subject to x.
+ const int max_shuffle_height =
+ std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
+
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+
+ const int left_base_increment8 = ystep8 >> 6;
+ const int ystep_remainder8 = ystep8 & 0x3F;
+ const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8);
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which is covered under the left_column
+ // offset. Following values need the full ystep as a relative offset.
+ int16x8_t left_y =
+ vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep);
+
+ // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ int x = 0;
+ for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
+ xstep_bounds_base -= (8 << 6),
+ left_y = vsubq_s16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ uint8_t* dst_x = dst + x * sizeof(uint16_t);
+
+ // Round down to the nearest multiple of 8.
+ const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+ DirectionalZone1_WxH<upsampled_top>(
+ reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
+ top_row + (x << upsample_top_shift), -xstep);
+
+ if (max_top_only_y == height) continue;
+
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+
+ // All rows from |min_left_only_y| down for this set of columns only need
+ // |left_column| to compute.
+ const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
+ // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+ // high. This means that max_shuffle_height is unbounded and xstep_bounds
+ // will overflow in 16 bits. This is prevented by stopping the first
+ // blending loop at min_left_only_y for such cases, which means we skip over
+ // the second blending loop as well.
+ const int left_shuffle_stop_y =
+ std::min(max_shuffle_height, min_left_only_y);
+ int xstep_bounds = xstep_bounds_base + xstep_y;
+ int top_x = -xstep - xstep_y;
+
+ for (; y < left_shuffle_stop_y;
+ y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ DirectionalZone2FromLeftCol_8xH(
+ dst_x, stride, 8,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y,
+ upsample_left_shift);
+
+ DirectionalZone1Blend_8xH<upsampled_top>(
+ dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds,
+ top_x, xstep);
+ }
+
+ // Pick up from the last y-value, using the slower but secure method for
+ // left prediction.
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ DirectionalZone3_8x8<upsampled_left>(
+ dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+ -ystep * x);
+
+ DirectionalZone1Blend_8xH<upsampled_top>(
+ dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds,
+ top_x, xstep);
+ }
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_8x8<upsampled_left>(
+ dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+ -ystep * x);
+ }
+ }
+ // Reached |min_top_only_x|.
+ if (x < width) {
+ DirectionalZone1_WxH<upsampled_top>(
+ reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, width - x, height,
+ top_row + (x << upsample_top_shift), -xstep);
+ }
+}
+
+// At this angle, neither edges are upsampled.
+// |min_width| is either 4 or 8.
+template <int min_width>
+void DirectionalAngle135(uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const top,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int width, const int height) {
+ // y = 0 is more trivial than the other rows.
+ memcpy(dst, top - 1, width * sizeof(top[0]));
+ dst += stride;
+
+ // If |height| > |width|, then there is a point at which top_row is no longer
+ // used in each row.
+ const int min_left_only_y = std::min(width, height);
+
+ int y = 1;
+ do {
+ // Example: If y is 4 (min_width), the dest row starts with left[3],
+ // left[2], left[1], left[0], because the angle points up. Therefore, load
+ // starts at left[0] and is then reversed. If y is 2, the load starts at
+ // left[-2], and is reversed to store left[1], left[0], with negative values
+ // overwritten from |top_row|.
+ const uint16_t* const load_left = left + y - min_width;
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+
+ // Some values will be overwritten when |y| is not a multiple of
+ // |min_width|.
+ if (min_width == 4) {
+ const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left));
+ vst1_u16(dst16, left_toward_corner);
+ } else {
+ int x = 0;
+ do {
+ const uint16x8_t left_toward_corner =
+ vrev64q_u16(vld1q_u16(load_left - x));
+ vst1_u16(dst16 + x, vget_high_u16(left_toward_corner));
+ vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner));
+ x += 8;
+ } while (x < y);
+ }
+ // Entering |top|.
+ memcpy(dst16 + y, top - 1, (width - y) * sizeof(top[0]));
+ dst += stride;
+ } while (++y < min_left_only_y);
+
+ // Left only.
+ for (; y < height; ++y, dst += stride) {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16_t* const load_left = left + y - min_width;
+
+ int x = 0;
+ if (min_width == 4) {
+ const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left - x));
+ vst1_u16(dst16 + x, left_toward_corner);
+ } else {
+ do {
+ const uint16x8_t left_toward_corner =
+ vrev64q_u16(vld1q_u16(load_left - x));
+ vst1_u16(dst16 + x, vget_high_u16(left_toward_corner));
+ vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner));
+ x += 8;
+ } while (x < width);
+ }
+ }
+}
+
+void DirectionalIntraPredictorZone2_NEON(
+ void* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int xstep, const int ystep,
+ const bool upsampled_top, const bool upsampled_left) {
+ // Increasing the negative buffer for this function allows more rows to be
+ // processed at a time without branching in an inner loop to check the base.
+ uint16_t top_buffer[288];
+ uint16_t left_buffer[288];
+#if LIBGAV1_MSAN
+ memset(top_buffer, 0, sizeof(top_buffer));
+ memset(left_buffer, 0, sizeof(left_buffer));
+#endif // LIBGAV1_MSAN
+ memcpy(top_buffer + 128, static_cast<const uint16_t*>(top_row) - 16, 160);
+ memcpy(left_buffer + 128, static_cast<const uint16_t*>(left_column) - 16,
+ 160);
+ const uint16_t* top_ptr = top_buffer + 144;
+ const uint16_t* left_ptr = left_buffer + 144;
+ auto* dst = static_cast<uint8_t*>(dest);
+
+ if (width == 4) {
+ if (xstep == 64) {
+ assert(ystep == 64);
+ DirectionalAngle135<4>(dst, stride, top_ptr, left_ptr, width, height);
+ return;
+ }
+ if (upsampled_top) {
+ if (upsampled_left) {
+ DirectionalZone2_4xH<true, true>(dst, stride, top_ptr, left_ptr, height,
+ xstep, ystep);
+ } else {
+ DirectionalZone2_4xH<true, false>(dst, stride, top_ptr, left_ptr,
+ height, xstep, ystep);
+ }
+ } else if (upsampled_left) {
+ DirectionalZone2_4xH<false, true>(dst, stride, top_ptr, left_ptr, height,
+ xstep, ystep);
+ } else {
+ DirectionalZone2_4xH<false, false>(dst, stride, top_ptr, left_ptr, height,
+ xstep, ystep);
+ }
+ return;
+ }
+
+ if (xstep == 64) {
+ assert(ystep == 64);
+ DirectionalAngle135<8>(dst, stride, top_ptr, left_ptr, width, height);
+ return;
+ }
+ if (upsampled_top) {
+ if (upsampled_left) {
+ DirectionalZone2_8<true, true>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
+ } else {
+ DirectionalZone2_8<true, false>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
+ }
+ } else if (upsampled_left) {
+ DirectionalZone2_8<false, true>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
+ } else {
+ DirectionalZone2_8<false, false>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
+ }
+}
+
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
assert(dsp != nullptr);
dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+ dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON;
dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
}