aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/arm/intrapred_directional_neon.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/arm/intrapred_directional_neon.cc')
-rw-r--r--src/dsp/arm/intrapred_directional_neon.cc594
1 files changed, 586 insertions, 8 deletions
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
index 805ba81..3f5edbd 100644
--- a/src/dsp/arm/intrapred_directional_neon.cc
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -12,18 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_directional.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
#include <arm_neon.h>
-#include <algorithm> // std::min
+#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstring> // memset
+#include <cstring>
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
@@ -35,14 +35,14 @@ namespace dsp {
namespace low_bitdepth {
namespace {
-// Blend two values based on a 32 bit weight.
+// Blend two values based on weights that sum to 32.
inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
const uint8x8_t a_weight,
const uint8x8_t b_weight) {
const uint16x8_t a_product = vmull_u8(a, a_weight);
const uint16x8_t b_product = vmull_u8(b, b_weight);
- return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5);
+ return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5 /*log2(32)*/);
}
// For vertical operations the weights are one constant value.
@@ -112,7 +112,7 @@ inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride,
// 4 wide subsamples the output. 8 wide subsamples the input.
if (width == 4) {
const uint8x8_t left_values = vld1_u8(top + top_base_x);
- const uint8x8_t right_values = RightShift<8>(left_values);
+ const uint8x8_t right_values = RightShiftVector<8>(left_values);
const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
// If |upsampled| is true then extract every other value for output.
@@ -910,12 +910,590 @@ void Init8bpp() {
} // namespace
} // namespace low_bitdepth
-void IntraPredDirectionalInit_NEON() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Blend two values based on weights that sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+ const int a_weight, const int b_weight) {
+ const uint16x4_t a_product = vmul_n_u16(a, a_weight);
+ const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight);
+
+ return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+ const uint16_t a_weight,
+ const uint16_t b_weight) {
+ const uint16x8_t a_product = vmulq_n_u16(a, a_weight);
+ const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight);
+
+ return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source,
+ const bool upsampled) {
+ if (upsampled) {
+ *dest = vld2_u16(source);
+ } else {
+ dest->val[0] = vld1_u16(source);
+ dest->val[1] = vld1_u16(source + 1);
+ }
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source,
+ const bool upsampled) {
+ if (upsampled) {
+ *dest = vld2q_u16(source);
+ } else {
+ dest->val[0] = vld1q_u16(source);
+ dest->val[1] = vld1q_u16(source + 1);
+ }
+}
+
+template <bool upsampled>
+inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride,
+ const int height, const uint16_t* const top,
+ const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_x = (4 + height - 1) << upsample_shift;
+ const int16x4_t max_base = vdup_n_s16(max_base_x);
+ const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]);
+ const int16x4_t index_offset = {0, 1, 2, 3};
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ int top_x = xstep;
+ int y = 0;
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ const int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset);
+ const uint16x4_t max_base_mask = vclt_s16(base_x, max_base);
+
+ uint16x4x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x4_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ // If |upsampled| is true then extract every other value for output.
+ const uint16x4_t masked_result =
+ vbsl_u16(max_base_mask, combined, final_top_val);
+
+ vst1_u16(dst, masked_result);
+ }
+ for (; y < height; ++y) {
+ Memset(dst, top[max_base_x], 4 /* width */);
+ dst += stride;
+ }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+template <bool upsampled>
+inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride,
+ const int width, const int height,
+ const uint16_t* const top, const int xstep) {
+ assert(width % 8 == 0);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_index = (width + height - 1) << upsample_shift;
+ const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+ const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+ const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+ const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+ int top_x = xstep;
+ int y = 0;
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+ int x = 0;
+ do {
+ const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ const uint16x8_t masked_result =
+ vbslq_u16(max_base_mask, combined, final_top_val);
+ vst1q_u16(dst + x, masked_result);
+
+ base_x = vaddq_s16(base_x, block_step);
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+ for (int i = y; i < height; ++i) {
+ Memset(dst, top[max_base_index], width);
+ dst += stride;
+ }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride,
+ const int width, const int height,
+ const uint16_t* const top, const int xstep,
+ const bool upsampled) {
+ assert(width % 8 == 0);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_index = (width + height - 1) << upsample_shift;
+ const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+ const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+ const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+ const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ const int max_no_corner_y = std::min(
+ ((max_base_index - (base_step * width)) << index_scale_bits) / xstep,
+ height);
+ // No need to check for exceeding |max_base_x| in the first loop.
+ int y = 0;
+ int top_x = xstep;
+ for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ int x = 0;
+ do {
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ vst1q_u16(dst + x, combined);
+
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+ int x = 0;
+ const int min_corner_only_x =
+ std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) &
+ ~7;
+ for (; x < min_corner_only_x; x += 8, top_base_x += base_step8,
+ base_x = vaddq_s16(base_x, block_step)) {
+ const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ const uint16x8_t masked_result =
+ vbslq_u16(max_base_mask, combined, final_top_val);
+ vst1q_u16(dst + x, masked_result);
+ }
+ // Corner-only section of the row.
+ Memset(dst + x, top[max_base_index], width - x);
+ }
+ for (; y < height; ++y) {
+ Memset(dst, top[max_base_index], width);
+ dst += stride;
+ }
+}
+
+void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const int width, const int height,
+ const int xstep,
+ const bool upsampled_top) {
+ const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+ uint16_t* dst = static_cast<uint16_t*>(dest);
+ stride /= sizeof(top[0]);
+
+ assert(xstep > 0);
+
+ if (xstep == 64) {
+ assert(!upsampled_top);
+ const uint16_t* top_ptr = top + 1;
+ const int width_bytes = width * sizeof(top[0]);
+ int y = height;
+ do {
+ memcpy(dst, top_ptr, width_bytes);
+ memcpy(dst + stride, top_ptr + 1, width_bytes);
+ memcpy(dst + 2 * stride, top_ptr + 2, width_bytes);
+ memcpy(dst + 3 * stride, top_ptr + 3, width_bytes);
+ dst += 4 * stride;
+ top_ptr += 4;
+ y -= 4;
+ } while (y != 0);
+ } else {
+ if (width == 4) {
+ if (upsampled_top) {
+ DirectionalZone1_4xH<true>(dst, stride, height, top, xstep);
+ } else {
+ DirectionalZone1_4xH<false>(dst, stride, height, top, xstep);
+ }
+ } else if (width >= 32) {
+ if (upsampled_top) {
+ DirectionalZone1_Large(dst, stride, width, height, top, xstep, true);
+ } else {
+ DirectionalZone1_Large(dst, stride, width, height, top, xstep, false);
+ }
+ } else if (upsampled_top) {
+ DirectionalZone1_WxH<true>(dst, stride, width, height, top, xstep);
+ } else {
+ DirectionalZone1_WxH<false>(dst, stride, width, height, top, xstep);
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Zone 3
+// This can be considered "the transpose of Zone 1." In Zone 1, the fractional
+// step applies when moving vertically in the destination block, connected to
+// the change in |y|, whereas in this mode, the step applies when moving
+// horizontally, connected to the change in |x|. This makes vectorization very
+// complicated in row-order, because a given vector may need source pixels that
+// span 16 or 32 pixels in steep angles, requiring multiple expensive table
+// lookups and checked loads. Rather than work in row order, it is simpler to
+// compute |dest| in column order, and then store the transposed results.
+
+// Compute 4x4 sub-blocks.
+// Example of computed sub-blocks of a 4x8 block before and after transpose:
+// 00 10 20 30 00 01 02 03
+// 01 11 21 31 10 11 12 13
+// 02 12 22 32 20 21 22 23
+// 03 13 23 33 30 31 32 33
+// ----------- --> -----------
+// 40 50 60 70 40 41 42 43
+// 41 51 61 71 50 51 52 53
+// 42 52 62 72 60 61 62 63
+// 43 53 63 73 70 71 72 73
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride,
+ const uint16_t* const left, const int ystep,
+ const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ // Compute one column at a time, then transpose for storage.
+ uint16x4_t result[4];
+
+ int left_y = base_left_y + ystep;
+ int left_offset = left_y >> index_scale_bits;
+ int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ int shift_1 = 32 - shift_0;
+ uint16x4x2_t sampled_left_col;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ Transpose4x4(result);
+ Store4(dst, result[0]);
+ dst += stride;
+ Store4(dst, result[1]);
+ dst += stride;
+ Store4(dst, result[2]);
+ dst += stride;
+ Store4(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride,
+ const int height, const uint16_t* const left,
+ const int ystep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ int y = 0;
+ do {
+ DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift),
+ ystep);
+ dest += 4 * stride;
+ y += 4;
+ } while (y < height);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride,
+ const int width, const uint16_t* const left,
+ const int ystep) {
+ int x = 0;
+ int base_left_y = 0;
+ do {
+ // TODO(petersonab): Establish 8x4 transpose to reserve this function for
+ // 8x4 and 16x4.
+ DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep,
+ base_left_y);
+ base_left_y += 4 * ystep;
+ x += 4;
+ } while (x < width);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride,
+ const uint16_t* const left, const int ystep,
+ const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ // Compute one column at a time, then transpose for storage.
+ uint16x8_t result[8];
+
+ int left_y = base_left_y + ystep;
+ uint16x8x2_t sampled_left_col;
+ int left_offset = left_y >> index_scale_bits;
+ int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ int shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ Transpose8x8(result);
+ Store8(dest, result[0]);
+ dest += stride;
+ Store8(dest, result[1]);
+ dest += stride;
+ Store8(dest, result[2]);
+ dest += stride;
+ Store8(dest, result[3]);
+ dest += stride;
+ Store8(dest, result[4]);
+ dest += stride;
+ Store8(dest, result[5]);
+ dest += stride;
+ Store8(dest, result[6]);
+ dest += stride;
+ Store8(dest, result[7]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
+ const int width, const int height,
+ const uint16_t* const left, const int ystep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ // Zone3 never runs out of left_column values.
+ assert((width + height - 1) << upsample_shift > // max_base_y
+ ((ystep * width) >> (6 - upsample_shift)) +
+ (/* base_step */ 1 << upsample_shift) *
+ (height - 1)); // left_base_y
+ int y = 0;
+ do {
+ int x = 0;
+ uint8_t* dst_x = dest + y * stride;
+ do {
+ const int base_left_y = ystep * x;
+ DirectionalZone3_8x8<upsampled>(
+ dst_x, stride, left + (y << upsample_shift), ystep, base_left_y);
+ dst_x += 8 * sizeof(uint16_t);
+ x += 8;
+ } while (x < width);
+ y += 8;
+ } while (y < height);
+}
+
+void DirectionalIntraPredictorZone3_NEON(void* const dest,
+ const ptrdiff_t stride,
+ const void* const left_column,
+ const int width, const int height,
+ const int ystep,
+ const bool upsampled_left) {
+ const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+
+ if (ystep == 64) {
+ assert(!upsampled_left);
+ const int width_bytes = width * sizeof(left[0]);
+ int y = height;
+ do {
+ const uint16_t* left_ptr = left + 1;
+ memcpy(dst, left_ptr, width_bytes);
+ memcpy(dst + stride, left_ptr + 1, width_bytes);
+ memcpy(dst + 2 * stride, left_ptr + 2, width_bytes);
+ memcpy(dst + 3 * stride, left_ptr + 3, width_bytes);
+ dst += 4 * stride;
+ left_ptr += 4;
+ y -= 4;
+ } while (y != 0);
+ return;
+ }
+ if (width == 4) {
+ if (upsampled_left) {
+ DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
+ } else {
+ DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
+ }
+ } else if (height == 4) {
+ if (upsampled_left) {
+ DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
+ } else {
+ DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
+ }
+ } else {
+ if (upsampled_left) {
+ // |upsampled_left| can only be true if |width| + |height| <= 16,
+ // therefore this is 8x8.
+ DirectionalZone3_8x8<true>(dst, stride, left, ystep);
+ } else {
+ DirectionalZone3_WxH<false>(dst, stride, width, height, left, ystep);
+ }
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+ dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {