diff options
Diffstat (limited to 'src/dsp/arm/intrapred_directional_neon.cc')
-rw-r--r-- | src/dsp/arm/intrapred_directional_neon.cc | 901 |
1 files changed, 787 insertions, 114 deletions
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc index 3f5edbd..3cad4a6 100644 --- a/src/dsp/arm/intrapred_directional_neon.cc +++ b/src/dsp/arm/intrapred_directional_neon.cc @@ -29,6 +29,7 @@ #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" namespace libgav1 { namespace dsp { @@ -40,9 +41,9 @@ inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b, const uint8x8_t a_weight, const uint8x8_t b_weight) { const uint16x8_t a_product = vmull_u8(a, a_weight); - const uint16x8_t b_product = vmull_u8(b, b_weight); + const uint16x8_t sum = vmlal_u8(a_product, b, b_weight); - return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5 /*log2(32)*/); + return vrshrn_n_u16(sum, 5 /*log2(32)*/); } // For vertical operations the weights are one constant value. @@ -52,9 +53,9 @@ inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b, } // Fill |left| and |right| with the appropriate values for a given |base_step|. -inline void LoadStepwise(const uint8_t* const source, const uint8x8_t left_step, - const uint8x8_t right_step, uint8x8_t* left, - uint8x8_t* right) { +inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source, + const uint8x8_t left_step, const uint8x8_t right_step, + uint8x8_t* left, uint8x8_t* right) { const uint8x16_t mixed = vld1q_u8(source); *left = VQTbl1U8(mixed, left_step); *right = VQTbl1U8(mixed, right_step); @@ -62,17 +63,18 @@ inline void LoadStepwise(const uint8_t* const source, const uint8x8_t left_step, // Handle signed step arguments by ignoring the sign. Negative values are // considered out of range and overwritten later. -inline void LoadStepwise(const uint8_t* const source, const int8x8_t left_step, - const int8x8_t right_step, uint8x8_t* left, - uint8x8_t* right) { +inline void LoadStepwise(const uint8_t* LIBGAV1_RESTRICT const source, + const int8x8_t left_step, const int8x8_t right_step, + uint8x8_t* left, uint8x8_t* right) { LoadStepwise(source, vreinterpret_u8_s8(left_step), vreinterpret_u8_s8(right_step), left, right); } // Process 4 or 8 |width| by any |height|. template <int width> -inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride, - const int height, const uint8_t* const top, +inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, const int height, + const uint8_t* LIBGAV1_RESTRICT const top, const int xstep, const bool upsampled) { assert(width == 4 || width == 8); @@ -142,10 +144,11 @@ inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride, // Process a multiple of 8 |width| by any |height|. Processes horizontally // before vertically in the hopes of being a little more cache friendly. -inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride, - const int width, const int height, - const uint8_t* const top, const int xstep, - const bool upsampled) { +inline void DirectionalZone1_WxH(uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, const int width, + const int height, + const uint8_t* LIBGAV1_RESTRICT const top, + const int xstep, const bool upsampled) { assert(width % 8 == 0); const int upsample_shift = static_cast<int>(upsampled); const int scale_bits = 6 - upsample_shift; @@ -203,14 +206,12 @@ inline void DirectionalZone1_WxH(uint8_t* dst, const ptrdiff_t stride, } while (++y < height); } -void DirectionalIntraPredictorZone1_NEON(void* const dest, - const ptrdiff_t stride, - const void* const top_row, - const int width, const int height, - const int xstep, - const bool upsampled_top) { - const uint8_t* const top = static_cast<const uint8_t*>(top_row); - uint8_t* dst = static_cast<uint8_t*>(dest); +void DirectionalIntraPredictorZone1_NEON( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, const int width, + const int height, const int xstep, const bool upsampled_top) { + const auto* const top = static_cast<const uint8_t*>(top_row); + auto* dst = static_cast<uint8_t*>(dest); assert(xstep > 0); @@ -282,11 +283,10 @@ void DirectionalIntraPredictorZone1_NEON(void* const dest, // Process 4 or 8 |width| by 4 or 8 |height|. template <int width> -inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride, - const int height, - const uint8_t* const left_column, - const int base_left_y, const int ystep, - const int upsample_shift) { +inline void DirectionalZone3_WxH( + uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int base_left_y, + const int ystep, const int upsample_shift) { assert(width == 4 || width == 8); assert(height == 4 || height == 8); const int scale_bits = 6 - upsample_shift; @@ -417,12 +417,10 @@ constexpr int kPositiveIndexOffset = 15; // Process 4 or 8 |width| by any |height|. template <int width> -inline void DirectionalZone2FromLeftCol_WxH(uint8_t* dst, - const ptrdiff_t stride, - const int height, - const uint8_t* const left_column, - const int16x8_t left_y, - const int upsample_shift) { +inline void DirectionalZone2FromLeftCol_WxH( + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y, + const int upsample_shift) { assert(width == 4 || width == 8); // The shift argument must be a constant. @@ -468,12 +466,10 @@ inline void DirectionalZone2FromLeftCol_WxH(uint8_t* dst, // Process 4 or 8 |width| by any |height|. template <int width> -inline void DirectionalZone1Blend_WxH(uint8_t* dest, const ptrdiff_t stride, - const int height, - const uint8_t* const top_row, - int zone_bounds, int top_x, - const int xstep, - const int upsample_shift) { +inline void DirectionalZone1Blend_WxH( + uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, + const uint8_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x, + const int xstep, const int upsample_shift) { assert(width == 4 || width == 8); const int scale_bits_x = 6 - upsample_shift; @@ -523,12 +519,12 @@ constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { // then handle only blocks that take from |left_ptr|. Additionally, a fast // index-shuffle approach is used for pred values from |left_column| in sections // that permit it. -inline void DirectionalZone2_4xH(uint8_t* dst, const ptrdiff_t stride, - const uint8_t* const top_row, - const uint8_t* const left_column, - const int height, const int xstep, - const int ystep, const bool upsampled_top, - const bool upsampled_left) { +inline void DirectionalZone2_4xH( + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, + const uint8_t* LIBGAV1_RESTRICT const top_row, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int height, + const int xstep, const int ystep, const bool upsampled_top, + const bool upsampled_left) { const int upsample_left_shift = static_cast<int>(upsampled_left); const int upsample_top_shift = static_cast<int>(upsampled_top); @@ -564,8 +560,8 @@ inline void DirectionalZone2_4xH(uint8_t* dst, const ptrdiff_t stride, // If the 64 scaling is regarded as a decimal point, the first value of the // left_y vector omits the portion which is covered under the left_column // offset. The following values need the full ystep as a relative offset. - int16x8_t left_y = vmulq_n_s16(zero_to_seven, -ystep); - left_y = vaddq_s16(left_y, vdupq_n_s16(-ystep_remainder)); + const int16x8_t remainder = vdupq_n_s16(-ystep_remainder); + const int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep); // This loop treats each set of 4 columns in 3 stages with y-value boundaries. // The first stage, before the first y-loop, covers blocks that are only @@ -639,13 +635,12 @@ inline void DirectionalZone2_4xH(uint8_t* dst, const ptrdiff_t stride, } // Process a multiple of 8 |width|. -inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride, - const uint8_t* const top_row, - const uint8_t* const left_column, - const int width, const int height, - const int xstep, const int ystep, - const bool upsampled_top, - const bool upsampled_left) { +inline void DirectionalZone2_8( + uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, + const uint8_t* LIBGAV1_RESTRICT const top_row, + const uint8_t* LIBGAV1_RESTRICT const left_column, const int width, + const int height, const int xstep, const int ystep, + const bool upsampled_top, const bool upsampled_left) { const int upsample_left_shift = static_cast<int>(upsampled_left); const int upsample_top_shift = static_cast<int>(upsampled_top); @@ -668,12 +663,6 @@ inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride, assert(xstep >= 3); const int min_top_only_x = std::min((height * xstep) >> 6, width); - // For steep angles, the source pixels from |left_column| may not fit in a - // 16-byte load for shuffling. - // TODO(petersonab): Find a more precise formula for this subject to x. - const int max_shuffle_height = - std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height); - // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; @@ -687,8 +676,8 @@ inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride, // If the 64 scaling is regarded as a decimal point, the first value of the // left_y vector omits the portion which is covered under the left_column // offset. Following values need the full ystep as a relative offset. - int16x8_t left_y = vmulq_n_s16(zero_to_seven, -ystep); - left_y = vaddq_s16(left_y, vdupq_n_s16(-ystep_remainder)); + const int16x8_t remainder = vdupq_n_s16(-ystep_remainder); + int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep); // This loop treats each set of 4 columns in 3 stages with y-value boundaries. // The first stage, before the first y-loop, covers blocks that are only @@ -696,12 +685,21 @@ inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride, // blocks that have a mixture of values computed from top or left. The final // stage covers blocks that are only computed from the left. int x = 0; + // For steep angles, the source pixels from |left_column| may not fit in a + // 16-byte load for shuffling. |d| represents the number of pixels that can + // fit in one contiguous vector when stepping by |ystep|. For a given x + // position, the left column values can be obtained by VTBL as long as the + // values at row[x + d] and beyond come from the top row. However, this does + // not guarantee that the vector will also contain all of the values needed + // from top row. + const int d = 16 / ((ystep >> 6) + 1); for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, xstep_bounds_base -= (8 << 6), left_y = vsubq_s16(left_y, increment_left8), left_offset -= left_base_increment8) { uint8_t* dst_x = dst + x; - + const int max_shuffle_height = + std::min(((x + d) << 6) / xstep, height) & ~7; // Round down to the nearest multiple of 8. const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y, @@ -770,14 +768,20 @@ inline void DirectionalZone2_8(uint8_t* const dst, const ptrdiff_t stride, } void DirectionalIntraPredictorZone2_NEON( - void* const dest, const ptrdiff_t stride, const void* const top_row, - const void* const left_column, const int width, const int height, - const int xstep, const int ystep, const bool upsampled_top, - const bool upsampled_left) { + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column, const int width, + const int height, const int xstep, const int ystep, + const bool upsampled_top, const bool upsampled_left) { // Increasing the negative buffer for this function allows more rows to be // processed at a time without branching in an inner loop to check the base. uint8_t top_buffer[288]; uint8_t left_buffer[288]; +#if LIBGAV1_MSAN + memset(top_buffer, 0, sizeof(top_buffer)); + memset(left_buffer, 0, sizeof(left_buffer)); +#endif // LIBGAV1_MSAN + memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160); memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160); const uint8_t* top_ptr = top_buffer + 144; @@ -793,12 +797,10 @@ void DirectionalIntraPredictorZone2_NEON( } } -void DirectionalIntraPredictorZone3_NEON(void* const dest, - const ptrdiff_t stride, - const void* const left_column, - const int width, const int height, - const int ystep, - const bool upsampled_left) { +void DirectionalIntraPredictorZone3_NEON( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const left_column, const int width, + const int height, const int ystep, const bool upsampled_left) { const auto* const left = static_cast<const uint8_t*>(left_column); assert(ystep > 0); @@ -819,7 +821,7 @@ void DirectionalIntraPredictorZone3_NEON(void* const dest, do { int x = 0; do { - uint8_t* dst = static_cast<uint8_t*>(dest); + auto* dst = static_cast<uint8_t*>(dest); dst += y * stride + x; uint8x8_t left_v[4], right_v[4], value_v[4]; const int ystep_base = ystep * x; @@ -886,7 +888,7 @@ void DirectionalIntraPredictorZone3_NEON(void* const dest, do { int x = 0; do { - uint8_t* dst = static_cast<uint8_t*>(dest); + auto* dst = static_cast<uint8_t*>(dest); dst += y * stride + x; const int ystep_base = ystep * (x + 1); @@ -934,7 +936,8 @@ inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, } // Each element of |dest| contains values associated with one weight value. -inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source, +inline void LoadEdgeVals(uint16x4x2_t* dest, + const uint16_t* LIBGAV1_RESTRICT const source, const bool upsampled) { if (upsampled) { *dest = vld2_u16(source); @@ -945,7 +948,8 @@ inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source, } // Each element of |dest| contains values associated with one weight value. -inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source, +inline void LoadEdgeVals(uint16x8x2_t* dest, + const uint16_t* LIBGAV1_RESTRICT const source, const bool upsampled) { if (upsampled) { *dest = vld2q_u16(source); @@ -956,8 +960,9 @@ inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source, } template <bool upsampled> -inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride, - const int height, const uint16_t* const top, +inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, const int height, + const uint16_t* LIBGAV1_RESTRICT const top, const int xstep) { const int upsample_shift = static_cast<int>(upsampled); const int index_scale_bits = 6 - upsample_shift; @@ -1007,9 +1012,11 @@ inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride, // Process a multiple of 8 |width| by any |height|. Processes horizontally // before vertically in the hopes of being a little more cache friendly. template <bool upsampled> -inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride, - const int width, const int height, - const uint16_t* const top, const int xstep) { +inline void DirectionalZone1_WxH(uint16_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, const int width, + const int height, + const uint16_t* LIBGAV1_RESTRICT const top, + const int xstep) { assert(width % 8 == 0); const int upsample_shift = static_cast<int>(upsampled); const int index_scale_bits = 6 - upsample_shift; @@ -1068,10 +1075,11 @@ inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride, // Process a multiple of 8 |width| by any |height|. Processes horizontally // before vertically in the hopes of being a little more cache friendly. -inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride, - const int width, const int height, - const uint16_t* const top, const int xstep, - const bool upsampled) { +inline void DirectionalZone1_Large(uint16_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, const int width, + const int height, + const uint16_t* LIBGAV1_RESTRICT const top, + const int xstep, const bool upsampled) { assert(width % 8 == 0); const int upsample_shift = static_cast<int>(upsampled); const int index_scale_bits = 6 - upsample_shift; @@ -1156,13 +1164,12 @@ inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride, } } -void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride, - const void* const top_row, - const int width, const int height, - const int xstep, - const bool upsampled_top) { - const uint16_t* const top = static_cast<const uint16_t*>(top_row); - uint16_t* dst = static_cast<uint16_t*>(dest); +void DirectionalIntraPredictorZone1_NEON( + void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, const int width, + const int height, const int xstep, const bool upsampled_top) { + const auto* const top = static_cast<const uint16_t*>(top_row); + auto* dst = static_cast<uint16_t*>(dest); stride /= sizeof(top[0]); assert(xstep > 0); @@ -1225,9 +1232,10 @@ void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride, // 42 52 62 72 60 61 62 63 // 43 53 63 73 70 71 72 73 template <bool upsampled> -inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride, - const uint16_t* const left, const int ystep, - const int base_left_y = 0) { +inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst, + const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const left, + const int ystep, const int base_left_y = 0) { const int upsample_shift = static_cast<int>(upsampled); const int index_scale_bits = 6 - upsample_shift; @@ -1278,8 +1286,9 @@ inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride, } template <bool upsampled> -inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride, - const int height, const uint16_t* const left, +inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest, + const ptrdiff_t stride, const int height, + const uint16_t* LIBGAV1_RESTRICT const left, const int ystep) { const int upsample_shift = static_cast<int>(upsampled); int y = 0; @@ -1292,8 +1301,9 @@ inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride, } template <bool upsampled> -inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride, - const int width, const uint16_t* const left, +inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest, + const ptrdiff_t stride, const int width, + const uint16_t* LIBGAV1_RESTRICT const left, const int ystep) { int x = 0; int base_left_y = 0; @@ -1308,9 +1318,10 @@ inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride, } template <bool upsampled> -inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride, - const uint16_t* const left, const int ystep, - const int base_left_y = 0) { +inline void DirectionalZone3_8x8(uint8_t* LIBGAV1_RESTRICT dest, + const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const left, + const int ystep, const int base_left_y = 0) { const int upsample_shift = static_cast<int>(upsampled); const int index_scale_bits = 6 - upsample_shift; @@ -1400,9 +1411,11 @@ inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride, } template <bool upsampled> -inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride, - const int width, const int height, - const uint16_t* const left, const int ystep) { +inline void DirectionalZone3_WxH(uint8_t* LIBGAV1_RESTRICT dest, + const ptrdiff_t stride, const int width, + const int height, + const uint16_t* LIBGAV1_RESTRICT const left, + const int ystep) { const int upsample_shift = static_cast<int>(upsampled); // Zone3 never runs out of left_column values. assert((width + height - 1) << upsample_shift > // max_base_y @@ -1424,14 +1437,12 @@ inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride, } while (y < height); } -void DirectionalIntraPredictorZone3_NEON(void* const dest, - const ptrdiff_t stride, - const void* const left_column, - const int width, const int height, - const int ystep, - const bool upsampled_left) { - const uint16_t* const left = static_cast<const uint16_t*>(left_column); - uint8_t* dst = static_cast<uint8_t*>(dest); +void DirectionalIntraPredictorZone3_NEON( + void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const left_column, const int width, + const int height, const int ystep, const bool upsampled_left) { + const auto* const left = static_cast<const uint16_t*>(left_column); + auto* dst = static_cast<uint8_t*>(dest); if (ystep == 64) { assert(!upsampled_left); @@ -1472,10 +1483,672 @@ void DirectionalIntraPredictorZone3_NEON(void* const dest, } } +// ----------------------------------------------------------------------------- +// Zone2 +// This function deals with cases not found in zone 1 or zone 3. The extreme +// angles are 93, which makes for sharp ascents along |left_column| with each +// successive dest row element until reaching |top_row|, and 177, with a shallow +// ascent up |left_column| until reaching large jumps along |top_row|. In the +// extremely steep cases, source vectors can only be loaded one lane at a time. + +// Fill |left| and |right| with the appropriate values for a given |base_step|. +inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source, + const uint8x8_t left_step, const uint8x8_t right_step, + uint16x4_t* left, uint16x4_t* right) { + const uint8x16x2_t mixed = { + vld1q_u8(static_cast<const uint8_t*>(source)), + vld1q_u8(static_cast<const uint8_t*>(source) + 16)}; + *left = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step)); + *right = vreinterpret_u16_u8(VQTbl2U8(mixed, right_step)); +} + +inline void LoadStepwise(const void* LIBGAV1_RESTRICT const source, + const uint8x8_t left_step_0, + const uint8x8_t right_step_0, + const uint8x8_t left_step_1, + const uint8x8_t right_step_1, uint16x8_t* left, + uint16x8_t* right) { + const uint8x16x2_t mixed = { + vld1q_u8(static_cast<const uint8_t*>(source)), + vld1q_u8(static_cast<const uint8_t*>(source) + 16)}; + const uint16x4_t left_low = vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_0)); + const uint16x4_t left_high = + vreinterpret_u16_u8(VQTbl2U8(mixed, left_step_1)); + *left = vcombine_u16(left_low, left_high); + const uint16x4_t right_low = + vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_0)); + const uint16x4_t right_high = + vreinterpret_u16_u8(VQTbl2U8(mixed, right_step_1)); + *right = vcombine_u16(right_low, right_high); +} + +// Blend two values based on weight pairs that each sum to 32. +inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b, + const uint16x4_t a_weight, + const uint16x4_t b_weight) { + const uint16x4_t a_product = vmul_u16(a, a_weight); + const uint16x4_t sum = vmla_u16(a_product, b, b_weight); + + return vrshr_n_u16(sum, 5 /*log2(32)*/); +} + +// Blend two values based on weight pairs that each sum to 32. +inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b, + const uint16x8_t a_weight, + const uint16x8_t b_weight) { + const uint16x8_t a_product = vmulq_u16(a, a_weight); + const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight); + + return vrshrq_n_u16(sum, 5 /*log2(32)*/); +} + +// Because the source values "move backwards" as the row index increases, the +// indices derived from ystep are generally negative in localized functions. +// This is accommodated by making sure the relative indices are within [-15, 0] +// when the function is called, and sliding them into the inclusive range +// [0, 15], relative to a lower base address. 15 is the Pixel offset, so 30 is +// the byte offset for table lookups. + +constexpr int kPositiveIndexOffsetPixels = 15; +constexpr int kPositiveIndexOffsetBytes = 30; + +inline void DirectionalZone2FromLeftCol_4xH( + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, + const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x4_t left_y, + const bool upsampled) { + const int upsample_shift = static_cast<int>(upsampled); + + const int index_scale_bits = 6; + // The values in |offset_y| are negative, except for the first element, which + // is zero. + int16x4_t offset_y; + int16x4_t shift_upsampled = left_y; + // The shift argument must be a constant, otherwise use upsample_shift + // directly. + if (upsampled) { + offset_y = vshr_n_s16(left_y, index_scale_bits - 1 /*upsample_shift*/); + shift_upsampled = vshl_n_s16(shift_upsampled, 1); + } else { + offset_y = vshr_n_s16(left_y, index_scale_bits); + } + offset_y = vshl_n_s16(offset_y, 1); + + // Select values to the left of the starting point. + // The 15th element (and 16th) will be all the way at the end, to the + // right. With a negative ystep everything else will be "left" of them. + // This supports cumulative steps up to 15. We could support up to 16 by + // doing separate loads for |left_values| and |right_values|. vtbl + // supports 2 Q registers as input which would allow for cumulative + // offsets of 32. + // |sampler_0| indexes the first byte of each 16-bit value. + const int16x4_t sampler_0 = + vadd_s16(offset_y, vdup_n_s16(kPositiveIndexOffsetBytes)); + // |sampler_1| indexes the second byte of each 16-bit value. + const int16x4_t sampler_1 = vadd_s16(sampler_0, vdup_n_s16(1)); + const int16x4x2_t sampler = vzip_s16(sampler_0, sampler_1); + const uint8x8_t left_indices = + vqmovun_s16(vcombine_s16(sampler.val[0], sampler.val[1])); + const uint8x8_t right_indices = + vadd_u8(left_indices, vdup_n_u8(sizeof(uint16_t))); + + const int16x4_t shift_masked = vand_s16(shift_upsampled, vdup_n_s16(0x3f)); + const uint16x4_t shift_0 = vreinterpret_u16_s16(vshr_n_s16(shift_masked, 1)); + const uint16x4_t shift_1 = vsub_u16(vdup_n_u16(32), shift_0); + + int y = 0; + do { + uint16x4_t src_left, src_right; + LoadStepwise( + left_column - kPositiveIndexOffsetPixels + (y << upsample_shift), + left_indices, right_indices, &src_left, &src_right); + const uint16x4_t val = WeightedBlend(src_left, src_right, shift_1, shift_0); + + Store4(dst, val); + dst += stride; + } while (++y < height); +} + +inline void DirectionalZone2FromLeftCol_8xH( + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height, + const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y, + const bool upsampled) { + const int upsample_shift = static_cast<int>(upsampled); + + const int index_scale_bits = 6; + // The values in |offset_y| are negative, except for the first element, which + // is zero. + int16x8_t offset_y = left_y; + int16x8_t shift_upsampled = left_y; + // The shift argument must be a constant, otherwise use upsample_shift + // directly. + if (upsampled) { + offset_y = vshrq_n_s16(left_y, index_scale_bits - 1); + shift_upsampled = vshlq_n_s16(shift_upsampled, 1); + } else { + offset_y = vshrq_n_s16(left_y, index_scale_bits); + } + offset_y = vshlq_n_s16(offset_y, 1); + + // Select values to the left of the starting point. + // The 15th element (and 16th) will be all the way at the end, to the right. + // With a negative ystep everything else will be "left" of them. + // This supports cumulative steps up to 15. We could support up to 16 by doing + // separate loads for |left_values| and |right_values|. vtbl supports 2 Q + // registers as input which would allow for cumulative offsets of 32. + // |sampler_0| indexes the first byte of each 16-bit value. + const int16x8_t sampler_0 = + vaddq_s16(offset_y, vdupq_n_s16(kPositiveIndexOffsetBytes)); + // |sampler_1| indexes the second byte of each 16-bit value. + const int16x8_t sampler_1 = vaddq_s16(sampler_0, vdupq_n_s16(1)); + const int16x8x2_t sampler = vzipq_s16(sampler_0, sampler_1); + const uint8x8_t left_values_0 = vqmovun_s16(sampler.val[0]); + const uint8x8_t left_values_1 = vqmovun_s16(sampler.val[1]); + const uint8x8_t right_values_0 = + vadd_u8(left_values_0, vdup_n_u8(sizeof(uint16_t))); + const uint8x8_t right_values_1 = + vadd_u8(left_values_1, vdup_n_u8(sizeof(uint16_t))); + + const int16x8_t shift_masked = vandq_s16(shift_upsampled, vdupq_n_s16(0x3f)); + const uint16x8_t shift_0 = + vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1)); + const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0); + + int y = 0; + do { + uint16x8_t src_left, src_right; + LoadStepwise( + left_column - kPositiveIndexOffsetPixels + (y << upsample_shift), + left_values_0, right_values_0, left_values_1, right_values_1, &src_left, + &src_right); + const uint16x8_t val = WeightedBlend(src_left, src_right, shift_1, shift_0); + + Store8(dst, val); + dst += stride; + } while (++y < height); +} + +template <bool upsampled> +inline void DirectionalZone1Blend_4xH( + uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, + const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x, + const int xstep) { + const int upsample_shift = static_cast<int>(upsampled); + const int scale_bits_x = 6 - upsample_shift; + + // Representing positions along the row, which |zone_bounds| will target for + // the blending boundary. + const int16x4_t indices = {0, 1, 2, 3}; + + uint16x4x2_t top_vals; + int y = height; + do { + const uint16_t* const src = top_row + (top_x >> scale_bits_x); + LoadEdgeVals(&top_vals, src, upsampled); + + const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1; + const uint16_t shift_1 = 32 - shift_0; + + const uint16x4_t val = + WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0); + + const uint16x4_t dst_blend = Load4U16(dest); + // |zone_bounds| values can be negative. + const uint16x4_t blend = vcge_s16(indices, vdup_n_s16(zone_bounds >> 6)); + const uint16x4_t output = vbsl_u16(blend, val, dst_blend); + + Store4(dest, output); + dest += stride; + zone_bounds += xstep; + top_x -= xstep; + } while (--y != 0); +} + +template <bool upsampled> +inline void DirectionalZone1Blend_8xH( + uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height, + const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x, + const int xstep) { + const int upsample_shift = static_cast<int>(upsampled); + const int scale_bits_x = 6 - upsample_shift; + + // Representing positions along the row, which |zone_bounds| will target for + // the blending boundary. + const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7}; + + uint16x8x2_t top_vals; + int y = height; + do { + const uint16_t* const src = top_row + (top_x >> scale_bits_x); + LoadEdgeVals(&top_vals, src, upsampled); + + const uint16_t shift_0 = ((top_x << upsample_shift) & 0x3f) >> 1; + const uint16_t shift_1 = 32 - shift_0; + + const uint16x8_t val = + WeightedBlend(top_vals.val[0], top_vals.val[1], shift_1, shift_0); + + const uint16x8_t dst_blend = Load8U16(dest); + // |zone_bounds| values can be negative. + const uint16x8_t blend = vcgeq_s16(indices, vdupq_n_s16(zone_bounds >> 6)); + const uint16x8_t output = vbslq_u16(blend, val, dst_blend); + + Store8(dest, output); + dest += stride; + zone_bounds += xstep; + top_x -= xstep; + } while (--y != 0); +} + +// The height at which a load of 16 bytes will not contain enough source pixels +// from |left_column| to supply an accurate row when computing 8 pixels at a +// time. The values are found by inspection. By coincidence, all angles that +// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up +// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. Indices +// that do not correspond to angle derivatives are left at zero. +// Notably, in cases with upsampling, the shuffle-invalid height is always +// greater than the prediction height (which is 8 at maximum). +constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = { + 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40}; + +// 7.11.2.4 (8) 90 < angle > 180 +// The strategy for these functions (4xH and 8+xH) is to know how many blocks +// can be processed with just pixels from |top_ptr|, then handle mixed blocks, +// then handle only blocks that take from |left_ptr|. Additionally, a fast +// index-shuffle approach is used for pred values from |left_column| in sections +// that permit it. +template <bool upsampled_top, bool upsampled_left> +inline void DirectionalZone2_4xH( + uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const top_row, + const uint16_t* LIBGAV1_RESTRICT const left_column, const int height, + const int xstep, const int ystep) { + const int upsample_left_shift = static_cast<int>(upsampled_left); + + // Helper vector for index computation. + const int16x4_t zero_to_three = {0, 1, 2, 3}; + + // Loop increments for moving by block (4xN). Vertical still steps by 8. If + // it's only 4, it will be finished in the first iteration. + const ptrdiff_t stride8 = stride << 3; + const int xstep8 = xstep << 3; + + const int min_height = (height == 4) ? 4 : 8; + + // All columns from |min_top_only_x| to the right will only need |top_row| to + // compute and can therefore call the Zone1 functions. This assumes |xstep| is + // at least 3. + assert(xstep >= 3); + + // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 + int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; + + const int left_base_increment = ystep >> 6; + const int ystep_remainder = ystep & 0x3F; + + // If the 64 scaling is regarded as a decimal point, the first value of the + // left_y vector omits the portion which is covered under the left_column + // offset. The following values need the full ystep as a relative offset. + const int16x4_t left_y = + vmla_n_s16(vdup_n_s16(-ystep_remainder), zero_to_three, -ystep); + + // This loop treats the 4 columns in 3 stages with y-value boundaries. + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + // Round down to the nearest multiple of 8. + // TODO(petersonab): Check if rounding to the nearest 4 is okay. + const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7; + DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst), + stride >> 1, max_top_only_y, top_row, + -xstep); + + if (max_top_only_y == height) return; + + int y = max_top_only_y; + dst += stride * y; + const int xstep_y = xstep * y; + + // All rows from |min_left_only_y| down for this set of columns only need + // |left_column| to compute. + const int min_left_only_y = std::min((4 /*width*/ << 6) / xstep, height); + int xstep_bounds = xstep_bounds_base + xstep_y; + int top_x = -xstep - xstep_y; + + // +8 increment is OK because if height is 4 this only runs once. + for (; y < min_left_only_y; + y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + DirectionalZone2FromLeftCol_4xH( + dst, stride, min_height, + left_column + ((y - left_base_increment) << upsample_left_shift), + left_y, upsampled_left); + + DirectionalZone1Blend_4xH<upsampled_top>(dst, stride, min_height, top_row, + xstep_bounds, top_x, xstep); + } + + // Loop over y for left-only rows. + for (; y < height; y += 8, dst += stride8) { + // Angle expected by Zone3 is flipped about the 180 degree vector, which + // is the x-axis. + DirectionalZone3_4xH<upsampled_left>( + dst, stride, min_height, left_column + (y << upsample_left_shift), + -ystep); + } +} + +// Process 8x4 and 16x4 blocks. This avoids a lot of overhead and simplifies +// address safety. +template <bool upsampled_top, bool upsampled_left> +inline void DirectionalZone2_Wx4( + uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const top_row, + const uint16_t* LIBGAV1_RESTRICT const left_column, const int width, + const int xstep, const int ystep) { + const int upsample_top_shift = static_cast<int>(upsampled_top); + // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 + int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; + + const int min_top_only_x = std::min((4 * xstep) >> 6, width); + int x = 0; + for (; x < min_top_only_x; x += 4, xstep_bounds_base -= (4 << 6)) { + uint8_t* dst_x = dst + x * sizeof(uint16_t); + + // Round down to the nearest multiple of 4. + const int max_top_only_y = (((x + 1) << 6) / xstep) & ~3; + if (max_top_only_y != 0) { + DirectionalZone1_4xH<upsampled_top>( + reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 4, + top_row + (x << upsample_top_shift), -xstep); + continue; + } + + DirectionalZone3_4x4<upsampled_left>(dst_x, stride, left_column, -ystep, + -ystep * x); + + const int min_left_only_y = ((x + 4) << 6) / xstep; + if (min_left_only_y != 0) { + const int top_x = -xstep; + DirectionalZone1Blend_4xH<upsampled_top>( + dst_x, stride, 4, top_row + (x << upsample_top_shift), + xstep_bounds_base, top_x, xstep); + } + } + // Reached |min_top_only_x|. + for (; x < width; x += 4) { + DirectionalZone1_4xH<upsampled_top>( + reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, 4, + top_row + (x << upsample_top_shift), -xstep); + } +} + +// Process a multiple of 8 |width|. +template <bool upsampled_top, bool upsampled_left> +inline void DirectionalZone2_8( + uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const top_row, + const uint16_t* LIBGAV1_RESTRICT const left_column, const int width, + const int height, const int xstep, const int ystep) { + if (height == 4) { + DirectionalZone2_Wx4<upsampled_top, upsampled_left>( + dst, stride, top_row, left_column, width, xstep, ystep); + return; + } + const int upsample_left_shift = static_cast<int>(upsampled_left); + const int upsample_top_shift = static_cast<int>(upsampled_top); + + // Helper vector. + const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7}; + + // Loop increments for moving by block (8x8). This function handles blocks + // with height 4 as well. They are calculated in one pass so these variables + // do not get used. + const ptrdiff_t stride8 = stride << 3; + const int xstep8 = xstep << 3; + const int ystep8 = ystep << 3; + + // All columns from |min_top_only_x| to the right will only need |top_row| to + // compute and can therefore call the Zone1 functions. This assumes |xstep| is + // at least 3. + assert(xstep >= 3); + const int min_top_only_x = std::min((height * xstep) >> 6, width); + + // For steep angles, the source pixels from |left_column| may not fit in a + // 16-byte load for shuffling. + // TODO(petersonab): Find a more precise formula for this subject to x. + const int max_shuffle_height = + std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height); + + // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1 + int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1; + + const int left_base_increment = ystep >> 6; + const int ystep_remainder = ystep & 0x3F; + + const int left_base_increment8 = ystep8 >> 6; + const int ystep_remainder8 = ystep8 & 0x3F; + const int16x8_t increment_left8 = vdupq_n_s16(ystep_remainder8); + + // If the 64 scaling is regarded as a decimal point, the first value of the + // left_y vector omits the portion which is covered under the left_column + // offset. Following values need the full ystep as a relative offset. + int16x8_t left_y = + vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep); + + // This loop treats each set of 4 columns in 3 stages with y-value boundaries. + // The first stage, before the first y-loop, covers blocks that are only + // computed from the top row. The second stage, comprising two y-loops, covers + // blocks that have a mixture of values computed from top or left. The final + // stage covers blocks that are only computed from the left. + int x = 0; + for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8, + xstep_bounds_base -= (8 << 6), + left_y = vsubq_s16(left_y, increment_left8), + left_offset -= left_base_increment8) { + uint8_t* dst_x = dst + x * sizeof(uint16_t); + + // Round down to the nearest multiple of 8. + const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7; + DirectionalZone1_WxH<upsampled_top>( + reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y, + top_row + (x << upsample_top_shift), -xstep); + + if (max_top_only_y == height) continue; + + int y = max_top_only_y; + dst_x += stride * y; + const int xstep_y = xstep * y; + + // All rows from |min_left_only_y| down for this set of columns only need + // |left_column| to compute. + const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height); + // At high angles such that min_left_only_y < 8, ystep is low and xstep is + // high. This means that max_shuffle_height is unbounded and xstep_bounds + // will overflow in 16 bits. This is prevented by stopping the first + // blending loop at min_left_only_y for such cases, which means we skip over + // the second blending loop as well. + const int left_shuffle_stop_y = + std::min(max_shuffle_height, min_left_only_y); + int xstep_bounds = xstep_bounds_base + xstep_y; + int top_x = -xstep - xstep_y; + + for (; y < left_shuffle_stop_y; + y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + DirectionalZone2FromLeftCol_8xH( + dst_x, stride, 8, + left_column + ((left_offset + y) << upsample_left_shift), left_y, + upsample_left_shift); + + DirectionalZone1Blend_8xH<upsampled_top>( + dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds, + top_x, xstep); + } + + // Pick up from the last y-value, using the slower but secure method for + // left prediction. + for (; y < min_left_only_y; + y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) { + DirectionalZone3_8x8<upsampled_left>( + dst_x, stride, left_column + (y << upsample_left_shift), -ystep, + -ystep * x); + + DirectionalZone1Blend_8xH<upsampled_top>( + dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds, + top_x, xstep); + } + // Loop over y for left_only rows. + for (; y < height; y += 8, dst_x += stride8) { + DirectionalZone3_8x8<upsampled_left>( + dst_x, stride, left_column + (y << upsample_left_shift), -ystep, + -ystep * x); + } + } + // Reached |min_top_only_x|. + if (x < width) { + DirectionalZone1_WxH<upsampled_top>( + reinterpret_cast<uint16_t*>(dst) + x, stride >> 1, width - x, height, + top_row + (x << upsample_top_shift), -xstep); + } +} + +// At this angle, neither edges are upsampled. +// |min_width| is either 4 or 8. +template <int min_width> +void DirectionalAngle135(uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, + const uint16_t* LIBGAV1_RESTRICT const top, + const uint16_t* LIBGAV1_RESTRICT const left, + const int width, const int height) { + // y = 0 is more trivial than the other rows. + memcpy(dst, top - 1, width * sizeof(top[0])); + dst += stride; + + // If |height| > |width|, then there is a point at which top_row is no longer + // used in each row. + const int min_left_only_y = std::min(width, height); + + int y = 1; + do { + // Example: If y is 4 (min_width), the dest row starts with left[3], + // left[2], left[1], left[0], because the angle points up. Therefore, load + // starts at left[0] and is then reversed. If y is 2, the load starts at + // left[-2], and is reversed to store left[1], left[0], with negative values + // overwritten from |top_row|. + const uint16_t* const load_left = left + y - min_width; + auto* dst16 = reinterpret_cast<uint16_t*>(dst); + + // Some values will be overwritten when |y| is not a multiple of + // |min_width|. + if (min_width == 4) { + const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left)); + vst1_u16(dst16, left_toward_corner); + } else { + int x = 0; + do { + const uint16x8_t left_toward_corner = + vrev64q_u16(vld1q_u16(load_left - x)); + vst1_u16(dst16 + x, vget_high_u16(left_toward_corner)); + vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner)); + x += 8; + } while (x < y); + } + // Entering |top|. + memcpy(dst16 + y, top - 1, (width - y) * sizeof(top[0])); + dst += stride; + } while (++y < min_left_only_y); + + // Left only. + for (; y < height; ++y, dst += stride) { + auto* dst16 = reinterpret_cast<uint16_t*>(dst); + const uint16_t* const load_left = left + y - min_width; + + int x = 0; + if (min_width == 4) { + const uint16x4_t left_toward_corner = vrev64_u16(vld1_u16(load_left - x)); + vst1_u16(dst16 + x, left_toward_corner); + } else { + do { + const uint16x8_t left_toward_corner = + vrev64q_u16(vld1q_u16(load_left - x)); + vst1_u16(dst16 + x, vget_high_u16(left_toward_corner)); + vst1_u16(dst16 + x + 4, vget_low_u16(left_toward_corner)); + x += 8; + } while (x < width); + } + } +} + +void DirectionalIntraPredictorZone2_NEON( + void* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, + const void* LIBGAV1_RESTRICT const top_row, + const void* LIBGAV1_RESTRICT const left_column, const int width, + const int height, const int xstep, const int ystep, + const bool upsampled_top, const bool upsampled_left) { + // Increasing the negative buffer for this function allows more rows to be + // processed at a time without branching in an inner loop to check the base. + uint16_t top_buffer[288]; + uint16_t left_buffer[288]; +#if LIBGAV1_MSAN + memset(top_buffer, 0, sizeof(top_buffer)); + memset(left_buffer, 0, sizeof(left_buffer)); +#endif // LIBGAV1_MSAN + memcpy(top_buffer + 128, static_cast<const uint16_t*>(top_row) - 16, 160); + memcpy(left_buffer + 128, static_cast<const uint16_t*>(left_column) - 16, + 160); + const uint16_t* top_ptr = top_buffer + 144; + const uint16_t* left_ptr = left_buffer + 144; + auto* dst = static_cast<uint8_t*>(dest); + + if (width == 4) { + if (xstep == 64) { + assert(ystep == 64); + DirectionalAngle135<4>(dst, stride, top_ptr, left_ptr, width, height); + return; + } + if (upsampled_top) { + if (upsampled_left) { + DirectionalZone2_4xH<true, true>(dst, stride, top_ptr, left_ptr, height, + xstep, ystep); + } else { + DirectionalZone2_4xH<true, false>(dst, stride, top_ptr, left_ptr, + height, xstep, ystep); + } + } else if (upsampled_left) { + DirectionalZone2_4xH<false, true>(dst, stride, top_ptr, left_ptr, height, + xstep, ystep); + } else { + DirectionalZone2_4xH<false, false>(dst, stride, top_ptr, left_ptr, height, + xstep, ystep); + } + return; + } + + if (xstep == 64) { + assert(ystep == 64); + DirectionalAngle135<8>(dst, stride, top_ptr, left_ptr, width, height); + return; + } + if (upsampled_top) { + if (upsampled_left) { + DirectionalZone2_8<true, true>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); + } else { + DirectionalZone2_8<true, false>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); + } + } else if (upsampled_left) { + DirectionalZone2_8<false, true>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); + } else { + DirectionalZone2_8<false, false>(dst, stride, top_ptr, left_ptr, width, + height, xstep, ystep); + } +} + void Init10bpp() { Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); assert(dsp != nullptr); dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON; + dsp->directional_intra_predictor_zone2 = DirectionalIntraPredictorZone2_NEON; dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON; } |