aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/arm
diff options
context:
space:
mode:
authorBoyuan Yang <byang@debian.org>2022-07-14 15:56:57 -0400
committerBoyuan Yang <byang@debian.org>2022-07-14 15:56:57 -0400
commitd4dbf19f6b0181ee78034bfe4caf189d1c016998 (patch)
tree47d5d28d2ab770a10e6c48788725c51dffeb84a9 /src/dsp/arm
parent320ef65362608ee1148c299d8d5d7618af34e470 (diff)
downloadlibgav1-d4dbf19f6b0181ee78034bfe4caf189d1c016998.tar.gz
libgav1-d4dbf19f6b0181ee78034bfe4caf189d1c016998.tar.bz2
libgav1-d4dbf19f6b0181ee78034bfe4caf189d1c016998.zip
New upstream version 0.18.0
Diffstat (limited to 'src/dsp/arm')
-rw-r--r--src/dsp/arm/common_neon.h52
-rw-r--r--src/dsp/arm/convolve_10bit_neon.cc224
-rw-r--r--src/dsp/arm/distance_weighted_blend_neon.cc105
-rw-r--r--src/dsp/arm/film_grain_neon.cc218
-rw-r--r--src/dsp/arm/film_grain_neon.h4
-rw-r--r--src/dsp/arm/intrapred_directional_neon.cc688
-rw-r--r--src/dsp/arm/intrapred_neon.cc10
-rw-r--r--src/dsp/arm/intrapred_smooth_neon.cc339
-rw-r--r--src/dsp/arm/inverse_transform_10bit_neon.cc28
-rw-r--r--src/dsp/arm/inverse_transform_neon.cc146
-rw-r--r--src/dsp/arm/loop_filter_10bit_neon.cc1218
-rw-r--r--src/dsp/arm/loop_filter_neon.cc1298
-rw-r--r--src/dsp/arm/loop_filter_neon.h1
-rw-r--r--src/dsp/arm/loop_restoration_neon.cc8
-rw-r--r--src/dsp/arm/mask_blend_neon.cc375
-rw-r--r--src/dsp/arm/obmc_neon.cc523
-rw-r--r--src/dsp/arm/warp_neon.cc97
17 files changed, 2702 insertions, 2632 deletions
diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h
index 9c46525..c0af2c1 100644
--- a/src/dsp/arm/common_neon.h
+++ b/src/dsp/arm/common_neon.h
@@ -309,6 +309,12 @@ inline uint8x16_t MaskOverreadsQ(const uint8x16_t source,
return dst;
}
+inline uint16x8_t MaskOverreadsQ(const uint16x8_t source,
+ const ptrdiff_t over_read_in_bytes) {
+ return vreinterpretq_u16_u8(
+ MaskOverreadsQ(vreinterpretq_u8_u16(source), over_read_in_bytes));
+}
+
inline uint8x8_t Load1MsanU8(const uint8_t* const source,
const ptrdiff_t over_read_in_bytes) {
return MaskOverreads(vld1_u8(source), over_read_in_bytes);
@@ -325,20 +331,6 @@ inline uint16x8_t Load1QMsanU16(const uint16_t* const source,
vreinterpretq_u8_u16(vld1q_u16(source)), over_read_in_bytes));
}
-inline uint16x8x2_t Load2QMsanU16(const uint16_t* const source,
- const ptrdiff_t over_read_in_bytes) {
- // Relative source index of elements (2 bytes each):
- // dst.val[0]: 00 02 04 06 08 10 12 14
- // dst.val[1]: 01 03 05 07 09 11 13 15
- uint16x8x2_t dst = vld2q_u16(source);
- dst.val[0] = vreinterpretq_u16_u8(MaskOverreadsQ(
- vreinterpretq_u8_u16(dst.val[0]), over_read_in_bytes >> 1));
- dst.val[1] = vreinterpretq_u16_u8(
- MaskOverreadsQ(vreinterpretq_u8_u16(dst.val[1]),
- (over_read_in_bytes >> 1) + (over_read_in_bytes % 4)));
- return dst;
-}
-
inline uint32x4_t Load1QMsanU32(const uint32_t* const source,
const ptrdiff_t over_read_in_bytes) {
return vreinterpretq_u32_u8(MaskOverreadsQ(
@@ -402,6 +394,24 @@ inline void Store8(void* const buf, const uint16x8_t val) {
vst1q_u16(static_cast<uint16_t*>(buf), val);
}
+inline void Store4QMsanS16(void* const buf, const int16x8x4_t src) {
+#if LIBGAV1_MSAN
+ // The memory shadow is incorrect for vst4q_u16, only marking the first 16
+ // bytes of the destination as initialized. To avoid missing truly
+ // uninitialized memory, check the input vectors first, before marking the
+ // whole 64 bytes initialized. If any input vector contains unused values, it
+ // should pass through MaskOverreadsQ first.
+ __msan_check_mem_is_initialized(&src.val[0], sizeof(src.val[0]));
+ __msan_check_mem_is_initialized(&src.val[1], sizeof(src.val[1]));
+ __msan_check_mem_is_initialized(&src.val[2], sizeof(src.val[2]));
+ __msan_check_mem_is_initialized(&src.val[3], sizeof(src.val[3]));
+ vst4q_s16(static_cast<int16_t*>(buf), src);
+ __msan_unpoison(buf, sizeof(int16x8x4_t));
+#else
+ vst4q_s16(static_cast<int16_t*>(buf), src);
+#endif // LIBGAV1_MSAN
+}
+
//------------------------------------------------------------------------------
// Pointer helpers.
@@ -587,7 +597,8 @@ inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
//------------------------------------------------------------------------------
// Saturation helpers.
-inline int16x4_t Clip3S16(int16x4_t val, int16x4_t low, int16x4_t high) {
+inline int16x4_t Clip3S16(const int16x4_t val, const int16x4_t low,
+ const int16x4_t high) {
return vmin_s16(vmax_s16(val, low), high);
}
@@ -596,7 +607,7 @@ inline int16x8_t Clip3S16(const int16x8_t val, const int16x8_t low,
return vminq_s16(vmaxq_s16(val, low), high);
}
-inline uint16x8_t ConvertToUnsignedPixelU16(int16x8_t val, int bitdepth) {
+inline uint16x8_t ConvertToUnsignedPixelU16(const int16x8_t val, int bitdepth) {
const int16x8_t low = vdupq_n_s16(0);
const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
@@ -727,7 +738,7 @@ inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); }
// Output:
// b0.val[0]: 00 01 02 03 16 17 18 19
// b0.val[1]: 04 05 06 07 20 21 22 23
-inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) {
+inline int16x8x2_t VtrnqS64(const int32x4_t a0, const int32x4_t a1) {
int16x8x2_t b0;
b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
vreinterpret_s16_s32(vget_low_s32(a1)));
@@ -736,7 +747,7 @@ inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) {
return b0;
}
-inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
+inline uint16x8x2_t VtrnqU64(const uint32x4_t a0, const uint32x4_t a1) {
uint16x8x2_t b0;
b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
vreinterpret_u16_u32(vget_low_u32(a1)));
@@ -750,6 +761,11 @@ inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
// 10 11 12 13
// 20 21 22 23
// 30 31 32 33
+// Output:
+// 00 10 20 30
+// 01 11 21 31
+// 02 12 22 32
+// 03 13 23 33
inline void Transpose4x4(uint16x4_t a[4]) {
// b:
// 00 10 02 12
diff --git a/src/dsp/arm/convolve_10bit_neon.cc b/src/dsp/arm/convolve_10bit_neon.cc
index b7205df..389f029 100644
--- a/src/dsp/arm/convolve_10bit_neon.cc
+++ b/src/dsp/arm/convolve_10bit_neon.cc
@@ -45,12 +45,12 @@ namespace {
// Pixel output range: [ 0, 1023]
// Compound output range: [ 3988, 61532]
-template <int filter_index>
+template <int num_taps>
int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
const int16x4_t* const taps) {
const auto* ssrc = reinterpret_cast<const int16x8_t*>(src);
int32x4x2_t sum;
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
@@ -65,7 +65,7 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[3]), taps[3]);
sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[4]), taps[4]);
sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
@@ -84,7 +84,7 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[5]), taps[5]);
sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[6]), taps[6]);
sum.val[1] = vmlal_s16(sum.val[1], vget_high_s16(ssrc[7]), taps[7]);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
sum.val[0] = vmull_s16(vget_low_s16(ssrc[0]), taps[0]);
sum.val[0] = vmlal_s16(sum.val[0], vget_low_s16(ssrc[1]), taps[1]);
@@ -106,12 +106,12 @@ int32x4x2_t SumOnePassTaps(const uint16x8_t* const src,
return sum;
}
-template <int filter_index>
+template <int num_taps>
int32x4_t SumOnePassTaps(const uint16x4_t* const src,
const int16x4_t* const taps) {
const auto* ssrc = reinterpret_cast<const int16x4_t*>(src);
int32x4_t sum;
- if (filter_index < 2) {
+ if (num_taps == 6) {
// 6 taps.
sum = vmull_s16(ssrc[0], taps[0]);
sum = vmlal_s16(sum, ssrc[1], taps[1]);
@@ -119,7 +119,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src,
sum = vmlal_s16(sum, ssrc[3], taps[3]);
sum = vmlal_s16(sum, ssrc[4], taps[4]);
sum = vmlal_s16(sum, ssrc[5], taps[5]);
- } else if (filter_index == 2) {
+ } else if (num_taps == 8) {
// 8 taps.
sum = vmull_s16(ssrc[0], taps[0]);
sum = vmlal_s16(sum, ssrc[1], taps[1]);
@@ -129,7 +129,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src,
sum = vmlal_s16(sum, ssrc[5], taps[5]);
sum = vmlal_s16(sum, ssrc[6], taps[6]);
sum = vmlal_s16(sum, ssrc[7], taps[7]);
- } else if (filter_index == 3) {
+ } else if (num_taps == 2) {
// 2 taps.
sum = vmull_s16(ssrc[0], taps[0]);
sum = vmlal_s16(sum, ssrc[1], taps[1]);
@@ -143,7 +143,7 @@ int32x4_t SumOnePassTaps(const uint16x4_t* const src,
return sum;
}
-template <int filter_index, bool is_compound, bool is_2d>
+template <int num_taps, bool is_compound, bool is_2d>
void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
@@ -162,15 +162,15 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
const uint16x8_t src_long_hi = vld1q_u16(s + 8);
uint16x8_t v_src[8];
int32x4x2_t v_sum;
- if (filter_index < 2) {
+ if (num_taps == 6) {
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
v_src[2] = vextq_u16(src_long, src_long_hi, 2);
v_src[3] = vextq_u16(src_long, src_long_hi, 3);
v_src[4] = vextq_u16(src_long, src_long_hi, 4);
v_src[5] = vextq_u16(src_long, src_long_hi, 5);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1);
- } else if (filter_index == 2) {
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1);
+ } else if (num_taps == 8) {
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
v_src[2] = vextq_u16(src_long, src_long_hi, 2);
@@ -179,17 +179,17 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
v_src[5] = vextq_u16(src_long, src_long_hi, 5);
v_src[6] = vextq_u16(src_long, src_long_hi, 6);
v_src[7] = vextq_u16(src_long, src_long_hi, 7);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap);
- } else if (filter_index == 3) {
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap);
+ } else if (num_taps == 2) {
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
- } else { // filter_index > 3
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
+ } else { // 4 taps
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
v_src[2] = vextq_u16(src_long, src_long_hi, 2);
v_src[3] = vextq_u16(src_long, src_long_hi, 3);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
}
const int16x4_t d0 =
@@ -213,15 +213,15 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
const uint16x8_t src_long_hi = vld1q_u16(src + x + 8);
uint16x8_t v_src[8];
int32x4x2_t v_sum;
- if (filter_index < 2) {
+ if (num_taps == 6) {
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
v_src[2] = vextq_u16(src_long, src_long_hi, 2);
v_src[3] = vextq_u16(src_long, src_long_hi, 3);
v_src[4] = vextq_u16(src_long, src_long_hi, 4);
v_src[5] = vextq_u16(src_long, src_long_hi, 5);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 1);
- } else if (filter_index == 2) {
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 1);
+ } else if (num_taps == 8) {
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
v_src[2] = vextq_u16(src_long, src_long_hi, 2);
@@ -230,17 +230,17 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
v_src[5] = vextq_u16(src_long, src_long_hi, 5);
v_src[6] = vextq_u16(src_long, src_long_hi, 6);
v_src[7] = vextq_u16(src_long, src_long_hi, 7);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap);
- } else if (filter_index == 3) {
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap);
+ } else if (num_taps == 2) {
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
- } else { // filter_index > 3
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
+ } else { // 4 taps
v_src[0] = src_long;
v_src[1] = vextq_u16(src_long, src_long_hi, 1);
v_src[2] = vextq_u16(src_long, src_long_hi, 2);
v_src[3] = vextq_u16(src_long, src_long_hi, 3);
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
}
if (is_compound) {
const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
@@ -276,7 +276,7 @@ void FilterHorizontalWidth8AndUp(const uint16_t* LIBGAV1_RESTRICT src,
} while (--y != 0);
}
-template <int filter_index, bool is_compound, bool is_2d>
+template <int num_taps, bool is_compound, bool is_2d>
void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
@@ -291,14 +291,14 @@ void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
int32x4_t v_sum;
const uint16x8_t src_long = vld1q_u16(src);
v_src[0] = vget_low_u16(src_long);
- if (filter_index == 3) {
+ if (num_taps == 2) {
v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 3);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 3);
} else {
v_src[1] = vget_low_u16(vextq_u16(src_long, v_zero, 1));
v_src[2] = vget_low_u16(vextq_u16(src_long, v_zero, 2));
v_src[3] = vget_low_u16(vextq_u16(src_long, v_zero, 3));
- v_sum = SumOnePassTaps<filter_index>(v_src, v_tap + 2);
+ v_sum = SumOnePassTaps<num_taps>(v_src, v_tap + 2);
}
if (is_compound || is_2d) {
const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
@@ -321,7 +321,7 @@ void FilterHorizontalWidth4(const uint16_t* LIBGAV1_RESTRICT src,
} while (--y != 0);
}
-template <int filter_index, bool is_2d>
+template <int num_taps, bool is_2d>
void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
@@ -336,7 +336,7 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
const int16x8_t input1 = vreinterpretq_s16_u16(vld1q_u16(src + src_stride));
const int16x8x2_t input = vzipq_s16(input0, input1);
int32x4_t v_sum;
- if (filter_index == 3) {
+ if (num_taps == 2) {
v_sum = vmull_s16(vget_low_s16(input.val[0]), v_tap[3]);
v_sum = vmlal_s16(v_sum,
vget_low_s16(vextq_s16(input.val[0], input.val[1], 2)),
@@ -387,7 +387,7 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
assert(height % 2 == 1);
const int16x8_t input = vreinterpretq_s16_u16(vld1q_u16(src));
int32x4_t v_sum;
- if (filter_index == 3) {
+ if (num_taps == 2) {
v_sum = vmull_s16(vget_low_s16(input), v_tap[3]);
v_sum =
vmlal_s16(v_sum, vget_low_s16(vextq_s16(input, input, 1)), v_tap[4]);
@@ -406,17 +406,17 @@ void FilterHorizontalWidth2(const uint16_t* LIBGAV1_RESTRICT src,
}
}
-template <int filter_index, bool is_compound, bool is_2d>
+template <int num_taps, bool is_compound, bool is_2d>
void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dest,
const ptrdiff_t pred_stride, const int width,
const int height, const int16x4_t* const v_tap) {
- assert(width < 8 || filter_index <= 3);
+ assert(width < 8 || num_taps != 4);
// Don't simplify the redundant if conditions with the template parameters,
// which helps the compiler generate compact code.
- if (width >= 8 && filter_index <= 3) {
- FilterHorizontalWidth8AndUp<filter_index, is_compound, is_2d>(
+ if (width >= 8 && num_taps != 4) {
+ FilterHorizontalWidth8AndUp<num_taps, is_compound, is_2d>(
src, src_stride, dest, pred_stride, width, height, v_tap);
return;
}
@@ -424,17 +424,17 @@ void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
// Horizontal passes only needs to account for number of taps 2 and 4 when
// |width| <= 4.
assert(width <= 4);
- assert(filter_index >= 3 && filter_index <= 5);
- if (filter_index >= 3 && filter_index <= 5) {
+ assert(num_taps == 2 || num_taps == 4);
+ if (num_taps == 2 || num_taps == 4) {
if (width == 4) {
- FilterHorizontalWidth4<filter_index, is_compound, is_2d>(
+ FilterHorizontalWidth4<num_taps, is_compound, is_2d>(
src, src_stride, dest, pred_stride, height, v_tap);
return;
}
assert(width == 2);
if (!is_compound) {
- FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
- pred_stride, height, v_tap);
+ FilterHorizontalWidth2<num_taps, is_2d>(src, src_stride, dest,
+ pred_stride, height, v_tap);
}
}
}
@@ -455,22 +455,17 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
}
if (filter_index == 2) { // 8 tap.
- FilterHorizontal<2, is_compound, is_2d>(src, src_stride, dst, dst_stride,
+ FilterHorizontal<8, is_compound, is_2d>(src, src_stride, dst, dst_stride,
width, height, v_tap);
- } else if (filter_index == 1) { // 6 tap.
- FilterHorizontal<1, is_compound, is_2d>(src + 1, src_stride, dst,
+ } else if (filter_index < 2) { // 6 tap.
+ FilterHorizontal<6, is_compound, is_2d>(src + 1, src_stride, dst,
dst_stride, width, height, v_tap);
- } else if (filter_index == 0) { // 6 tap.
- FilterHorizontal<0, is_compound, is_2d>(src + 1, src_stride, dst,
- dst_stride, width, height, v_tap);
- } else if (filter_index == 4) { // 4 tap.
+ } else if ((filter_index & 0x4) != 0) { // 4 tap.
+ // ((filter_index == 4) | (filter_index == 5))
FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst,
dst_stride, width, height, v_tap);
- } else if (filter_index == 5) { // 4 tap.
- FilterHorizontal<5, is_compound, is_2d>(src + 2, src_stride, dst,
- dst_stride, width, height, v_tap);
} else { // 2 tap.
- FilterHorizontal<3, is_compound, is_2d>(src + 3, src_stride, dst,
+ FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst,
dst_stride, width, height, v_tap);
}
}
@@ -510,13 +505,12 @@ void ConvolveCompoundHorizontal_NEON(
filter_index);
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int width,
const int height, const int16x4_t* const taps) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
auto* const dst16 = static_cast<uint16_t*>(dst);
@@ -555,7 +549,7 @@ void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src,
srcs[next_row] = vld1q_u16(src_x);
src_x += src_stride;
- const int32x4x2_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
+ const int32x4x2_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
if (is_compound) {
const int16x4_t v_compound_offset = vdup_n_s16(kCompoundOffset);
const int16x4_t d0 =
@@ -593,13 +587,12 @@ void FilterVertical(const uint16_t* LIBGAV1_RESTRICT const src,
} while (x < width);
}
-template <int filter_index, bool is_compound = false>
+template <int num_taps, bool is_compound = false>
void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int height,
const int16x4_t* const taps) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -633,8 +626,8 @@ void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src,
srcs[num_taps] = vld1_u16(src);
src += src_stride;
- const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
- const int32x4_t v_sum_1 = SumOnePassTaps<filter_index>(srcs + 1, taps);
+ const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
+ const int32x4_t v_sum_1 = SumOnePassTaps<num_taps>(srcs + 1, taps);
if (is_compound) {
const int16x4_t d0 = vqrshrn_n_s32(v_sum, kInterRoundBitsHorizontal - 1);
const int16x4_t d1 =
@@ -673,13 +666,12 @@ void FilterVertical4xH(const uint16_t* LIBGAV1_RESTRICT src,
} while (y != 0);
}
-template <int filter_index>
+template <int num_taps>
void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
void* LIBGAV1_RESTRICT const dst,
const ptrdiff_t dst_stride, const int height,
const int16x4_t* const taps) {
- const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -718,7 +710,7 @@ void FilterVertical2xH(const uint16_t* LIBGAV1_RESTRICT src,
src += src_stride;
srcs[next_row] = vext_u16(srcs[next_row - 1], srcs[num_taps], 2);
- const int32x4_t v_sum = SumOnePassTaps<filter_index>(srcs, taps);
+ const int32x4_t v_sum = SumOnePassTaps<num_taps>(srcs, taps);
const uint16x4_t d0 =
vmin_u16(vqrshrun_n_s32(v_sum, kFilterBits - 1), v_max_bitdepth);
Store2<0>(dst16, d0);
@@ -1180,13 +1172,13 @@ void ConvolveVertical_NEON(
if (filter_index == 0) { // 6 tap.
if (width == 2) {
- FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
+ FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height,
taps + 1);
} else if (width == 4) {
- FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
+ FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height,
taps + 1);
} else {
- FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
taps + 1);
}
} else if ((static_cast<int>(filter_index == 1) &
@@ -1196,33 +1188,33 @@ void ConvolveVertical_NEON(
static_cast<int>(vertical_filter_id == 9) |
static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap.
if (width == 2) {
- FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
+ FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height,
taps + 1);
} else if (width == 4) {
- FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
+ FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height,
taps + 1);
} else {
- FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
taps + 1);
}
} else if (filter_index == 2) { // 8 tap.
if (width == 2) {
- FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
} else {
- FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<8>(src, src_stride, dest, dest_stride, width, height,
taps);
}
} else if (filter_index == 3) { // 2 tap.
if (width == 2) {
- FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height,
taps + 3);
} else if (width == 4) {
- FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height,
taps + 3);
} else {
- FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
taps + 3);
}
} else {
@@ -1240,13 +1232,13 @@ void ConvolveVertical_NEON(
// treating it as though it has 4.
if (filter_index == 1) src += src_stride;
if (width == 2) {
- FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
taps + 2);
} else if (width == 4) {
- FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
taps + 2);
} else {
- FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+ FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
taps + 2);
}
}
@@ -1274,10 +1266,10 @@ void ConvolveCompoundVertical_NEON(
if (filter_index == 0) { // 6 tap.
if (width == 4) {
- FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+ FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
height, taps + 1);
} else {
- FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps + 1);
}
} else if ((static_cast<int>(filter_index == 1) &
@@ -1287,26 +1279,26 @@ void ConvolveCompoundVertical_NEON(
static_cast<int>(vertical_filter_id == 9) |
static_cast<int>(vertical_filter_id == 15))) != 0) { // 6 tap.
if (width == 4) {
- FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
+ FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
height, taps + 1);
} else {
- FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps + 1);
}
} else if (filter_index == 2) { // 8 tap.
if (width == 4) {
- FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4,
height, taps);
} else {
- FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
}
} else if (filter_index == 3) { // 2 tap.
if (width == 4) {
- FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
height, taps + 3);
} else {
- FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps + 3);
}
} else {
@@ -1323,10 +1315,10 @@ void ConvolveCompoundVertical_NEON(
// treating it as though it has 4.
if (filter_index == 1) src += src_stride;
if (width == 4) {
- FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
height, taps + 2);
} else {
- FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+ FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps + 2);
}
}
@@ -1980,7 +1972,7 @@ inline void ConvolveKernelHorizontal2Tap(
PermuteSrcVals(src_bytes, src_lookup[1])};
vst1_s16(intermediate,
- vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src, taps),
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src, taps),
kInterRoundBitsHorizontal - 1));
src_y = AddByteStride(src_y, src_stride);
intermediate += kIntermediateStride;
@@ -2034,13 +2026,12 @@ inline void ConvolveKernelHorizontal2Tap(
const uint16x4_t src_high[2] = {vget_high_u16(src[0]),
vget_high_u16(src[1])};
- vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(
- src_low, taps_low),
- kInterRoundBitsHorizontal - 1));
- vst1_s16(
- intermediate_x + 4,
- vrshrn_n_s32(SumOnePassTaps</*filter_index=*/3>(src_high, taps_high),
- kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_low, taps_low),
+ kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x + 4,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/2>(src_high, taps_high),
+ kInterRoundBitsHorizontal - 1));
// Avoid right shifting the stride.
src_x = AddByteStride(src_x, src_stride);
intermediate_x += kIntermediateStride;
@@ -2123,7 +2114,7 @@ inline void ConvolveKernelHorizontalPositive4Tap(
PermuteSrcVals(src_bytes, src_lookup[3])};
vst1_s16(intermediate,
- vrshrn_n_s32(SumOnePassTaps</*filter_index=*/5>(src, taps),
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps),
kInterRoundBitsHorizontal - 1));
src_y = AddByteStride(src_y, src_stride);
intermediate += kIntermediateStride;
@@ -2202,7 +2193,7 @@ inline void ConvolveKernelHorizontalSigned4Tap(
PermuteSrcVals(src_bytes, src_lookup[3])};
vst1_s16(intermediate,
- vrshrn_n_s32(SumOnePassTaps</*filter_index=*/4>(src, taps),
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/4>(src, taps),
kInterRoundBitsHorizontal - 1));
src_y = AddByteStride(src_y, src_stride);
intermediate += kIntermediateStride;
@@ -2297,13 +2288,12 @@ inline void ConvolveKernelHorizontalSigned6Tap(
src_high[i] = vget_high_u16(src_i);
}
- vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(
- src_low, taps_low),
- kInterRoundBitsHorizontal - 1));
- vst1_s16(
- intermediate_x + 4,
- vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high),
- kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low),
+ kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x + 4,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high),
+ kInterRoundBitsHorizontal - 1));
// Avoid right shifting the stride.
src_x = AddByteStride(src_x, src_stride);
intermediate_x += kIntermediateStride;
@@ -2401,13 +2391,12 @@ inline void ConvolveKernelHorizontalMixed6Tap(
src_high[i] = vget_high_u16(src_i);
}
- vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(
- src_low, taps_low),
- kInterRoundBitsHorizontal - 1));
- vst1_s16(
- intermediate_x + 4,
- vrshrn_n_s32(SumOnePassTaps</*filter_index=*/0>(src_high, taps_high),
- kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_low, taps_low),
+ kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x + 4,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/6>(src_high, taps_high),
+ kInterRoundBitsHorizontal - 1));
// Avoid right shifting the stride.
src_x = AddByteStride(src_x, src_stride);
intermediate_x += kIntermediateStride;
@@ -2505,13 +2494,12 @@ inline void ConvolveKernelHorizontalSigned8Tap(
src_high[i] = vget_high_u16(src_i);
}
- vst1_s16(intermediate_x, vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>(
- src_low, taps_low),
- kInterRoundBitsHorizontal - 1));
- vst1_s16(
- intermediate_x + 4,
- vrshrn_n_s32(SumOnePassTaps</*filter_index=*/2>(src_high, taps_high),
- kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_low, taps_low),
+ kInterRoundBitsHorizontal - 1));
+ vst1_s16(intermediate_x + 4,
+ vrshrn_n_s32(SumOnePassTaps</*num_taps=*/8>(src_high, taps_high),
+ kInterRoundBitsHorizontal - 1));
// Avoid right shifting the stride.
src_x = AddByteStride(src_x, src_stride);
intermediate_x += kIntermediateStride;
diff --git a/src/dsp/arm/distance_weighted_blend_neon.cc b/src/dsp/arm/distance_weighted_blend_neon.cc
index 7d287c8..6087276 100644
--- a/src/dsp/arm/distance_weighted_blend_neon.cc
+++ b/src/dsp/arm/distance_weighted_blend_neon.cc
@@ -36,44 +36,48 @@ constexpr int kInterPostRoundBit = 4;
namespace low_bitdepth {
namespace {
-inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
+inline uint8x8_t ComputeWeightedAverage8(const int16x8_t pred0,
const int16x8_t pred1,
- const int16x4_t weights[2]) {
- // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
- const int32x4_t wpred0_lo = vmull_s16(weights[0], vget_low_s16(pred0));
- const int32x4_t wpred0_hi = vmull_s16(weights[0], vget_high_s16(pred0));
- const int32x4_t blended_lo =
- vmlal_s16(wpred0_lo, weights[1], vget_low_s16(pred1));
- const int32x4_t blended_hi =
- vmlal_s16(wpred0_hi, weights[1], vget_high_s16(pred1));
-
- return vcombine_s16(vqrshrn_n_s32(blended_lo, kInterPostRoundBit + 4),
- vqrshrn_n_s32(blended_hi, kInterPostRoundBit + 4));
+ const int16x8_t weight) {
+ // Given: p0,p1 in range [-5132,9212] and w0 = 16 - w1, w1 = 16 - w0
+ // Output: (p0 * w0 + p1 * w1 + 128(=rounding bit)) >>
+ // 8(=kInterPostRoundBit + 4)
+ // The formula is manipulated to avoid lengthening to 32 bits.
+ // p0 * w0 + p1 * w1 = p0 * w0 + (16 - w0) * p1
+ // = (p0 - p1) * w0 + 16 * p1
+ // Maximum value of p0 - p1 is 9212 + 5132 = 0x3808.
+ const int16x8_t diff = vsubq_s16(pred0, pred1);
+ // (((p0 - p1) * (w0 << 11) << 1) >> 16) + ((16 * p1) >> 4)
+ const int16x8_t weighted_diff = vqdmulhq_s16(diff, weight);
+ // ((p0 - p1) * w0 >> 4) + p1
+ const int16x8_t upscaled_average = vaddq_s16(weighted_diff, pred1);
+ // (((p0 - p1) * w0 >> 4) + p1 + (128 >> 4)) >> 4
+ return vqrshrun_n_s16(upscaled_average, kInterPostRoundBit);
}
-template <int width, int height>
+template <int width>
inline void DistanceWeightedBlendSmall_NEON(
const int16_t* LIBGAV1_RESTRICT prediction_0,
- const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2],
- void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
+ const int16_t* LIBGAV1_RESTRICT prediction_1, const int height,
+ const int16x8_t weight, void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
constexpr int step = 16 / width;
- for (int y = 0; y < height; y += step) {
+ int y = height;
+ do {
const int16x8_t src_00 = vld1q_s16(prediction_0);
const int16x8_t src_10 = vld1q_s16(prediction_1);
prediction_0 += 8;
prediction_1 += 8;
- const int16x8_t res0 = ComputeWeightedAverage8(src_00, src_10, weights);
+ const uint8x8_t result0 = ComputeWeightedAverage8(src_00, src_10, weight);
const int16x8_t src_01 = vld1q_s16(prediction_0);
const int16x8_t src_11 = vld1q_s16(prediction_1);
prediction_0 += 8;
prediction_1 += 8;
- const int16x8_t res1 = ComputeWeightedAverage8(src_01, src_11, weights);
+ const uint8x8_t result1 = ComputeWeightedAverage8(src_01, src_11, weight);
- const uint8x8_t result0 = vqmovun_s16(res0);
- const uint8x8_t result1 = vqmovun_s16(res1);
if (width == 4) {
StoreLo4(dst, result0);
dst += dest_stride;
@@ -90,12 +94,13 @@ inline void DistanceWeightedBlendSmall_NEON(
vst1_u8(dst, result1);
dst += dest_stride;
}
- }
+ y -= step;
+ } while (y != 0);
}
inline void DistanceWeightedBlendLarge_NEON(
const int16_t* LIBGAV1_RESTRICT prediction_0,
- const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x4_t weights[2],
+ const int16_t* LIBGAV1_RESTRICT prediction_1, const int16x8_t weight,
const int width, const int height, void* LIBGAV1_RESTRICT const dest,
const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
@@ -106,16 +111,15 @@ inline void DistanceWeightedBlendLarge_NEON(
do {
const int16x8_t src0_lo = vld1q_s16(prediction_0 + x);
const int16x8_t src1_lo = vld1q_s16(prediction_1 + x);
- const int16x8_t res_lo =
- ComputeWeightedAverage8(src0_lo, src1_lo, weights);
+ const uint8x8_t res_lo =
+ ComputeWeightedAverage8(src0_lo, src1_lo, weight);
const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8);
const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8);
- const int16x8_t res_hi =
- ComputeWeightedAverage8(src0_hi, src1_hi, weights);
+ const uint8x8_t res_hi =
+ ComputeWeightedAverage8(src0_hi, src1_hi, weight);
- const uint8x16_t result =
- vcombine_u8(vqmovun_s16(res_lo), vqmovun_s16(res_hi));
+ const uint8x16_t result = vcombine_u8(res_lo, res_hi);
vst1q_u8(dst + x, result);
x += 16;
} while (x < width);
@@ -128,52 +132,25 @@ inline void DistanceWeightedBlendLarge_NEON(
inline void DistanceWeightedBlend_NEON(
const void* LIBGAV1_RESTRICT prediction_0,
const void* LIBGAV1_RESTRICT prediction_1, const uint8_t weight_0,
- const uint8_t weight_1, const int width, const int height,
+ const uint8_t /*weight_1*/, const int width, const int height,
void* LIBGAV1_RESTRICT const dest, const ptrdiff_t dest_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int16x4_t weights[2] = {vdup_n_s16(weight_0), vdup_n_s16(weight_1)};
- // TODO(johannkoenig): Investigate the branching. May be fine to call with a
- // variable height.
+ // Upscale the weight for vqdmulh.
+ const int16x8_t weight = vdupq_n_s16(weight_0 << 11);
if (width == 4) {
- if (height == 4) {
- DistanceWeightedBlendSmall_NEON<4, 4>(pred_0, pred_1, weights, dest,
- dest_stride);
- } else if (height == 8) {
- DistanceWeightedBlendSmall_NEON<4, 8>(pred_0, pred_1, weights, dest,
- dest_stride);
- } else {
- assert(height == 16);
- DistanceWeightedBlendSmall_NEON<4, 16>(pred_0, pred_1, weights, dest,
- dest_stride);
- }
+ DistanceWeightedBlendSmall_NEON<4>(pred_0, pred_1, height, weight, dest,
+ dest_stride);
return;
}
if (width == 8) {
- switch (height) {
- case 4:
- DistanceWeightedBlendSmall_NEON<8, 4>(pred_0, pred_1, weights, dest,
- dest_stride);
- return;
- case 8:
- DistanceWeightedBlendSmall_NEON<8, 8>(pred_0, pred_1, weights, dest,
- dest_stride);
- return;
- case 16:
- DistanceWeightedBlendSmall_NEON<8, 16>(pred_0, pred_1, weights, dest,
- dest_stride);
- return;
- default:
- assert(height == 32);
- DistanceWeightedBlendSmall_NEON<8, 32>(pred_0, pred_1, weights, dest,
- dest_stride);
-
- return;
- }
+ DistanceWeightedBlendSmall_NEON<8>(pred_0, pred_1, height, weight, dest,
+ dest_stride);
+ return;
}
- DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weights, width, height, dest,
+ DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weight, width, height, dest,
dest_stride);
}
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc
index 0b1b481..76e1151 100644
--- a/src/dsp/arm/film_grain_neon.cc
+++ b/src/dsp/arm/film_grain_neon.cc
@@ -18,23 +18,21 @@
#if LIBGAV1_ENABLE_NEON
#include <arm_neon.h>
-#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
-#include <new>
#include "src/dsp/arm/common_neon.h"
-#include "src/dsp/arm/film_grain_neon.h"
-#include "src/dsp/common.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/dsp/film_grain_common.h"
+#include "src/utils/array_2d.h"
#include "src/utils/common.h"
#include "src/utils/compiler_attributes.h"
-#include "src/utils/logging.h"
+#include "src/utils/constants.h"
#include "src/utils/memory.h"
+#include "src/utils/types.h"
namespace libgav1 {
namespace dsp {
@@ -52,10 +50,8 @@ inline int16x8_t GetSignedSource8(const uint8_t* src) {
return ZeroExtend(vld1_u8(src));
}
-inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int /*valid_range*/) {
- // TODO(b/194217060): restore |valid_range| usage after correcting call sites
- // causing test vector failures.
- return ZeroExtend(Load1MsanU8(src, 0));
+inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int valid_range) {
+ return ZeroExtend(Load1MsanU8(src, 8 - valid_range));
}
inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) {
@@ -69,11 +65,8 @@ inline int16x8_t GetSignedSource8(const uint16_t* src) {
return vreinterpretq_s16_u16(vld1q_u16(src));
}
-inline int16x8_t GetSignedSource8Msan(const uint16_t* src,
- int /*valid_range*/) {
- // TODO(b/194217060): restore |valid_range| usage after correcting call sites
- // causing test vector failures.
- return vreinterpretq_s16_u16(Load1QMsanU16(src, 0));
+inline int16x8_t GetSignedSource8Msan(const uint16_t* src, int valid_range) {
+ return vreinterpretq_s16_u16(Load1QMsanU16(src, 16 - valid_range));
}
inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) {
@@ -198,17 +191,13 @@ inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
}
inline uint16x8_t GetAverageLumaMsan(const uint8_t* const luma,
- int subsampling_x, int /*valid_range*/) {
+ int subsampling_x, int valid_range) {
if (subsampling_x != 0) {
- // TODO(b/194217060): restore |valid_range| usage after correcting call
- // sites causing test vector failures.
- const uint8x16_t src = Load1QMsanU8(luma, 0);
-
+ const uint8x16_t src = MaskOverreadsQ(vld1q_u8(luma), 16 - valid_range);
+ // MemorySanitizer registers vpaddlq_u8 as a use of the memory.
return vrshrq_n_u16(vpaddlq_u8(src), 1);
}
- // TODO(b/194217060): restore |valid_range| usage after correcting call sites
- // causing test vector failures.
- return vmovl_u8(Load1MsanU8(luma, 0));
+ return MaskOverreadsQ(vmovl_u8(vld1_u8(luma)), 16 - valid_range);
}
#if LIBGAV1_MAX_BITDEPTH >= 10
@@ -252,16 +241,13 @@ inline uint16x8_t GetAverageLuma(const uint16_t* const luma,
}
inline uint16x8_t GetAverageLumaMsan(const uint16_t* const luma,
- int subsampling_x, int /*valid_range*/) {
+ int subsampling_x, int valid_range) {
if (subsampling_x != 0) {
- // TODO(b/194217060): restore |valid_range| usage after correcting call
- // sites causing test vector failures.
- const uint16x8x2_t src = Load2QMsanU16(luma, 0);
- return vrhaddq_u16(src.val[0], src.val[1]);
+ const uint16x8x2_t src = vld2q_u16(luma);
+ const uint16x8_t result = vrhaddq_u16(src.val[0], src.val[1]);
+ return MaskOverreadsQ(result, 16 - valid_range);
}
- // TODO(b/194217060): restore |valid_range| usage after correcting call sites
- // causing test vector failures.
- return Load1QMsanU16(luma, 0);
+ return Load1QMsanU16(luma, 16 - valid_range);
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
@@ -614,8 +600,7 @@ void InitializeScalingLookupTable_NEON(int num_points,
}
static_assert(sizeof(scaling_lut[0]) == 2, "");
Memset(scaling_lut, point_scaling[0],
- std::max(static_cast<int>(point_value[0]), 1)
- << (bitdepth - kBitdepth8));
+ (static_cast<int>(point_value[0]) + 1) << (bitdepth - kBitdepth8));
const int32x4_t steps = vmovl_s16(vcreate_s16(0x0003000200010000));
const int32x4_t rounding = vdupq_n_s32(32768);
for (int i = 0; i < num_points - 1; ++i) {
@@ -666,7 +651,7 @@ void InitializeScalingLookupTable_NEON(int num_points,
const int16x8x4_t result = {
start, vaddq_s16(start, vrshrq_n_s16(delta, 2)),
vaddq_s16(start, delta2), vaddq_s16(start, delta3)};
- vst4q_s16(&scaling_lut[x_base], result);
+ Store4QMsanS16(&scaling_lut[x_base], result);
} else {
vst1q_s16(&scaling_lut[x_base], full_interp);
}
@@ -696,13 +681,29 @@ inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low,
}
template <int bitdepth, typename Pixel>
-inline int16x8_t GetScalingFactors(
- const int16_t scaling_lut[kScalingLookupTableSize], const Pixel* source) {
+inline int16x8_t GetScalingFactors(const int16_t scaling_lut[],
+ const Pixel* source) {
int16_t start_vals[8];
static_assert(bitdepth <= kBitdepth10,
"NEON Film Grain is not yet implemented for 12bpp.");
+#if LIBGAV1_MSAN
+ memset(start_vals, 0, sizeof(start_vals));
+#endif
for (int i = 0; i < 8; ++i) {
- assert(source[i] < kScalingLookupTableSize << (bitdepth - 2));
+ assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8)));
+ start_vals[i] = scaling_lut[source[i]];
+ }
+ return vld1q_s16(start_vals);
+}
+
+template <int bitdepth, typename Pixel>
+inline int16x8_t GetScalingFactors(const int16_t scaling_lut[],
+ const Pixel* source, const int valid_range) {
+ int16_t start_vals[8];
+ static_assert(bitdepth <= kBitdepth10,
+ "NEON Film Grain is not yet implemented for 12bpp.");
+ for (int i = 0; i < valid_range; ++i) {
+ assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8)));
start_vals[i] = scaling_lut[source[i]];
}
return vld1q_s16(start_vals);
@@ -743,10 +744,11 @@ void BlendNoiseWithImageLuma_NEON(
const int16x8_t scaling_shift_vect = vdupq_n_s16(
(bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
+ const int safe_width = width & ~15;
int y = 0;
do {
int x = 0;
- do {
+ for (; x + 8 <= safe_width; x += 8) {
// This operation on the unsigned input is safe in 8bpp because the vector
// is widened before it is reinterpreted.
const int16x8_t orig0 = GetSignedSource8(&in_y_row[x]);
@@ -767,8 +769,8 @@ void BlendNoiseWithImageLuma_NEON(
// This operation on the unsigned input is safe in 8bpp because the vector
// is widened before it is reinterpreted.
const int16x8_t orig1 = GetSignedSource8(&in_y_row[x]);
- const int16x8_t scaling1 = GetScalingFactors<bitdepth, Pixel>(
- scaling_lut_y, &in_y_row[std::min(x, width)]);
+ const int16x8_t scaling1 =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
noise = GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
noise = ScaleNoise<bitdepth>(noise, scaling1, scaling_shift_vect);
@@ -778,8 +780,41 @@ void BlendNoiseWithImageLuma_NEON(
// function for just that case, though the gain would be very small.
StoreUnsigned8(&out_y_row[x],
vreinterpretq_u16_s16(Clip3(combined1, floor, ceiling)));
- x += 8;
- } while (x < width);
+ }
+
+ if (x < width) {
+ assert(width - x < 16);
+ if (x < width - 8) {
+ const int16x8_t orig = GetSignedSource8(&in_y_row[x]);
+ const int16x8_t scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+ int16x8_t noise =
+ GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+
+ noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
+ const int16x8_t combined = vaddq_s16(orig, noise);
+ // In 8bpp, when params_.clip_to_restricted_range == false, we can
+ // replace clipping with vqmovun_s16, but it's not likely to be worth
+ // copying the function for just that case, though the gain would be
+ // very small.
+ StoreUnsigned8(&out_y_row[x],
+ vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
+ x += 8;
+ }
+ const int valid_range_pixels = width - x;
+ const int valid_range_bytes = (width - x) * sizeof(in_y_row[0]);
+ const int16x8_t orig =
+ GetSignedSource8Msan(&in_y_row[x], valid_range_bytes);
+ const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>(
+ scaling_lut_y, &in_y_row[x], valid_range_pixels);
+ int16x8_t noise =
+ GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+ noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
+
+ const int16x8_t combined = vaddq_s16(orig, noise);
+ StoreUnsigned8(&out_y_row[x],
+ vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
+ }
in_y_row += source_stride_y;
out_y_row += dest_stride_y;
} while (++y < height);
@@ -787,13 +822,9 @@ void BlendNoiseWithImageLuma_NEON(
template <int bitdepth, typename GrainType, typename Pixel>
inline int16x8_t BlendChromaValsWithCfl(
- const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
- const int16_t* LIBGAV1_RESTRICT scaling_lut,
const Pixel* LIBGAV1_RESTRICT chroma_cursor,
const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
- const int16x8_t scaling_shift_vect) {
- const int16x8_t scaling =
- GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+ const int16x8_t scaling, const int16x8_t scaling_shift_vect) {
const int16x8_t orig = GetSignedSource8(chroma_cursor);
int16x8_t noise = GetSignedSource8(noise_image_cursor);
noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
@@ -812,7 +843,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
const int16x8_t floor = vdupq_n_s16(min_value);
const int16x8_t ceiling = vdupq_n_s16(max_chroma);
Pixel luma_buffer[16];
- memset(luma_buffer, 0, sizeof(luma_buffer));
// In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
// for 16 bit signed integers. In higher bitdepths, however, we have to
// expand to 32 to protect the sign bit.
@@ -831,40 +861,45 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
int y = 0;
do {
int x = 0;
- do {
+ for (; x + 8 <= safe_chroma_width; x += 8) {
const int luma_x = x << subsampling_x;
const uint16x8_t average_luma =
GetAverageLuma(&in_y_row[luma_x], subsampling_x);
StoreUnsigned8(average_luma_buffer, average_luma);
+ const int16x8_t scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
const int16x8_t blended =
BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
- average_luma_buffer, scaling_lut, &in_chroma_row[x],
- &(noise_image[y + start_height][x]), scaling_shift_vect);
+ &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling,
+ scaling_shift_vect);
// In 8bpp, when params_.clip_to_restricted_range == false, we can replace
// clipping with vqmovun_s16, but it's not likely to be worth copying the
// function for just that case.
StoreUnsigned8(&out_chroma_row[x],
vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
- x += 8;
- } while (x < safe_chroma_width);
+ }
if (x < chroma_width) {
const int luma_x = x << subsampling_x;
const int valid_range_pixels = width - luma_x;
+ const int valid_range_chroma_pixels = chroma_width - x;
const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+ assert(valid_range_pixels < 16);
memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
luma_buffer[valid_range_pixels] = in_y_row[width - 1];
const uint16x8_t average_luma = GetAverageLumaMsan(
- luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0]));
+ luma_buffer, subsampling_x, valid_range_chroma_pixels << 1);
StoreUnsigned8(average_luma_buffer, average_luma);
+ const int16x8_t scaling = GetScalingFactors<bitdepth, Pixel>(
+ scaling_lut, average_luma_buffer, valid_range_chroma_pixels);
const int16x8_t blended =
BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
- average_luma_buffer, scaling_lut, &in_chroma_row[x],
- &(noise_image[y + start_height][x]), scaling_shift_vect);
+ &in_chroma_row[x], &(noise_image[y + start_height][x]), scaling,
+ scaling_shift_vect);
// In 8bpp, when params_.clip_to_restricted_range == false, we can replace
// clipping with vqmovun_s16, but it's not likely to be worth copying the
// function for just that case.
@@ -915,7 +950,8 @@ inline int16x8_t BlendChromaValsNoCfl(
const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
- const int16x8_t& offset, int luma_multiplier, int chroma_multiplier) {
+ const int16x8_t& offset, int luma_multiplier, int chroma_multiplier,
+ bool restrict_scaling_lookup, int valid_range_pixels = 0) {
uint8_t merged_buffer[8];
const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier);
const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier);
@@ -925,8 +961,12 @@ inline int16x8_t BlendChromaValsNoCfl(
// 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required.
const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4);
vst1_u8(merged_buffer, merged);
+
const int16x8_t scaling =
- GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
+ restrict_scaling_lookup
+ ? GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer,
+ valid_range_pixels)
+ : GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
int16x8_t noise = GetSignedSource8(noise_image_cursor);
noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift_vect);
return vaddq_s16(orig, noise);
@@ -952,34 +992,28 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
const int chroma_width = (width + subsampling_x) >> subsampling_x;
const int safe_chroma_width = chroma_width & ~7;
uint8_t luma_buffer[16];
-#if LIBGAV1_MSAN
- // Quiet msan warnings.
- memset(luma_buffer, 0, sizeof(luma_buffer));
-#endif
const int16x8_t offset = vdupq_n_s16(chroma_offset << 5);
start_height >>= subsampling_y;
int y = 0;
do {
int x = 0;
- do {
+ for (; x + 8 <= safe_chroma_width; x += 8) {
const int luma_x = x << subsampling_x;
- const int valid_range = width - luma_x;
+ const int valid_range_chroma_pixels = chroma_width - x;
const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
- const int16x8_t average_luma = vreinterpretq_s16_u16(
- GetAverageLumaMsan(&in_y_row[luma_x], subsampling_x, valid_range));
+ const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
+ &in_y_row[luma_x], subsampling_x, valid_range_chroma_pixels << 1));
const int16x8_t blended = BlendChromaValsNoCfl(
scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
average_luma, scaling_shift_vect, offset, luma_multiplier,
- chroma_multiplier);
+ chroma_multiplier, /*restrict_scaling_lookup=*/false);
// In 8bpp, when params_.clip_to_restricted_range == false, we can
// replace clipping with vqmovun_s16, but the gain would be small.
StoreUnsigned8(&out_chroma_row[x],
vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
-
- x += 8;
- } while (x < safe_chroma_width);
+ }
if (x < chroma_width) {
// Begin right edge iteration. Same as the normal iterations, but the
@@ -988,19 +1022,20 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
const int luma_x = x << subsampling_x;
const int valid_range_pixels = width - luma_x;
const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+ assert(valid_range_pixels < 16);
memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
luma_buffer[valid_range_pixels] = in_y_row[width - 1];
- const int valid_range_chroma_bytes =
- (chroma_width - x) * sizeof(in_chroma_row[0]);
+ const int valid_range_chroma_pixels = chroma_width - x;
const int16x8_t orig_chroma =
- GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes);
+ GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_pixels);
const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
- luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0])));
+ luma_buffer, subsampling_x, valid_range_chroma_pixels << 1));
const int16x8_t blended = BlendChromaValsNoCfl(
scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
average_luma, scaling_shift_vect, offset, luma_multiplier,
- chroma_multiplier);
+ chroma_multiplier, /*restrict_scaling_lookup=*/true,
+ valid_range_chroma_pixels);
StoreUnsigned8(&out_chroma_row[x],
vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
// End of right edge iteration.
@@ -1267,7 +1302,8 @@ inline int16x8_t BlendChromaValsNoCfl(
const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
const int16_t* LIBGAV1_RESTRICT noise_image_cursor,
const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
- const int32x4_t& offset, int luma_multiplier, int chroma_multiplier) {
+ const int32x4_t& offset, int luma_multiplier, int chroma_multiplier,
+ bool restrict_scaling_lookup, int valid_range_pixels = 0) {
uint16_t merged_buffer[8];
const int32x4_t weighted_luma_low =
vmull_n_s16(vget_low_s16(average_luma), luma_multiplier);
@@ -1287,7 +1323,11 @@ inline int16x8_t BlendChromaValsNoCfl(
vst1q_u16(merged_buffer,
vminq_u16(vcombine_u16(merged_low, merged_high), max_pixel));
const int16x8_t scaling =
- GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer);
+ restrict_scaling_lookup
+ ? GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer,
+ valid_range_pixels)
+ : GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut,
+ merged_buffer);
const int16x8_t noise = GetSignedSource8(noise_image_cursor);
const int16x8_t scaled_noise =
ScaleNoise<kBitdepth10>(noise, scaling, scaling_shift_vect);
@@ -1311,11 +1351,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
const int chroma_width = (width + subsampling_x) >> subsampling_x;
const int safe_chroma_width = chroma_width & ~7;
uint16_t luma_buffer[16];
-#if LIBGAV1_MSAN
- // TODO(b/194217060): This can be removed if the range calculations below are
- // fixed.
- memset(luma_buffer, 0, sizeof(luma_buffer));
-#endif
// Offset is added before downshifting in order to take advantage of
// saturation, so it has to be upscaled by 6 bits, plus 2 bits for 10bpp.
const int32x4_t offset = vdupq_n_s32(chroma_offset << (6 + 2));
@@ -1324,7 +1359,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
int y = 0;
do {
int x = 0;
- do {
+ for (; x + 8 <= safe_chroma_width; x += 8) {
const int luma_x = x << subsampling_x;
const int16x8_t average_luma = vreinterpretq_s16_u16(
GetAverageLuma(&in_y_row[luma_x], subsampling_x));
@@ -1332,12 +1367,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
const int16x8_t blended = BlendChromaValsNoCfl(
scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
average_luma, scaling_shift_vect, offset, luma_multiplier,
- chroma_multiplier);
+ chroma_multiplier, /*restrict_scaling_lookup=*/false);
StoreUnsigned8(&out_chroma_row[x],
vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
-
- x += 8;
- } while (x < safe_chroma_width);
+ }
if (x < chroma_width) {
// Begin right edge iteration. Same as the normal iterations, but the
@@ -1346,19 +1379,22 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
const int luma_x = x << subsampling_x;
const int valid_range_pixels = width - luma_x;
const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+ assert(valid_range_pixels < 16);
memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+ const int valid_range_chroma_pixels = chroma_width - x;
const int valid_range_chroma_bytes =
(chroma_width - x) * sizeof(in_chroma_row[0]);
const int16x8_t orig_chroma =
GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes);
const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
- luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0])));
+ luma_buffer, subsampling_x, valid_range_chroma_pixels << 1));
const int16x8_t blended = BlendChromaValsNoCfl(
scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
average_luma, scaling_shift_vect, offset, luma_multiplier,
- chroma_multiplier);
+ chroma_multiplier, /*restrict_scaling_lookup=*/true,
+ valid_range_chroma_pixels);
StoreUnsigned8(&out_chroma_row[x],
vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
// End of right edge iteration.
@@ -1442,10 +1478,8 @@ void Init10bpp() {
dsp->film_grain.initialize_scaling_lut =
InitializeScalingLookupTable_NEON<kBitdepth10>;
- // TODO(b/194442742): reenable this function after segfault under armv7 ASan
- // is fixed.
- // dsp->film_grain.blend_noise_luma =
- // BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>;
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>;
dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma10bpp_NEON;
dsp->film_grain.blend_noise_chroma[1] =
BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth10, int16_t, uint16_t>;
diff --git a/src/dsp/arm/film_grain_neon.h b/src/dsp/arm/film_grain_neon.h
index 3ba2eef..09596e2 100644
--- a/src/dsp/arm/film_grain_neon.h
+++ b/src/dsp/arm/film_grain_neon.h
@@ -39,9 +39,7 @@ void FilmGrainInit_NEON();
#define LIBGAV1_Dsp8bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
#define LIBGAV1_Dsp10bpp_FilmGrainInitializeScalingLutFunc LIBGAV1_DSP_NEON
#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
-// TODO(b/194442742): reenable this function after segfault under armv7 ASan is
-// fixed.
-// #define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_NEON
#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON
#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_NEON
#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_NEON
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
index 3cad4a6..e9bdcf0 100644
--- a/src/dsp/arm/intrapred_directional_neon.cc
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -505,20 +505,12 @@ inline void DirectionalZone1Blend_WxH(
} while (++y < height);
}
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
- 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
-// 7.11.2.4 (8) 90 < angle > 180
-// The strategy for these functions (4xH and 8+xH) is to know how many blocks
-// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
-// then handle only blocks that take from |left_ptr|. Additionally, a fast
-// index-shuffle approach is used for pred values from |left_column| in sections
-// that permit it.
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for these functions (4xH and 8+xH) is to know how many blocks
+// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
+// then handle only blocks that take from |left_ptr|. Additionally, a fast
+// index-shuffle approach is used for pred values from |left_column| in
+// sections that permit it.
inline void DirectionalZone2_4xH(
uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
const uint8_t* LIBGAV1_RESTRICT const top_row,
@@ -544,13 +536,6 @@ inline void DirectionalZone2_4xH(
assert(xstep >= 3);
const int min_top_only_x = std::min((height * xstep) >> 6, /* width */ 4);
- // For steep angles, the source pixels from |left_column| may not fit in a
- // 16-byte load for shuffling.
- // TODO(petersonab): Find a more precise formula for this subject to x.
- // TODO(johannkoenig): Revisit this for |width| == 4.
- const int max_shuffle_height =
- std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
-
// Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
@@ -569,9 +554,9 @@ inline void DirectionalZone2_4xH(
// blocks that have a mixture of values computed from top or left. The final
// stage covers blocks that are only computed from the left.
if (min_top_only_x > 0) {
- // Round down to the nearest multiple of 8.
- // TODO(johannkoenig): This never hits for Wx4 blocks but maybe it should.
- const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min((1 << 6) / xstep, height) & ~(min_height - 1);
DirectionalZone1_WxH<4>(dst, stride, max_top_only_y, top_row, -xstep,
upsampled_top);
@@ -584,18 +569,11 @@ inline void DirectionalZone2_4xH(
// All rows from |min_left_only_y| down for this set of columns only need
// |left_column| to compute.
const int min_left_only_y = std::min((4 << 6) / xstep, height);
- // At high angles such that min_left_only_y < 8, ystep is low and xstep is
- // high. This means that max_shuffle_height is unbounded and xstep_bounds
- // will overflow in 16 bits. This is prevented by stopping the first
- // blending loop at min_left_only_y for such cases, which means we skip over
- // the second blending loop as well.
- const int left_shuffle_stop_y =
- std::min(max_shuffle_height, min_left_only_y);
int xstep_bounds = xstep_bounds_base + xstep_y;
int top_x = -xstep - xstep_y;
// +8 increment is OK because if height is 4 this only goes once.
- for (; y < left_shuffle_stop_y;
+ for (; y < min_left_only_y;
y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
DirectionalZone2FromLeftCol_WxH<4>(
dst, stride, min_height,
@@ -607,21 +585,8 @@ inline void DirectionalZone2_4xH(
upsample_top_shift);
}
- // Pick up from the last y-value, using the slower but secure method for
- // left prediction.
- const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
- for (; y < min_left_only_y;
- y += 8, dst += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone3_WxH<4>(
- dst, stride, min_height,
- left_column + ((y - left_base_increment) << upsample_left_shift),
- base_left_y, -ystep, upsample_left_shift);
-
- DirectionalZone1Blend_WxH<4>(dst, stride, min_height, top_row,
- xstep_bounds, top_x, xstep,
- upsample_top_shift);
- }
// Loop over y for left_only rows.
+ const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
for (; y < height; y += 8, dst += stride8) {
DirectionalZone3_WxH<4>(
dst, stride, min_height,
@@ -634,34 +599,88 @@ inline void DirectionalZone2_4xH(
}
}
-// Process a multiple of 8 |width|.
-inline void DirectionalZone2_8(
+template <bool shuffle_left_column>
+inline void DirectionalZone2_8xH(
uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
const uint8_t* LIBGAV1_RESTRICT const top_row,
- const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
- const int height, const int xstep, const int ystep,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep, const int x, const int left_offset,
+ const int xstep_bounds_base, const int16x8_t left_y,
const bool upsampled_top, const bool upsampled_left) {
const int upsample_left_shift = static_cast<int>(upsampled_left);
const int upsample_top_shift = static_cast<int>(upsampled_top);
- // Helper vector.
- const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
-
// Loop incrementers for moving by block (8x8). This function handles blocks
// with height 4 as well. They are calculated in one pass so these variables
// do not get used.
const ptrdiff_t stride8 = stride << 3;
const int xstep8 = xstep << 3;
- const int ystep8 = ystep << 3;
- // Process Wx4 blocks.
+ // Cover 8x4 case.
const int min_height = (height == 4) ? 4 : 8;
- // All columns from |min_top_only_x| to the right will only need |top_row| to
- // compute and can therefore call the Zone1 functions. This assumes |xstep| is
- // at least 3.
- assert(xstep >= 3);
- const int min_top_only_x = std::min((height * xstep) >> 6, width);
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ uint8_t* dst_x = dst + x;
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min((1 << 6) / xstep, height) & ~(min_height - 1);
+ DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
+ top_row + (x << upsample_top_shift), -xstep,
+ upsampled_top);
+
+ if (max_top_only_y == height) return;
+
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+
+ // All rows from |min_left_only_y| down for this set of columns only need
+ // |left_column| to compute. Round up to the nearest 8.
+ const int min_left_only_y =
+ Align(std::min(((x + 8) << 6) / xstep, height), 8);
+ int xstep_bounds = xstep_bounds_base + xstep_y;
+ int top_x = -xstep - xstep_y;
+
+ const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ if (shuffle_left_column) {
+ DirectionalZone2FromLeftCol_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y,
+ upsample_left_shift);
+ } else {
+ DirectionalZone3_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep, upsample_left_shift);
+ }
+
+ DirectionalZone1Blend_WxH<8>(
+ dst_x, stride, min_height, top_row + (x << upsample_top_shift),
+ xstep_bounds, top_x, xstep, upsample_top_shift);
+ }
+
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_WxH<8>(
+ dst_x, stride, min_height,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep, upsample_left_shift);
+ }
+}
+
+// Process a multiple of 8 |width|.
+inline void DirectionalZone2_WxH(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint8_t* LIBGAV1_RESTRICT const top_row,
+ const uint8_t* LIBGAV1_RESTRICT const left_column, const int width,
+ const int height, const int xstep, const int ystep,
+ const bool upsampled_top, const bool upsampled_left) {
+ const int ystep8 = ystep << 3;
// Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
@@ -677,90 +696,43 @@ inline void DirectionalZone2_8(
// left_y vector omits the portion which is covered under the left_column
// offset. Following values need the full ystep as a relative offset.
const int16x8_t remainder = vdupq_n_s16(-ystep_remainder);
+ const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
int16x8_t left_y = vmlaq_n_s16(remainder, zero_to_seven, -ystep);
+ // For ystep > 90, at least two sets of 8 columns can be fully computed from
+ // top_row only.
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+ // Analysis finds that, for most angles (ystep < 132), all segments that use
+ // both top_row and left_column can compute from left_column using byte
+ // shuffles from a single vector. For steeper angles, the shuffle is also
+ // fully reliable when x >= 32.
+ const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+ const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
+
// This loop treats each set of 4 columns in 3 stages with y-value boundaries.
// The first stage, before the first y-loop, covers blocks that are only
// computed from the top row. The second stage, comprising two y-loops, covers
// blocks that have a mixture of values computed from top or left. The final
// stage covers blocks that are only computed from the left.
int x = 0;
- // For steep angles, the source pixels from |left_column| may not fit in a
- // 16-byte load for shuffling. |d| represents the number of pixels that can
- // fit in one contiguous vector when stepping by |ystep|. For a given x
- // position, the left column values can be obtained by VTBL as long as the
- // values at row[x + d] and beyond come from the top row. However, this does
- // not guarantee that the vector will also contain all of the values needed
- // from top row.
- const int d = 16 / ((ystep >> 6) + 1);
+ for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
+ xstep_bounds_base -= (8 << 6),
+ left_y = vsubq_s16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ DirectionalZone2_8xH<false>(dst, stride, top_row, left_column, height,
+ xstep, ystep, x, left_offset, xstep_bounds_base,
+ left_y, upsampled_top, upsampled_left);
+ }
for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
xstep_bounds_base -= (8 << 6),
left_y = vsubq_s16(left_y, increment_left8),
left_offset -= left_base_increment8) {
- uint8_t* dst_x = dst + x;
- const int max_shuffle_height =
- std::min(((x + d) << 6) / xstep, height) & ~7;
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
- DirectionalZone1_WxH<8>(dst_x, stride, max_top_only_y,
- top_row + (x << upsample_top_shift), -xstep,
- upsampled_top);
-
- if (max_top_only_y == height) continue;
-
- int y = max_top_only_y;
- dst_x += stride * y;
- const int xstep_y = xstep * y;
-
- // All rows from |min_left_only_y| down for this set of columns only need
- // |left_column| to compute.
- const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
- // At high angles such that min_left_only_y < 8, ystep is low and xstep is
- // high. This means that max_shuffle_height is unbounded and xstep_bounds
- // will overflow in 16 bits. This is prevented by stopping the first
- // blending loop at min_left_only_y for such cases, which means we skip over
- // the second blending loop as well.
- const int left_shuffle_stop_y =
- std::min(max_shuffle_height, min_left_only_y);
- int xstep_bounds = xstep_bounds_base + xstep_y;
- int top_x = -xstep - xstep_y;
-
- for (; y < left_shuffle_stop_y;
- y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone2FromLeftCol_WxH<8>(
- dst_x, stride, min_height,
- left_column + ((left_offset + y) << upsample_left_shift), left_y,
- upsample_left_shift);
-
- DirectionalZone1Blend_WxH<8>(
- dst_x, stride, min_height, top_row + (x << upsample_top_shift),
- xstep_bounds, top_x, xstep, upsample_top_shift);
- }
-
- // Pick up from the last y-value, using the slower but secure method for
- // left prediction.
- const int16_t base_left_y = vgetq_lane_s16(left_y, 0);
- for (; y < min_left_only_y;
- y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone3_WxH<8>(
- dst_x, stride, min_height,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep, upsample_left_shift);
-
- DirectionalZone1Blend_WxH<8>(
- dst_x, stride, min_height, top_row + (x << upsample_top_shift),
- xstep_bounds, top_x, xstep, upsample_top_shift);
- }
- // Loop over y for left_only rows.
- for (; y < height; y += 8, dst_x += stride8) {
- DirectionalZone3_WxH<8>(
- dst_x, stride, min_height,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep, upsample_left_shift);
- }
+ DirectionalZone2_8xH<true>(dst, stride, top_row, left_column, height, xstep,
+ ystep, x, left_offset, xstep_bounds_base, left_y,
+ upsampled_top, upsampled_left);
}
- // TODO(johannkoenig): May be able to remove this branch.
if (x < width) {
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
DirectionalZone1_WxH(dst + x, stride, width - x, height,
top_row + (x << upsample_top_shift), -xstep,
upsampled_top);
@@ -792,8 +764,8 @@ void DirectionalIntraPredictorZone2_NEON(
DirectionalZone2_4xH(dst, stride, top_ptr, left_ptr, height, xstep, ystep,
upsampled_top, upsampled_left);
} else {
- DirectionalZone2_8(dst, stride, top_ptr, left_ptr, width, height, xstep,
- ystep, upsampled_top, upsampled_left);
+ DirectionalZone2_WxH(dst, stride, top_ptr, left_ptr, width, height, xstep,
+ ystep, upsampled_top, upsampled_left);
}
}
@@ -935,6 +907,16 @@ inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
return vrshrq_n_u16(sum, 5 /*log2(32)*/);
}
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+ const uint16x8_t a_weight,
+ const uint16x8_t b_weight) {
+ const uint16x8_t a_product = vmulq_u16(a, a_weight);
+ const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
+
+ return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
// Each element of |dest| contains values associated with one weight value.
inline void LoadEdgeVals(uint16x4x2_t* dest,
const uint16_t* LIBGAV1_RESTRICT const source,
@@ -959,6 +941,24 @@ inline void LoadEdgeVals(uint16x8x2_t* dest,
}
}
+// For Wx4 blocks, load the source for 2 columns. The source for the second
+// column is held in the high half of each vector.
+inline void LoadEdgeVals2x4(uint16x8x2_t* dest,
+ const uint16_t* LIBGAV1_RESTRICT const source_low,
+ const uint16_t* LIBGAV1_RESTRICT const source_high,
+ const bool upsampled) {
+ if (upsampled) {
+ const uint16x4x2_t low = vld2_u16(source_low);
+ const uint16x4x2_t high = vld2_u16(source_high);
+ dest->val[0] = vcombine_u16(low.val[0], high.val[0]);
+ dest->val[1] = vcombine_u16(low.val[1], high.val[1]);
+ } else {
+ dest->val[0] = vcombine_u16(vld1_u16(source_low), vld1_u16(source_high));
+ dest->val[1] =
+ vcombine_u16(vld1_u16(source_low + 1), vld1_u16(source_high + 1));
+ }
+}
+
template <bool upsampled>
inline void DirectionalZone1_4xH(uint16_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t stride, const int height,
@@ -1286,18 +1286,162 @@ inline void DirectionalZone3_4x4(uint8_t* LIBGAV1_RESTRICT dst,
}
template <bool upsampled>
+inline void DirectionalZone3_8x4(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep, const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+ const uint16x8_t inverter = vdupq_n_u16(32);
+
+ uint16x8x2_t sampled_left_col;
+ // Compute two columns at a time, then transpose for storage.
+ uint16x8_t result[4];
+
+ // The low half of pre-transpose vectors contains columns 0 through 3.
+ int left_y_low = base_left_y + ystep;
+ int left_offset_low = left_y_low >> index_scale_bits;
+ int shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ // The high half of pre-transpose vectors contains columns 4 through 7.
+ int left_y_high = left_y_low + (ystep << 2);
+ int left_offset_high = left_y_high >> index_scale_bits;
+ int shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ uint16x8_t weights_0 =
+ vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ uint16x8_t weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ left_y_low += ystep;
+ left_offset_low = left_y_low >> index_scale_bits;
+ shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ left_y_high += ystep;
+ left_offset_high = left_y_high >> index_scale_bits;
+ shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ left_y_high += ystep;
+ left_y_low += ystep;
+ left_offset_low = left_y_low >> index_scale_bits;
+ shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ left_offset_high = left_y_high >> index_scale_bits;
+ shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ left_y_low += ystep;
+ left_offset_low = left_y_low >> index_scale_bits;
+ shift_low = (LeftShift(left_y_low, upsample_shift) & 0x3F) >> 1;
+
+ left_y_high += ystep;
+ left_offset_high = left_y_high >> index_scale_bits;
+ shift_high = (LeftShift(left_y_high, upsample_shift) & 0x3F) >> 1;
+ weights_0 = vcombine_u16(vdup_n_u16(shift_low), vdup_n_u16(shift_high));
+ weights_1 = vsubq_u16(inverter, weights_0);
+ LoadEdgeVals2x4(&sampled_left_col, &left[left_offset_low],
+ &left[left_offset_high], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ weights_1, weights_0);
+
+ Transpose4x8(result);
+ Store8(dst, result[0]);
+ dst += stride;
+ Store8(dst, result[1]);
+ dst += stride;
+ Store8(dst, result[2]);
+ dst += stride;
+ Store8(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x8(uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const left,
+ const int ystep, const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ // Compute one column at a time, then transpose for storage.
+ uint16x8_t result[4];
+
+ int left_y = base_left_y + ystep;
+ int left_offset = left_y >> index_scale_bits;
+ int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ int shift_1 = 32 - shift_0;
+ uint16x8x2_t sampled_left_col;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ Transpose4x8(result);
+ Store4(dst, vget_low_u16(result[0]));
+ dst += stride;
+ Store4(dst, vget_low_u16(result[1]));
+ dst += stride;
+ Store4(dst, vget_low_u16(result[2]));
+ dst += stride;
+ Store4(dst, vget_low_u16(result[3]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[0]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[1]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[2]));
+ dst += stride;
+ Store4(dst, vget_high_u16(result[3]));
+}
+
+template <bool upsampled>
inline void DirectionalZone3_4xH(uint8_t* LIBGAV1_RESTRICT dest,
const ptrdiff_t stride, const int height,
const uint16_t* LIBGAV1_RESTRICT const left,
const int ystep) {
+ assert(height == 8 || height == 16);
const int upsample_shift = static_cast<int>(upsampled);
- int y = 0;
- do {
- DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift),
+ DirectionalZone3_4x8<upsampled>(dest, stride, left, ystep);
+ if (height == 16) {
+ dest += stride << 3;
+ DirectionalZone3_4x8<upsampled>(dest, stride, left + (8 << upsample_shift),
ystep);
- dest += 4 * stride;
- y += 4;
- } while (y < height);
+ }
}
template <bool upsampled>
@@ -1305,16 +1449,17 @@ inline void DirectionalZone3_Wx4(uint8_t* LIBGAV1_RESTRICT dest,
const ptrdiff_t stride, const int width,
const uint16_t* LIBGAV1_RESTRICT const left,
const int ystep) {
- int x = 0;
- int base_left_y = 0;
- do {
- // TODO(petersonab): Establish 8x4 transpose to reserve this function for
- // 8x4 and 16x4.
- DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep,
- base_left_y);
- base_left_y += 4 * ystep;
- x += 4;
- } while (x < width);
+ assert(width <= 16);
+ if (width == 4) {
+ DirectionalZone3_4x4<upsampled>(dest, stride, left, ystep);
+ return;
+ }
+ DirectionalZone3_8x4<upsampled>(dest, stride, left, ystep);
+ if (width == 16) {
+ const int base_left_y = ystep << 3;
+ DirectionalZone3_8x4<upsampled>(dest + 8 * sizeof(uint16_t), stride, left,
+ ystep, base_left_y);
+ }
}
template <bool upsampled>
@@ -1460,17 +1605,17 @@ void DirectionalIntraPredictorZone3_NEON(
} while (y != 0);
return;
}
- if (width == 4) {
+ if (height == 4) {
if (upsampled_left) {
- DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
+ DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
} else {
- DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
+ DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
}
- } else if (height == 4) {
+ } else if (width == 4) {
if (upsampled_left) {
- DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
+ DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
} else {
- DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
+ DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
}
} else {
if (upsampled_left) {
@@ -1532,16 +1677,6 @@ inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
return vrshr_n_u16(sum, 5 /*log2(32)*/);
}
-// Blend two values based on weight pairs that each sum to 32.
-inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
- const uint16x8_t a_weight,
- const uint16x8_t b_weight) {
- const uint16x8_t a_product = vmulq_u16(a, a_weight);
- const uint16x8_t sum = vmlaq_u16(a_product, b, b_weight);
-
- return vrshrq_n_u16(sum, 5 /*log2(32)*/);
-}
-
// Because the source values "move backwards" as the row index increases, the
// indices derived from ystep are generally negative in localized functions.
// This is accommodated by making sure the relative indices are within [-15, 0]
@@ -1608,8 +1743,8 @@ inline void DirectionalZone2FromLeftCol_4xH(
} while (++y < height);
}
-inline void DirectionalZone2FromLeftCol_8xH(
- uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride, const int height,
+inline void DirectionalZone2FromLeftCol_8x8(
+ uint8_t* LIBGAV1_RESTRICT dst, const ptrdiff_t stride,
const uint16_t* LIBGAV1_RESTRICT const left_column, const int16x8_t left_y,
const bool upsampled) {
const int upsample_shift = static_cast<int>(upsampled);
@@ -1653,8 +1788,7 @@ inline void DirectionalZone2FromLeftCol_8xH(
vreinterpretq_u16_s16(vshrq_n_s16(shift_masked, 1));
const uint16x8_t shift_1 = vsubq_u16(vdupq_n_u16(32), shift_0);
- int y = 0;
- do {
+ for (int y = 0; y < 8; ++y) {
uint16x8_t src_left, src_right;
LoadStepwise(
left_column - kPositiveIndexOffsetPixels + (y << upsample_shift),
@@ -1664,7 +1798,7 @@ inline void DirectionalZone2FromLeftCol_8xH(
Store8(dst, val);
dst += stride;
- } while (++y < height);
+ }
}
template <bool upsampled>
@@ -1704,8 +1838,8 @@ inline void DirectionalZone1Blend_4xH(
}
template <bool upsampled>
-inline void DirectionalZone1Blend_8xH(
- uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride, const int height,
+inline void DirectionalZone1Blend_8x8(
+ uint8_t* LIBGAV1_RESTRICT dest, const ptrdiff_t stride,
const uint16_t* LIBGAV1_RESTRICT const top_row, int zone_bounds, int top_x,
const int xstep) {
const int upsample_shift = static_cast<int>(upsampled);
@@ -1716,8 +1850,7 @@ inline void DirectionalZone1Blend_8xH(
const int16x8_t indices = {0, 1, 2, 3, 4, 5, 6, 7};
uint16x8x2_t top_vals;
- int y = height;
- do {
+ for (int y = 0; y < 8; ++y) {
const uint16_t* const src = top_row + (top_x >> scale_bits_x);
LoadEdgeVals(&top_vals, src, upsampled);
@@ -1736,20 +1869,9 @@ inline void DirectionalZone1Blend_8xH(
dest += stride;
zone_bounds += xstep;
top_x -= xstep;
- } while (--y != 0);
+ }
}
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15. Indices
-// that do not correspond to angle derivatives are left at zero.
-// Notably, in cases with upsampling, the shuffle-invalid height is always
-// greater than the prediction height (which is 8 at maximum).
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
- 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
// 7.11.2.4 (8) 90 < angle > 180
// The strategy for these functions (4xH and 8+xH) is to know how many blocks
// can be processed with just pixels from |top_ptr|, then handle mixed blocks,
@@ -1796,9 +1918,9 @@ inline void DirectionalZone2_4xH(
// computed from the top row. The second stage, comprising two y-loops, covers
// blocks that have a mixture of values computed from top or left. The final
// stage covers blocks that are only computed from the left.
- // Round down to the nearest multiple of 8.
- // TODO(petersonab): Check if rounding to the nearest 4 is okay.
- const int max_top_only_y = std::min((1 << 6) / xstep, height) & ~7;
+ // Round down to the nearest multiple of 8 (or 4, if height is 4).
+ const int max_top_only_y =
+ std::min((1 << 6) / xstep, height) & ~(min_height - 1);
DirectionalZone1_4xH<upsampled_top>(reinterpret_cast<uint16_t*>(dst),
stride >> 1, max_top_only_y, top_row,
-xstep);
@@ -1827,12 +1949,15 @@ inline void DirectionalZone2_4xH(
xstep_bounds, top_x, xstep);
}
- // Loop over y for left-only rows.
- for (; y < height; y += 8, dst += stride8) {
- // Angle expected by Zone3 is flipped about the 180 degree vector, which
- // is the x-axis.
+ // Left-only section. |height| - |y| is assumed equivalent to:
+ // (y == 0) && (height == 4)
+ if (height - y == 4) {
+ DirectionalZone3_4x4<upsampled_left>(dst, stride, left_column, -ystep);
+ return;
+ }
+ if (y < height) {
DirectionalZone3_4xH<upsampled_left>(
- dst, stride, min_height, left_column + (y << upsample_left_shift),
+ dst, stride, height - y, left_column + (y << upsample_left_shift),
-ystep);
}
}
@@ -1882,9 +2007,75 @@ inline void DirectionalZone2_Wx4(
}
}
+template <bool shuffle_left_column, bool upsampled_top, bool upsampled_left>
+inline void DirectionalZone2_8xH(
+ uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
+ const uint16_t* LIBGAV1_RESTRICT const top_row,
+ const uint16_t* LIBGAV1_RESTRICT const left_column, const int height,
+ const int xstep, const int ystep, const int x, const int left_offset,
+ const int xstep_bounds_base, const int16x8_t left_y) {
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+
+ // Loop incrementers for moving by block (8x8). This function handles blocks
+ // with height 4 as well. They are calculated in one pass so these variables
+ // do not get used.
+ const ptrdiff_t stride8 = stride << 3;
+ const int xstep8 = xstep << 3;
+
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ uint8_t* dst_x = dst + x * sizeof(uint16_t);
+ // Round down to the nearest multiple of 8.
+ const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+ DirectionalZone1_WxH<upsampled_top>(
+ reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
+ top_row + (x << upsample_top_shift), -xstep);
+
+ if (max_top_only_y == height) return;
+
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+
+ // All rows from |min_left_only_y| down for this set of columns only need
+ // |left_column| to compute. Round up to the nearest 8.
+ const int min_left_only_y =
+ Align(std::min(((x + 8) << 6) / xstep, height), 8);
+ int xstep_bounds = xstep_bounds_base + xstep_y;
+ int top_x = -xstep - xstep_y;
+
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
+ if (shuffle_left_column) {
+ DirectionalZone2FromLeftCol_8x8(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y,
+ upsampled_left);
+ } else {
+ DirectionalZone3_8x8<upsampled_left>(
+ dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+ -ystep * x);
+ }
+
+ DirectionalZone1Blend_8x8<upsampled_top>(
+ dst_x, stride, top_row + (x << upsample_top_shift), xstep_bounds, top_x,
+ xstep);
+ }
+
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_8x8<upsampled_left>(
+ dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
+ -ystep * x);
+ }
+}
+
// Process a multiple of 8 |width|.
template <bool upsampled_top, bool upsampled_left>
-inline void DirectionalZone2_8(
+inline void DirectionalZone2_NEON(
uint8_t* LIBGAV1_RESTRICT const dst, const ptrdiff_t stride,
const uint16_t* LIBGAV1_RESTRICT const top_row,
const uint16_t* LIBGAV1_RESTRICT const left_column, const int width,
@@ -1894,30 +2085,24 @@ inline void DirectionalZone2_8(
dst, stride, top_row, left_column, width, xstep, ystep);
return;
}
- const int upsample_left_shift = static_cast<int>(upsampled_left);
const int upsample_top_shift = static_cast<int>(upsampled_top);
// Helper vector.
const int16x8_t zero_to_seven = {0, 1, 2, 3, 4, 5, 6, 7};
- // Loop increments for moving by block (8x8). This function handles blocks
- // with height 4 as well. They are calculated in one pass so these variables
- // do not get used.
- const ptrdiff_t stride8 = stride << 3;
- const int xstep8 = xstep << 3;
const int ystep8 = ystep << 3;
// All columns from |min_top_only_x| to the right will only need |top_row| to
// compute and can therefore call the Zone1 functions. This assumes |xstep| is
// at least 3.
assert(xstep >= 3);
- const int min_top_only_x = std::min((height * xstep) >> 6, width);
-
- // For steep angles, the source pixels from |left_column| may not fit in a
- // 16-byte load for shuffling.
- // TODO(petersonab): Find a more precise formula for this subject to x.
- const int max_shuffle_height =
- std::min(kDirectionalZone2ShuffleInvalidHeight[ystep >> 6], height);
+ const int min_top_only_x = Align(std::min((height * xstep) >> 6, width), 8);
+ // Analysis finds that, for most angles (ystep < 132), all segments that use
+ // both top_row and left_column can compute from left_column using byte
+ // shuffles from a single vector. For steeper angles, the shuffle is also
+ // fully reliable when x >= 32.
+ const int shuffle_left_col_x = (ystep < 132) ? 0 : 32;
+ const int min_shuffle_x = std::min(min_top_only_x, shuffle_left_col_x);
// Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
int xstep_bounds_base = (xstep == 64) ? 0 : xstep - 1;
@@ -1935,73 +2120,22 @@ inline void DirectionalZone2_8(
int16x8_t left_y =
vmlaq_n_s16(vdupq_n_s16(-ystep_remainder), zero_to_seven, -ystep);
- // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
- // The first stage, before the first y-loop, covers blocks that are only
- // computed from the top row. The second stage, comprising two y-loops, covers
- // blocks that have a mixture of values computed from top or left. The final
- // stage covers blocks that are only computed from the left.
int x = 0;
+ for (int left_offset = -left_base_increment; x < min_shuffle_x; x += 8,
+ xstep_bounds_base -= (8 << 6),
+ left_y = vsubq_s16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ DirectionalZone2_8xH<false, upsampled_top, upsampled_left>(
+ dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+ xstep_bounds_base, left_y);
+ }
for (int left_offset = -left_base_increment; x < min_top_only_x; x += 8,
xstep_bounds_base -= (8 << 6),
left_y = vsubq_s16(left_y, increment_left8),
left_offset -= left_base_increment8) {
- uint8_t* dst_x = dst + x * sizeof(uint16_t);
-
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
- DirectionalZone1_WxH<upsampled_top>(
- reinterpret_cast<uint16_t*>(dst_x), stride >> 1, 8, max_top_only_y,
- top_row + (x << upsample_top_shift), -xstep);
-
- if (max_top_only_y == height) continue;
-
- int y = max_top_only_y;
- dst_x += stride * y;
- const int xstep_y = xstep * y;
-
- // All rows from |min_left_only_y| down for this set of columns only need
- // |left_column| to compute.
- const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
- // At high angles such that min_left_only_y < 8, ystep is low and xstep is
- // high. This means that max_shuffle_height is unbounded and xstep_bounds
- // will overflow in 16 bits. This is prevented by stopping the first
- // blending loop at min_left_only_y for such cases, which means we skip over
- // the second blending loop as well.
- const int left_shuffle_stop_y =
- std::min(max_shuffle_height, min_left_only_y);
- int xstep_bounds = xstep_bounds_base + xstep_y;
- int top_x = -xstep - xstep_y;
-
- for (; y < left_shuffle_stop_y;
- y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone2FromLeftCol_8xH(
- dst_x, stride, 8,
- left_column + ((left_offset + y) << upsample_left_shift), left_y,
- upsample_left_shift);
-
- DirectionalZone1Blend_8xH<upsampled_top>(
- dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds,
- top_x, xstep);
- }
-
- // Pick up from the last y-value, using the slower but secure method for
- // left prediction.
- for (; y < min_left_only_y;
- y += 8, dst_x += stride8, xstep_bounds += xstep8, top_x -= xstep8) {
- DirectionalZone3_8x8<upsampled_left>(
- dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
- -ystep * x);
-
- DirectionalZone1Blend_8xH<upsampled_top>(
- dst_x, stride, 8, top_row + (x << upsample_top_shift), xstep_bounds,
- top_x, xstep);
- }
- // Loop over y for left_only rows.
- for (; y < height; y += 8, dst_x += stride8) {
- DirectionalZone3_8x8<upsampled_left>(
- dst_x, stride, left_column + (y << upsample_left_shift), -ystep,
- -ystep * x);
- }
+ DirectionalZone2_8xH<true, upsampled_top, upsampled_left>(
+ dst, stride, top_row, left_column, height, xstep, ystep, x, left_offset,
+ xstep_bounds_base, left_y);
}
// Reached |min_top_only_x|.
if (x < width) {
@@ -2129,18 +2263,18 @@ void DirectionalIntraPredictorZone2_NEON(
}
if (upsampled_top) {
if (upsampled_left) {
- DirectionalZone2_8<true, true>(dst, stride, top_ptr, left_ptr, width,
- height, xstep, ystep);
+ DirectionalZone2_NEON<true, true>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
} else {
- DirectionalZone2_8<true, false>(dst, stride, top_ptr, left_ptr, width,
- height, xstep, ystep);
+ DirectionalZone2_NEON<true, false>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
}
} else if (upsampled_left) {
- DirectionalZone2_8<false, true>(dst, stride, top_ptr, left_ptr, width,
- height, xstep, ystep);
+ DirectionalZone2_NEON<false, true>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
} else {
- DirectionalZone2_8<false, false>(dst, stride, top_ptr, left_ptr, width,
- height, xstep, ystep);
+ DirectionalZone2_NEON<false, false>(dst, stride, top_ptr, left_ptr, width,
+ height, xstep, ystep);
}
}
diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc
index cd47a22..d1adbdf 100644
--- a/src/dsp/arm/intrapred_neon.cc
+++ b/src/dsp/arm/intrapred_neon.cc
@@ -407,13 +407,9 @@ inline void Paeth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist,
const uint16x8_t top_left_dist_low,
const uint16x8_t top_left_dist_high) {
- // TODO(johannkoenig): cle() should work with vmovn(top_left_dist) instead of
- // using movl(x_dist).
- const uint8x8_t x_le_top_left_low =
- vmovn_u16(vcleq_u16(vmovl_u8(vget_low_u8(x_dist)), top_left_dist_low));
- const uint8x8_t x_le_top_left_high =
- vmovn_u16(vcleq_u16(vmovl_u8(vget_high_u8(x_dist)), top_left_dist_high));
- return vcombine_u8(x_le_top_left_low, x_le_top_left_high);
+ const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
+ vqmovn_u16(top_left_dist_high));
+ return vcleq_u8(x_dist, top_left_dist);
}
// Select the closest values and collect them.
diff --git a/src/dsp/arm/intrapred_smooth_neon.cc b/src/dsp/arm/intrapred_smooth_neon.cc
index bcda131..d6c1450 100644
--- a/src/dsp/arm/intrapred_smooth_neon.cc
+++ b/src/dsp/arm/intrapred_smooth_neon.cc
@@ -31,7 +31,6 @@
namespace libgav1 {
namespace dsp {
-
namespace low_bitdepth {
namespace {
@@ -42,20 +41,15 @@ constexpr uint8_t kSmoothWeights[] = {
#include "src/dsp/smooth_weights.inc"
};
-inline uint16x4_t CalculatePred(const uint16x4_t weighted_top,
- const uint16x4_t weighted_left,
- const uint16x4_t weighted_bl,
- const uint16x4_t weighted_tr) {
- const uint32x4_t pred_0 = vaddl_u16(weighted_top, weighted_left);
- const uint32x4_t pred_1 = vaddl_u16(weighted_bl, weighted_tr);
- const uint32x4_t pred_2 = vaddq_u32(pred_0, pred_1);
- return vrshrn_n_u32(pred_2, kSmoothWeightScale + 1);
+// 256 - v = vneg_s8(v)
+inline uint8x8_t NegateS8(const uint8x8_t v) {
+ return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
}
template <int height>
-inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
constexpr int width = 4;
const auto* const top = static_cast<const uint8_t*>(top_row);
const auto* const left = static_cast<const uint8_t*>(left_column);
@@ -68,47 +62,49 @@ inline void Smooth4xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const uint8x8_t top_right_v = vdup_n_u8(top_right);
const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
const uint8x8_t weights_x_v = Load4(kSmoothWeights + width - 4);
- // 256 - weights = vneg_s8(weights)
- const uint8x8_t scaled_weights_x =
- vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v)));
+ const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
for (int y = 0; y < height; ++y) {
const uint8x8_t left_v = vdup_n_u8(left[y]);
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y =
- vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_y_v)));
- const uint16x4_t weighted_bl =
- vget_low_u16(vmull_u8(scaled_weights_y, bottom_left_v));
-
- const uint16x4_t weighted_top = vget_low_u16(vmull_u8(weights_y_v, top_v));
- const uint16x4_t weighted_left =
- vget_low_u16(vmull_u8(weights_x_v, left_v));
- const uint16x4_t weighted_tr =
- vget_low_u16(vmull_u8(scaled_weights_x, top_right_v));
- const uint16x4_t result =
- CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr);
-
- StoreLo4(dst, vmovn_u16(vcombine_u16(result, result)));
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
+ const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_bl, weights_y_v, top_v);
+ const uint16x8_t weighted_left_tr =
+ vmlal_u8(weighted_tr, weights_x_v, left_v);
+ // Maximum value of each parameter: 0xFF00
+ const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+ const uint8x8_t result = vrshrn_n_u16(avg, kSmoothWeightScale);
+
+ StoreLo4(dst, result);
dst += stride;
}
}
-inline uint8x8_t CalculatePred(const uint16x8_t weighted_top,
- const uint16x8_t weighted_left,
- const uint16x8_t weighted_bl,
- const uint16x8_t weighted_tr) {
- // Maximum value: 0xFF00
- const uint16x8_t pred_0 = vaddq_u16(weighted_top, weighted_bl);
- // Maximum value: 0xFF00
- const uint16x8_t pred_1 = vaddq_u16(weighted_left, weighted_tr);
- const uint16x8_t pred_2 = vhaddq_u16(pred_0, pred_1);
- return vrshrn_n_u16(pred_2, kSmoothWeightScale);
+inline uint8x8_t CalculatePred(const uint16x8_t weighted_top_bl,
+ const uint16x8_t weighted_left_tr) {
+ // Maximum value of each parameter: 0xFF00
+ const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+ return vrshrn_n_u16(avg, kSmoothWeightScale);
+}
+
+inline uint8x8_t CalculateWeightsAndPred(
+ const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
+ const uint8x8_t bottom_left, const uint8x8_t weights_x,
+ const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
+ const uint16x8_t weighted_top = vmull_u8(weights_y, top);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
+ const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
+ return CalculatePred(weighted_top_bl, weighted_left_tr);
}
template <int height>
-inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
constexpr int width = 8;
const auto* const top = static_cast<const uint8_t*>(top_row);
const auto* const left = static_cast<const uint8_t*>(left_column);
@@ -121,21 +117,16 @@ inline void Smooth8xN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const uint8x8_t top_right_v = vdup_n_u8(top_right);
const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
const uint8x8_t weights_x_v = vld1_u8(kSmoothWeights + width - 4);
- // 256 - weights = vneg_s8(weights)
- const uint8x8_t scaled_weights_x =
- vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x_v)));
+ const uint8x8_t scaled_weights_x = NegateS8(weights_x_v);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
for (int y = 0; y < height; ++y) {
const uint8x8_t left_v = vdup_n_u8(left[y]);
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
- const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
-
- const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
- const uint16x8_t weighted_left = vmull_u8(weights_x_v, left_v);
- const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
const uint8x8_t result =
- CalculatePred(weighted_top, weighted_left, weighted_bl, weighted_tr);
+ CalculateWeightsAndPred(top_v, left_v, weighted_tr, bottom_left_v,
+ weights_x_v, scaled_weights_y, weights_y_v);
vst1_u8(dst, result);
dst += stride;
@@ -146,28 +137,34 @@ inline uint8x16_t CalculateWeightsAndPred(
const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
const uint8x8_t weights_y, const uint8x16_t weights_x,
const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
- const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top));
+ const uint16x8_t weighted_top_bl_low =
+ vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
- const uint16x8_t weighted_tr_low =
- vmull_u8(vget_low_u8(scaled_weights_x), top_right);
- const uint8x8_t result_low = CalculatePred(
- weighted_top_low, weighted_left_low, weighted_bl, weighted_tr_low);
+ const uint16x8_t weighted_left_tr_low =
+ vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+ const uint8x8_t result_low =
+ CalculatePred(weighted_top_bl_low, weighted_left_tr_low);
- const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top));
+ const uint16x8_t weighted_top_bl_high =
+ vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
- const uint16x8_t weighted_tr_high =
- vmull_u8(vget_high_u8(scaled_weights_x), top_right);
- const uint8x8_t result_high = CalculatePred(
- weighted_top_high, weighted_left_high, weighted_bl, weighted_tr_high);
+ const uint16x8_t weighted_left_tr_high =
+ vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+ const uint8x8_t result_high =
+ CalculatePred(weighted_top_bl_high, weighted_left_tr_high);
return vcombine_u8(result_low, result_high);
}
+// 256 - v = vneg_s8(v)
+inline uint8x16_t NegateS8(const uint8x16_t v) {
+ return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
+}
+
template <int width, int height>
-inline void Smooth16PlusxN_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const auto* const left = static_cast<const uint8_t*>(left_column);
const uint8_t top_right = top[width - 1];
@@ -188,9 +185,6 @@ inline void Smooth16PlusxN_NEON(
const uint8x8_t top_right_v = vdup_n_u8(top_right);
const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
- // TODO(johannkoenig): Consider re-reading top_v and weights_x_v in the loop.
- // This currently has a performance slope similar to Paeth so it does not
- // appear to be register bound for arm64.
uint8x16_t weights_x_v[4];
weights_x_v[0] = vld1q_u8(kSmoothWeights + width - 4);
if (width > 16) {
@@ -202,23 +196,19 @@ inline void Smooth16PlusxN_NEON(
}
uint8x16_t scaled_weights_x[4];
- scaled_weights_x[0] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[0])));
+ scaled_weights_x[0] = NegateS8(weights_x_v[0]);
if (width > 16) {
- scaled_weights_x[1] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[1])));
+ scaled_weights_x[1] = NegateS8(weights_x_v[1]);
if (width == 64) {
- scaled_weights_x[2] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[2])));
- scaled_weights_x[3] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x_v[3])));
+ scaled_weights_x[2] = NegateS8(weights_x_v[2]);
+ scaled_weights_x[3] = NegateS8(weights_x_v[3]);
}
}
for (int y = 0; y < height; ++y) {
const uint8x8_t left_v = vdup_n_u8(left[y]);
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
vst1q_u8(dst, CalculateWeightsAndPred(top_v[0], left_v, top_right_v,
@@ -246,10 +236,10 @@ inline void Smooth16PlusxN_NEON(
}
template <int width, int height>
-inline void SmoothVertical4Or8xN_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVertical4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint8_t*>(top_row);
const auto* const left = static_cast<const uint8_t*>(left_column);
const uint8_t bottom_left = left[height - 1];
@@ -267,17 +257,17 @@ inline void SmoothVertical4Or8xN_NEON(
for (int y = 0; y < height; ++y) {
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);
- const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
- const uint16x8_t pred = vaddq_u16(weighted_top, weighted_bl);
- const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale);
+ const uint16x8_t weighted_top_bl =
+ vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v);
+ const uint8x8_t pred = vrshrn_n_u16(weighted_top_bl, kSmoothWeightScale);
if (width == 4) {
- StoreLo4(dst, pred_scaled);
+ StoreLo4(dst, pred);
} else { // width == 8
- vst1_u8(dst, pred_scaled);
+ vst1_u8(dst, pred);
}
dst += stride;
}
@@ -286,10 +276,10 @@ inline void SmoothVertical4Or8xN_NEON(
inline uint8x16_t CalculateVerticalWeightsAndPred(
const uint8x16_t top, const uint8x8_t weights_y,
const uint16x8_t weighted_bl) {
- const uint16x8_t weighted_top_low = vmull_u8(weights_y, vget_low_u8(top));
- const uint16x8_t weighted_top_high = vmull_u8(weights_y, vget_high_u8(top));
- const uint16x8_t pred_low = vaddq_u16(weighted_top_low, weighted_bl);
- const uint16x8_t pred_high = vaddq_u16(weighted_top_high, weighted_bl);
+ const uint16x8_t pred_low =
+ vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+ const uint16x8_t pred_high =
+ vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
const uint8x8_t pred_scaled_high =
vrshrn_n_u16(pred_high, kSmoothWeightScale);
@@ -297,7 +287,7 @@ inline uint8x16_t CalculateVerticalWeightsAndPred(
}
template <int width, int height>
-inline void SmoothVertical16PlusxN_NEON(
+void SmoothVertical16PlusxN_NEON(
void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const void* LIBGAV1_RESTRICT const top_row,
const void* LIBGAV1_RESTRICT const left_column) {
@@ -321,7 +311,7 @@ inline void SmoothVertical16PlusxN_NEON(
for (int y = 0; y < height; ++y) {
const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
- const uint8x8_t scaled_weights_y = vdup_n_u8(256 - weights_y[y]);
+ const uint8x8_t scaled_weights_y = NegateS8(weights_y_v);
const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
const uint8x16_t pred_0 =
@@ -349,7 +339,7 @@ inline void SmoothVertical16PlusxN_NEON(
}
template <int width, int height>
-inline void SmoothHorizontal4Or8xN_NEON(
+void SmoothHorizontal4Or8xN_NEON(
void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const void* LIBGAV1_RESTRICT const top_row,
const void* LIBGAV1_RESTRICT const left_column) {
@@ -361,22 +351,19 @@ inline void SmoothHorizontal4Or8xN_NEON(
const uint8x8_t top_right_v = vdup_n_u8(top_right);
// Over-reads for 4xN but still within the array.
const uint8x8_t weights_x = vld1_u8(kSmoothWeights + width - 4);
- // 256 - weights = vneg_s8(weights)
- const uint8x8_t scaled_weights_x =
- vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(weights_x)));
+ const uint8x8_t scaled_weights_x = NegateS8(weights_x);
+ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
for (int y = 0; y < height; ++y) {
const uint8x8_t left_v = vdup_n_u8(left[y]);
-
- const uint16x8_t weighted_left = vmull_u8(weights_x, left_v);
- const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
- const uint16x8_t pred = vaddq_u16(weighted_left, weighted_tr);
- const uint8x8_t pred_scaled = vrshrn_n_u16(pred, kSmoothWeightScale);
+ const uint16x8_t weighted_left_tr =
+ vmlal_u8(weighted_tr, weights_x, left_v);
+ const uint8x8_t pred = vrshrn_n_u16(weighted_left_tr, kSmoothWeightScale);
if (width == 4) {
- StoreLo4(dst, pred_scaled);
+ StoreLo4(dst, pred);
} else { // width == 8
- vst1_u8(dst, pred_scaled);
+ vst1_u8(dst, pred);
}
dst += stride;
}
@@ -386,23 +373,22 @@ inline uint8x16_t CalculateHorizontalWeightsAndPred(
const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
const uint8x16_t scaled_weights_x) {
const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
- const uint16x8_t weighted_tr_low =
- vmull_u8(vget_low_u8(scaled_weights_x), top_right);
- const uint16x8_t pred_low = vaddq_u16(weighted_left_low, weighted_tr_low);
- const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, kSmoothWeightScale);
+ const uint16x8_t weighted_left_tr_low =
+ vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+ const uint8x8_t pred_scaled_low =
+ vrshrn_n_u16(weighted_left_tr_low, kSmoothWeightScale);
const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
- const uint16x8_t weighted_tr_high =
- vmull_u8(vget_high_u8(scaled_weights_x), top_right);
- const uint16x8_t pred_high = vaddq_u16(weighted_left_high, weighted_tr_high);
+ const uint16x8_t weighted_left_tr_high =
+ vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
const uint8x8_t pred_scaled_high =
- vrshrn_n_u16(pred_high, kSmoothWeightScale);
+ vrshrn_n_u16(weighted_left_tr_high, kSmoothWeightScale);
return vcombine_u8(pred_scaled_low, pred_scaled_high);
}
template <int width, int height>
-inline void SmoothHorizontal16PlusxN_NEON(
+void SmoothHorizontal16PlusxN_NEON(
void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const void* LIBGAV1_RESTRICT const top_row,
const void* LIBGAV1_RESTRICT const left_column) {
@@ -424,16 +410,12 @@ inline void SmoothHorizontal16PlusxN_NEON(
}
uint8x16_t scaled_weights_x[4];
- scaled_weights_x[0] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[0])));
+ scaled_weights_x[0] = NegateS8(weights_x[0]);
if (width > 16) {
- scaled_weights_x[1] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[1])));
+ scaled_weights_x[1] = NegateS8(weights_x[1]);
if (width == 64) {
- scaled_weights_x[2] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[2])));
- scaled_weights_x[3] =
- vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(weights_x[3])));
+ scaled_weights_x[2] = NegateS8(weights_x[2]);
+ scaled_weights_x[3] = NegateS8(weights_x[3]);
}
}
@@ -633,10 +615,15 @@ constexpr uint16_t kSmoothWeights[] = {
#include "src/dsp/smooth_weights.inc"
};
+// 256 - v = vneg_s8(v)
+inline uint16x4_t NegateS8(const uint16x4_t v) {
+ return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
+}
+
template <int height>
-inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[3];
@@ -647,9 +634,7 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const uint16x4_t top_v = vld1_u16(top);
const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
const uint16x4_t weights_x_v = vld1_u16(kSmoothWeights);
- const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x_v);
-
- // Weighted top right doesn't change with each row.
+ const uint16x4_t scaled_weights_x = NegateS8(weights_x_v);
const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
for (int y = 0; y < height; ++y) {
@@ -670,10 +655,10 @@ inline void Smooth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
// Common code between 8xH and [16|32|64]xH.
inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst,
- const uint32x4_t& weighted_corners_low,
- const uint32x4_t& weighted_corners_high,
- const uint16x4x2_t& top_vals,
- const uint16x4x2_t& weights_x, const uint16_t left_y,
+ const uint32x4_t weighted_corners_low,
+ const uint32x4_t weighted_corners_high,
+ const uint16x4x2_t top_vals,
+ const uint16x4x2_t weights_x, const uint16_t left_y,
const uint16_t weight_y) {
// Each variable in the running summation is named for the last item to be
// accumulated.
@@ -697,9 +682,9 @@ inline void CalculatePred8(uint16_t* LIBGAV1_RESTRICT dst,
}
template <int height>
-inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[7];
@@ -712,14 +697,12 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
const uint16x4x2_t weights_x = {vld1_u16(kSmoothWeights + 4),
vld1_u16(kSmoothWeights + 8)};
- // Weighted top right doesn't change with each row.
const uint32x4_t weighted_tr_low =
- vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right);
+ vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
const uint32x4_t weighted_tr_high =
- vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right);
+ vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
for (int y = 0; y < height; ++y) {
- // |weighted_bl| is invariant across the row.
const uint32x4_t weighted_bl =
vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
const uint32x4_t weighted_corners_low =
@@ -735,9 +718,9 @@ inline void Smooth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
// For width 16 and above.
template <int width, int height>
-inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[width - 1];
@@ -746,23 +729,19 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
auto* dst = static_cast<uint8_t*>(dest);
- const uint16x4_t weight_scaling = vdup_n_u16(256);
// Precompute weighted values that don't vary with |y|.
uint32x4_t weighted_tr_low[width >> 3];
uint32x4_t weighted_tr_high[width >> 3];
for (int i = 0; i < width >> 3; ++i) {
const int x = i << 3;
const uint16x4_t weights_x_low = vld1_u16(kSmoothWeights + width - 4 + x);
- weighted_tr_low[i] =
- vmull_n_u16(vsub_u16(weight_scaling, weights_x_low), top_right);
+ weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low), top_right);
const uint16x4_t weights_x_high = vld1_u16(kSmoothWeights + width + x);
- weighted_tr_high[i] =
- vmull_n_u16(vsub_u16(weight_scaling, weights_x_high), top_right);
+ weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high), top_right);
}
const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
for (int y = 0; y < height; ++y) {
- // |weighted_bl| is invariant across the row.
const uint32x4_t weighted_bl =
vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
auto* dst_x = reinterpret_cast<uint16_t*>(dst);
@@ -785,10 +764,9 @@ inline void SmoothWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
}
template <int height>
-inline void SmoothVertical4xH_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t bottom_left = left[height - 1];
@@ -812,10 +790,10 @@ inline void SmoothVertical4xH_NEON(
}
template <int height>
-inline void SmoothVertical8xH_NEON(
- void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVertical8xH_NEON(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t bottom_left = left[height - 1];
@@ -829,7 +807,6 @@ inline void SmoothVertical8xH_NEON(
for (int y = 0; y < height; ++y) {
auto* dst16 = reinterpret_cast<uint16_t*>(dst);
- // |weighted_bl| is invariant across the row.
const uint32x4_t weighted_bl =
vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
@@ -846,10 +823,10 @@ inline void SmoothVertical8xH_NEON(
// For width 16 and above.
template <int width, int height>
-inline void SmoothVerticalWxH_NEON(
- void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothVerticalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t bottom_left = left[height - 1];
@@ -865,7 +842,6 @@ inline void SmoothVerticalWxH_NEON(
const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
for (int y = 0; y < height; ++y) {
- // |weighted_bl| is invariant across the row.
const uint32x4_t weighted_bl =
vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
@@ -885,10 +861,10 @@ inline void SmoothVerticalWxH_NEON(
}
template <int height>
-inline void SmoothHorizontal4xH_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothHorizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[3];
@@ -896,7 +872,7 @@ inline void SmoothHorizontal4xH_NEON(
auto* dst = static_cast<uint8_t*>(dest);
const uint16x4_t weights_x = vld1_u16(kSmoothWeights);
- const uint16x4_t scaled_weights_x = vsub_u16(vdup_n_u16(256), weights_x);
+ const uint16x4_t scaled_weights_x = NegateS8(weights_x);
const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
for (int y = 0; y < height; ++y) {
@@ -909,10 +885,10 @@ inline void SmoothHorizontal4xH_NEON(
}
template <int height>
-inline void SmoothHorizontal8xH_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothHorizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[7];
@@ -923,9 +899,9 @@ inline void SmoothHorizontal8xH_NEON(
vld1_u16(kSmoothWeights + 8)};
const uint32x4_t weighted_tr_low =
- vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[0]), top_right);
+ vmull_n_u16(NegateS8(weights_x.val[0]), top_right);
const uint32x4_t weighted_tr_high =
- vmull_n_u16(vsub_u16(vdup_n_u16(256), weights_x.val[1]), top_right);
+ vmull_n_u16(NegateS8(weights_x.val[1]), top_right);
for (int y = 0; y < height; ++y) {
auto* dst16 = reinterpret_cast<uint16_t*>(dst);
@@ -943,18 +919,16 @@ inline void SmoothHorizontal8xH_NEON(
// For width 16 and above.
template <int width, int height>
-inline void SmoothHorizontalWxH_NEON(
- void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
- const void* LIBGAV1_RESTRICT const top_row,
- const void* LIBGAV1_RESTRICT const left_column) {
+void SmoothHorizontalWxH_NEON(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const top = static_cast<const uint16_t*>(top_row);
const auto* const left = static_cast<const uint16_t*>(left_column);
const uint16_t top_right = top[width - 1];
auto* dst = static_cast<uint8_t*>(dest);
- const uint16x4_t weight_scaling = vdup_n_u16(256);
-
uint16x4_t weights_x_low[width >> 3];
uint16x4_t weights_x_high[width >> 3];
uint32x4_t weighted_tr_low[width >> 3];
@@ -962,11 +936,9 @@ inline void SmoothHorizontalWxH_NEON(
for (int i = 0; i < width >> 3; ++i) {
const int x = i << 3;
weights_x_low[i] = vld1_u16(kSmoothWeights + width - 4 + x);
- weighted_tr_low[i] =
- vmull_n_u16(vsub_u16(weight_scaling, weights_x_low[i]), top_right);
+ weighted_tr_low[i] = vmull_n_u16(NegateS8(weights_x_low[i]), top_right);
weights_x_high[i] = vld1_u16(kSmoothWeights + width + x);
- weighted_tr_high[i] =
- vmull_n_u16(vsub_u16(weight_scaling, weights_x_high[i]), top_right);
+ weighted_tr_high[i] = vmull_n_u16(NegateS8(weights_x_high[i]), top_right);
}
for (int y = 0; y < height; ++y) {
@@ -1141,6 +1113,7 @@ void Init10bpp() {
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
SmoothHorizontalWxH_NEON<64, 64>;
}
+
} // namespace
} // namespace high_bitdepth
#endif // LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc
index 617accc..e6f0d9d 100644
--- a/src/dsp/arm/inverse_transform_10bit_neon.cc
+++ b/src/dsp/arm/inverse_transform_10bit_neon.cc
@@ -282,9 +282,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
const int32x4_t max = vdupq_n_s32((1 << range) - 1);
int32x4_t s[4], x[4];
- LoadSrc<4>(dst, step, 0, x);
if (is_row) {
- Transpose4x4(x, x);
+ assert(step == 4);
+ int32x4x4_t y = vld4q_s32(dst);
+ for (int i = 0; i < 4; ++i) x[i] = y.val[i];
+ } else {
+ LoadSrc<4>(dst, step, 0, x);
}
// stage 1.
@@ -301,9 +304,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
for (auto& i : s) {
i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
}
- Transpose4x4(s, s);
+ int32x4x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = s[i];
+ vst4q_s32(dst, y);
+ } else {
+ StoreDst<4>(dst, step, 0, s);
}
- StoreDst<4>(dst, step, 0, s);
}
template <ButterflyRotationFunc butterfly_rotation,
@@ -937,9 +943,12 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
int32x4_t s[8];
int32x4_t x[4];
- LoadSrc<4>(dst, step, 0, x);
if (is_row) {
- Transpose4x4(x, x);
+ assert(step == 4);
+ int32x4x4_t y = vld4q_s32(dst);
+ for (int i = 0; i < 4; ++i) x[i] = y.val[i];
+ } else {
+ LoadSrc<4>(dst, step, 0, x);
}
// stage 1.
@@ -981,9 +990,12 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift)));
x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift)));
x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift)));
- Transpose4x4(x, x);
+ int32x4x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = x[i];
+ vst4q_s32(dst, y);
+ } else {
+ StoreDst<4>(dst, step, 0, x);
}
- StoreDst<4>(dst, step, 0, x);
}
alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc
index 1c2e111..452f14a 100644
--- a/src/dsp/arm/inverse_transform_neon.cc
+++ b/src/dsp/arm/inverse_transform_neon.cc
@@ -41,50 +41,6 @@ namespace {
//------------------------------------------------------------------------------
-// TODO(slavarnway): Move transpose functions to transpose_neon.h or
-// common_neon.h.
-
-LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int16x8_t in[4],
- int16x8_t out[4]) {
- // Swap 16 bit elements. Goes from:
- // a0: 00 01 02 03
- // a1: 10 11 12 13
- // a2: 20 21 22 23
- // a3: 30 31 32 33
- // to:
- // b0.val[0]: 00 10 02 12
- // b0.val[1]: 01 11 03 13
- // b1.val[0]: 20 30 22 32
- // b1.val[1]: 21 31 23 33
- const int16x4_t a0 = vget_low_s16(in[0]);
- const int16x4_t a1 = vget_low_s16(in[1]);
- const int16x4_t a2 = vget_low_s16(in[2]);
- const int16x4_t a3 = vget_low_s16(in[3]);
-
- const int16x4x2_t b0 = vtrn_s16(a0, a1);
- const int16x4x2_t b1 = vtrn_s16(a2, a3);
-
- // Swap 32 bit elements resulting in:
- // c0.val[0]: 00 10 20 30 04 14 24 34
- // c0.val[1]: 02 12 22 32 06 16 26 36
- // c1.val[0]: 01 11 21 31 05 15 25 35
- // c1.val[1]: 03 13 23 33 07 17 27 37
- const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
- vreinterpret_s32_s16(b1.val[0]));
- const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
- vreinterpret_s32_s16(b1.val[1]));
-
- const int16x4_t d0 = vreinterpret_s16_s32(c0.val[0]);
- const int16x4_t d1 = vreinterpret_s16_s32(c1.val[0]);
- const int16x4_t d2 = vreinterpret_s16_s32(c0.val[1]);
- const int16x4_t d3 = vreinterpret_s16_s32(c1.val[1]);
-
- out[0] = vcombine_s16(d0, d0);
- out[1] = vcombine_s16(d1, d1);
- out[2] = vcombine_s16(d2, d2);
- out[3] = vcombine_s16(d3, d3);
-}
-
// Note this is only used in the final stage of Dct32/64 and Adst16 as the in
// place version causes additional stack usage with clang.
LIBGAV1_ALWAYS_INLINE void Transpose8x8(const int16x8_t in[8],
@@ -580,16 +536,19 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) {
if (stage_is_rectangular) {
if (transpose) {
- int16x8_t input[8];
- LoadSrc<8, 8>(dst, step, 0, input);
- Transpose4x8To8x4(input, x);
+ assert(step == 4);
+ int16x8x4_t y = vld4q_s16(dst);
+ for (int i = 0; i < 4; ++i) x[i] = y.val[i];
} else {
LoadSrc<16, 4>(dst, step, 0, x);
}
} else {
- LoadSrc<8, 4>(dst, step, 0, x);
if (transpose) {
- Transpose4x4(x, x);
+ assert(step == 4);
+ int16x4x4_t y = vld4_s16(dst);
+ for (int i = 0; i < 4; ++i) x[i] = vcombine_s16(y.val[i], y.val[i]);
+ } else {
+ LoadSrc<8, 4>(dst, step, 0, x);
}
}
@@ -604,17 +563,20 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) {
if (stage_is_rectangular) {
if (transpose) {
- int16x8_t output[8];
- Transpose8x4To4x8(s, output);
- StoreDst<8, 8>(dst, step, 0, output);
+ int16x8x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = s[i];
+ vst4q_s16(dst, y);
} else {
StoreDst<16, 4>(dst, step, 0, s);
}
} else {
if (transpose) {
- Transpose4x4(s, s);
+ int16x4x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = vget_low_s16(s[i]);
+ vst4_s16(dst, y);
+ } else {
+ StoreDst<8, 4>(dst, step, 0, s);
}
- StoreDst<8, 4>(dst, step, 0, s);
}
}
@@ -1204,45 +1166,41 @@ void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
//------------------------------------------------------------------------------
// Asymmetric Discrete Sine Transforms (ADST).
-template <bool stage_is_rectangular>
+
LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step,
bool transpose) {
auto* const dst = static_cast<int16_t*>(dest);
- int32x4_t s[8];
- int16x8_t x[4];
+ int32x4_t s[7];
+ int16x4_t x[4];
- if (stage_is_rectangular) {
- if (transpose) {
- int16x8_t input[8];
- LoadSrc<8, 8>(dst, step, 0, input);
- Transpose4x8To8x4(input, x);
- } else {
- LoadSrc<16, 4>(dst, step, 0, x);
- }
+ if (transpose) {
+ assert(step == 4);
+ int16x4x4_t y = vld4_s16(dst);
+ for (int i = 0; i < 4; ++i) x[i] = y.val[i];
} else {
- LoadSrc<8, 4>(dst, step, 0, x);
- if (transpose) {
- Transpose4x4(x, x);
- }
+ x[0] = vld1_s16(dst);
+ x[1] = vld1_s16(dst + 1 * step);
+ x[2] = vld1_s16(dst + 2 * step);
+ x[3] = vld1_s16(dst + 3 * step);
}
// stage 1.
- s[5] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[1]);
- s[6] = vmull_n_s16(vget_low_s16(x[3]), kAdst4Multiplier[3]);
+ s[5] = vmull_n_s16(x[3], kAdst4Multiplier[1]);
+ s[6] = vmull_n_s16(x[3], kAdst4Multiplier[3]);
// stage 2.
- const int32x4_t a7 = vsubl_s16(vget_low_s16(x[0]), vget_low_s16(x[2]));
- const int32x4_t b7 = vaddw_s16(a7, vget_low_s16(x[3]));
+ const int32x4_t a7 = vsubl_s16(x[0], x[2]);
+ const int32x4_t b7 = vaddw_s16(a7, x[3]);
// stage 3.
- s[0] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[0]);
- s[1] = vmull_n_s16(vget_low_s16(x[0]), kAdst4Multiplier[1]);
+ s[0] = vmull_n_s16(x[0], kAdst4Multiplier[0]);
+ s[1] = vmull_n_s16(x[0], kAdst4Multiplier[1]);
// s[0] = s[0] + s[3]
- s[0] = vmlal_n_s16(s[0], vget_low_s16(x[2]), kAdst4Multiplier[3]);
+ s[0] = vmlal_n_s16(s[0], x[2], kAdst4Multiplier[3]);
// s[1] = s[1] - s[4]
- s[1] = vmlsl_n_s16(s[1], vget_low_s16(x[2]), kAdst4Multiplier[0]);
+ s[1] = vmlsl_n_s16(s[1], x[2], kAdst4Multiplier[0]);
- s[3] = vmull_n_s16(vget_low_s16(x[1]), kAdst4Multiplier[2]);
+ s[3] = vmull_n_s16(x[1], kAdst4Multiplier[2]);
s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
// stage 4.
@@ -1259,24 +1217,20 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step,
const int16x4_t dst_2 = vqrshrn_n_s32(s[2], 12);
const int16x4_t dst_3 = vqrshrn_n_s32(x3, 12);
- x[0] = vcombine_s16(dst_0, dst_0);
- x[1] = vcombine_s16(dst_1, dst_1);
- x[2] = vcombine_s16(dst_2, dst_2);
- x[3] = vcombine_s16(dst_3, dst_3);
+ x[0] = dst_0;
+ x[1] = dst_1;
+ x[2] = dst_2;
+ x[3] = dst_3;
- if (stage_is_rectangular) {
- if (transpose) {
- int16x8_t output[8];
- Transpose8x4To4x8(x, output);
- StoreDst<8, 8>(dst, step, 0, output);
- } else {
- StoreDst<16, 4>(dst, step, 0, x);
- }
+ if (transpose) {
+ int16x4x4_t y;
+ for (int i = 0; i < 4; ++i) y.val[i] = x[i];
+ vst4_s16(dst, y);
} else {
- if (transpose) {
- Transpose4x4(x, x);
- }
- StoreDst<8, 4>(dst, step, 0, x);
+ vst1_s16(dst, x[0]);
+ vst1_s16(dst + 1 * step, x[1]);
+ vst1_s16(dst + 2 * step, x[2]);
+ vst1_s16(dst + 3 * step, x[3]);
}
}
@@ -2705,7 +2659,7 @@ void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
int i = adjusted_tx_height;
auto* data = src;
do {
- Adst4_NEON<false>(data, /*step=*/4, /*transpose=*/true);
+ Adst4_NEON(data, /*step=*/4, /*transpose=*/true);
data += 16;
i -= 4;
} while (i != 0);
@@ -2732,7 +2686,7 @@ void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
int i = tx_width;
auto* data = src;
do {
- Adst4_NEON<false>(data, tx_width, /*transpose=*/false);
+ Adst4_NEON(data, tx_width, /*transpose=*/false);
data += 4;
i -= 4;
} while (i != 0);
diff --git a/src/dsp/arm/loop_filter_10bit_neon.cc b/src/dsp/arm/loop_filter_10bit_neon.cc
new file mode 100644
index 0000000..a9dd98f
--- /dev/null
+++ b/src/dsp/arm/loop_filter_10bit_neon.cc
@@ -0,0 +1,1218 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) {
+ const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
+ return vorr_u16(vget_low_u16(a), vget_high_u16(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0,
+ const uint16x4_t q0, const uint16x4_t q1,
+ const uint16_t outer_thresh) {
+ const uint16x4_t abd_p0q0 = vabd_u16(p0, q0);
+ const uint16x4_t abd_p1q1 = vabd_u16(p1, q1);
+ const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1);
+ const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1);
+ const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half);
+ return vcle_u16(sum, vdup_n_u16(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+// OuterThreshold()
+inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+// OuterThreshold()
+inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p1p2_q1q2,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+ const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+// OuterThreshold()
+inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p1p2_q1q2,
+ const uint16x8_t abd_p2p3_q2q3,
+ const uint16_t inner_thresh,
+ const uint16x4_t outer_mask) {
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+ const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3);
+ const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh));
+ const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c));
+ return vand_u16(inner_mask, outer_mask);
+}
+
+// -----------------------------------------------------------------------------
+// FilterNMasks functions.
+
+inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
+ const uint16_t hev_thresh, const uint16x4_t outer_mask,
+ const uint16_t inner_thresh,
+ uint16x4_t* const hev_mask,
+ uint16x4_t* const needs_filter4_mask) {
+ const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ // This includes cases where NeedsFilter4() is not true and so Filter2() will
+ // not be applied.
+ const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
+
+ *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask);
+
+ // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
+ *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1,
+ const uint16x8_t abd_p0p2_q0q2) {
+ constexpr int flat_thresh = 1 << 2;
+ const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2);
+ const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh));
+ return vand_u16(vget_low_u16(b), vget_high_u16(b));
+}
+
+inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, const uint16_t hev_thresh,
+ const uint16x4_t outer_mask,
+ const uint16_t inner_thresh,
+ uint16x4_t* const needs_filter6_mask,
+ uint16x4_t* const is_flat3_mask,
+ uint16x4_t* const hev_mask) {
+ const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
+ *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2));
+ *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2),
+ inner_thresh, outer_mask);
+}
+
+// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0,
+ const uint16x8_t abd_pn1p0_qn1q0,
+ const uint16x8_t abd_pn2p0_qn2q0) {
+ constexpr int flat_thresh = 1 << 2;
+ const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0);
+ const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0);
+ const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh));
+ return vand_u16(vget_low_u16(c), vget_high_u16(c));
+}
+
+inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2,
+ const uint16x8_t p1q1, const uint16x8_t p0q0,
+ const uint16_t hev_thresh, const uint16x4_t outer_mask,
+ const uint16_t inner_thresh,
+ uint16x4_t* const needs_filter8_mask,
+ uint16x4_t* const is_flat4_mask,
+ uint16x4_t* const hev_mask) {
+ const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+ *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
+ const uint16x4_t is_flat4 =
+ IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3));
+ *needs_filter8_mask =
+ NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3),
+ inner_thresh, outer_mask);
+ // |is_flat4_mask| is used to decide where to use the result of Filter8.
+ // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false,
+ // overriding the question of whether to use Filter8. Because Filter4 doesn't
+ // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the
+ // source value. To be correct, the mask must account for this override.
+ *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask);
+}
+
+// -----------------------------------------------------------------------------
+// FilterN functions.
+
+// Calculate Filter4() or Filter2() based on |hev_mask|.
+inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
+ const uint16x8_t p1q1, const uint16x4_t hev_mask,
+ uint16x8_t* const p1q1_result,
+ uint16x8_t* const p0q0_result) {
+ const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4);
+ // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+ // q0mp0 means "q0 minus p0".
+ const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1));
+ const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+ // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
+ const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/)));
+ const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1);
+ const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+ const int16x4_t p1mq1_saturated =
+ Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel);
+ const int16x4_t hev_option =
+ vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated);
+
+ const int16x4_t a = vadd_s16(q0mp0_3, hev_option);
+
+ // Need to figure out what's going on here because there are some unnecessary
+ // tricks to accommodate 8x8 as smallest 8bpp vector
+
+ // We can not shift with rounding because the clamp comes *before* the
+ // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+ // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+ const int16x4_t plus_four =
+ Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel);
+ const int16x4_t plus_three =
+ Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel);
+ const int16x4_t a1 = vshr_n_s16(plus_four, 3);
+ const int16x4_t a2 = vshr_n_s16(plus_three, 3);
+
+ // a3 = (a1 + 1) >> 1;
+ const int16x4_t a3 = vrshr_n_s16(a1, 1);
+
+ const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3));
+ const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3);
+
+ // Need to shift the second term or we end up with a2_ma2.
+ const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1));
+ const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1);
+ *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10);
+ *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10);
+}
+
+void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
+ int outer_thresh, int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+ auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+ auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+
+ const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0),
+ vld1_u16(dst_q0), vld1_u16(dst_q1)};
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+ const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+ Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+ &needs_filter4_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter4_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter4_mask_8 =
+ vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+ uint16x8_t f_p1q1;
+ uint16x8_t f_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
+
+ // Already integrated the Hev mask when calculating the filtered values.
+ const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+ // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+ // with |needs_filter4_mask| previously.
+ const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+ const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ // Offset by 2 uint16_t values to load from first p1 position.
+ auto* dst = static_cast<uint8_t*>(dest) - 4;
+ auto* dst_p1 = reinterpret_cast<uint16_t*>(dst);
+ auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2);
+ auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3);
+
+ uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1)};
+ Transpose4x4(src);
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+ const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+ Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+ &needs_filter4_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter4_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter4_mask_8 =
+ vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+ uint16x8_t f_p1q1;
+ uint16x8_t f_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
+
+ // Already integrated the Hev mask when calculating the filtered values.
+ const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+ // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
+ // with |needs_filter4_mask| previously.
+ const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+ const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+ uint16x4_t output[4] = {
+ vget_low_u16(p1q1_output),
+ vget_low_u16(p0q0_output),
+ vget_high_u16(p0q0_output),
+ vget_high_u16(p1q1_output),
+ };
+ Transpose4x4(output);
+
+ vst1_u16(dst_p1, output[0]);
+ vst1_u16(dst_p0, output[1]);
+ vst1_u16(dst_q0, output[2]);
+ vst1_u16(dst_q1, output[3]);
+}
+
+inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, uint16x8_t* const p1q1_output,
+ uint16x8_t* const p0q0_output) {
+ // Sum p1 and q1 output from opposite directions.
+ // The formula is regrouped to allow 3 doubling operations to be combined.
+ //
+ // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+ // ^^^^^^^^
+ // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+ // ^^^^^^^^
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^^^^^^
+ uint16x8_t sum = vaddq_u16(p2q2, p1q1);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^
+ sum = vaddq_u16(sum, p0q0);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^
+ sum = vshlq_n_u16(sum, 1);
+
+ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+ // ^^^^^^ ^^^^^^
+ // Should dual issue with the left shift.
+ const uint16x8_t q0p0 = Transpose64(p0q0);
+ const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
+ sum = vaddq_u16(sum, outer_sum);
+
+ *p1q1_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - (2 * p2) + q0 + q1
+ // q0 = q1 - (2 * q2) + p0 + p1
+ // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+ // ^^^^^^^^
+ const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
+ // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+ // ^^^^^^^^
+ sum = vsubq_u16(sum, p2q2_double);
+ const uint16x8_t q1p1 = Transpose64(p1q1);
+ sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
+
+ *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
+ int outer_thresh, int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+ auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+ auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+ auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+
+ const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1),
+ vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1), vld1_u16(dst_q2)};
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat3_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+ const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+ const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+ Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
+ // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
+ // output is not used.
+ uint16x8_t f6_p1q1, f6_p0q0;
+ const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+ if (vget_lane_u64(need_filter6, 0) == 0) {
+ // Filter6() does not apply, but Filter4() applies to one or more values.
+ p0q0_output = p0q0;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+ p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ // Left side of the filter window.
+ auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t);
+ auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+ // Overread by 2 values. These overreads become the high halves of src_raw[2]
+ // and src_raw[3] after transpose.
+ uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+ vld1q_u16(dst_3)};
+ Transpose4x8(src_raw);
+ // p2, p1, p0, q0, q1, q2
+ const uint16x4_t src[6] = {
+ vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]),
+ vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]),
+ vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]),
+ };
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat3_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+ const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+ const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+ Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
+ // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
+ // output is not used.
+ uint16x8_t f6_p1q1, f6_p0q0;
+ const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+ if (vget_lane_u64(need_filter6, 0) == 0) {
+ // Filter6() does not apply, but Filter4() applies to one or more values.
+ p0q0_output = p0q0;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+ p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ uint16x4_t output[4] = {
+ vget_low_u16(p1q1_output),
+ vget_low_u16(p0q0_output),
+ vget_high_u16(p0q0_output),
+ vget_high_u16(p1q1_output),
+ };
+ Transpose4x4(output);
+
+ // dst_n starts at p2, so adjust to p1.
+ vst1_u16(dst_0 + 1, output[0]);
+ vst1_u16(dst_1 + 1, output[1]);
+ vst1_u16(dst_2 + 1, output[2]);
+ vst1_u16(dst_3 + 1, output[3]);
+}
+
+inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
+ const uint16x8_t p1q1, const uint16x8_t p0q0,
+ uint16x8_t* const p2q2_output,
+ uint16x8_t* const p1q1_output,
+ uint16x8_t* const p0q0_output) {
+ // Sum p2 and q2 output from opposite directions.
+ // The formula is regrouped to allow 2 doubling operations to be combined.
+ // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+ // ^^^^^^^^
+ // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+ // ^^^^^^^^
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^
+ const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^
+ uint16x8_t sum = vshlq_n_u16(p23q23, 1);
+
+ // Add two other terms to make dual issue with shift more likely.
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^
+ const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^^^^^^^^
+ sum = vaddq_u16(sum, p01q01);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^
+ sum = vaddq_u16(sum, p3q3);
+
+ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+ // ^^^^^^
+ const uint16x8_t q0p0 = Transpose64(p0q0);
+ sum = vaddq_u16(sum, q0p0);
+
+ *p2q2_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p1 and q1 output:
+ // p1 = p2 - p3 - p2 + p1 + q1
+ // q1 = q2 - q3 - q2 + q0 + p1
+ sum = vsubq_u16(sum, p23q23);
+ const uint16x8_t q1p1 = Transpose64(p1q1);
+ sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
+
+ *p1q1_output = vrshrq_n_u16(sum, 3);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - p3 - p1 + p0 + q2
+ // q0 = q1 - q3 - q1 + q0 + p2
+ sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
+ const uint16x8_t q2p2 = Transpose64(p2q2);
+ sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
+
+ *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
+ int outer_thresh, int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
+ auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+ auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+ auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+ auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+ const uint16x4_t src[8] = {
+ vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0),
+ vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)};
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]);
+ const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]);
+ const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]);
+ const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]);
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+ // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // Filter8() does not apply, but Filter4() applies to one or more values.
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t is_flat4_mask_8 =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+ vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+}
+
+inline uint16x8_t ReverseLowHalf(const uint16x8_t a) {
+ return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
+}
+
+void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t);
+ auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+ // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
+ // To get desired pairs after transpose, one half should be reversed.
+ uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+ vld1q_u16(dst_3)};
+
+ // src[0] = p0q0
+ // src[1] = p1q1
+ // src[2] = p2q2
+ // src[3] = p3q3
+ LoopFilterTranspose4x8(src);
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask = OuterThreshold(
+ vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]),
+ vget_high_u16(src[1]), outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = src[0];
+ const uint16x8_t p1q1 = src[1];
+ const uint16x8_t p2q2 = src[2];
+ const uint16x8_t p3q3 = src[3];
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+ // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+ // output is not used.
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // Filter8() does not apply, but Filter4() applies to one or more values.
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t is_flat4_mask_8 =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+
+ uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3};
+ // After transpose, |output| will contain rows of the form:
+ // p0 p1 p2 p3 q0 q1 q2 q3
+ Transpose4x8(output);
+
+ // Reverse p values to produce original order:
+ // p3 p2 p1 p0 q0 q1 q2 q3
+ vst1q_u16(dst_0, ReverseLowHalf(output[0]));
+ vst1q_u16(dst_1, ReverseLowHalf(output[1]));
+ vst1q_u16(dst_2, ReverseLowHalf(output[2]));
+ vst1q_u16(dst_3, ReverseLowHalf(output[3]));
+}
+
+inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5,
+ const uint16x8_t p4q4, const uint16x8_t p3q3,
+ const uint16x8_t p2q2, const uint16x8_t p1q1,
+ const uint16x8_t p0q0, uint16x8_t* const p5q5_output,
+ uint16x8_t* const p4q4_output,
+ uint16x8_t* const p3q3_output,
+ uint16x8_t* const p2q2_output,
+ uint16x8_t* const p1q1_output,
+ uint16x8_t* const p0q0_output) {
+ // Sum p5 and q5 output from opposite directions.
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^
+ const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^^^^^^^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^^^^^^^^^^^^^
+ uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
+ sum = vaddq_u16(sum, p6q6_x7);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^^^^^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^^^^^^
+ sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum);
+
+ // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+ // ^^
+ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+ // ^^
+ const uint16x8_t q0p0 = Transpose64(p0q0);
+ sum = vaddq_u16(sum, q0p0);
+
+ *p5q5_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p4 and q4 output:
+ // p4 = p5 - (2 * p6) + p3 + q1
+ // q4 = q5 - (2 * q6) + q3 + p1
+ sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
+ const uint16x8_t q1p1 = Transpose64(p1q1);
+ sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
+
+ *p4q4_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p3 and q3 output:
+ // p3 = p4 - p6 - p5 + p2 + q2
+ // q3 = q4 - q6 - q5 + q2 + p2
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
+ const uint16x8_t q2p2 = Transpose64(p2q2);
+ sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
+
+ *p3q3_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p2 and q2 output:
+ // p2 = p3 - p6 - p4 + p1 + q3
+ // q2 = q3 - q6 - q4 + q1 + p3
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
+ const uint16x8_t q3p3 = Transpose64(p3q3);
+ sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
+
+ *p2q2_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p1 and q1 output:
+ // p1 = p2 - p6 - p3 + p0 + q4
+ // q1 = q2 - q6 - q3 + q0 + p4
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
+ const uint16x8_t q4p4 = Transpose64(p4q4);
+ sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
+
+ *p1q1_output = vrshrq_n_u16(sum, 4);
+
+ // Convert to p0 and q0 output:
+ // p0 = p1 - p6 - p2 + q0 + q5
+ // q0 = q1 - q6 - q2 + p0 + p5
+ sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
+ const uint16x8_t q5p5 = Transpose64(p5q5);
+ sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
+
+ *p0q0_output = vrshrq_n_u16(sum, 4);
+}
+
+void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
+ int outer_thresh, int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest);
+ auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride);
+ auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride);
+ auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride);
+ auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
+ auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
+ auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
+ auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
+ auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+ auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride);
+ auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride);
+ auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride);
+
+ const uint16x4_t src[14] = {
+ vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
+ vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+ vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
+ vld1_u16(dst_q5), vld1_u16(dst_q6)};
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask =
+ OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]);
+ const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]);
+ const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]);
+ const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]);
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+ const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
+ const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
+ const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
+ // Mask to choose between the outputs of Filter8 and Filter14.
+ // As with the derivation of |is_flat4_mask|, the question of whether to use
+ // Filter14 is only raised where |is_flat4_mask| is true.
+ const uint16x4_t is_flat4_outer_mask = vand_u16(
+ is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+ vabdq_u16(p0q0, p6q6)));
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ // ZIP1 p0q0, p1q1 may perform better here.
+ const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+ p5q5_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+ // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // Filter8() and Filter14() do not apply, but Filter4() applies to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t use_filter8_mask =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+ if (vget_lane_u64(need_filter14, 0) == 0) {
+ // Filter14() does not apply, but Filter8() and Filter4() apply to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ } else {
+ // All filters may contribute values to final outputs.
+ const uint16x8_t use_filter14_mask =
+ vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+ uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+ Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+ &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+ p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+ p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+ p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+ p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+ p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+ p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+ p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+ p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+ p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+ }
+
+ vst1_u16(dst_p5, vget_low_u16(p5q5_output));
+ vst1_u16(dst_p4, vget_low_u16(p4q4_output));
+ vst1_u16(dst_p3, vget_low_u16(p3q3_output));
+ vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+ vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+ vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+ vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+ vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+ vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+ vst1_u16(dst_q3, vget_high_u16(p3q3_output));
+ vst1_u16(dst_q4, vget_high_u16(p4q4_output));
+ vst1_u16(dst_q5, vget_high_u16(p5q5_output));
+}
+
+inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) {
+ uint16x8x2_t acdb;
+#if defined(__aarch64__)
+ // a[b] <- [c]d
+ acdb.val[0] = vreinterpretq_u16_u64(
+ vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
+ // [a]b <- c[d]
+ acdb.val[1] = vreinterpretq_u16_u64(
+ vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab)));
+#else
+ // a[b] <- [c]d
+ acdb.val[0] = vreinterpretq_u16_u64(
+ vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0),
+ vreinterpretq_u64_u16(ab), 1));
+ // [a]b <- c[d]
+ acdb.val[1] = vreinterpretq_u16_u64(
+ vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
+ vreinterpretq_u64_u16(ab), 0));
+#endif // defined(__aarch64__)
+ return acdb;
+}
+
+void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
+ int inner_thresh, int hev_thresh) {
+ auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t);
+ auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
+ auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
+ auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
+ auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
+
+ // Low halves: p7 p6 p5 p4
+ // High halves: p3 p2 p1 p0
+ uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+ vld1q_u16(dst_3)};
+ // p7 will be the low half of src_p[0]. Not used until the end.
+ Transpose4x8(src_p);
+
+ // Low halves: q0 q1 q2 q3
+ // High halves: q4 q5 q6 q7
+ uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
+ vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)};
+ // q7 will be the high half of src_q[3]. Not used until the end.
+ Transpose4x8(src_q);
+
+ // Adjust thresholds to bitdepth.
+ outer_thresh <<= 2;
+ inner_thresh <<= 2;
+ hev_thresh <<= 2;
+ const uint16x4_t outer_mask = OuterThreshold(
+ vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]),
+ vget_low_u16(src_q[1]), outer_thresh);
+ const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4);
+ const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4);
+ const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4);
+ const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4);
+ uint16x4_t hev_mask;
+ uint16x4_t needs_filter_mask;
+ uint16x4_t is_flat4_mask;
+ Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+ &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if defined(__aarch64__)
+ if (vaddv_u16(needs_filter_mask) == 0) {
+ // None of the values will be filtered.
+ return;
+ }
+#endif // defined(__aarch64__)
+ const uint16x8_t p4q4 =
+ vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
+ const uint16x8_t p5q5 =
+ vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1]));
+ const uint16x8_t p6q6 =
+ vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2]));
+ const uint16x8_t p7q7 =
+ vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3]));
+ // Mask to choose between the outputs of Filter8 and Filter14.
+ // As with the derivation of |is_flat4_mask|, the question of whether to use
+ // Filter14 is only raised where |is_flat4_mask| is true.
+ const uint16x4_t is_flat4_outer_mask = vand_u16(
+ is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+ vabdq_u16(p0q0, p6q6)));
+ // Copy the masks to the high bits for packed comparisons later.
+ const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+ const uint16x8_t needs_filter_mask_8 =
+ vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+ uint16x8_t f4_p1q1;
+ uint16x8_t f4_p0q0;
+ const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+ Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
+ f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+ uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+ p5q5_output;
+ // Because we did not return after testing |needs_filter_mask| we know it is
+ // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
+ // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
+ // output is not used.
+ uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+ const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+ if (vget_lane_u64(need_filter8, 0) == 0) {
+ // Filter8() and Filter14() do not apply, but Filter4() applies to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = p2q2;
+ p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+ } else {
+ const uint16x8_t use_filter8_mask =
+ vcombine_u16(is_flat4_mask, is_flat4_mask);
+ Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+ const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+ if (vget_lane_u64(need_filter14, 0) == 0) {
+ // Filter14() does not apply, but Filter8() and Filter4() apply to one or
+ // more values.
+ p5q5_output = p5q5;
+ p4q4_output = p4q4;
+ p3q3_output = p3q3;
+ p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+ p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ } else {
+ // All filters may contribute values to final outputs.
+ const uint16x8_t use_filter14_mask =
+ vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+ uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+ Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+ &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+ p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+ p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+ p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+ p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+ p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+ p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+ p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+ p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+ p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+ p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+ p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+ p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+ }
+ }
+ // To get the correctly ordered rows from the transpose, we need:
+ // p7p3 p6p2 p5p1 p4p0
+ // q0q4 q1q5 q2q6 q3q7
+ const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output);
+ const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output);
+ const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output);
+ const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output);
+ uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0],
+ p5p1_q1q5.val[0], p4p0_q0q4.val[0]};
+ Transpose4x8(output_p);
+ uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1],
+ p6p2_q2q6.val[1], p7p3_q3q7.val[1]};
+ Transpose4x8(output_q);
+
+ // Reverse p values to produce original order:
+ // p3 p2 p1 p0 q0 q1 q2 q3
+ vst1q_u16(dst_0, output_p[0]);
+ vst1q_u16(dst_0 + 8, output_q[0]);
+ vst1q_u16(dst_1, output_p[1]);
+ vst1q_u16(dst_1 + 8, output_q[1]);
+ vst1q_u16(dst_2, output_p[2]);
+ vst1q_u16(dst_2 + 8, output_q[2]);
+ vst1q_u16(dst_3, output_p[3]);
+ vst1q_u16(dst_3 + 8, output_q[3]);
+}
+
+} // namespace
+
+void LoopFilterInit10bpp_NEON() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
+ Horizontal4_NEON;
+ dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
+ Horizontal6_NEON;
+ dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
+ Horizontal8_NEON;
+ dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
+ Horizontal14_NEON;
+ dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
+ Vertical14_NEON;
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !(LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopFilterInit10bpp_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/src/dsp/arm/loop_filter_neon.cc b/src/dsp/arm/loop_filter_neon.cc
index 8c03928..a8b236d 100644
--- a/src/dsp/arm/loop_filter_neon.cc
+++ b/src/dsp/arm/loop_filter_neon.cc
@@ -29,7 +29,6 @@
namespace libgav1 {
namespace dsp {
-namespace low_bitdepth {
namespace {
// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
@@ -149,10 +148,6 @@ void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter4_mask) == 0) {
// None of the values will be filtered.
return;
@@ -209,10 +204,6 @@ void Vertical4_NEON(void* const dest, const ptrdiff_t stride,
needs_filter4_mask = InterleaveLow32(needs_filter4_mask, needs_filter4_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter4_mask) == 0) {
// None of the values will be filtered.
return;
@@ -346,10 +337,6 @@ void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter6_mask) == 0) {
// None of the values will be filtered.
return;
@@ -420,10 +407,6 @@ void Vertical6_NEON(void* const dest, const ptrdiff_t stride,
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter6_mask) == 0) {
// None of the values will be filtered.
return;
@@ -600,10 +583,6 @@ void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter8_mask) == 0) {
// None of the values will be filtered.
return;
@@ -679,10 +658,6 @@ void Vertical8_NEON(void* const dest, const ptrdiff_t stride,
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter8_mask) == 0) {
// None of the values will be filtered.
return;
@@ -863,10 +838,6 @@ void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter8_mask) == 0) {
// None of the values will be filtered.
return;
@@ -1031,10 +1002,6 @@ void Vertical14_NEON(void* const dest, const ptrdiff_t stride,
hev_mask = InterleaveLow32(hev_mask, hev_mask);
#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
if (vaddv_u8(needs_filter8_mask) == 0) {
// None of the values will be filtered.
return;
@@ -1158,7 +1125,9 @@ void Vertical14_NEON(void* const dest, const ptrdiff_t stride,
vst1q_u8(dst, output_3);
}
-void Init8bpp() {
+} // namespace
+
+void LoopFilterInit_NEON() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
@@ -1178,1267 +1147,6 @@ void Init8bpp() {
dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
Vertical14_NEON;
}
-} // namespace
-} // namespace low_bitdepth
-
-#if LIBGAV1_MAX_BITDEPTH >= 10
-namespace high_bitdepth {
-namespace {
-
-// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
-inline uint16x4_t Hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) {
- const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
- return vorr_u16(vget_low_u16(a), vget_high_u16(a));
-}
-
-// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
-inline uint16x4_t OuterThreshold(const uint16x4_t p1, const uint16x4_t p0,
- const uint16x4_t q0, const uint16x4_t q1,
- const uint16_t outer_thresh) {
- const uint16x4_t abd_p0q0 = vabd_u16(p0, q0);
- const uint16x4_t abd_p1q1 = vabd_u16(p1, q1);
- const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1);
- const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1);
- const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half);
- return vcle_u16(sum, vdup_n_u16(outer_thresh));
-}
-
-// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
-// OuterThreshold()
-inline uint16x4_t NeedsFilter4(const uint16x8_t abd_p0p1_q0q1,
- const uint16_t inner_thresh,
- const uint16x4_t outer_mask) {
- const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
- const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a));
- return vand_u16(inner_mask, outer_mask);
-}
-
-// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
-// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
-// OuterThreshold()
-inline uint16x4_t NeedsFilter6(const uint16x8_t abd_p0p1_q0q1,
- const uint16x8_t abd_p1p2_q1q2,
- const uint16_t inner_thresh,
- const uint16x4_t outer_mask) {
- const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
- const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh));
- const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b));
- return vand_u16(inner_mask, outer_mask);
-}
-
-// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
-// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
-// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
-// OuterThreshold()
-inline uint16x4_t NeedsFilter8(const uint16x8_t abd_p0p1_q0q1,
- const uint16x8_t abd_p1p2_q1q2,
- const uint16x8_t abd_p2p3_q2q3,
- const uint16_t inner_thresh,
- const uint16x4_t outer_mask) {
- const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
- const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3);
- const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh));
- const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c));
- return vand_u16(inner_mask, outer_mask);
-}
-
-// -----------------------------------------------------------------------------
-// FilterNMasks functions.
-
-inline void Filter4Masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
- const uint16_t hev_thresh, const uint16x4_t outer_mask,
- const uint16_t inner_thresh,
- uint16x4_t* const hev_mask,
- uint16x4_t* const needs_filter4_mask) {
- const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
- // This includes cases where NeedsFilter4() is not true and so Filter2() will
- // not be applied.
- const uint16x4_t hev_tmp_mask = Hev(p0p1_q0q1, hev_thresh);
-
- *needs_filter4_mask = NeedsFilter4(p0p1_q0q1, inner_thresh, outer_mask);
-
- // Filter2() will only be applied if both NeedsFilter4() and Hev() are true.
- *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask);
-}
-
-// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
-// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
-// |flat_thresh| == 4 for 10 bit decode.
-inline uint16x4_t IsFlat3(const uint16x8_t abd_p0p1_q0q1,
- const uint16x8_t abd_p0p2_q0q2) {
- constexpr int flat_thresh = 1 << 2;
- const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2);
- const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh));
- return vand_u16(vget_low_u16(b), vget_high_u16(b));
-}
-
-inline void Filter6Masks(const uint16x8_t p2q2, const uint16x8_t p1q1,
- const uint16x8_t p0q0, const uint16_t hev_thresh,
- const uint16x4_t outer_mask,
- const uint16_t inner_thresh,
- uint16x4_t* const needs_filter6_mask,
- uint16x4_t* const is_flat3_mask,
- uint16x4_t* const hev_mask) {
- const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
- *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
- *is_flat3_mask = IsFlat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2));
- *needs_filter6_mask = NeedsFilter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2),
- inner_thresh, outer_mask);
-}
-
-// IsFlat4 uses N=1, IsFlatOuter4 uses N=4.
-// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
-// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
-// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
-// |flat_thresh| == 4 for 10 bit decode.
-inline uint16x4_t IsFlat4(const uint16x8_t abd_pnp0_qnq0,
- const uint16x8_t abd_pn1p0_qn1q0,
- const uint16x8_t abd_pn2p0_qn2q0) {
- constexpr int flat_thresh = 1 << 2;
- const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0);
- const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0);
- const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh));
- return vand_u16(vget_low_u16(c), vget_high_u16(c));
-}
-
-inline void Filter8Masks(const uint16x8_t p3q3, const uint16x8_t p2q2,
- const uint16x8_t p1q1, const uint16x8_t p0q0,
- const uint16_t hev_thresh, const uint16x4_t outer_mask,
- const uint16_t inner_thresh,
- uint16x4_t* const needs_filter8_mask,
- uint16x4_t* const is_flat4_mask,
- uint16x4_t* const hev_mask) {
- const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
- *hev_mask = Hev(abd_p0p1_q0q1, hev_thresh);
- const uint16x4_t is_flat4 =
- IsFlat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3));
- *needs_filter8_mask =
- NeedsFilter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3),
- inner_thresh, outer_mask);
- // |is_flat4_mask| is used to decide where to use the result of Filter8.
- // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false,
- // overriding the question of whether to use Filter8. Because Filter4 doesn't
- // apply to p2q2, |is_flat4_mask| chooses directly between Filter8 and the
- // source value. To be correct, the mask must account for this override.
- *is_flat4_mask = vand_u16(is_flat4, *needs_filter8_mask);
-}
-
-// -----------------------------------------------------------------------------
-// FilterN functions.
-
-// Calculate Filter4() or Filter2() based on |hev_mask|.
-inline void Filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
- const uint16x8_t p1q1, const uint16x4_t hev_mask,
- uint16x8_t* const p1q1_result,
- uint16x8_t* const p0q0_result) {
- const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4);
- // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
- // q0mp0 means "q0 minus p0".
- const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1));
- const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
-
- // If this is for Filter2() then include |p1mq1|. Otherwise zero it.
- const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (9 /*bitdepth-1*/)));
- const int16x4_t max_signed_pixel = vdup_n_s16((1 << (9 /*bitdepth-1*/)) - 1);
- const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
- const int16x4_t p1mq1_saturated =
- Clip3S16(p1mq1, min_signed_pixel, max_signed_pixel);
- const int16x4_t hev_option =
- vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated);
-
- const int16x4_t a = vadd_s16(q0mp0_3, hev_option);
-
- // Need to figure out what's going on here because there are some unnecessary
- // tricks to accommodate 8x8 as smallest 8bpp vector
-
- // We can not shift with rounding because the clamp comes *before* the
- // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
- // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
- const int16x4_t plus_four =
- Clip3S16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel);
- const int16x4_t plus_three =
- Clip3S16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel);
- const int16x4_t a1 = vshr_n_s16(plus_four, 3);
- const int16x4_t a2 = vshr_n_s16(plus_three, 3);
-
- // a3 = (a1 + 1) >> 1;
- const int16x4_t a3 = vrshr_n_s16(a1, 1);
-
- const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3));
- const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3);
-
- // Need to shift the second term or we end up with a2_ma2.
- const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1));
- const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1);
- *p1q1_result = ConvertToUnsignedPixelU16(p1q1_a3, kBitdepth10);
- *p0q0_result = ConvertToUnsignedPixelU16(p0q0_a, kBitdepth10);
-}
-
-void Horizontal4_NEON(void* const dest, const ptrdiff_t stride,
- int outer_thresh, int inner_thresh, int hev_thresh) {
- auto* const dst = static_cast<uint8_t*>(dest);
- auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
- auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
- auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
- auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
-
- const uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0),
- vld1_u16(dst_q0), vld1_u16(dst_q1)};
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask =
- OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter4_mask;
- const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
- const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
- Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
- &needs_filter4_mask);
-
-#if defined(__aarch64__)
- // This provides a good speedup for the unit test, but may not come up often
- // enough to warrant it.
- if (vaddv_u16(needs_filter4_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- const uint64x1_t needs_filter4_mask64 =
- vreinterpret_u64_u16(needs_filter4_mask);
- if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
-
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t needs_filter4_mask_8 =
- vcombine_u16(needs_filter4_mask, needs_filter4_mask);
-
- uint16x8_t f_p1q1;
- uint16x8_t f_p0q0;
- const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
-
- // Already integrated the Hev mask when calculating the filtered values.
- const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
-
- // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
- // with |needs_filter4_mask| previously.
- const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
- const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
-
- vst1_u16(dst_p1, vget_low_u16(p1q1_output));
- vst1_u16(dst_p0, vget_low_u16(p0q0_output));
- vst1_u16(dst_q0, vget_high_u16(p0q0_output));
- vst1_u16(dst_q1, vget_high_u16(p1q1_output));
-}
-
-void Vertical4_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
- int inner_thresh, int hev_thresh) {
- // Offset by 2 uint16_t values to load from first p1 position.
- auto* dst = static_cast<uint8_t*>(dest) - 4;
- auto* dst_p1 = reinterpret_cast<uint16_t*>(dst);
- auto* dst_p0 = reinterpret_cast<uint16_t*>(dst + stride);
- auto* dst_q0 = reinterpret_cast<uint16_t*>(dst + stride * 2);
- auto* dst_q1 = reinterpret_cast<uint16_t*>(dst + stride * 3);
-
- uint16x4_t src[4] = {vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
- vld1_u16(dst_q1)};
- Transpose4x4(src);
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask =
- OuterThreshold(src[0], src[1], src[2], src[3], outer_thresh);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter4_mask;
- const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
- const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
- Filter4Masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
- &needs_filter4_mask);
-
-#if defined(__aarch64__)
- // This provides a good speedup for the unit test. Not sure how applicable it
- // is to valid streams though.
- // Consider doing this on armv7 if there is a quick way to check if a vector
- // is zero.
- if (vaddv_u16(needs_filter4_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- const uint64x1_t needs_filter4_mask64 =
- vreinterpret_u64_u16(needs_filter4_mask);
- if (vget_lane_u64(needs_filter4_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
-
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t needs_filter4_mask_8 =
- vcombine_u16(needs_filter4_mask, needs_filter4_mask);
-
- uint16x8_t f_p1q1;
- uint16x8_t f_p0q0;
- const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f_p1q1, &f_p0q0);
-
- // Already integrated the Hev mask when calculating the filtered values.
- const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
-
- // p1/q1 are unmodified if only Hev() is true. This works because it was and'd
- // with |needs_filter4_mask| previously.
- const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
- const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
-
- uint16x4_t output[4] = {
- vget_low_u16(p1q1_output),
- vget_low_u16(p0q0_output),
- vget_high_u16(p0q0_output),
- vget_high_u16(p1q1_output),
- };
- Transpose4x4(output);
-
- vst1_u16(dst_p1, output[0]);
- vst1_u16(dst_p0, output[1]);
- vst1_u16(dst_q0, output[2]);
- vst1_u16(dst_q1, output[3]);
-}
-
-inline void Filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
- const uint16x8_t p0q0, uint16x8_t* const p1q1_output,
- uint16x8_t* const p0q0_output) {
- // Sum p1 and q1 output from opposite directions.
- // The formula is regrouped to allow 3 doubling operations to be combined.
- //
- // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
- // ^^^^^^^^
- // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
- // ^^^^^^^^
- // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
- // ^^^^^^^^^^^
- uint16x8_t sum = vaddq_u16(p2q2, p1q1);
-
- // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
- // ^^^^^^
- sum = vaddq_u16(sum, p0q0);
-
- // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
- // ^^^^^
- sum = vshlq_n_u16(sum, 1);
-
- // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
- // ^^^^^^ ^^^^^^
- // Should dual issue with the left shift.
- const uint16x8_t q0p0 = Transpose64(p0q0);
- const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
- sum = vaddq_u16(sum, outer_sum);
-
- *p1q1_output = vrshrq_n_u16(sum, 3);
-
- // Convert to p0 and q0 output:
- // p0 = p1 - (2 * p2) + q0 + q1
- // q0 = q1 - (2 * q2) + p0 + p1
- // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
- // ^^^^^^^^
- const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
- // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
- // ^^^^^^^^
- sum = vsubq_u16(sum, p2q2_double);
- const uint16x8_t q1p1 = Transpose64(p1q1);
- sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
-
- *p0q0_output = vrshrq_n_u16(sum, 3);
-}
-
-void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
- int outer_thresh, int inner_thresh, int hev_thresh) {
- auto* const dst = static_cast<uint8_t*>(dest);
- auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
- auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
- auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
- auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
- auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
- auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
-
- const uint16x4_t src[6] = {vld1_u16(dst_p2), vld1_u16(dst_p1),
- vld1_u16(dst_p0), vld1_u16(dst_q0),
- vld1_u16(dst_q1), vld1_u16(dst_q2)};
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask =
- OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter_mask;
- uint16x4_t is_flat3_mask;
- const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
- const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
- const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
- Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
- &needs_filter_mask, &is_flat3_mask, &hev_mask);
-
-#if defined(__aarch64__)
- if (vaddv_u16(needs_filter_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
-
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
- const uint16x8_t needs_filter_mask_8 =
- vcombine_u16(needs_filter_mask, needs_filter_mask);
-
- uint16x8_t f4_p1q1;
- uint16x8_t f4_p0q0;
- // ZIP1 p0q0, p1q1 may perform better here.
- const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
- f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
- uint16x8_t p0q0_output, p1q1_output;
- // Because we did not return after testing |needs_filter_mask| we know it is
- // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
- // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
- // output is not used.
- uint16x8_t f6_p1q1, f6_p0q0;
- const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
- if (vget_lane_u64(need_filter6, 0) == 0) {
- // Filter6() does not apply, but Filter4() applies to one or more values.
- p0q0_output = p0q0;
- p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
- p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
- } else {
- Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
- p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- }
-
- vst1_u16(dst_p1, vget_low_u16(p1q1_output));
- vst1_u16(dst_p0, vget_low_u16(p0q0_output));
- vst1_u16(dst_q0, vget_high_u16(p0q0_output));
- vst1_u16(dst_q1, vget_high_u16(p1q1_output));
-}
-
-void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
- int inner_thresh, int hev_thresh) {
- // Left side of the filter window.
- auto* const dst = static_cast<uint8_t*>(dest) - 3 * sizeof(uint16_t);
- auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
- auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
- auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
- auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
-
- // Overread by 2 values. These overreads become the high halves of src_raw[2]
- // and src_raw[3] after transpose.
- uint16x8_t src_raw[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
- vld1q_u16(dst_3)};
- Transpose4x8(src_raw);
- // p2, p1, p0, q0, q1, q2
- const uint16x4_t src[6] = {
- vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]),
- vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]),
- vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]),
- };
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask =
- OuterThreshold(src[1], src[2], src[3], src[4], outer_thresh);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter_mask;
- uint16x4_t is_flat3_mask;
- const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
- const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
- const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
- Filter6Masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
- &needs_filter_mask, &is_flat3_mask, &hev_mask);
-
-#if defined(__aarch64__)
- if (vaddv_u16(needs_filter_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
-
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
- const uint16x8_t needs_filter_mask_8 =
- vcombine_u16(needs_filter_mask, needs_filter_mask);
-
- uint16x8_t f4_p1q1;
- uint16x8_t f4_p0q0;
- // ZIP1 p0q0, p1q1 may perform better here.
- const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
- f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
- uint16x8_t p0q0_output, p1q1_output;
- // Because we did not return after testing |needs_filter_mask| we know it is
- // nonzero. |is_flat3_mask| controls whether the needed filter is Filter4 or
- // Filter6. Therefore if it is false when |needs_filter_mask| is true, Filter6
- // output is not used.
- uint16x8_t f6_p1q1, f6_p0q0;
- const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
- if (vget_lane_u64(need_filter6, 0) == 0) {
- // Filter6() does not apply, but Filter4() applies to one or more values.
- p0q0_output = p0q0;
- p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
- p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
- } else {
- Filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
- p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- }
-
- uint16x4_t output[4] = {
- vget_low_u16(p1q1_output),
- vget_low_u16(p0q0_output),
- vget_high_u16(p0q0_output),
- vget_high_u16(p1q1_output),
- };
- Transpose4x4(output);
-
- // dst_n starts at p2, so adjust to p1.
- vst1_u16(dst_0 + 1, output[0]);
- vst1_u16(dst_1 + 1, output[1]);
- vst1_u16(dst_2 + 1, output[2]);
- vst1_u16(dst_3 + 1, output[3]);
-}
-
-inline void Filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
- const uint16x8_t p1q1, const uint16x8_t p0q0,
- uint16x8_t* const p2q2_output,
- uint16x8_t* const p1q1_output,
- uint16x8_t* const p0q0_output) {
- // Sum p2 and q2 output from opposite directions.
- // The formula is regrouped to allow 2 doubling operations to be combined.
- // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
- // ^^^^^^^^
- // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
- // ^^^^^^^^
- // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
- // ^^^^^^^^^^^
- const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
-
- // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
- // ^^^^^
- uint16x8_t sum = vshlq_n_u16(p23q23, 1);
-
- // Add two other terms to make dual issue with shift more likely.
- // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
- // ^^^^^^^^^^^
- const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
-
- // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
- // ^^^^^^^^^^^^^
- sum = vaddq_u16(sum, p01q01);
-
- // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
- // ^^^^^^
- sum = vaddq_u16(sum, p3q3);
-
- // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
- // ^^^^^^
- const uint16x8_t q0p0 = Transpose64(p0q0);
- sum = vaddq_u16(sum, q0p0);
-
- *p2q2_output = vrshrq_n_u16(sum, 3);
-
- // Convert to p1 and q1 output:
- // p1 = p2 - p3 - p2 + p1 + q1
- // q1 = q2 - q3 - q2 + q0 + p1
- sum = vsubq_u16(sum, p23q23);
- const uint16x8_t q1p1 = Transpose64(p1q1);
- sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
-
- *p1q1_output = vrshrq_n_u16(sum, 3);
-
- // Convert to p0 and q0 output:
- // p0 = p1 - p3 - p1 + p0 + q2
- // q0 = q1 - q3 - q1 + q0 + p2
- sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
- const uint16x8_t q2p2 = Transpose64(p2q2);
- sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
-
- *p0q0_output = vrshrq_n_u16(sum, 3);
-}
-
-void Horizontal8_NEON(void* const dest, const ptrdiff_t stride,
- int outer_thresh, int inner_thresh, int hev_thresh) {
- auto* const dst = static_cast<uint8_t*>(dest);
- auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
- auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
- auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
- auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
- auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
- auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
- auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
- auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
-
- const uint16x4_t src[8] = {
- vld1_u16(dst_p3), vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0),
- vld1_u16(dst_q0), vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3)};
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask =
- OuterThreshold(src[2], src[3], src[4], src[5], outer_thresh);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter_mask;
- uint16x4_t is_flat4_mask;
- const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]);
- const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]);
- const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]);
- const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]);
- Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
- &needs_filter_mask, &is_flat4_mask, &hev_mask);
-
-#if defined(__aarch64__)
- if (vaddv_u16(needs_filter_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
-
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t needs_filter_mask_8 =
- vcombine_u16(needs_filter_mask, needs_filter_mask);
-
- uint16x8_t f4_p1q1;
- uint16x8_t f4_p0q0;
- // ZIP1 p0q0, p1q1 may perform better here.
- const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
- f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
- uint16x8_t p0q0_output, p1q1_output, p2q2_output;
- // Because we did not return after testing |needs_filter_mask| we know it is
- // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
- // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
- // output is not used.
- uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
- const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
- if (vget_lane_u64(need_filter8, 0) == 0) {
- // Filter8() does not apply, but Filter4() applies to one or more values.
- p2q2_output = p2q2;
- p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
- p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
- } else {
- const uint16x8_t is_flat4_mask_8 =
- vcombine_u16(is_flat4_mask, is_flat4_mask);
- Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
- p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
- p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- }
-
- vst1_u16(dst_p2, vget_low_u16(p2q2_output));
- vst1_u16(dst_p1, vget_low_u16(p1q1_output));
- vst1_u16(dst_p0, vget_low_u16(p0q0_output));
- vst1_u16(dst_q0, vget_high_u16(p0q0_output));
- vst1_u16(dst_q1, vget_high_u16(p1q1_output));
- vst1_u16(dst_q2, vget_high_u16(p2q2_output));
-}
-
-inline uint16x8_t ReverseLowHalf(const uint16x8_t a) {
- return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
-}
-
-void Vertical8_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
- int inner_thresh, int hev_thresh) {
- auto* const dst = static_cast<uint8_t*>(dest) - 4 * sizeof(uint16_t);
- auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
- auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
- auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
- auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
-
- // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
- // To get desired pairs after transpose, one half should be reversed.
- uint16x8_t src[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
- vld1q_u16(dst_3)};
-
- // src[0] = p0q0
- // src[1] = p1q1
- // src[2] = p2q2
- // src[3] = p3q3
- LoopFilterTranspose4x8(src);
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask = OuterThreshold(
- vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]),
- vget_high_u16(src[1]), outer_thresh);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter_mask;
- uint16x4_t is_flat4_mask;
- const uint16x8_t p0q0 = src[0];
- const uint16x8_t p1q1 = src[1];
- const uint16x8_t p2q2 = src[2];
- const uint16x8_t p3q3 = src[3];
- Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
- &needs_filter_mask, &is_flat4_mask, &hev_mask);
-
-#if defined(__aarch64__)
- if (vaddv_u16(needs_filter_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
-
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t needs_filter_mask_8 =
- vcombine_u16(needs_filter_mask, needs_filter_mask);
-
- uint16x8_t f4_p1q1;
- uint16x8_t f4_p0q0;
- const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
- f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
- uint16x8_t p0q0_output, p1q1_output, p2q2_output;
- // Because we did not return after testing |needs_filter_mask| we know it is
- // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
- // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
- // output is not used.
- const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
- if (vget_lane_u64(need_filter8, 0) == 0) {
- // Filter8() does not apply, but Filter4() applies to one or more values.
- p2q2_output = p2q2;
- p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
- p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
- } else {
- const uint16x8_t is_flat4_mask_8 =
- vcombine_u16(is_flat4_mask, is_flat4_mask);
- uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
- Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
- p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
- p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- }
-
- uint16x8_t output[4] = {p0q0_output, p1q1_output, p2q2_output, p3q3};
- // After transpose, |output| will contain rows of the form:
- // p0 p1 p2 p3 q0 q1 q2 q3
- Transpose4x8(output);
-
- // Reverse p values to produce original order:
- // p3 p2 p1 p0 q0 q1 q2 q3
- vst1q_u16(dst_0, ReverseLowHalf(output[0]));
- vst1q_u16(dst_1, ReverseLowHalf(output[1]));
- vst1q_u16(dst_2, ReverseLowHalf(output[2]));
- vst1q_u16(dst_3, ReverseLowHalf(output[3]));
-}
-inline void Filter14(const uint16x8_t p6q6, const uint16x8_t p5q5,
- const uint16x8_t p4q4, const uint16x8_t p3q3,
- const uint16x8_t p2q2, const uint16x8_t p1q1,
- const uint16x8_t p0q0, uint16x8_t* const p5q5_output,
- uint16x8_t* const p4q4_output,
- uint16x8_t* const p3q3_output,
- uint16x8_t* const p2q2_output,
- uint16x8_t* const p1q1_output,
- uint16x8_t* const p0q0_output) {
- // Sum p5 and q5 output from opposite directions.
- // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
- // ^^^^^^^^
- // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
- // ^^^^^^^^
- const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6);
-
- // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
- // ^^^^^^^^^^^^^^^^^^^
- // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
- // ^^^^^^^^^^^^^^^^^^^
- uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
- sum = vaddq_u16(sum, p6q6_x7);
-
- // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
- // ^^^^^^^
- // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
- // ^^^^^^^
- sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum);
-
- // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
- // ^^^^^^^
- // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
- // ^^^^^^^
- sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum);
-
- // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
- // ^^
- // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
- // ^^
- const uint16x8_t q0p0 = Transpose64(p0q0);
- sum = vaddq_u16(sum, q0p0);
-
- *p5q5_output = vrshrq_n_u16(sum, 4);
-
- // Convert to p4 and q4 output:
- // p4 = p5 - (2 * p6) + p3 + q1
- // q4 = q5 - (2 * q6) + q3 + p1
- sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
- const uint16x8_t q1p1 = Transpose64(p1q1);
- sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
-
- *p4q4_output = vrshrq_n_u16(sum, 4);
-
- // Convert to p3 and q3 output:
- // p3 = p4 - p6 - p5 + p2 + q2
- // q3 = q4 - q6 - q5 + q2 + p2
- sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
- const uint16x8_t q2p2 = Transpose64(p2q2);
- sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
-
- *p3q3_output = vrshrq_n_u16(sum, 4);
-
- // Convert to p2 and q2 output:
- // p2 = p3 - p6 - p4 + p1 + q3
- // q2 = q3 - q6 - q4 + q1 + p3
- sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
- const uint16x8_t q3p3 = Transpose64(p3q3);
- sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
-
- *p2q2_output = vrshrq_n_u16(sum, 4);
-
- // Convert to p1 and q1 output:
- // p1 = p2 - p6 - p3 + p0 + q4
- // q1 = q2 - q6 - q3 + q0 + p4
- sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
- const uint16x8_t q4p4 = Transpose64(p4q4);
- sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
-
- *p1q1_output = vrshrq_n_u16(sum, 4);
-
- // Convert to p0 and q0 output:
- // p0 = p1 - p6 - p2 + q0 + q5
- // q0 = q1 - q6 - q2 + p0 + p5
- sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
- const uint16x8_t q5p5 = Transpose64(p5q5);
- sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
-
- *p0q0_output = vrshrq_n_u16(sum, 4);
-}
-
-void Horizontal14_NEON(void* const dest, const ptrdiff_t stride,
- int outer_thresh, int inner_thresh, int hev_thresh) {
- auto* const dst = static_cast<uint8_t*>(dest);
- auto* const dst_p6 = reinterpret_cast<uint16_t*>(dst - 7 * stride);
- auto* const dst_p5 = reinterpret_cast<uint16_t*>(dst - 6 * stride);
- auto* const dst_p4 = reinterpret_cast<uint16_t*>(dst - 5 * stride);
- auto* const dst_p3 = reinterpret_cast<uint16_t*>(dst - 4 * stride);
- auto* const dst_p2 = reinterpret_cast<uint16_t*>(dst - 3 * stride);
- auto* const dst_p1 = reinterpret_cast<uint16_t*>(dst - 2 * stride);
- auto* const dst_p0 = reinterpret_cast<uint16_t*>(dst - stride);
- auto* const dst_q0 = reinterpret_cast<uint16_t*>(dst);
- auto* const dst_q1 = reinterpret_cast<uint16_t*>(dst + stride);
- auto* const dst_q2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
- auto* const dst_q3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
- auto* const dst_q4 = reinterpret_cast<uint16_t*>(dst + 4 * stride);
- auto* const dst_q5 = reinterpret_cast<uint16_t*>(dst + 5 * stride);
- auto* const dst_q6 = reinterpret_cast<uint16_t*>(dst + 6 * stride);
-
- const uint16x4_t src[14] = {
- vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
- vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
- vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
- vld1_u16(dst_q5), vld1_u16(dst_q6)};
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask =
- OuterThreshold(src[5], src[6], src[7], src[8], outer_thresh);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter_mask;
- uint16x4_t is_flat4_mask;
- const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]);
- const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]);
- const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]);
- const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]);
- Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
- &needs_filter_mask, &is_flat4_mask, &hev_mask);
-
-#if defined(__aarch64__)
- if (vaddv_u16(needs_filter_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
- const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
- const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
- const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
- // Mask to choose between the outputs of Filter8 and Filter14.
- // As with the derivation of |is_flat4_mask|, the question of whether to use
- // Filter14 is only raised where |is_flat4_mask| is true.
- const uint16x4_t is_flat4_outer_mask = vand_u16(
- is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
- vabdq_u16(p0q0, p6q6)));
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t needs_filter_mask_8 =
- vcombine_u16(needs_filter_mask, needs_filter_mask);
-
- uint16x8_t f4_p1q1;
- uint16x8_t f4_p0q0;
- // ZIP1 p0q0, p1q1 may perform better here.
- const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
- f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
- uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
- p5q5_output;
- // Because we did not return after testing |needs_filter_mask| we know it is
- // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
- // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
- // output is not used.
- uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
- const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
- if (vget_lane_u64(need_filter8, 0) == 0) {
- // Filter8() and Filter14() do not apply, but Filter4() applies to one or
- // more values.
- p5q5_output = p5q5;
- p4q4_output = p4q4;
- p3q3_output = p3q3;
- p2q2_output = p2q2;
- p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
- p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
- } else {
- const uint16x8_t use_filter8_mask =
- vcombine_u16(is_flat4_mask, is_flat4_mask);
- Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
- const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
- if (vget_lane_u64(need_filter14, 0) == 0) {
- // Filter14() does not apply, but Filter8() and Filter4() apply to one or
- // more values.
- p5q5_output = p5q5;
- p4q4_output = p4q4;
- p3q3_output = p3q3;
- p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
- p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- } else {
- // All filters may contribute values to final outputs.
- const uint16x8_t use_filter14_mask =
- vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
- uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
- Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
- &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
- p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
- p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
- p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
- p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
- p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
- p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
- p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
- p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
- p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- }
- }
-
- vst1_u16(dst_p5, vget_low_u16(p5q5_output));
- vst1_u16(dst_p4, vget_low_u16(p4q4_output));
- vst1_u16(dst_p3, vget_low_u16(p3q3_output));
- vst1_u16(dst_p2, vget_low_u16(p2q2_output));
- vst1_u16(dst_p1, vget_low_u16(p1q1_output));
- vst1_u16(dst_p0, vget_low_u16(p0q0_output));
- vst1_u16(dst_q0, vget_high_u16(p0q0_output));
- vst1_u16(dst_q1, vget_high_u16(p1q1_output));
- vst1_u16(dst_q2, vget_high_u16(p2q2_output));
- vst1_u16(dst_q3, vget_high_u16(p3q3_output));
- vst1_u16(dst_q4, vget_high_u16(p4q4_output));
- vst1_u16(dst_q5, vget_high_u16(p5q5_output));
-}
-
-inline uint16x8x2_t PermuteACDB64(const uint16x8_t ab, const uint16x8_t cd) {
- uint16x8x2_t acdb;
-#if defined(__aarch64__)
- // a[b] <- [c]d
- acdb.val[0] = vreinterpretq_u16_u64(
- vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
- // [a]b <- c[d]
- acdb.val[1] = vreinterpretq_u16_u64(
- vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab)));
-#else
- // a[b] <- [c]d
- acdb.val[0] = vreinterpretq_u16_u64(
- vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0),
- vreinterpretq_u64_u16(ab), 1));
- // [a]b <- c[d]
- acdb.val[1] = vreinterpretq_u16_u64(
- vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
- vreinterpretq_u64_u16(ab), 0));
-#endif // defined(__aarch64__)
- return acdb;
-}
-
-void Vertical14_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
- int inner_thresh, int hev_thresh) {
- auto* const dst = static_cast<uint8_t*>(dest) - 8 * sizeof(uint16_t);
- auto* const dst_0 = reinterpret_cast<uint16_t*>(dst);
- auto* const dst_1 = reinterpret_cast<uint16_t*>(dst + stride);
- auto* const dst_2 = reinterpret_cast<uint16_t*>(dst + 2 * stride);
- auto* const dst_3 = reinterpret_cast<uint16_t*>(dst + 3 * stride);
-
- // Low halves: p7 p6 p5 p4
- // High halves: p3 p2 p1 p0
- uint16x8_t src_p[4] = {vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
- vld1q_u16(dst_3)};
- // p7 will be the low half of src_p[0]. Not used until the end.
- Transpose4x8(src_p);
-
- // Low halves: q0 q1 q2 q3
- // High halves: q4 q5 q6 q7
- uint16x8_t src_q[4] = {vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
- vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8)};
- // q7 will be the high half of src_q[3]. Not used until the end.
- Transpose4x8(src_q);
-
- // Adjust thresholds to bitdepth.
- outer_thresh <<= 2;
- inner_thresh <<= 2;
- hev_thresh <<= 2;
- const uint16x4_t outer_mask = OuterThreshold(
- vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]),
- vget_low_u16(src_q[1]), outer_thresh);
- const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4);
- const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4);
- const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4);
- const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4);
- uint16x4_t hev_mask;
- uint16x4_t needs_filter_mask;
- uint16x4_t is_flat4_mask;
- Filter8Masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
- &needs_filter_mask, &is_flat4_mask, &hev_mask);
-
-#if defined(__aarch64__)
- if (vaddv_u16(needs_filter_mask) == 0) {
- // None of the values will be filtered.
- return;
- }
-#else // !defined(__aarch64__)
- // This might be faster than vaddv (latency 3) because mov to general register
- // has latency 2.
- const uint64x1_t needs_filter_mask64 =
- vreinterpret_u64_u16(needs_filter_mask);
- if (vget_lane_u64(needs_filter_mask64, 0) == 0) {
- // None of the values will be filtered.
- return;
- }
-#endif // defined(__aarch64__)
- const uint16x8_t p4q4 =
- vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
- const uint16x8_t p5q5 =
- vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1]));
- const uint16x8_t p6q6 =
- vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2]));
- const uint16x8_t p7q7 =
- vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3]));
- // Mask to choose between the outputs of Filter8 and Filter14.
- // As with the derivation of |is_flat4_mask|, the question of whether to use
- // Filter14 is only raised where |is_flat4_mask| is true.
- const uint16x4_t is_flat4_outer_mask = vand_u16(
- is_flat4_mask, IsFlat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
- vabdq_u16(p0q0, p6q6)));
- // Copy the masks to the high bits for packed comparisons later.
- const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
- const uint16x8_t needs_filter_mask_8 =
- vcombine_u16(needs_filter_mask, needs_filter_mask);
-
- uint16x8_t f4_p1q1;
- uint16x8_t f4_p0q0;
- const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
- Filter4(p0q0, p0q1, p1q1, hev_mask, &f4_p1q1, &f4_p0q0);
- f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
-
- uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
- p5q5_output;
- // Because we did not return after testing |needs_filter_mask| we know it is
- // nonzero. |is_flat4_mask| controls whether the needed filter is Filter4 or
- // Filter8. Therefore if it is false when |needs_filter_mask| is true, Filter8
- // output is not used.
- uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
- const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
- if (vget_lane_u64(need_filter8, 0) == 0) {
- // Filter8() and Filter14() do not apply, but Filter4() applies to one or
- // more values.
- p5q5_output = p5q5;
- p4q4_output = p4q4;
- p3q3_output = p3q3;
- p2q2_output = p2q2;
- p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
- p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
- } else {
- const uint16x8_t use_filter8_mask =
- vcombine_u16(is_flat4_mask, is_flat4_mask);
- Filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
- const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
- if (vget_lane_u64(need_filter14, 0) == 0) {
- // Filter14() does not apply, but Filter8() and Filter4() apply to one or
- // more values.
- p5q5_output = p5q5;
- p4q4_output = p4q4;
- p3q3_output = p3q3;
- p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
- p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- } else {
- // All filters may contribute values to final outputs.
- const uint16x8_t use_filter14_mask =
- vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
- uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
- Filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
- &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
- p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
- p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
- p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
- p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
- p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
- p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
- p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
- p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
- p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
- p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
- p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
- p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
- }
- }
- // To get the correctly ordered rows from the transpose, we need:
- // p7p3 p6p2 p5p1 p4p0
- // q0q4 q1q5 q2q6 q3q7
- const uint16x8x2_t p7p3_q3q7 = PermuteACDB64(p7q7, p3q3_output);
- const uint16x8x2_t p6p2_q2q6 = PermuteACDB64(p6q6, p2q2_output);
- const uint16x8x2_t p5p1_q1q5 = PermuteACDB64(p5q5_output, p1q1_output);
- const uint16x8x2_t p4p0_q0q4 = PermuteACDB64(p4q4_output, p0q0_output);
- uint16x8_t output_p[4] = {p7p3_q3q7.val[0], p6p2_q2q6.val[0],
- p5p1_q1q5.val[0], p4p0_q0q4.val[0]};
- Transpose4x8(output_p);
- uint16x8_t output_q[4] = {p4p0_q0q4.val[1], p5p1_q1q5.val[1],
- p6p2_q2q6.val[1], p7p3_q3q7.val[1]};
- Transpose4x8(output_q);
-
- // Reverse p values to produce original order:
- // p3 p2 p1 p0 q0 q1 q2 q3
- vst1q_u16(dst_0, output_p[0]);
- vst1q_u16(dst_0 + 8, output_q[0]);
- vst1q_u16(dst_1, output_p[1]);
- vst1q_u16(dst_1 + 8, output_q[1]);
- vst1q_u16(dst_2, output_p[2]);
- vst1q_u16(dst_2 + 8, output_q[2]);
- vst1q_u16(dst_3, output_p[3]);
- vst1q_u16(dst_3 + 8, output_q[3]);
-}
-
-void Init10bpp() {
- Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
- assert(dsp != nullptr);
- dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeHorizontal] =
- Horizontal4_NEON;
- dsp->loop_filters[kLoopFilterSize4][kLoopFilterTypeVertical] = Vertical4_NEON;
- dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeHorizontal] =
- Horizontal6_NEON;
- dsp->loop_filters[kLoopFilterSize6][kLoopFilterTypeVertical] = Vertical6_NEON;
- dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeHorizontal] =
- Horizontal8_NEON;
- dsp->loop_filters[kLoopFilterSize8][kLoopFilterTypeVertical] = Vertical8_NEON;
- dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeHorizontal] =
- Horizontal14_NEON;
- dsp->loop_filters[kLoopFilterSize14][kLoopFilterTypeVertical] =
- Vertical14_NEON;
-}
-
-} // namespace
-} // namespace high_bitdepth
-#endif // LIBGAV1_MAX_BITDEPTH >= 10
-
-void LoopFilterInit_NEON() {
- low_bitdepth::Init8bpp();
-#if LIBGAV1_MAX_BITDEPTH >= 10
- high_bitdepth::Init10bpp();
-#endif
-}
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/arm/loop_filter_neon.h b/src/dsp/arm/loop_filter_neon.h
index 540defc..531cd0d 100644
--- a/src/dsp/arm/loop_filter_neon.h
+++ b/src/dsp/arm/loop_filter_neon.h
@@ -26,6 +26,7 @@ namespace dsp {
// Initializes Dsp::loop_filters, see the defines below for specifics. This
// function is not thread-safe.
void LoopFilterInit_NEON();
+void LoopFilterInit10bpp_NEON();
} // namespace dsp
} // namespace libgav1
diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc
index 2db137f..cd8552e 100644
--- a/src/dsp/arm/loop_restoration_neon.cc
+++ b/src/dsp/arm/loop_restoration_neon.cc
@@ -1504,7 +1504,6 @@ inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
uint8x16_t s[2][2], mas[2];
uint16x8_t sq[2][4], bs[3];
- // TODO(b/194217060): Future msan load.
s[0][0] = vld1q_u8(src0);
s[1][0] = vld1q_u8(src1);
@@ -1599,7 +1598,6 @@ inline void BoxSumFilterPreProcess(
const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
uint8x16_t s[2][2], ma3[2][2], ma5[2];
uint16x8_t sq[2][4], b3[2][3], b5[3];
- // TODO(b/194217060): Future msan load.
s[0][0] = vld1q_u8(src0);
s[1][0] = vld1q_u8(src1);
@@ -1801,7 +1799,6 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src,
uint8_t* const dst) {
uint8x16_t s[2], mas[2];
uint16x8_t sq[4], bs[4];
- // TODO(b/194217060): Future msan load.
s[0] = vld1q_u8(src0);
BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0],
@@ -1812,7 +1809,6 @@ inline void BoxFilterPass1LastRow(const uint8_t* const src,
uint16x8_t ma[2];
uint8x16_t masx[3];
uint32x4x2_t b[2];
- // TODO(b/194217060): Future msan load.
s[1] = vld1q_u8(src0 + x + 16);
BoxFilterPreProcess5LastRow(s, x + 8, scale, sum5, square_sum5, sq + 1, mas,
@@ -1856,7 +1852,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass2 - width;
uint8x16_t s[2], mas[2];
uint16x8_t sq[4], bs[3];
- // TODO(b/194217060): Future msan load.
s[0] = vld1q_u8(src0);
BoxFilterPreProcess3Lo(&s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
@@ -1915,7 +1910,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilter(
const ptrdiff_t overread_in_bytes = kWideOverreadInBytesPass1 - width;
uint8x16_t s[2][2], ma3[2][2], ma5[2];
uint16x8_t sq[2][4], b3[2][3], b5[3];
- // TODO(b/194217060): Future msan load.
s[0][0] = vld1q_u8(src0);
s[1][0] = vld1q_u8(src1);
@@ -2023,7 +2017,6 @@ inline void BoxFilterLastRow(
uint8x16_t s[2], ma3[2], ma5[2];
uint16x8_t sq[4], ma[3], b3[3], b5[3];
uint32x4x2_t b[3];
- // TODO(b/194217060): Future msan load.
s[0] = vld1q_u8(src0);
BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
@@ -2033,7 +2026,6 @@ inline void BoxFilterLastRow(
do {
uint8x16_t ma3x[3], ma5x[3];
int16x8_t p[2];
- // TODO(b/194217060): Future msan load.
s[1] = vld1q_u8(src0 + x + 16);
BoxFilterPreProcessLastRow(s, x + 8, scales, sum3, sum5, square_sum3,
diff --git a/src/dsp/arm/mask_blend_neon.cc b/src/dsp/arm/mask_blend_neon.cc
index 853f949..ecc67f8 100644
--- a/src/dsp/arm/mask_blend_neon.cc
+++ b/src/dsp/arm/mask_blend_neon.cc
@@ -33,50 +33,40 @@ namespace dsp {
namespace low_bitdepth {
namespace {
-// TODO(b/150461164): Consider combining with GetInterIntraMask4x2().
-// Compound predictors use int16_t values and need to multiply long because the
-// Convolve range * 64 is 20 bits. Unfortunately there is no multiply int16_t by
-// int8_t and accumulate into int32_t instruction.
-template <int subsampling_x, int subsampling_y>
-inline int16x8_t GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride) {
- if (subsampling_x == 1) {
- const int16x4_t mask_val0 = vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask)));
- const int16x4_t mask_val1 = vreinterpret_s16_u16(
- vpaddl_u8(vld1_u8(mask + (mask_stride << subsampling_y))));
- int16x8_t final_val;
- if (subsampling_y == 1) {
- const int16x4_t next_mask_val0 =
- vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride)));
- const int16x4_t next_mask_val1 =
- vreinterpret_s16_u16(vpaddl_u8(vld1_u8(mask + mask_stride * 3)));
- final_val = vaddq_s16(vcombine_s16(mask_val0, mask_val1),
- vcombine_s16(next_mask_val0, next_mask_val1));
- } else {
- final_val = vreinterpretq_s16_u16(
- vpaddlq_u8(vreinterpretq_u8_s16(vcombine_s16(mask_val0, mask_val1))));
- }
- return vrshrq_n_s16(final_val, subsampling_y + 1);
+template <int subsampling_y>
+inline uint8x8_t GetMask4x2(const uint8_t* mask) {
+ if (subsampling_y == 1) {
+ const uint8x16x2_t mask_val = vld2q_u8(mask);
+ const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]);
+ const uint32x2_t row_01 = vreinterpret_u32_u8(vget_low_u8(combined_horz));
+ const uint32x2_t row_23 = vreinterpret_u32_u8(vget_high_u8(combined_horz));
+
+ const uint32x2x2_t row_02_13 = vtrn_u32(row_01, row_23);
+ // Use a halving add to work around the case where all |mask| values are 64.
+ return vrshr_n_u8(vhadd_u8(vreinterpret_u8_u32(row_02_13.val[0]),
+ vreinterpret_u8_u32(row_02_13.val[1])),
+ 1);
}
- assert(subsampling_y == 0 && subsampling_x == 0);
- const uint8x8_t mask_val0 = Load4(mask);
- const uint8x8_t mask_val = Load4<1>(mask + mask_stride, mask_val0);
- return vreinterpretq_s16_u16(vmovl_u8(mask_val));
+ // subsampling_x == 1
+ const uint8x8x2_t mask_val = vld2_u8(mask);
+ return vrhadd_u8(mask_val.val[0], mask_val.val[1]);
}
template <int subsampling_x, int subsampling_y>
-inline int16x8_t GetMask8(const uint8_t* mask, ptrdiff_t mask_stride) {
+inline uint8x8_t GetMask8(const uint8_t* mask) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const uint8x16x2_t mask_val = vld2q_u8(mask);
+ const uint8x16_t combined_horz = vaddq_u8(mask_val.val[0], mask_val.val[1]);
+ // Use a halving add to work around the case where all |mask| values are 64.
+ return vrshr_n_u8(
+ vhadd_u8(vget_low_u8(combined_horz), vget_high_u8(combined_horz)), 1);
+ }
if (subsampling_x == 1) {
- int16x8_t mask_val = vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask)));
- if (subsampling_y == 1) {
- const int16x8_t next_mask_val =
- vreinterpretq_s16_u16(vpaddlq_u8(vld1q_u8(mask + mask_stride)));
- mask_val = vaddq_s16(mask_val, next_mask_val);
- }
- return vrshrq_n_s16(mask_val, 1 + subsampling_y);
+ const uint8x8x2_t mask_val = vld2_u8(mask);
+ return vrhadd_u8(mask_val.val[0], mask_val.val[1]);
}
assert(subsampling_y == 0 && subsampling_x == 0);
- const uint8x8_t mask_val = vld1_u8(mask);
- return vreinterpretq_s16_u16(vmovl_u8(mask_val));
+ return vld1_u8(mask);
}
inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
@@ -109,89 +99,162 @@ inline void WriteMaskBlendLine4x2(const int16_t* LIBGAV1_RESTRICT const pred_0,
StoreHi4(dst + dst_stride, result);
}
-template <int subsampling_x, int subsampling_y>
+template <int subsampling_y>
inline void MaskBlending4x4_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
const int16_t* LIBGAV1_RESTRICT pred_1,
const uint8_t* LIBGAV1_RESTRICT mask,
- const ptrdiff_t mask_stride,
uint8_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t dst_stride) {
+ constexpr int subsampling_x = 1;
+ constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
const int16x8_t mask_inverter = vdupq_n_s16(64);
- int16x8_t pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ // Compound predictors use int16_t values and need to multiply long because
+ // the Convolve range * 64 is 20 bits. Unfortunately there is no multiply
+ // int16_t by int8_t and accumulate into int32_t instruction.
+ int16x8_t pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
- // TODO(b/150461164): Arm tends to do better with load(val); val += stride
- // It may be possible to turn this into a loop with a templated height.
- pred_0 += 4 << 1;
- pred_1 += 4 << 1;
- mask += mask_stride << (1 + subsampling_y);
- dst += dst_stride << 1;
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
}
-template <int subsampling_x, int subsampling_y>
+template <int subsampling_y>
inline void MaskBlending4xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
const int16_t* LIBGAV1_RESTRICT pred_1,
const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
- const ptrdiff_t mask_stride, const int height,
+ const int height,
uint8_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t dst_stride) {
const uint8_t* mask = mask_ptr;
if (height == 4) {
- MaskBlending4x4_NEON<subsampling_x, subsampling_y>(
- pred_0, pred_1, mask, mask_stride, dst, dst_stride);
+ MaskBlending4x4_NEON<subsampling_y>(pred_0, pred_1, mask, dst, dst_stride);
return;
}
+ constexpr int subsampling_x = 1;
+ constexpr ptrdiff_t mask_stride = 4 << subsampling_x;
const int16x8_t mask_inverter = vdupq_n_s16(64);
int y = 0;
do {
int16x8_t pred_mask_0 =
- GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ vreinterpretq_s16_u16(vmovl_u8(GetMask4x2<subsampling_y>(mask)));
int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
- pred_0 += 4 << 1;
- pred_1 += 4 << 1;
- mask += mask_stride << (1 + subsampling_y);
- dst += dst_stride << 1;
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
- pred_0 += 4 << 1;
- pred_1 += 4 << 1;
- mask += mask_stride << (1 + subsampling_y);
- dst += dst_stride << 1;
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
- pred_0 += 4 << 1;
- pred_1 += 4 << 1;
- mask += mask_stride << (1 + subsampling_y);
- dst += dst_stride << 1;
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
- pred_mask_0 = GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride);
+ pred_mask_0 = ZeroExtend(GetMask4x2<subsampling_y>(mask));
pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
WriteMaskBlendLine4x2(pred_0, pred_1, pred_mask_0, pred_mask_1, dst,
dst_stride);
- pred_0 += 4 << 1;
- pred_1 += 4 << 1;
- mask += mask_stride << (1 + subsampling_y);
- dst += dst_stride << 1;
+ pred_0 += 4 << subsampling_x;
+ pred_1 += 4 << subsampling_x;
+ mask += mask_stride << (subsampling_x + subsampling_y);
+ dst += dst_stride << subsampling_x;
y += 8;
} while (y < height);
}
+inline uint8x8_t CombinePred8(const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const int16x8_t pred_mask_0,
+ const int16x8_t pred_mask_1) {
+ // First 8 values.
+ const int16x8_t pred_val_0 = vld1q_s16(pred_0);
+ const int16x8_t pred_val_1 = vld1q_s16(pred_1);
+ // int res = (mask_value * prediction_0[x] +
+ // (64 - mask_value) * prediction_1[x]) >> 6;
+ const int32x4_t weighted_pred_lo =
+ vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
+ const int32x4_t weighted_pred_hi =
+ vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+ const int32x4_t weighted_combo_lo = vmlal_s16(
+ weighted_pred_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1));
+ const int32x4_t weighted_combo_hi = vmlal_s16(
+ weighted_pred_hi, vget_high_s16(pred_mask_1), vget_high_s16(pred_val_1));
+
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ return vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
+ vshrn_n_s32(weighted_combo_hi, 6)),
+ 4);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlending8xH_NEON(const int16_t* LIBGAV1_RESTRICT pred_0,
+ const int16_t* LIBGAV1_RESTRICT pred_1,
+ const uint8_t* LIBGAV1_RESTRICT const mask_ptr,
+ const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
+ const uint8_t* mask = mask_ptr;
+ const int16x8_t mask_inverter = vdupq_n_s16(64);
+ int y = height;
+ do {
+ const int16x8_t pred_mask_0 =
+ ZeroExtend(GetMask8<subsampling_x, subsampling_y>(mask));
+ // 64 - mask
+ const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
+ const uint8x8_t result =
+ CombinePred8(pred_0, pred_1, pred_mask_0, pred_mask_1);
+ vst1_u8(dst, result);
+ dst += dst_stride;
+ mask += 8 << (subsampling_x + subsampling_y);
+ pred_0 += 8;
+ pred_1 += 8;
+ } while (--y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline uint8x16_t GetMask16(const uint8_t* mask, const ptrdiff_t mask_stride) {
+ if (subsampling_x == 1 && subsampling_y == 1) {
+ const uint8x16x2_t mask_val0 = vld2q_u8(mask);
+ const uint8x16x2_t mask_val1 = vld2q_u8(mask + mask_stride);
+ const uint8x16_t combined_horz0 =
+ vaddq_u8(mask_val0.val[0], mask_val0.val[1]);
+ const uint8x16_t combined_horz1 =
+ vaddq_u8(mask_val1.val[0], mask_val1.val[1]);
+ // Use a halving add to work around the case where all |mask| values are 64.
+ return vrshrq_n_u8(vhaddq_u8(combined_horz0, combined_horz1), 1);
+ }
+ if (subsampling_x == 1) {
+ const uint8x16x2_t mask_val = vld2q_u8(mask);
+ return vrhaddq_u8(mask_val.val[0], mask_val.val[1]);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ return vld1q_u8(mask);
+}
+
template <int subsampling_x, int subsampling_y>
inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
const void* LIBGAV1_RESTRICT prediction_1,
@@ -204,8 +267,13 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
if (width == 4) {
- MaskBlending4xH_NEON<subsampling_x, subsampling_y>(
- pred_0, pred_1, mask_ptr, mask_stride, height, dst, dst_stride);
+ MaskBlending4xH_NEON<subsampling_y>(pred_0, pred_1, mask_ptr, height, dst,
+ dst_stride);
+ return;
+ }
+ if (width == 8) {
+ MaskBlending8xH_NEON<subsampling_x, subsampling_y>(pred_0, pred_1, mask_ptr,
+ height, dst, dst_stride);
return;
}
const uint8_t* mask = mask_ptr;
@@ -214,35 +282,24 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
do {
int x = 0;
do {
- const int16x8_t pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+ const uint8x16_t pred_mask_0 = GetMask16<subsampling_x, subsampling_y>(
mask + (x << subsampling_x), mask_stride);
+ const int16x8_t pred_mask_0_lo = ZeroExtend(vget_low_u8(pred_mask_0));
+ const int16x8_t pred_mask_0_hi = ZeroExtend(vget_high_u8(pred_mask_0));
// 64 - mask
- const int16x8_t pred_mask_1 = vsubq_s16(mask_inverter, pred_mask_0);
- const int16x8_t pred_val_0 = vld1q_s16(pred_0 + x);
- const int16x8_t pred_val_1 = vld1q_s16(pred_1 + x);
+ const int16x8_t pred_mask_1_lo = vsubq_s16(mask_inverter, pred_mask_0_lo);
+ const int16x8_t pred_mask_1_hi = vsubq_s16(mask_inverter, pred_mask_0_hi);
+
uint8x8_t result;
- // int res = (mask_value * prediction_0[x] +
- // (64 - mask_value) * prediction_1[x]) >> 6;
- const int32x4_t weighted_pred_0_lo =
- vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
- const int32x4_t weighted_pred_0_hi =
- vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
- const int32x4_t weighted_combo_lo =
- vmlal_s16(weighted_pred_0_lo, vget_low_s16(pred_mask_1),
- vget_low_s16(pred_val_1));
- const int32x4_t weighted_combo_hi =
- vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1),
- vget_high_s16(pred_val_1));
-
- // dst[x] = static_cast<Pixel>(
- // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
- // (1 << kBitdepth8) - 1));
- result = vqrshrun_n_s16(vcombine_s16(vshrn_n_s32(weighted_combo_lo, 6),
- vshrn_n_s32(weighted_combo_hi, 6)),
- 4);
+ result =
+ CombinePred8(pred_0 + x, pred_1 + x, pred_mask_0_lo, pred_mask_1_lo);
vst1_u8(dst + x, result);
- x += 8;
+ result = CombinePred8(pred_0 + x + 8, pred_1 + x + 8, pred_mask_0_hi,
+ pred_mask_1_hi);
+ vst1_u8(dst + x + 8, result);
+
+ x += 16;
} while (x < width);
dst += dst_stride;
pred_0 += width;
@@ -251,63 +308,19 @@ inline void MaskBlend_NEON(const void* LIBGAV1_RESTRICT prediction_0,
} while (++y < height);
}
-// TODO(b/150461164): This is much faster for inter_intra (input is Pixel
-// values) but regresses compound versions (input is int16_t). Try to
-// consolidate these.
template <int subsampling_x, int subsampling_y>
inline uint8x8_t GetInterIntraMask4x2(const uint8_t* mask,
ptrdiff_t mask_stride) {
if (subsampling_x == 1) {
- const uint8x8_t mask_val =
- vpadd_u8(vld1_u8(mask), vld1_u8(mask + (mask_stride << subsampling_y)));
- if (subsampling_y == 1) {
- const uint8x8_t next_mask_val = vpadd_u8(vld1_u8(mask + mask_stride),
- vld1_u8(mask + mask_stride * 3));
-
- // Use a saturating add to work around the case where all |mask| values
- // are 64. Together with the rounding shift this ensures the correct
- // result.
- const uint8x8_t sum = vqadd_u8(mask_val, next_mask_val);
- return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y);
- }
-
- return vrshr_n_u8(mask_val, /*subsampling_x=*/1);
+ return GetMask4x2<subsampling_y>(mask);
}
-
+ // When using intra or difference weighted masks, the function doesn't use
+ // subsampling, so |mask_stride| may be 4 or 8.
assert(subsampling_y == 0 && subsampling_x == 0);
const uint8x8_t mask_val0 = Load4(mask);
- // TODO(b/150461164): Investigate the source of |mask| and see if the stride
- // can be removed.
- // TODO(b/150461164): The unit tests start at 8x8. Does this get run?
return Load4<1>(mask + mask_stride, mask_val0);
}
-template <int subsampling_x, int subsampling_y>
-inline uint8x8_t GetInterIntraMask8(const uint8_t* mask,
- ptrdiff_t mask_stride) {
- if (subsampling_x == 1) {
- const uint8x16_t mask_val = vld1q_u8(mask);
- const uint8x8_t mask_paired =
- vpadd_u8(vget_low_u8(mask_val), vget_high_u8(mask_val));
- if (subsampling_y == 1) {
- const uint8x16_t next_mask_val = vld1q_u8(mask + mask_stride);
- const uint8x8_t next_mask_paired =
- vpadd_u8(vget_low_u8(next_mask_val), vget_high_u8(next_mask_val));
-
- // Use a saturating add to work around the case where all |mask| values
- // are 64. Together with the rounding shift this ensures the correct
- // result.
- const uint8x8_t sum = vqadd_u8(mask_paired, next_mask_paired);
- return vrshr_n_u8(sum, /*subsampling_x=*/1 + subsampling_y);
- }
-
- return vrshr_n_u8(mask_paired, /*subsampling_x=*/1);
- }
-
- assert(subsampling_y == 0 && subsampling_x == 0);
- return vld1_u8(mask);
-}
-
inline void InterIntraWriteMaskBlendLine8bpp4x2(
const uint8_t* LIBGAV1_RESTRICT const pred_0,
uint8_t* LIBGAV1_RESTRICT const pred_1, const ptrdiff_t pred_stride_1,
@@ -374,6 +387,32 @@ inline void InterIntraMaskBlending8bpp4xH_NEON(
}
template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlending8bpp8xH_NEON(
+ const uint8_t* LIBGAV1_RESTRICT pred_0, uint8_t* LIBGAV1_RESTRICT pred_1,
+ const ptrdiff_t pred_stride_1, const uint8_t* LIBGAV1_RESTRICT mask,
+ const ptrdiff_t mask_stride, const int height) {
+ const uint8x8_t mask_inverter = vdup_n_u8(64);
+ int y = height;
+ do {
+ const uint8x8_t pred_mask_1 = GetMask8<subsampling_x, subsampling_y>(mask);
+ // 64 - mask
+ const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
+ const uint8x8_t pred_val_0 = vld1_u8(pred_0);
+ const uint8x8_t pred_val_1 = vld1_u8(pred_1);
+ const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+ // weighted_pred0 + weighted_pred1
+ const uint16x8_t weighted_combo =
+ vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
+ const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
+ vst1_u8(pred_1, result);
+
+ pred_0 += 8;
+ pred_1 += pred_stride_1;
+ mask += mask_stride << subsampling_y;
+ } while (--y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
inline void InterIntraMaskBlend8bpp_NEON(
const uint8_t* LIBGAV1_RESTRICT prediction_0,
uint8_t* LIBGAV1_RESTRICT prediction_1, const ptrdiff_t prediction_stride_1,
@@ -385,30 +424,46 @@ inline void InterIntraMaskBlend8bpp_NEON(
height);
return;
}
+ if (width == 8) {
+ InterIntraMaskBlending8bpp8xH_NEON<subsampling_x, subsampling_y>(
+ prediction_0, prediction_1, prediction_stride_1, mask_ptr, mask_stride,
+ height);
+ return;
+ }
const uint8_t* mask = mask_ptr;
- const uint8x8_t mask_inverter = vdup_n_u8(64);
+ const uint8x16_t mask_inverter = vdupq_n_u8(64);
int y = 0;
do {
int x = 0;
do {
- // TODO(b/150461164): Consider a 16 wide specialization (at least for the
- // unsampled version) to take advantage of vld1q_u8().
- const uint8x8_t pred_mask_1 =
- GetInterIntraMask8<subsampling_x, subsampling_y>(
- mask + (x << subsampling_x), mask_stride);
+ const uint8x16_t pred_mask_1 = GetMask16<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride);
// 64 - mask
- const uint8x8_t pred_mask_0 = vsub_u8(mask_inverter, pred_mask_1);
- const uint8x8_t pred_val_0 = vld1_u8(prediction_0);
+ const uint8x16_t pred_mask_0 = vsubq_u8(mask_inverter, pred_mask_1);
+ const uint8x8_t pred_val_0_lo = vld1_u8(prediction_0);
+ prediction_0 += 8;
+ const uint8x8_t pred_val_0_hi = vld1_u8(prediction_0);
prediction_0 += 8;
- const uint8x8_t pred_val_1 = vld1_u8(prediction_1 + x);
- const uint16x8_t weighted_pred_0 = vmull_u8(pred_mask_0, pred_val_0);
+ // Ensure armv7 build combines the load.
+ const uint8x16_t pred_val_1 = vld1q_u8(prediction_1 + x);
+ const uint8x8_t pred_val_1_lo = vget_low_u8(pred_val_1);
+ const uint8x8_t pred_val_1_hi = vget_high_u8(pred_val_1);
+ const uint16x8_t weighted_pred_0_lo =
+ vmull_u8(vget_low_u8(pred_mask_0), pred_val_0_lo);
// weighted_pred0 + weighted_pred1
- const uint16x8_t weighted_combo =
- vmlal_u8(weighted_pred_0, pred_mask_1, pred_val_1);
- const uint8x8_t result = vrshrn_n_u16(weighted_combo, 6);
- vst1_u8(prediction_1 + x, result);
+ const uint16x8_t weighted_combo_lo =
+ vmlal_u8(weighted_pred_0_lo, vget_low_u8(pred_mask_1), pred_val_1_lo);
+ const uint8x8_t result_lo = vrshrn_n_u16(weighted_combo_lo, 6);
+ vst1_u8(prediction_1 + x, result_lo);
+ const uint16x8_t weighted_pred_0_hi =
+ vmull_u8(vget_high_u8(pred_mask_0), pred_val_0_hi);
+ // weighted_pred0 + weighted_pred1
+ const uint16x8_t weighted_combo_hi = vmlal_u8(
+ weighted_pred_0_hi, vget_high_u8(pred_mask_1), pred_val_1_hi);
+ const uint8x8_t result_hi = vrshrn_n_u16(weighted_combo_hi, 6);
+ vst1_u8(prediction_1 + x + 8, result_hi);
- x += 8;
+ x += 16;
} while (x < width);
prediction_1 += prediction_stride_1;
mask += mask_stride << subsampling_y;
diff --git a/src/dsp/arm/obmc_neon.cc b/src/dsp/arm/obmc_neon.cc
index 659ed8e..271bbaa 100644
--- a/src/dsp/arm/obmc_neon.cc
+++ b/src/dsp/arm/obmc_neon.cc
@@ -52,6 +52,17 @@ inline void WriteObmcLine4(uint8_t* LIBGAV1_RESTRICT const pred,
StoreLo4(pred, result);
}
+inline void WriteObmcLine8(uint8_t* LIBGAV1_RESTRICT const pred,
+ const uint8x8_t obmc_pred_val,
+ const uint8x8_t pred_mask,
+ const uint8x8_t obmc_pred_mask) {
+ const uint8x8_t pred_val = vld1_u8(pred);
+ const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
+ const uint8x8_t result =
+ vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+ vst1_u8(pred, result);
+}
+
inline void OverlapBlendFromLeft2xH_NEON(
uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
@@ -99,24 +110,25 @@ inline void OverlapBlendFromLeft4xH_NEON(
inline void OverlapBlendFromLeft8xH_NEON(
uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
- const ptrdiff_t obmc_prediction_stride) {
+ const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) {
const uint8x8_t mask_inverter = vdup_n_u8(64);
const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6);
+ constexpr int obmc_prediction_stride = 8;
// 64 - mask
const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
int y = 0;
do {
- const uint8x8_t pred_val = vld1_u8(pred);
- const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
- const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
- const uint8x8_t result =
- vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+ const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
+ WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask, obmc_pred_mask);
+ pred += prediction_stride;
- vst1_u8(pred, result);
+ WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
- } while (++y != height);
+
+ obmc_pred += obmc_prediction_stride << 1;
+ y += 2;
+ } while (y != height);
}
void OverlapBlendFromLeft_NEON(
@@ -140,8 +152,7 @@ void OverlapBlendFromLeft_NEON(
return;
}
if (width == 8) {
- OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred);
return;
}
const uint8x16_t mask_inverter = vdupq_n_u8(64);
@@ -262,26 +273,31 @@ inline void OverlapBlendFromTop4xH_NEON(
inline void OverlapBlendFromTop8xH_NEON(
uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
- const ptrdiff_t obmc_prediction_stride) {
+ const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred) {
+ constexpr int obmc_prediction_stride = 8;
const uint8x8_t mask_inverter = vdup_n_u8(64);
const uint8_t* mask = kObmcMask + height - 2;
const int compute_height = height - (height >> 2);
int y = 0;
do {
- const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
+ const uint8x8_t pred_mask0 = vdup_n_u8(mask[y]);
// 64 - mask
- const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
- const uint8x8_t pred_val = vld1_u8(pred);
- const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
- const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
- const uint8x8_t result =
- vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
+ const uint8x8_t obmc_pred_mask0 = vsub_u8(mask_inverter, pred_mask0);
+ const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
- vst1_u8(pred, result);
+ WriteObmcLine8(pred, vget_low_u8(obmc_pred_val), pred_mask0,
+ obmc_pred_mask0);
pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
- } while (++y != compute_height);
+ ++y;
+
+ const uint8x8_t pred_mask1 = vdup_n_u8(mask[y]);
+ // 64 - mask
+ const uint8x8_t obmc_pred_mask1 = vsub_u8(mask_inverter, pred_mask1);
+ WriteObmcLine8(pred, vget_high_u8(obmc_pred_val), pred_mask1,
+ obmc_pred_mask1);
+ pred += prediction_stride;
+ obmc_pred += obmc_prediction_stride << 1;
+ } while (++y < compute_height);
}
void OverlapBlendFromTop_NEON(
@@ -301,8 +317,7 @@ void OverlapBlendFromTop_NEON(
}
if (width == 8) {
- OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred);
return;
}
@@ -371,26 +386,23 @@ constexpr uint16_t kObmcMask[62] = {
33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58,
59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
-inline uint16x4_t BlendObmc2Or4(uint8_t* LIBGAV1_RESTRICT const pred,
- const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
+inline uint16x4_t BlendObmc2Or4(uint16_t* const pred,
+ const uint16x4_t obmc_pred_val,
const uint16x4_t pred_mask,
const uint16x4_t obmc_pred_mask) {
- const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred));
- const uint16x4_t obmc_pred_val =
- vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+ const uint16x4_t pred_val = vld1_u16(pred);
const uint16x4_t weighted_pred = vmul_u16(pred_mask, pred_val);
const uint16x4_t result =
vrshr_n_u16(vmla_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
return result;
}
-inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred,
- const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
+inline uint16x8_t BlendObmc8(uint16_t* LIBGAV1_RESTRICT const pred,
+ const uint16_t* LIBGAV1_RESTRICT const obmc_pred,
const uint16x8_t pred_mask,
const uint16x8_t obmc_pred_mask) {
- const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred));
- const uint16x8_t obmc_pred_val =
- vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+ const uint16x8_t pred_val = vld1q_u16(pred);
+ const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
const uint16x8_t weighted_pred = vmulq_u16(pred_mask, pred_val);
const uint16x8_t result =
vrshrq_n_u16(vmlaq_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
@@ -398,27 +410,29 @@ inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred,
}
inline void OverlapBlendFromLeft2xH_NEON(
- uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
- const ptrdiff_t obmc_prediction_stride) {
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
+ constexpr int obmc_prediction_stride = 2;
const uint16x4_t mask_inverter = vdup_n_u16(64);
// Second two lanes unused.
const uint16x4_t pred_mask = vld1_u16(kObmcMask);
const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
int y = 0;
do {
+ const uint16x4_t obmc_pred_0 = vld1_u16(obmc_pred);
const uint16x4_t result_0 =
- BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
- Store2<0>(reinterpret_cast<uint16_t*>(pred), result_0);
+ BlendObmc2Or4(pred, obmc_pred_0, pred_mask, obmc_pred_mask);
+ Store2<0>(pred, result_0);
- pred += prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
+ const uint16x4_t obmc_pred_1 = vld1_u16(obmc_pred);
const uint16x4_t result_1 =
- BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
- Store2<0>(reinterpret_cast<uint16_t*>(pred), result_1);
+ BlendObmc2Or4(pred, obmc_pred_1, pred_mask, obmc_pred_mask);
+ Store2<0>(pred, result_1);
- pred += prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
y += 2;
@@ -426,26 +440,26 @@ inline void OverlapBlendFromLeft2xH_NEON(
}
inline void OverlapBlendFromLeft4xH_NEON(
- uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
- const ptrdiff_t obmc_prediction_stride) {
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
+ constexpr int obmc_prediction_stride = 4;
const uint16x4_t mask_inverter = vdup_n_u16(64);
const uint16x4_t pred_mask = vld1_u16(kObmcMask + 2);
// 64 - mask
const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
int y = 0;
do {
- const uint16x4_t result_0 =
- BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result_0);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
-
- const uint16x4_t result_1 =
- BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result_1);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
+ const uint16x4_t result_0 = BlendObmc2Or4(pred, vget_low_u16(obmc_pred_val),
+ pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result_0);
+ pred = AddByteStride(pred, prediction_stride);
+
+ const uint16x4_t result_1 = BlendObmc2Or4(
+ pred, vget_high_u16(obmc_pred_val), pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result_1);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
y += 2;
} while (y != height);
@@ -456,52 +470,47 @@ void OverlapBlendFromLeft_NEON(
const int width, const int height,
const void* LIBGAV1_RESTRICT const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
- auto* pred = static_cast<uint8_t*>(prediction);
- const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+ auto* pred = static_cast<uint16_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
assert(width >= 2);
assert(height >= 4);
if (width == 2) {
- OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred);
return;
}
if (width == 4) {
- OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred);
return;
}
const uint16x8_t mask_inverter = vdupq_n_u16(64);
const uint16_t* mask = kObmcMask + width - 2;
int x = 0;
do {
- pred = reinterpret_cast<uint8_t*>(static_cast<uint16_t*>(prediction) + x);
- obmc_pred = reinterpret_cast<const uint8_t*>(
- static_cast<const uint16_t*>(obmc_prediction) + x);
+ uint16_t* pred_x = pred + x;
+ const uint16_t* obmc_pred_x = obmc_pred + x;
const uint16x8_t pred_mask = vld1q_u16(mask + x);
// 64 - mask
const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
int y = 0;
do {
const uint16x8_t result =
- BlendObmc8(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+ BlendObmc8(pred_x, obmc_pred_x, pred_mask, obmc_pred_mask);
+ vst1q_u16(pred_x, result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred_x = AddByteStride(pred_x, prediction_stride);
+ obmc_pred_x = AddByteStride(obmc_pred_x, obmc_prediction_stride);
} while (++y < height);
x += 8;
} while (x < width);
}
template <int lane>
-inline uint16x4_t BlendObmcFromTop4(
- uint8_t* LIBGAV1_RESTRICT const pred,
- const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask,
- const uint16x8_t obmc_pred_mask) {
- const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred));
- const uint16x4_t obmc_pred_val =
- vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+inline uint16x4_t BlendObmcFromTop4(uint16_t* const pred,
+ const uint16x4_t obmc_pred_val,
+ const uint16x8_t pred_mask,
+ const uint16x8_t obmc_pred_mask) {
+ const uint16x4_t pred_val = vld1_u16(pred);
const uint16x4_t weighted_pred = VMulLaneQU16<lane>(pred_val, pred_mask);
const uint16x4_t result = vrshr_n_u16(
VMlaLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
@@ -510,12 +519,11 @@ inline uint16x4_t BlendObmcFromTop4(
template <int lane>
inline uint16x8_t BlendObmcFromTop8(
- uint8_t* LIBGAV1_RESTRICT const pred,
- const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask,
- const uint16x8_t obmc_pred_mask) {
- const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred));
- const uint16x8_t obmc_pred_val =
- vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
+ uint16_t* LIBGAV1_RESTRICT const pred,
+ const uint16_t* LIBGAV1_RESTRICT const obmc_pred,
+ const uint16x8_t pred_mask, const uint16x8_t obmc_pred_mask) {
+ const uint16x8_t pred_val = vld1q_u16(pred);
+ const uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
const uint16x8_t weighted_pred = VMulQLaneQU16<lane>(pred_val, pred_mask);
const uint16x8_t result = vrshrq_n_u16(
VMlaQLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
@@ -523,41 +531,43 @@ inline uint16x8_t BlendObmcFromTop8(
}
inline void OverlapBlendFromTop4x2Or4_NEON(
- uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
- const uint8_t* LIBGAV1_RESTRICT obmc_pred,
- const ptrdiff_t obmc_prediction_stride, const int height) {
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) {
+ constexpr int obmc_prediction_stride = 4;
const uint16x8_t pred_mask = vld1q_u16(&kObmcMask[height - 2]);
const uint16x8_t mask_inverter = vdupq_n_u16(64);
const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
- uint16x4_t result =
- BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ const uint16x8_t obmc_pred_val_0 = vld1q_u16(obmc_pred);
+ uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val_0),
+ pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
if (height == 2) {
// Mask value is 64, meaning |pred| is unchanged.
return;
}
- result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val_0), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
- result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
+ const uint16x4_t obmc_pred_val_2 = vld1_u16(obmc_pred);
+ result =
+ BlendObmcFromTop4<2>(pred, obmc_pred_val_2, pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result);
}
inline void OverlapBlendFromTop4xH_NEON(
- uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
- const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
- const ptrdiff_t obmc_prediction_stride) {
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const int height, const uint16_t* LIBGAV1_RESTRICT obmc_pred) {
if (height < 8) {
- OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred,
- obmc_prediction_stride, height);
+ OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred, height);
return;
}
+ constexpr int obmc_prediction_stride = 4;
const uint16_t* mask = kObmcMask + height - 2;
const uint16x8_t mask_inverter = vdupq_n_u16(64);
int y = 0;
@@ -566,36 +576,44 @@ inline void OverlapBlendFromTop4xH_NEON(
do {
const uint16x8_t pred_mask = vld1q_u16(&mask[y]);
const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
- uint16x4_t result =
- BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
-
- result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
-
- result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
-
- result = BlendObmcFromTop4<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
-
- result = BlendObmcFromTop4<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
-
- result = BlendObmcFromTop4<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ // Load obmc row 0, 1.
+ uint16x8_t obmc_pred_val = vld1q_u16(obmc_pred);
+ uint16x4_t result = BlendObmcFromTop4<0>(pred, vget_low_u16(obmc_pred_val),
+ pred_mask, obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+
+ result = BlendObmcFromTop4<1>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
+
+ // Load obmc row 2, 3.
+ obmc_pred_val = vld1q_u16(obmc_pred);
+ result = BlendObmcFromTop4<2>(pred, vget_low_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+
+ result = BlendObmcFromTop4<3>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
+
+ // Load obmc row 4, 5.
+ obmc_pred_val = vld1q_u16(obmc_pred);
+ result = BlendObmcFromTop4<4>(pred, vget_low_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+
+ result = BlendObmcFromTop4<5>(pred, vget_high_u16(obmc_pred_val), pred_mask,
+ obmc_pred_mask);
+ vst1_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred += obmc_prediction_stride << 1;
// Increment for the right mask index.
y += 6;
@@ -603,147 +621,147 @@ inline void OverlapBlendFromTop4xH_NEON(
}
inline void OverlapBlendFromTop8xH_NEON(
- uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
- const uint8_t* LIBGAV1_RESTRICT obmc_pred,
- const ptrdiff_t obmc_prediction_stride, const int height) {
+ uint16_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
+ const uint16_t* LIBGAV1_RESTRICT obmc_pred, const int height) {
const uint16_t* mask = kObmcMask + height - 2;
const uint16x8_t mask_inverter = vdupq_n_u16(64);
uint16x8_t pred_mask = vld1q_u16(mask);
uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
uint16x8_t result =
BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+ vst1q_u16(pred, result);
if (height == 2) return;
- pred += prediction_stride;
+ constexpr int obmc_prediction_stride = 8;
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+ vst1q_u16(pred, result);
if (height == 4) return;
- pred += prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+ vst1q_u16(pred, result);
if (height == 8) return;
- pred += prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
pred_mask = vld1q_u16(&mask[8]);
obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+ vst1q_u16(pred, result);
if (height == 16) return;
- pred += prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
pred_mask = vld1q_u16(&mask[16]);
obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
- pred += prediction_stride;
+ vst1q_u16(pred, result);
+ pred = AddByteStride(pred, prediction_stride);
obmc_pred += obmc_prediction_stride;
result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
- vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
+ vst1q_u16(pred, result);
}
void OverlapBlendFromTop_NEON(
@@ -751,20 +769,18 @@ void OverlapBlendFromTop_NEON(
const int width, const int height,
const void* LIBGAV1_RESTRICT const obmc_prediction,
const ptrdiff_t obmc_prediction_stride) {
- auto* pred = static_cast<uint8_t*>(prediction);
- const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
+ auto* pred = static_cast<uint16_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
assert(width >= 4);
assert(height >= 2);
if (width == 4) {
- OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
- obmc_prediction_stride);
+ OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred);
return;
}
if (width == 8) {
- OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred,
- obmc_prediction_stride, height);
+ OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred, height);
return;
}
@@ -773,19 +789,16 @@ void OverlapBlendFromTop_NEON(
const uint16x8_t pred_mask = vld1q_u16(mask);
// 64 - mask
const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
-#define OBMC_ROW_FROM_TOP(n) \
- do { \
- int x = 0; \
- do { \
- const uint16x8_t result = BlendObmcFromTop8<n>( \
- reinterpret_cast<uint8_t*>(reinterpret_cast<uint16_t*>(pred) + x), \
- reinterpret_cast<const uint8_t*>( \
- reinterpret_cast<const uint16_t*>(obmc_pred) + x), \
- pred_mask, obmc_pred_mask); \
- vst1q_u16(reinterpret_cast<uint16_t*>(pred) + x, result); \
- \
- x += 8; \
- } while (x < width); \
+#define OBMC_ROW_FROM_TOP(n) \
+ do { \
+ int x = 0; \
+ do { \
+ const uint16x8_t result = BlendObmcFromTop8<n>( \
+ pred + x, obmc_pred + x, pred_mask, obmc_pred_mask); \
+ vst1q_u16(pred + x, result); \
+ \
+ x += 8; \
+ } while (x < width); \
} while (false)
// Compute 1 row.
@@ -797,11 +810,11 @@ void OverlapBlendFromTop_NEON(
// Compute 3 rows.
if (height == 4) {
OBMC_ROW_FROM_TOP(0);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(1);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(2);
return;
}
@@ -809,20 +822,20 @@ void OverlapBlendFromTop_NEON(
// Compute 6 rows.
if (height == 8) {
OBMC_ROW_FROM_TOP(0);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(1);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(2);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(3);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(4);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(5);
return;
}
@@ -830,42 +843,42 @@ void OverlapBlendFromTop_NEON(
// Compute 12 rows.
if (height == 16) {
OBMC_ROW_FROM_TOP(0);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(1);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(2);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(3);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(4);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(5);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(6);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(7);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
const uint16x8_t pred_mask = vld1q_u16(&mask[8]);
// 64 - mask
const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
OBMC_ROW_FROM_TOP(0);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(1);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(2);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(3);
return;
}
@@ -879,29 +892,29 @@ void OverlapBlendFromTop_NEON(
// 64 - mask
const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
OBMC_ROW_FROM_TOP(0);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(1);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(2);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(3);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(4);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(5);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(6);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
OBMC_ROW_FROM_TOP(7);
- pred += prediction_stride;
- obmc_pred += obmc_prediction_stride;
+ pred = AddByteStride(pred, prediction_stride);
+ obmc_pred = AddByteStride(obmc_pred, obmc_prediction_stride);
y += 8;
} while (y < compute_height);
diff --git a/src/dsp/arm/warp_neon.cc b/src/dsp/arm/warp_neon.cc
index 71e0a43..da380b1 100644
--- a/src/dsp/arm/warp_neon.cc
+++ b/src/dsp/arm/warp_neon.cc
@@ -147,14 +147,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
do {
const int src_x = (start_x + 4) << subsampling_x;
const int src_y = (start_y + 4) << subsampling_y;
- const int dst_x =
- src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
- const int dst_y =
- src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
- const int x4 = dst_x >> subsampling_x;
- const int y4 = dst_y >> subsampling_y;
- const int ix4 = x4 >> kWarpedModelPrecisionBits;
- const int iy4 = y4 >> kWarpedModelPrecisionBits;
+ const WarpFilterParams filter_params = GetWarpFilterParams(
+ src_x, src_y, subsampling_x, subsampling_y, warp_params);
// A prediction block may fall outside the frame's boundaries. If a
// prediction block is calculated using only samples outside the frame's
// boundary, the filtering can be simplified. We can divide the plane
@@ -207,22 +201,24 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// border index (source_width - 1 or 0, respectively). Then for each x,
// the inner for loop of the horizontal filter is reduced to multiplying
// the border pixel by the sum of the filter coefficients.
- if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+ if (filter_params.ix4 - 7 >= source_width - 1 ||
+ filter_params.ix4 + 7 <= 0) {
// Regions 1 and 2.
// Points to the left or right border of the first row of |src|.
const uint8_t* first_row_border =
- (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
// In general, for y in [-7, 8), the row number iy4 + y is clipped:
// const int row = Clip3(iy4 + y, 0, source_height - 1);
// In two special cases, iy4 + y is clipped to either 0 or
// source_height - 1 for all y. In the rest of the cases, iy4 + y is
// bounded and we can avoid clipping iy4 + y by relying on a reference
// frame's boundary extension on the top and bottom.
- if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
// Region 1.
// Every sample used to calculate the prediction block has the same
// value. So the whole prediction block has the same value.
- const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
const uint8_t row_border_pixel =
first_row_border[row * source_stride];
@@ -256,15 +252,15 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// We may over-read up to 13 pixels above the top source row, or up
// to 13 pixels below the bottom source row. This is proved in
// warp.cc.
- const int row = iy4 + y;
+ const int row = filter_params.iy4 + y;
int sum = first_row_border[row * source_stride];
sum <<= (kFilterBits - kInterRoundBitsHorizontal);
intermediate_result_column[y + 7] = sum;
}
// Vertical filter.
DestType* dst_row = dst + start_x - block_start_x;
- int sy4 =
- (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
for (int y = 0; y < 8; ++y) {
int sy = sy4 - MultiplyBy4(gamma);
#if defined(__aarch64__)
@@ -341,10 +337,11 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// source_height - 1 for all y. In the rest of the cases, iy4 + y is
// bounded and we can avoid clipping iy4 + y by relying on a reference
// frame's boundary extension on the top and bottom.
- if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
// Region 3.
// Horizontal filter.
- const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
const uint8_t* const src_row = src + row * source_stride;
// Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
// read but is ignored.
@@ -354,11 +351,12 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// has left and right borders of at least 13 bytes that extend the
// frame boundary pixels. We also assume there is at least one extra
// padding byte after the right border of the last source row.
- const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]);
+ const uint8x16_t src_row_v = vld1q_u8(&src_row[filter_params.ix4 - 7]);
// Convert src_row_v to int8 (subtract 128).
const int8x16_t src_row_centered =
vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
- int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
for (int y = -7; y < 8; ++y) {
HorizontalFilter(sx4, alpha, src_row_centered,
intermediate_result[y + 7]);
@@ -367,12 +365,13 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
} else {
// Region 4.
// Horizontal filter.
- int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
for (int y = -7; y < 8; ++y) {
// We may over-read up to 13 pixels above the top source row, or up
// to 13 pixels below the bottom source row. This is proved in
// warp.cc.
- const int row = iy4 + y;
+ const int row = filter_params.iy4 + y;
const uint8_t* const src_row = src + row * source_stride;
// Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
// read but is ignored.
@@ -382,7 +381,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// has left and right borders of at least 13 bytes that extend the
// frame boundary pixels. We also assume there is at least one extra
// padding byte after the right border of the last source row.
- const uint8x16_t src_row_v = vld1q_u8(&src_row[ix4 - 7]);
+ const uint8x16_t src_row_v =
+ vld1q_u8(&src_row[filter_params.ix4 - 7]);
// Convert src_row_v to int8 (subtract 128).
const int8x16_t src_row_centered =
vreinterpretq_s8_u8(vsubq_u8(src_row_v, vdupq_n_u8(128)));
@@ -395,8 +395,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// Regions 3 and 4.
// Vertical filter.
DestType* dst_row = dst + start_x - block_start_x;
- int sy4 =
- (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
for (int y = 0; y < 8; ++y) {
int sy = sy4 - MultiplyBy4(gamma);
int16x8_t filter[8];
@@ -574,14 +574,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
do {
const int src_x = (start_x + 4) << subsampling_x;
const int src_y = (start_y + 4) << subsampling_y;
- const int dst_x =
- src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
- const int dst_y =
- src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
- const int x4 = dst_x >> subsampling_x;
- const int y4 = dst_y >> subsampling_y;
- const int ix4 = x4 >> kWarpedModelPrecisionBits;
- const int iy4 = y4 >> kWarpedModelPrecisionBits;
+ const WarpFilterParams filter_params = GetWarpFilterParams(
+ src_x, src_y, subsampling_x, subsampling_y, warp_params);
// A prediction block may fall outside the frame's boundaries. If a
// prediction block is calculated using only samples outside the frame's
// boundary, the filtering can be simplified. We can divide the plane
@@ -634,22 +628,24 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// border index (source_width - 1 or 0, respectively). Then for each x,
// the inner for loop of the horizontal filter is reduced to multiplying
// the border pixel by the sum of the filter coefficients.
- if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
+ if (filter_params.ix4 - 7 >= source_width - 1 ||
+ filter_params.ix4 + 7 <= 0) {
// Regions 1 and 2.
// Points to the left or right border of the first row of |src|.
const uint16_t* first_row_border =
- (ix4 + 7 <= 0) ? src : src + source_width - 1;
+ (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
// In general, for y in [-7, 8), the row number iy4 + y is clipped:
// const int row = Clip3(iy4 + y, 0, source_height - 1);
// In two special cases, iy4 + y is clipped to either 0 or
// source_height - 1 for all y. In the rest of the cases, iy4 + y is
// bounded and we can avoid clipping iy4 + y by relying on a reference
// frame's boundary extension on the top and bottom.
- if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
// Region 1.
// Every sample used to calculate the prediction block has the same
// value. So the whole prediction block has the same value.
- const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
const uint16_t row_border_pixel = first_row_border[row * src_stride];
DestType* dst_row = dst + start_x - block_start_x;
@@ -684,15 +680,15 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// We may over-read up to 13 pixels above the top source row, or up
// to 13 pixels below the bottom source row. This is proved in
// warp.cc.
- const int row = iy4 + y;
+ const int row = filter_params.iy4 + y;
int sum = first_row_border[row * src_stride];
sum <<= (kFilterBits - kInterRoundBitsHorizontal);
intermediate_result_column[y + 7] = sum;
}
// Vertical filter.
DestType* dst_row = dst + start_x - block_start_x;
- int sy4 =
- (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
for (int y = 0; y < 8; ++y) {
int sy = sy4 - MultiplyBy4(gamma);
#if defined(__aarch64__)
@@ -782,10 +778,11 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// source_height - 1 for all y. In the rest of the cases, iy4 + y is
// bounded and we can avoid clipping iy4 + y by relying on a reference
// frame's boundary extension on the top and bottom.
- if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
+ if (filter_params.iy4 - 7 >= source_height - 1 ||
+ filter_params.iy4 + 7 <= 0) {
// Region 3.
// Horizontal filter.
- const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
+ const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
const uint16_t* const src_row = src + row * src_stride;
// Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
// read but is ignored.
@@ -795,8 +792,10 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// has left and right borders of at least 13 pixels that extend the
// frame boundary pixels. We also assume there is at least one extra
// padding pixel after the right border of the last source row.
- const uint16x8x2_t src_row_v = LoadSrcRow(&src_row[ix4 - 7]);
- int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ const uint16x8x2_t src_row_v =
+ LoadSrcRow(&src_row[filter_params.ix4 - 7]);
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
for (int y = -7; y < 8; ++y) {
HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
sx4 += beta;
@@ -804,12 +803,13 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
} else {
// Region 4.
// Horizontal filter.
- int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
+ int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ beta * 7;
for (int y = -7; y < 8; ++y) {
// We may over-read up to 13 pixels above the top source row, or up
// to 13 pixels below the bottom source row. This is proved in
// warp.cc.
- const int row = iy4 + y;
+ const int row = filter_params.iy4 + y;
const uint16_t* const src_row = src + row * src_stride;
// Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
// read but is ignored.
@@ -819,7 +819,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// frame has left and right borders of at least 13 pixels that extend
// the frame boundary pixels. We also assume there is at least one
// extra padding pixel after the right border of the last source row.
- const uint16x8x2_t src_row_v = LoadSrcRow(&src_row[ix4 - 7]);
+ const uint16x8x2_t src_row_v =
+ LoadSrcRow(&src_row[filter_params.ix4 - 7]);
HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
sx4 += beta;
}
@@ -828,8 +829,8 @@ void Warp_NEON(const void* LIBGAV1_RESTRICT const source,
// Regions 3 and 4.
// Vertical filter.
DestType* dst_row = dst + start_x - block_start_x;
- int sy4 =
- (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
+ int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
+ MultiplyBy4(delta);
for (int y = 0; y < 8; ++y) {
int sy = sy4 - MultiplyBy4(gamma);
int16x8_t filter[8];