aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/arm/film_grain_neon.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/arm/film_grain_neon.cc')
-rw-r--r--src/dsp/arm/film_grain_neon.cc739
1 files changed, 515 insertions, 224 deletions
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc
index 8ee3745..0b1b481 100644
--- a/src/dsp/arm/film_grain_neon.cc
+++ b/src/dsp/arm/film_grain_neon.cc
@@ -34,6 +34,7 @@
#include "src/utils/common.h"
#include "src/utils/compiler_attributes.h"
#include "src/utils/logging.h"
+#include "src/utils/memory.h"
namespace libgav1 {
namespace dsp {
@@ -51,6 +52,12 @@ inline int16x8_t GetSignedSource8(const uint8_t* src) {
return ZeroExtend(vld1_u8(src));
}
+inline int16x8_t GetSignedSource8Msan(const uint8_t* src, int /*valid_range*/) {
+ // TODO(b/194217060): restore |valid_range| usage after correcting call sites
+ // causing test vector failures.
+ return ZeroExtend(Load1MsanU8(src, 0));
+}
+
inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) {
vst1_u8(dest, vmovn_u16(data));
}
@@ -62,6 +69,13 @@ inline int16x8_t GetSignedSource8(const uint16_t* src) {
return vreinterpretq_s16_u16(vld1q_u16(src));
}
+inline int16x8_t GetSignedSource8Msan(const uint16_t* src,
+ int /*valid_range*/) {
+ // TODO(b/194217060): restore |valid_range| usage after correcting call sites
+ // causing test vector failures.
+ return vreinterpretq_s16_u16(Load1QMsanU16(src, 0));
+}
+
inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) {
vst1q_u16(dest, data);
}
@@ -84,8 +98,10 @@ inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo,
// compute pixels that come after in the row, we have to finish the calculations
// one at a time.
template <int bitdepth, int auto_regression_coeff_lag, int lane>
-inline void WriteFinalAutoRegression(int8_t* grain_cursor, int32x4x2_t sum,
- const int8_t* coeffs, int pos, int shift) {
+inline void WriteFinalAutoRegression(int8_t* LIBGAV1_RESTRICT grain_cursor,
+ int32x4x2_t sum,
+ const int8_t* LIBGAV1_RESTRICT coeffs,
+ int pos, int shift) {
int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
@@ -99,8 +115,10 @@ inline void WriteFinalAutoRegression(int8_t* grain_cursor, int32x4x2_t sum,
#if LIBGAV1_MAX_BITDEPTH >= 10
template <int bitdepth, int auto_regression_coeff_lag, int lane>
-inline void WriteFinalAutoRegression(int16_t* grain_cursor, int32x4x2_t sum,
- const int8_t* coeffs, int pos, int shift) {
+inline void WriteFinalAutoRegression(int16_t* LIBGAV1_RESTRICT grain_cursor,
+ int32x4x2_t sum,
+ const int8_t* LIBGAV1_RESTRICT coeffs,
+ int pos, int shift) {
int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
@@ -117,12 +135,11 @@ inline void WriteFinalAutoRegression(int16_t* grain_cursor, int32x4x2_t sum,
// compute pixels that come after in the row, we have to finish the calculations
// one at a time.
template <int bitdepth, int auto_regression_coeff_lag, int lane>
-inline void WriteFinalAutoRegressionChroma(int8_t* u_grain_cursor,
- int8_t* v_grain_cursor,
- int32x4x2_t sum_u, int32x4x2_t sum_v,
- const int8_t* coeffs_u,
- const int8_t* coeffs_v, int pos,
- int shift) {
+inline void WriteFinalAutoRegressionChroma(
+ int8_t* LIBGAV1_RESTRICT u_grain_cursor,
+ int8_t* LIBGAV1_RESTRICT v_grain_cursor, int32x4x2_t sum_u,
+ int32x4x2_t sum_v, const int8_t* LIBGAV1_RESTRICT coeffs_u,
+ const int8_t* LIBGAV1_RESTRICT coeffs_v, int pos, int shift) {
WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
u_grain_cursor, sum_u, coeffs_u, pos, shift);
WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
@@ -131,12 +148,11 @@ inline void WriteFinalAutoRegressionChroma(int8_t* u_grain_cursor,
#if LIBGAV1_MAX_BITDEPTH >= 10
template <int bitdepth, int auto_regression_coeff_lag, int lane>
-inline void WriteFinalAutoRegressionChroma(int16_t* u_grain_cursor,
- int16_t* v_grain_cursor,
- int32x4x2_t sum_u, int32x4x2_t sum_v,
- const int8_t* coeffs_u,
- const int8_t* coeffs_v, int pos,
- int shift) {
+inline void WriteFinalAutoRegressionChroma(
+ int16_t* LIBGAV1_RESTRICT u_grain_cursor,
+ int16_t* LIBGAV1_RESTRICT v_grain_cursor, int32x4x2_t sum_u,
+ int32x4x2_t sum_v, const int8_t* LIBGAV1_RESTRICT coeffs_u,
+ const int8_t* LIBGAV1_RESTRICT coeffs_v, int pos, int shift) {
WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
u_grain_cursor, sum_u, coeffs_u, pos, shift);
WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
@@ -181,6 +197,20 @@ inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
return vmovl_u8(vld1_u8(luma));
}
+inline uint16x8_t GetAverageLumaMsan(const uint8_t* const luma,
+ int subsampling_x, int /*valid_range*/) {
+ if (subsampling_x != 0) {
+ // TODO(b/194217060): restore |valid_range| usage after correcting call
+ // sites causing test vector failures.
+ const uint8x16_t src = Load1QMsanU8(luma, 0);
+
+ return vrshrq_n_u16(vpaddlq_u8(src), 1);
+ }
+ // TODO(b/194217060): restore |valid_range| usage after correcting call sites
+ // causing test vector failures.
+ return vmovl_u8(Load1MsanU8(luma, 0));
+}
+
#if LIBGAV1_MAX_BITDEPTH >= 10
// Computes subsampled luma for use with chroma, by averaging in the x direction
// or y direction when applicable.
@@ -220,16 +250,28 @@ inline uint16x8_t GetAverageLuma(const uint16_t* const luma,
}
return vld1q_u16(luma);
}
+
+inline uint16x8_t GetAverageLumaMsan(const uint16_t* const luma,
+ int subsampling_x, int /*valid_range*/) {
+ if (subsampling_x != 0) {
+ // TODO(b/194217060): restore |valid_range| usage after correcting call
+ // sites causing test vector failures.
+ const uint16x8x2_t src = Load2QMsanU16(luma, 0);
+ return vrhaddq_u16(src.val[0], src.val[1]);
+ }
+ // TODO(b/194217060): restore |valid_range| usage after correcting call sites
+ // causing test vector failures.
+ return Load1QMsanU16(luma, 0);
+}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
template <int bitdepth, typename GrainType, int auto_regression_coeff_lag,
bool use_luma>
-void ApplyAutoRegressiveFilterToChromaGrains_NEON(const FilmGrainParams& params,
- const void* luma_grain_buffer,
- int subsampling_x,
- int subsampling_y,
- void* u_grain_buffer,
- void* v_grain_buffer) {
+void ApplyAutoRegressiveFilterToChromaGrains_NEON(
+ const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT luma_grain_buffer, int subsampling_x,
+ int subsampling_y, void* LIBGAV1_RESTRICT u_grain_buffer,
+ void* LIBGAV1_RESTRICT v_grain_buffer) {
static_assert(auto_regression_coeff_lag <= 3, "Invalid autoregression lag.");
const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer);
auto* u_grain = static_cast<GrainType*>(u_grain_buffer);
@@ -558,49 +600,93 @@ void ApplyAutoRegressiveFilterToLumaGrain_NEON(const FilmGrainParams& params,
#undef ACCUMULATE_WEIGHTED_GRAIN
}
-void InitializeScalingLookupTable_NEON(
- int num_points, const uint8_t point_value[], const uint8_t point_scaling[],
- uint8_t scaling_lut[kScalingLookupTableSize]) {
+template <int bitdepth>
+void InitializeScalingLookupTable_NEON(int num_points,
+ const uint8_t point_value[],
+ const uint8_t point_scaling[],
+ int16_t* scaling_lut,
+ const int scaling_lut_length) {
+ static_assert(bitdepth < kBitdepth12,
+ "NEON Scaling lookup table only supports 8bpp and 10bpp.");
if (num_points == 0) {
- memset(scaling_lut, 0, sizeof(scaling_lut[0]) * kScalingLookupTableSize);
+ memset(scaling_lut, 0, sizeof(scaling_lut[0]) * scaling_lut_length);
return;
}
- static_assert(sizeof(scaling_lut[0]) == 1, "");
- memset(scaling_lut, point_scaling[0], point_value[0]);
- const uint32x4_t steps = vmovl_u16(vcreate_u16(0x0003000200010000));
- const uint32x4_t offset = vdupq_n_u32(32768);
+ static_assert(sizeof(scaling_lut[0]) == 2, "");
+ Memset(scaling_lut, point_scaling[0],
+ std::max(static_cast<int>(point_value[0]), 1)
+ << (bitdepth - kBitdepth8));
+ const int32x4_t steps = vmovl_s16(vcreate_s16(0x0003000200010000));
+ const int32x4_t rounding = vdupq_n_s32(32768);
for (int i = 0; i < num_points - 1; ++i) {
const int delta_y = point_scaling[i + 1] - point_scaling[i];
const int delta_x = point_value[i + 1] - point_value[i];
+ // |delta| corresponds to b, for the function y = a + b*x.
const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
const int delta4 = delta << 2;
- const uint8x8_t base_point = vdup_n_u8(point_scaling[i]);
- uint32x4_t upscaled_points0 = vmlaq_n_u32(offset, steps, delta);
- const uint32x4_t line_increment4 = vdupq_n_u32(delta4);
+ // vmull_n_u16 will not work here because |delta| typically exceeds the
+ // range of uint16_t.
+ int32x4_t upscaled_points0 = vmlaq_n_s32(rounding, steps, delta);
+ const int32x4_t line_increment4 = vdupq_n_s32(delta4);
// Get the second set of 4 points by adding 4 steps to the first set.
- uint32x4_t upscaled_points1 = vaddq_u32(upscaled_points0, line_increment4);
+ int32x4_t upscaled_points1 = vaddq_s32(upscaled_points0, line_increment4);
// We obtain the next set of 8 points by adding 8 steps to each of the
// current 8 points.
- const uint32x4_t line_increment8 = vshlq_n_u32(line_increment4, 1);
+ const int32x4_t line_increment8 = vshlq_n_s32(line_increment4, 1);
+ const int16x8_t base_point = vdupq_n_s16(point_scaling[i]);
int x = 0;
+ // Derive and write 8 values (or 32 values, for 10bpp).
do {
- const uint16x4_t interp_points0 = vshrn_n_u32(upscaled_points0, 16);
- const uint16x4_t interp_points1 = vshrn_n_u32(upscaled_points1, 16);
- const uint8x8_t interp_points =
- vmovn_u16(vcombine_u16(interp_points0, interp_points1));
+ const int16x4_t interp_points0 = vshrn_n_s32(upscaled_points0, 16);
+ const int16x4_t interp_points1 = vshrn_n_s32(upscaled_points1, 16);
+ const int16x8_t interp_points =
+ vcombine_s16(interp_points0, interp_points1);
// The spec guarantees that the max value of |point_value[i]| + x is 255.
- // Writing 8 bytes starting at the final table byte, leaves 7 bytes of
+ // Writing 8 values starting at the final table byte, leaves 7 values of
// required padding.
- vst1_u8(&scaling_lut[point_value[i] + x],
- vadd_u8(interp_points, base_point));
- upscaled_points0 = vaddq_u32(upscaled_points0, line_increment8);
- upscaled_points1 = vaddq_u32(upscaled_points1, line_increment8);
+ const int16x8_t full_interp = vaddq_s16(interp_points, base_point);
+ const int x_base = (point_value[i] + x) << (bitdepth - kBitdepth8);
+ if (bitdepth == kBitdepth10) {
+ const int16x8_t next_val = vaddq_s16(
+ base_point,
+ vdupq_n_s16((vgetq_lane_s32(upscaled_points1, 3) + delta) >> 16));
+ const int16x8_t start = full_interp;
+ const int16x8_t end = vextq_s16(full_interp, next_val, 1);
+ // lut[i << 2] = start;
+ // lut[(i << 2) + 1] = start + RightShiftWithRounding(start - end, 2)
+ // lut[(i << 2) + 2] = start +
+ // RightShiftWithRounding(2 * (start - end), 2)
+ // lut[(i << 2) + 3] = start +
+ // RightShiftWithRounding(3 * (start - end), 2)
+ const int16x8_t delta = vsubq_s16(end, start);
+ const int16x8_t double_delta = vshlq_n_s16(delta, 1);
+ const int16x8_t delta2 = vrshrq_n_s16(double_delta, 2);
+ const int16x8_t delta3 =
+ vrshrq_n_s16(vaddq_s16(delta, double_delta), 2);
+ const int16x8x4_t result = {
+ start, vaddq_s16(start, vrshrq_n_s16(delta, 2)),
+ vaddq_s16(start, delta2), vaddq_s16(start, delta3)};
+ vst4q_s16(&scaling_lut[x_base], result);
+ } else {
+ vst1q_s16(&scaling_lut[x_base], full_interp);
+ }
+ upscaled_points0 = vaddq_s32(upscaled_points0, line_increment8);
+ upscaled_points1 = vaddq_s32(upscaled_points1, line_increment8);
x += 8;
} while (x < delta_x);
}
- const uint8_t last_point_value = point_value[num_points - 1];
- memset(&scaling_lut[last_point_value], point_scaling[num_points - 1],
- kScalingLookupTableSize - last_point_value);
+ const int16_t last_point_value = point_value[num_points - 1];
+ const int x_base = last_point_value << (bitdepth - kBitdepth8);
+ Memset(&scaling_lut[x_base], point_scaling[num_points - 1],
+ scaling_lut_length - x_base);
+ if (bitdepth == kBitdepth10 && x_base > 0) {
+ const int start = scaling_lut[x_base - 4];
+ const int end = point_scaling[num_points - 1];
+ const int delta = end - start;
+ scaling_lut[x_base - 3] = start + RightShiftWithRounding(delta, 2);
+ scaling_lut[x_base - 2] = start + RightShiftWithRounding(2 * delta, 2);
+ scaling_lut[x_base - 1] = start + RightShiftWithRounding(3 * delta, 2);
+ }
}
inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low,
@@ -611,86 +697,38 @@ inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low,
template <int bitdepth, typename Pixel>
inline int16x8_t GetScalingFactors(
- const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) {
+ const int16_t scaling_lut[kScalingLookupTableSize], const Pixel* source) {
int16_t start_vals[8];
- if (bitdepth == 8) {
- start_vals[0] = scaling_lut[source[0]];
- start_vals[1] = scaling_lut[source[1]];
- start_vals[2] = scaling_lut[source[2]];
- start_vals[3] = scaling_lut[source[3]];
- start_vals[4] = scaling_lut[source[4]];
- start_vals[5] = scaling_lut[source[5]];
- start_vals[6] = scaling_lut[source[6]];
- start_vals[7] = scaling_lut[source[7]];
- return vld1q_s16(start_vals);
+ static_assert(bitdepth <= kBitdepth10,
+ "NEON Film Grain is not yet implemented for 12bpp.");
+ for (int i = 0; i < 8; ++i) {
+ assert(source[i] < kScalingLookupTableSize << (bitdepth - 2));
+ start_vals[i] = scaling_lut[source[i]];
}
- int16_t end_vals[8];
- // TODO(petersonab): Precompute this into a larger table for direct lookups.
- int index = source[0] >> 2;
- start_vals[0] = scaling_lut[index];
- end_vals[0] = scaling_lut[index + 1];
- index = source[1] >> 2;
- start_vals[1] = scaling_lut[index];
- end_vals[1] = scaling_lut[index + 1];
- index = source[2] >> 2;
- start_vals[2] = scaling_lut[index];
- end_vals[2] = scaling_lut[index + 1];
- index = source[3] >> 2;
- start_vals[3] = scaling_lut[index];
- end_vals[3] = scaling_lut[index + 1];
- index = source[4] >> 2;
- start_vals[4] = scaling_lut[index];
- end_vals[4] = scaling_lut[index + 1];
- index = source[5] >> 2;
- start_vals[5] = scaling_lut[index];
- end_vals[5] = scaling_lut[index + 1];
- index = source[6] >> 2;
- start_vals[6] = scaling_lut[index];
- end_vals[6] = scaling_lut[index + 1];
- index = source[7] >> 2;
- start_vals[7] = scaling_lut[index];
- end_vals[7] = scaling_lut[index + 1];
- const int16x8_t start = vld1q_s16(start_vals);
- const int16x8_t end = vld1q_s16(end_vals);
- int16x8_t remainder = GetSignedSource8(source);
- remainder = vandq_s16(remainder, vdupq_n_s16(3));
- const int16x8_t delta = vmulq_s16(vsubq_s16(end, start), remainder);
- return vaddq_s16(start, vrshrq_n_s16(delta, 2));
+ return vld1q_s16(start_vals);
}
+template <int bitdepth>
inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling,
const int16x8_t scaling_shift_vect) {
- const int16x8_t upscaled_noise = vmulq_s16(noise, scaling);
- return vrshlq_s16(upscaled_noise, scaling_shift_vect);
-}
-
-#if LIBGAV1_MAX_BITDEPTH >= 10
-inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling,
- const int32x4_t scaling_shift_vect) {
- // TODO(petersonab): Try refactoring scaling lookup table to int16_t and
- // upscaling by 7 bits to permit high half multiply. This would eliminate
- // the intermediate 32x4 registers. Also write the averaged values directly
- // into the table so it doesn't have to be done for every pixel in
- // the frame.
- const int32x4_t upscaled_noise_lo =
- vmull_s16(vget_low_s16(noise), vget_low_s16(scaling));
- const int32x4_t upscaled_noise_hi =
- vmull_s16(vget_high_s16(noise), vget_high_s16(scaling));
- const int16x4_t noise_lo =
- vmovn_s32(vrshlq_s32(upscaled_noise_lo, scaling_shift_vect));
- const int16x4_t noise_hi =
- vmovn_s32(vrshlq_s32(upscaled_noise_hi, scaling_shift_vect));
- return vcombine_s16(noise_lo, noise_hi);
+ if (bitdepth == kBitdepth8) {
+ const int16x8_t upscaled_noise = vmulq_s16(noise, scaling);
+ return vrshlq_s16(upscaled_noise, scaling_shift_vect);
+ }
+ // Scaling shift is in the range [8, 11]. The doubling multiply returning high
+ // half is equivalent to a right shift by 15, so |scaling_shift_vect| should
+ // provide a left shift equal to 15 - s, where s is the original shift
+ // parameter.
+ const int16x8_t scaling_up = vshlq_s16(scaling, scaling_shift_vect);
+ return vqrdmulhq_s16(noise, scaling_up);
}
-#endif // LIBGAV1_MAX_BITDEPTH >= 10
template <int bitdepth, typename GrainType, typename Pixel>
void BlendNoiseWithImageLuma_NEON(
- const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift,
- int width, int height, int start_height,
- const uint8_t scaling_lut_y[kScalingLookupTableSize],
- const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y,
- ptrdiff_t dest_stride_y) {
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
+ int scaling_shift, int width, int height, int start_height,
+ const int16_t* scaling_lut_y, const void* source_plane_y,
+ ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
const auto* noise_image =
static_cast<const Array2D<GrainType>*>(noise_image_ptr);
const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
@@ -702,10 +740,8 @@ void BlendNoiseWithImageLuma_NEON(
// In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
// for 16 bit signed integers. In higher bitdepths, however, we have to
// expand to 32 to protect the sign bit.
- const int16x8_t scaling_shift_vect16 = vdupq_n_s16(-scaling_shift);
-#if LIBGAV1_MAX_BITDEPTH >= 10
- const int32x4_t scaling_shift_vect32 = vdupq_n_s32(-scaling_shift);
-#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ const int16x8_t scaling_shift_vect = vdupq_n_s16(
+ (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
int y = 0;
do {
@@ -713,25 +749,35 @@ void BlendNoiseWithImageLuma_NEON(
do {
// This operation on the unsigned input is safe in 8bpp because the vector
// is widened before it is reinterpreted.
- const int16x8_t orig = GetSignedSource8(&in_y_row[x]);
- const int16x8_t scaling =
+ const int16x8_t orig0 = GetSignedSource8(&in_y_row[x]);
+ const int16x8_t scaling0 =
GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
int16x8_t noise =
GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
- if (bitdepth == 8) {
- noise = ScaleNoise(noise, scaling, scaling_shift_vect16);
- } else {
-#if LIBGAV1_MAX_BITDEPTH >= 10
- noise = ScaleNoise(noise, scaling, scaling_shift_vect32);
-#endif // LIBGAV1_MAX_BITDEPTH >= 10
- }
- const int16x8_t combined = vaddq_s16(orig, noise);
+ noise = ScaleNoise<bitdepth>(noise, scaling0, scaling_shift_vect);
+ const int16x8_t combined0 = vaddq_s16(orig0, noise);
+ // In 8bpp, when params_.clip_to_restricted_range == false, we can replace
+ // clipping with vqmovun_s16, but it's not likely to be worth copying the
+ // function for just that case, though the gain would be very small.
+ StoreUnsigned8(&out_y_row[x],
+ vreinterpretq_u16_s16(Clip3(combined0, floor, ceiling)));
+ x += 8;
+
+ // This operation on the unsigned input is safe in 8bpp because the vector
+ // is widened before it is reinterpreted.
+ const int16x8_t orig1 = GetSignedSource8(&in_y_row[x]);
+ const int16x8_t scaling1 = GetScalingFactors<bitdepth, Pixel>(
+ scaling_lut_y, &in_y_row[std::min(x, width)]);
+ noise = GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x]));
+
+ noise = ScaleNoise<bitdepth>(noise, scaling1, scaling_shift_vect);
+ const int16x8_t combined1 = vaddq_s16(orig1, noise);
// In 8bpp, when params_.clip_to_restricted_range == false, we can replace
// clipping with vqmovun_s16, but it's not likely to be worth copying the
// function for just that case, though the gain would be very small.
StoreUnsigned8(&out_y_row[x],
- vreinterpretq_u16_s16(Clip3(combined, floor, ceiling)));
+ vreinterpretq_u16_s16(Clip3(combined1, floor, ceiling)));
x += 8;
} while (x < width);
in_y_row += source_stride_y;
@@ -741,20 +787,16 @@ void BlendNoiseWithImageLuma_NEON(
template <int bitdepth, typename GrainType, typename Pixel>
inline int16x8_t BlendChromaValsWithCfl(
- const Pixel* average_luma_buffer,
- const uint8_t scaling_lut[kScalingLookupTableSize],
- const Pixel* chroma_cursor, const GrainType* noise_image_cursor,
- const int16x8_t scaling_shift_vect16,
- const int32x4_t scaling_shift_vect32) {
+ const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
+ const int16_t* LIBGAV1_RESTRICT scaling_lut,
+ const Pixel* LIBGAV1_RESTRICT chroma_cursor,
+ const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
+ const int16x8_t scaling_shift_vect) {
const int16x8_t scaling =
GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
const int16x8_t orig = GetSignedSource8(chroma_cursor);
int16x8_t noise = GetSignedSource8(noise_image_cursor);
- if (bitdepth == 8) {
- noise = ScaleNoise(noise, scaling, scaling_shift_vect16);
- } else {
- noise = ScaleNoise(noise, scaling, scaling_shift_vect32);
- }
+ noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift_vect);
return vaddq_s16(orig, noise);
}
@@ -763,10 +805,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
int width, int height, int start_height, int subsampling_x,
int subsampling_y, int scaling_shift,
- const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row,
- ptrdiff_t source_stride_y, const Pixel* in_chroma_row,
- ptrdiff_t source_stride_chroma, Pixel* out_chroma_row,
- ptrdiff_t dest_stride) {
+ const int16_t* LIBGAV1_RESTRICT scaling_lut,
+ const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+ const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
+ Pixel* out_chroma_row, ptrdiff_t dest_stride) {
const int16x8_t floor = vdupq_n_s16(min_value);
const int16x8_t ceiling = vdupq_n_s16(max_chroma);
Pixel luma_buffer[16];
@@ -774,8 +816,8 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
// In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
// for 16 bit signed integers. In higher bitdepths, however, we have to
// expand to 32 to protect the sign bit.
- const int16x8_t scaling_shift_vect16 = vdupq_n_s16(-scaling_shift);
- const int32x4_t scaling_shift_vect32 = vdupq_n_s32(-scaling_shift);
+ const int16x8_t scaling_shift_vect = vdupq_n_s16(
+ (bitdepth == kBitdepth10) ? 15 - scaling_shift : -scaling_shift);
const int chroma_height = (height + subsampling_y) >> subsampling_y;
const int chroma_width = (width + subsampling_x) >> subsampling_x;
@@ -791,8 +833,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
int x = 0;
do {
const int luma_x = x << subsampling_x;
- // TODO(petersonab): Consider specializing by subsampling_x. In the 444
- // case &in_y_row[x] can be passed to GetScalingFactors directly.
const uint16x8_t average_luma =
GetAverageLuma(&in_y_row[luma_x], subsampling_x);
StoreUnsigned8(average_luma_buffer, average_luma);
@@ -800,8 +840,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
const int16x8_t blended =
BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
average_luma_buffer, scaling_lut, &in_chroma_row[x],
- &(noise_image[y + start_height][x]), scaling_shift_vect16,
- scaling_shift_vect32);
+ &(noise_image[y + start_height][x]), scaling_shift_vect);
// In 8bpp, when params_.clip_to_restricted_range == false, we can replace
// clipping with vqmovun_s16, but it's not likely to be worth copying the
@@ -813,18 +852,19 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
if (x < chroma_width) {
const int luma_x = x << subsampling_x;
- const int valid_range = width - luma_x;
- memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
- luma_buffer[valid_range] = in_y_row[width - 1];
- const uint16x8_t average_luma =
- GetAverageLuma(luma_buffer, subsampling_x);
+ const int valid_range_pixels = width - luma_x;
+ const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+ memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
+ luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+ const uint16x8_t average_luma = GetAverageLumaMsan(
+ luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0]));
+
StoreUnsigned8(average_luma_buffer, average_luma);
const int16x8_t blended =
BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
average_luma_buffer, scaling_lut, &in_chroma_row[x],
- &(noise_image[y + start_height][x]), scaling_shift_vect16,
- scaling_shift_vect32);
+ &(noise_image[y + start_height][x]), scaling_shift_vect);
// In 8bpp, when params_.clip_to_restricted_range == false, we can replace
// clipping with vqmovun_s16, but it's not likely to be worth copying the
// function for just that case.
@@ -842,11 +882,11 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON(
// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
template <int bitdepth, typename GrainType, typename Pixel>
void BlendNoiseWithImageChromaWithCfl_NEON(
- Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
- int min_value, int max_chroma, int width, int height, int start_height,
- int subsampling_x, int subsampling_y,
- const uint8_t scaling_lut[kScalingLookupTableSize],
- const void* source_plane_y, ptrdiff_t source_stride_y,
+ Plane plane, const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
+ const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
const void* source_plane_uv, ptrdiff_t source_stride_uv,
void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
const auto* noise_image =
@@ -872,12 +912,11 @@ namespace low_bitdepth {
namespace {
inline int16x8_t BlendChromaValsNoCfl(
- const uint8_t scaling_lut[kScalingLookupTableSize],
- const uint8_t* chroma_cursor, const int8_t* noise_image_cursor,
+ const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
+ const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
const int16x8_t& offset, int luma_multiplier, int chroma_multiplier) {
uint8_t merged_buffer[8];
- const int16x8_t orig = GetSignedSource8(chroma_cursor);
const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier);
const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier);
// Maximum value of |combined_u| is 127*255 = 0x7E81.
@@ -887,9 +926,9 @@ inline int16x8_t BlendChromaValsNoCfl(
const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4);
vst1_u8(merged_buffer, merged);
const int16x8_t scaling =
- GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer);
+ GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
int16x8_t noise = GetSignedSource8(noise_image_cursor);
- noise = ScaleNoise(noise, scaling, scaling_shift_vect);
+ noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift_vect);
return vaddq_s16(orig, noise);
}
@@ -898,10 +937,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
int width, int height, int start_height, int subsampling_x,
int subsampling_y, int scaling_shift, int chroma_offset,
int chroma_multiplier, int luma_multiplier,
- const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row,
- ptrdiff_t source_stride_y, const uint8_t* in_chroma_row,
- ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row,
- ptrdiff_t dest_stride) {
+ const int16_t* LIBGAV1_RESTRICT scaling_lut,
+ const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+ const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
+ uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
const int16x8_t floor = vdupq_n_s16(min_value);
const int16x8_t ceiling = vdupq_n_s16(max_chroma);
// In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe
@@ -913,6 +952,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
const int chroma_width = (width + subsampling_x) >> subsampling_x;
const int safe_chroma_width = chroma_width & ~7;
uint8_t luma_buffer[16];
+#if LIBGAV1_MSAN
+ // Quiet msan warnings.
+ memset(luma_buffer, 0, sizeof(luma_buffer));
+#endif
const int16x8_t offset = vdupq_n_s16(chroma_offset << 5);
start_height >>= subsampling_y;
@@ -921,10 +964,13 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
int x = 0;
do {
const int luma_x = x << subsampling_x;
+ const int valid_range = width - luma_x;
+
+ const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
const int16x8_t average_luma = vreinterpretq_s16_u16(
- GetAverageLuma(&in_y_row[luma_x], subsampling_x));
+ GetAverageLumaMsan(&in_y_row[luma_x], subsampling_x, valid_range));
const int16x8_t blended = BlendChromaValsNoCfl(
- scaling_lut, &in_chroma_row[x], &(noise_image[y + start_height][x]),
+ scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
average_luma, scaling_shift_vect, offset, luma_multiplier,
chroma_multiplier);
// In 8bpp, when params_.clip_to_restricted_range == false, we can
@@ -940,14 +986,19 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
// |average_luma| computation requires a duplicated luma value at the
// end.
const int luma_x = x << subsampling_x;
- const int valid_range = width - luma_x;
- memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
- luma_buffer[valid_range] = in_y_row[width - 1];
-
- const int16x8_t average_luma =
- vreinterpretq_s16_u16(GetAverageLuma(luma_buffer, subsampling_x));
+ const int valid_range_pixels = width - luma_x;
+ const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+ memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
+ luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+ const int valid_range_chroma_bytes =
+ (chroma_width - x) * sizeof(in_chroma_row[0]);
+
+ const int16x8_t orig_chroma =
+ GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes);
+ const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
+ luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0])));
const int16x8_t blended = BlendChromaValsNoCfl(
- scaling_lut, &in_chroma_row[x], &(noise_image[y + start_height][x]),
+ scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
average_luma, scaling_shift_vect, offset, luma_multiplier,
chroma_multiplier);
StoreUnsigned8(&out_chroma_row[x],
@@ -963,11 +1014,11 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON(
// This function is for the case params_.chroma_scaling_from_luma == false.
void BlendNoiseWithImageChroma8bpp_NEON(
- Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
- int min_value, int max_chroma, int width, int height, int start_height,
- int subsampling_x, int subsampling_y,
- const uint8_t scaling_lut[kScalingLookupTableSize],
- const void* source_plane_y, ptrdiff_t source_stride_y,
+ Plane plane, const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
+ const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
const void* source_plane_uv, ptrdiff_t source_stride_uv,
void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
assert(plane == kPlaneU || plane == kPlaneV);
@@ -989,12 +1040,11 @@ void BlendNoiseWithImageChroma8bpp_NEON(
in_uv, source_stride_uv, out_uv, dest_stride_uv);
}
-inline void WriteOverlapLine8bpp_NEON(const int8_t* noise_stripe_row,
- const int8_t* noise_stripe_row_prev,
- int plane_width,
- const int8x8_t grain_coeff,
- const int8x8_t old_coeff,
- int8_t* noise_image_row) {
+inline void WriteOverlapLine8bpp_NEON(
+ const int8_t* LIBGAV1_RESTRICT noise_stripe_row,
+ const int8_t* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
+ const int8x8_t grain_coeff, const int8x8_t old_coeff,
+ int8_t* LIBGAV1_RESTRICT noise_image_row) {
int x = 0;
do {
// Note that these reads may exceed noise_stripe_row's width by up to 7
@@ -1009,10 +1059,10 @@ inline void WriteOverlapLine8bpp_NEON(const int8_t* noise_stripe_row,
} while (x < plane_width);
}
-void ConstructNoiseImageOverlap8bpp_NEON(const void* noise_stripes_buffer,
- int width, int height,
- int subsampling_x, int subsampling_y,
- void* noise_image_buffer) {
+void ConstructNoiseImageOverlap8bpp_NEON(
+ const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
+ int subsampling_x, int subsampling_y,
+ void* LIBGAV1_RESTRICT noise_image_buffer) {
const auto* noise_stripes =
static_cast<const Array2DView<int8_t>*>(noise_stripes_buffer);
auto* noise_image = static_cast<Array2D<int8_t>*>(noise_image_buffer);
@@ -1077,41 +1127,45 @@ void Init8bpp() {
// LumaAutoRegressionFunc
dsp->film_grain.luma_auto_regression[0] =
- ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 1>;
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 1>;
dsp->film_grain.luma_auto_regression[1] =
- ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 2>;
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 2>;
dsp->film_grain.luma_auto_regression[2] =
- ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 3>;
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth8, int8_t, 3>;
// ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag]
// Chroma autoregression should never be called when lag is 0 and use_luma
// is false.
dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
dsp->film_grain.chroma_auto_regression[0][1] =
- ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 1, false>;
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 1,
+ false>;
dsp->film_grain.chroma_auto_regression[0][2] =
- ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 2, false>;
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 2,
+ false>;
dsp->film_grain.chroma_auto_regression[0][3] =
- ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 3, false>;
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 3,
+ false>;
dsp->film_grain.chroma_auto_regression[1][0] =
- ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 0, true>;
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 0, true>;
dsp->film_grain.chroma_auto_regression[1][1] =
- ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 1, true>;
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 1, true>;
dsp->film_grain.chroma_auto_regression[1][2] =
- ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 2, true>;
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 2, true>;
dsp->film_grain.chroma_auto_regression[1][3] =
- ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 3, true>;
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth8, int8_t, 3, true>;
dsp->film_grain.construct_noise_image_overlap =
ConstructNoiseImageOverlap8bpp_NEON;
- dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_NEON;
+ dsp->film_grain.initialize_scaling_lut =
+ InitializeScalingLookupTable_NEON<kBitdepth8>;
dsp->film_grain.blend_noise_luma =
- BlendNoiseWithImageLuma_NEON<8, int8_t, uint8_t>;
+ BlendNoiseWithImageLuma_NEON<kBitdepth8, int8_t, uint8_t>;
dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_NEON;
dsp->film_grain.blend_noise_chroma[1] =
- BlendNoiseWithImageChromaWithCfl_NEON<8, int8_t, uint8_t>;
+ BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth8, int8_t, uint8_t>;
}
} // namespace
@@ -1121,43 +1175,280 @@ void Init8bpp() {
namespace high_bitdepth {
namespace {
+inline void WriteOverlapLine10bpp_NEON(
+ const int16_t* LIBGAV1_RESTRICT noise_stripe_row,
+ const int16_t* LIBGAV1_RESTRICT noise_stripe_row_prev, int plane_width,
+ const int16x8_t grain_coeff, const int16x8_t old_coeff,
+ int16_t* LIBGAV1_RESTRICT noise_image_row) {
+ int x = 0;
+ do {
+ // Note that these reads may exceed noise_stripe_row's width by up to 7
+ // values.
+ const int16x8_t source_grain = vld1q_s16(noise_stripe_row + x);
+ const int16x8_t source_old = vld1q_s16(noise_stripe_row_prev + x);
+ // Maximum product is 511 * 27 = 0x35E5.
+ const int16x8_t weighted_grain = vmulq_s16(grain_coeff, source_grain);
+ // Maximum sum is 511 * (22 + 23) = 0x59D3.
+ const int16x8_t grain_sum =
+ vmlaq_s16(weighted_grain, old_coeff, source_old);
+ // Note that this write may exceed noise_image_row's width by up to 7
+ // values.
+ const int16x8_t grain = Clip3S16(vrshrq_n_s16(grain_sum, 5),
+ vdupq_n_s16(GetGrainMin<kBitdepth10>()),
+ vdupq_n_s16(GetGrainMax<kBitdepth10>()));
+ vst1q_s16(noise_image_row + x, grain);
+ x += 8;
+ } while (x < plane_width);
+}
+
+void ConstructNoiseImageOverlap10bpp_NEON(
+ const void* LIBGAV1_RESTRICT noise_stripes_buffer, int width, int height,
+ int subsampling_x, int subsampling_y,
+ void* LIBGAV1_RESTRICT noise_image_buffer) {
+ const auto* noise_stripes =
+ static_cast<const Array2DView<int16_t>*>(noise_stripes_buffer);
+ auto* noise_image = static_cast<Array2D<int16_t>*>(noise_image_buffer);
+ const int plane_width = (width + subsampling_x) >> subsampling_x;
+ const int plane_height = (height + subsampling_y) >> subsampling_y;
+ const int stripe_height = 32 >> subsampling_y;
+ const int stripe_mask = stripe_height - 1;
+ int y = stripe_height;
+ int luma_num = 1;
+ if (subsampling_y == 0) {
+ const int16x8_t first_row_grain_coeff = vdupq_n_s16(17);
+ const int16x8_t first_row_old_coeff = vdupq_n_s16(27);
+ const int16x8_t second_row_grain_coeff = first_row_old_coeff;
+ const int16x8_t second_row_old_coeff = first_row_grain_coeff;
+ for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
+ const int16_t* noise_stripe = (*noise_stripes)[luma_num];
+ const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine10bpp_NEON(
+ noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+ first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+ WriteOverlapLine10bpp_NEON(&noise_stripe[plane_width],
+ &noise_stripe_prev[(32 + 1) * plane_width],
+ plane_width, second_row_grain_coeff,
+ second_row_old_coeff, (*noise_image)[y + 1]);
+ }
+ // Either one partial stripe remains (remaining_height > 0),
+ // OR image is less than one stripe high (remaining_height < 0),
+ // OR all stripes are completed (remaining_height == 0).
+ const int remaining_height = plane_height - y;
+ if (remaining_height <= 0) {
+ return;
+ }
+ const int16_t* noise_stripe = (*noise_stripes)[luma_num];
+ const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine10bpp_NEON(
+ noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width,
+ first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+
+ if (remaining_height > 1) {
+ WriteOverlapLine10bpp_NEON(&noise_stripe[plane_width],
+ &noise_stripe_prev[(32 + 1) * plane_width],
+ plane_width, second_row_grain_coeff,
+ second_row_old_coeff, (*noise_image)[y + 1]);
+ }
+ } else { // subsampling_y == 1
+ const int16x8_t first_row_grain_coeff = vdupq_n_s16(22);
+ const int16x8_t first_row_old_coeff = vdupq_n_s16(23);
+ for (; y < plane_height; ++luma_num, y += stripe_height) {
+ const int16_t* noise_stripe = (*noise_stripes)[luma_num];
+ const int16_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1];
+ WriteOverlapLine10bpp_NEON(
+ noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width,
+ first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]);
+ }
+ }
+}
+
+inline int16x8_t BlendChromaValsNoCfl(
+ const int16_t* LIBGAV1_RESTRICT scaling_lut, const int16x8_t orig,
+ const int16_t* LIBGAV1_RESTRICT noise_image_cursor,
+ const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect,
+ const int32x4_t& offset, int luma_multiplier, int chroma_multiplier) {
+ uint16_t merged_buffer[8];
+ const int32x4_t weighted_luma_low =
+ vmull_n_s16(vget_low_s16(average_luma), luma_multiplier);
+ const int32x4_t weighted_luma_high =
+ vmull_n_s16(vget_high_s16(average_luma), luma_multiplier);
+ // Maximum value of combined is 127 * 1023 = 0x1FB81.
+ const int32x4_t combined_low =
+ vmlal_n_s16(weighted_luma_low, vget_low_s16(orig), chroma_multiplier);
+ const int32x4_t combined_high =
+ vmlal_n_s16(weighted_luma_high, vget_high_s16(orig), chroma_multiplier);
+ // Maximum value of offset is (255 << 8) = 0xFF00. Offset may be negative.
+ const uint16x4_t merged_low =
+ vqshrun_n_s32(vaddq_s32(offset, combined_low), 6);
+ const uint16x4_t merged_high =
+ vqshrun_n_s32(vaddq_s32(offset, combined_high), 6);
+ const uint16x8_t max_pixel = vdupq_n_u16((1 << kBitdepth10) - 1);
+ vst1q_u16(merged_buffer,
+ vminq_u16(vcombine_u16(merged_low, merged_high), max_pixel));
+ const int16x8_t scaling =
+ GetScalingFactors<kBitdepth10, uint16_t>(scaling_lut, merged_buffer);
+ const int16x8_t noise = GetSignedSource8(noise_image_cursor);
+ const int16x8_t scaled_noise =
+ ScaleNoise<kBitdepth10>(noise, scaling, scaling_shift_vect);
+ return vaddq_s16(orig, scaled_noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane10bpp_NEON(
+ const Array2D<int16_t>& noise_image, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, int scaling_shift, int chroma_offset,
+ int chroma_multiplier, int luma_multiplier,
+ const int16_t* LIBGAV1_RESTRICT scaling_lut,
+ const uint16_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+ const uint16_t* in_chroma_row, ptrdiff_t source_stride_chroma,
+ uint16_t* out_chroma_row, ptrdiff_t dest_stride) {
+ const int16x8_t floor = vdupq_n_s16(min_value);
+ const int16x8_t ceiling = vdupq_n_s16(max_chroma);
+ const int16x8_t scaling_shift_vect = vdupq_n_s16(15 - scaling_shift);
+
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ const int safe_chroma_width = chroma_width & ~7;
+ uint16_t luma_buffer[16];
+#if LIBGAV1_MSAN
+ // TODO(b/194217060): This can be removed if the range calculations below are
+ // fixed.
+ memset(luma_buffer, 0, sizeof(luma_buffer));
+#endif
+ // Offset is added before downshifting in order to take advantage of
+ // saturation, so it has to be upscaled by 6 bits, plus 2 bits for 10bpp.
+ const int32x4_t offset = vdupq_n_s32(chroma_offset << (6 + 2));
+
+ start_height >>= subsampling_y;
+ int y = 0;
+ do {
+ int x = 0;
+ do {
+ const int luma_x = x << subsampling_x;
+ const int16x8_t average_luma = vreinterpretq_s16_u16(
+ GetAverageLuma(&in_y_row[luma_x], subsampling_x));
+ const int16x8_t orig_chroma = GetSignedSource8(&in_chroma_row[x]);
+ const int16x8_t blended = BlendChromaValsNoCfl(
+ scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+ average_luma, scaling_shift_vect, offset, luma_multiplier,
+ chroma_multiplier);
+ StoreUnsigned8(&out_chroma_row[x],
+ vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+
+ x += 8;
+ } while (x < safe_chroma_width);
+
+ if (x < chroma_width) {
+ // Begin right edge iteration. Same as the normal iterations, but the
+ // |average_luma| computation requires a duplicated luma value at the
+ // end.
+ const int luma_x = x << subsampling_x;
+ const int valid_range_pixels = width - luma_x;
+ const int valid_range_bytes = valid_range_pixels * sizeof(in_y_row[0]);
+ memcpy(luma_buffer, &in_y_row[luma_x], valid_range_bytes);
+ luma_buffer[valid_range_pixels] = in_y_row[width - 1];
+ const int valid_range_chroma_bytes =
+ (chroma_width - x) * sizeof(in_chroma_row[0]);
+ const int16x8_t orig_chroma =
+ GetSignedSource8Msan(&in_chroma_row[x], valid_range_chroma_bytes);
+
+ const int16x8_t average_luma = vreinterpretq_s16_u16(GetAverageLumaMsan(
+ luma_buffer, subsampling_x, valid_range_bytes + sizeof(in_y_row[0])));
+ const int16x8_t blended = BlendChromaValsNoCfl(
+ scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+ average_luma, scaling_shift_vect, offset, luma_multiplier,
+ chroma_multiplier);
+ StoreUnsigned8(&out_chroma_row[x],
+ vreinterpretq_u16_s16(Clip3(blended, floor, ceiling)));
+ // End of right edge iteration.
+ }
+
+ in_y_row = AddByteStride(in_y_row, source_stride_y << subsampling_y);
+ in_chroma_row = AddByteStride(in_chroma_row, source_stride_chroma);
+ out_chroma_row = AddByteStride(out_chroma_row, dest_stride);
+ } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma10bpp_NEON(
+ Plane plane, const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, const int16_t* LIBGAV1_RESTRICT scaling_lut,
+ const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ assert(plane == kPlaneU || plane == kPlaneV);
+ const auto* noise_image =
+ static_cast<const Array2D<int16_t>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const uint16_t*>(source_plane_y);
+ const auto* in_uv = static_cast<const uint16_t*>(source_plane_uv);
+ auto* out_uv = static_cast<uint16_t*>(dest_plane_uv);
+
+ const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+ const int luma_multiplier =
+ (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+ const int multiplier =
+ (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+ BlendChromaPlane10bpp_NEON(
+ noise_image[plane], min_value, max_chroma, width, height, start_height,
+ subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
+ luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
+ source_stride_uv, out_uv, dest_stride_uv);
+}
+
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
assert(dsp != nullptr);
// LumaAutoRegressionFunc
dsp->film_grain.luma_auto_regression[0] =
- ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 1>;
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 1>;
dsp->film_grain.luma_auto_regression[1] =
- ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 2>;
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 2>;
dsp->film_grain.luma_auto_regression[2] =
- ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 3>;
+ ApplyAutoRegressiveFilterToLumaGrain_NEON<kBitdepth10, int16_t, 3>;
// ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag][subsampling]
// Chroma autoregression should never be called when lag is 0 and use_luma
// is false.
dsp->film_grain.chroma_auto_regression[0][0] = nullptr;
dsp->film_grain.chroma_auto_regression[0][1] =
- ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 1, false>;
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 1,
+ false>;
dsp->film_grain.chroma_auto_regression[0][2] =
- ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 2, false>;
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 2,
+ false>;
dsp->film_grain.chroma_auto_regression[0][3] =
- ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 3, false>;
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 3,
+ false>;
dsp->film_grain.chroma_auto_regression[1][0] =
- ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 0, true>;
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 0,
+ true>;
dsp->film_grain.chroma_auto_regression[1][1] =
- ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 1, true>;
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 1,
+ true>;
dsp->film_grain.chroma_auto_regression[1][2] =
- ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 2, true>;
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 2,
+ true>;
dsp->film_grain.chroma_auto_regression[1][3] =
- ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 3, true>;
+ ApplyAutoRegressiveFilterToChromaGrains_NEON<kBitdepth10, int16_t, 3,
+ true>;
- dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_NEON;
+ dsp->film_grain.construct_noise_image_overlap =
+ ConstructNoiseImageOverlap10bpp_NEON;
- dsp->film_grain.blend_noise_luma =
- BlendNoiseWithImageLuma_NEON<10, int16_t, uint16_t>;
+ dsp->film_grain.initialize_scaling_lut =
+ InitializeScalingLookupTable_NEON<kBitdepth10>;
+
+ // TODO(b/194442742): reenable this function after segfault under armv7 ASan
+ // is fixed.
+ // dsp->film_grain.blend_noise_luma =
+ // BlendNoiseWithImageLuma_NEON<kBitdepth10, int16_t, uint16_t>;
+ dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma10bpp_NEON;
dsp->film_grain.blend_noise_chroma[1] =
- BlendNoiseWithImageChromaWithCfl_NEON<10, int16_t, uint16_t>;
+ BlendNoiseWithImageChromaWithCfl_NEON<kBitdepth10, int16_t, uint16_t>;
}
} // namespace