diff options
Diffstat (limited to 'src/dsp/arm/film_grain_neon.cc')
-rw-r--r-- | src/dsp/arm/film_grain_neon.cc | 1188 |
1 files changed, 1188 insertions, 0 deletions
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc new file mode 100644 index 0000000..2612466 --- /dev/null +++ b/src/dsp/arm/film_grain_neon.cc @@ -0,0 +1,1188 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/film_grain.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_ENABLE_NEON +#include <arm_neon.h> + +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <new> + +#include "src/dsp/arm/common_neon.h" +#include "src/dsp/arm/film_grain_neon.h" +#include "src/dsp/common.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/film_grain_common.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/logging.h" + +namespace libgav1 { +namespace dsp { +namespace film_grain { +namespace { + +// These functions are overloaded for both possible sizes in order to simplify +// loading and storing to and from intermediate value types from within a +// template function. +inline int16x8_t GetSignedSource8(const int8_t* src) { + return vmovl_s8(vld1_s8(src)); +} + +inline int16x8_t GetSignedSource8(const uint8_t* src) { + return ZeroExtend(vld1_u8(src)); +} + +inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) { + vst1_u8(dest, vmovn_u16(data)); +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +inline int16x8_t GetSignedSource8(const int16_t* src) { return vld1q_s16(src); } + +inline int16x8_t GetSignedSource8(const uint16_t* src) { + return vreinterpretq_s16_u16(vld1q_u16(src)); +} + +inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) { + vst1q_u16(dest, data); +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +// Each element in |sum| represents one destination value's running +// autoregression formula. The fixed source values in |grain_lo| and |grain_hi| +// allow for a sliding window in successive calls to this function. +template <int position_offset> +inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo, + const int16x8_t grain_hi, + int16_t coeff, int32x4x2_t sum) { + const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset); + sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff); + sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff); + return sum; +} + +// Because the autoregressive filter requires the output of each pixel to +// compute pixels that come after in the row, we have to finish the calculations +// one at a time. +template <int bitdepth, int auto_regression_coeff_lag, int lane> +inline void WriteFinalAutoRegression(int8_t* grain_cursor, int32x4x2_t sum, + const int8_t* coeffs, int pos, int shift) { + int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3); + + for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) { + result += grain_cursor[lane + delta_col] * coeffs[pos]; + ++pos; + } + grain_cursor[lane] = + Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift), + GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>()); +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +template <int bitdepth, int auto_regression_coeff_lag, int lane> +inline void WriteFinalAutoRegression(int16_t* grain_cursor, int32x4x2_t sum, + const int8_t* coeffs, int pos, int shift) { + int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3); + + for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) { + result += grain_cursor[lane + delta_col] * coeffs[pos]; + ++pos; + } + grain_cursor[lane] = + Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift), + GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>()); +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +// Because the autoregressive filter requires the output of each pixel to +// compute pixels that come after in the row, we have to finish the calculations +// one at a time. +template <int bitdepth, int auto_regression_coeff_lag, int lane> +inline void WriteFinalAutoRegressionChroma(int8_t* u_grain_cursor, + int8_t* v_grain_cursor, + int32x4x2_t sum_u, int32x4x2_t sum_v, + const int8_t* coeffs_u, + const int8_t* coeffs_v, int pos, + int shift) { + WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( + u_grain_cursor, sum_u, coeffs_u, pos, shift); + WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( + v_grain_cursor, sum_v, coeffs_v, pos, shift); +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +template <int bitdepth, int auto_regression_coeff_lag, int lane> +inline void WriteFinalAutoRegressionChroma(int16_t* u_grain_cursor, + int16_t* v_grain_cursor, + int32x4x2_t sum_u, int32x4x2_t sum_v, + const int8_t* coeffs_u, + const int8_t* coeffs_v, int pos, + int shift) { + WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( + u_grain_cursor, sum_u, coeffs_u, pos, shift); + WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( + v_grain_cursor, sum_v, coeffs_v, pos, shift); +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +inline void SetZero(int32x4x2_t* v) { + v->val[0] = vdupq_n_s32(0); + v->val[1] = vdupq_n_s32(0); +} + +// Computes subsampled luma for use with chroma, by averaging in the x direction +// or y direction when applicable. +int16x8_t GetSubsampledLuma(const int8_t* const luma, int subsampling_x, + int subsampling_y, ptrdiff_t stride) { + if (subsampling_y != 0) { + assert(subsampling_x != 0); + const int8x16_t src0 = vld1q_s8(luma); + const int8x16_t src1 = vld1q_s8(luma + stride); + const int16x8_t ret0 = vcombine_s16(vpaddl_s8(vget_low_s8(src0)), + vpaddl_s8(vget_high_s8(src0))); + const int16x8_t ret1 = vcombine_s16(vpaddl_s8(vget_low_s8(src1)), + vpaddl_s8(vget_high_s8(src1))); + return vrshrq_n_s16(vaddq_s16(ret0, ret1), 2); + } + if (subsampling_x != 0) { + const int8x16_t src = vld1q_s8(luma); + return vrshrq_n_s16( + vcombine_s16(vpaddl_s8(vget_low_s8(src)), vpaddl_s8(vget_high_s8(src))), + 1); + } + return vmovl_s8(vld1_s8(luma)); +} + +// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed. +inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) { + if (subsampling_x != 0) { + const uint8x16_t src = vld1q_u8(luma); + return vrshrq_n_u16(vpaddlq_u8(src), 1); + } + return vmovl_u8(vld1_u8(luma)); +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +// Computes subsampled luma for use with chroma, by averaging in the x direction +// or y direction when applicable. +int16x8_t GetSubsampledLuma(const int16_t* const luma, int subsampling_x, + int subsampling_y, ptrdiff_t stride) { + if (subsampling_y != 0) { + assert(subsampling_x != 0); + int16x8_t src0_lo = vld1q_s16(luma); + int16x8_t src0_hi = vld1q_s16(luma + 8); + const int16x8_t src1_lo = vld1q_s16(luma + stride); + const int16x8_t src1_hi = vld1q_s16(luma + stride + 8); + const int16x8_t src0 = + vcombine_s16(vpadd_s16(vget_low_s16(src0_lo), vget_high_s16(src0_lo)), + vpadd_s16(vget_low_s16(src0_hi), vget_high_s16(src0_hi))); + const int16x8_t src1 = + vcombine_s16(vpadd_s16(vget_low_s16(src1_lo), vget_high_s16(src1_lo)), + vpadd_s16(vget_low_s16(src1_hi), vget_high_s16(src1_hi))); + return vrshrq_n_s16(vaddq_s16(src0, src1), 2); + } + if (subsampling_x != 0) { + const int16x8_t src_lo = vld1q_s16(luma); + const int16x8_t src_hi = vld1q_s16(luma + 8); + const int16x8_t ret = + vcombine_s16(vpadd_s16(vget_low_s16(src_lo), vget_high_s16(src_lo)), + vpadd_s16(vget_low_s16(src_hi), vget_high_s16(src_hi))); + return vrshrq_n_s16(ret, 1); + } + return vld1q_s16(luma); +} + +// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed. +inline uint16x8_t GetAverageLuma(const uint16_t* const luma, + int subsampling_x) { + if (subsampling_x != 0) { + const uint16x8x2_t src = vld2q_u16(luma); + return vrhaddq_u16(src.val[0], src.val[1]); + } + return vld1q_u16(luma); +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +template <int bitdepth, typename GrainType, int auto_regression_coeff_lag, + bool use_luma> +void ApplyAutoRegressiveFilterToChromaGrains_NEON(const FilmGrainParams& params, + const void* luma_grain_buffer, + int subsampling_x, + int subsampling_y, + void* u_grain_buffer, + void* v_grain_buffer) { + static_assert(auto_regression_coeff_lag <= 3, "Invalid autoregression lag."); + const auto* luma_grain = static_cast<const GrainType*>(luma_grain_buffer); + auto* u_grain = static_cast<GrainType*>(u_grain_buffer); + auto* v_grain = static_cast<GrainType*>(v_grain_buffer); + const int auto_regression_shift = params.auto_regression_shift; + const int chroma_width = + (subsampling_x == 0) ? kMaxChromaWidth : kMinChromaWidth; + const int chroma_height = + (subsampling_y == 0) ? kMaxChromaHeight : kMinChromaHeight; + // When |chroma_width| == 44, we write 8 at a time from x in [3, 34], + // leaving [35, 40] to write at the end. + const int chroma_width_remainder = + (chroma_width - 2 * kAutoRegressionBorder) & 7; + + int y = kAutoRegressionBorder; + luma_grain += kLumaWidth * y; + u_grain += chroma_width * y; + v_grain += chroma_width * y; + do { + // Each row is computed 8 values at a time in the following loop. At the + // end of the loop, 4 values remain to write. They are given a special + // reduced iteration at the end. + int x = kAutoRegressionBorder; + int luma_x = kAutoRegressionBorder; + do { + int pos = 0; + int32x4x2_t sum_u; + int32x4x2_t sum_v; + SetZero(&sum_u); + SetZero(&sum_v); + + if (auto_regression_coeff_lag > 0) { + for (int delta_row = -auto_regression_coeff_lag; delta_row < 0; + ++delta_row) { + // These loads may overflow to the next row, but they are never called + // on the final row of a grain block. Therefore, they will never + // exceed the block boundaries. + // Note: this could be slightly optimized to a single load in 8bpp, + // but requires making a special first iteration and accumulate + // function that takes an int8x16_t. + const int16x8_t u_grain_lo = + GetSignedSource8(u_grain + x + delta_row * chroma_width - + auto_regression_coeff_lag); + const int16x8_t u_grain_hi = + GetSignedSource8(u_grain + x + delta_row * chroma_width - + auto_regression_coeff_lag + 8); + const int16x8_t v_grain_lo = + GetSignedSource8(v_grain + x + delta_row * chroma_width - + auto_regression_coeff_lag); + const int16x8_t v_grain_hi = + GetSignedSource8(v_grain + x + delta_row * chroma_width - + auto_regression_coeff_lag + 8); +#define ACCUMULATE_WEIGHTED_GRAIN(offset) \ + sum_u = AccumulateWeightedGrain<offset>( \ + u_grain_lo, u_grain_hi, params.auto_regression_coeff_u[pos], sum_u); \ + sum_v = AccumulateWeightedGrain<offset>( \ + v_grain_lo, v_grain_hi, params.auto_regression_coeff_v[pos++], sum_v) + + ACCUMULATE_WEIGHTED_GRAIN(0); + ACCUMULATE_WEIGHTED_GRAIN(1); + ACCUMULATE_WEIGHTED_GRAIN(2); + // The horizontal |auto_regression_coeff_lag| loop is replaced with + // if-statements to give vextq_s16 an immediate param. + if (auto_regression_coeff_lag > 1) { + ACCUMULATE_WEIGHTED_GRAIN(3); + ACCUMULATE_WEIGHTED_GRAIN(4); + } + if (auto_regression_coeff_lag > 2) { + assert(auto_regression_coeff_lag == 3); + ACCUMULATE_WEIGHTED_GRAIN(5); + ACCUMULATE_WEIGHTED_GRAIN(6); + } + } + } + + if (use_luma) { + const int16x8_t luma = GetSubsampledLuma( + luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth); + + // Luma samples get the final coefficient in the formula, but are best + // computed all at once before the final row. + const int coeff_u = + params.auto_regression_coeff_u[pos + auto_regression_coeff_lag]; + const int coeff_v = + params.auto_regression_coeff_v[pos + auto_regression_coeff_lag]; + + sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u); + sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u); + sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v); + sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v); + } + // At this point in the filter, the source addresses and destination + // addresses overlap. Because this is an auto-regressive filter, the + // higher lanes cannot be computed without the results of the lower lanes. + // Each call to WriteFinalAutoRegression incorporates preceding values + // on the final row, and writes a single sample. This allows the next + // pixel's value to be computed in the next call. +#define WRITE_AUTO_REGRESSION_RESULT(lane) \ + WriteFinalAutoRegressionChroma<bitdepth, auto_regression_coeff_lag, lane>( \ + u_grain + x, v_grain + x, sum_u, sum_v, params.auto_regression_coeff_u, \ + params.auto_regression_coeff_v, pos, auto_regression_shift) + + WRITE_AUTO_REGRESSION_RESULT(0); + WRITE_AUTO_REGRESSION_RESULT(1); + WRITE_AUTO_REGRESSION_RESULT(2); + WRITE_AUTO_REGRESSION_RESULT(3); + WRITE_AUTO_REGRESSION_RESULT(4); + WRITE_AUTO_REGRESSION_RESULT(5); + WRITE_AUTO_REGRESSION_RESULT(6); + WRITE_AUTO_REGRESSION_RESULT(7); + + x += 8; + luma_x += 8 << subsampling_x; + } while (x < chroma_width - kAutoRegressionBorder - chroma_width_remainder); + + // This is the "final iteration" of the above loop over width. We fill in + // the remainder of the width, which is less than 8. + int pos = 0; + int32x4x2_t sum_u; + int32x4x2_t sum_v; + SetZero(&sum_u); + SetZero(&sum_v); + + for (int delta_row = -auto_regression_coeff_lag; delta_row < 0; + ++delta_row) { + // These loads may overflow to the next row, but they are never called on + // the final row of a grain block. Therefore, they will never exceed the + // block boundaries. + const int16x8_t u_grain_lo = GetSignedSource8( + u_grain + x + delta_row * chroma_width - auto_regression_coeff_lag); + const int16x8_t u_grain_hi = + GetSignedSource8(u_grain + x + delta_row * chroma_width - + auto_regression_coeff_lag + 8); + const int16x8_t v_grain_lo = GetSignedSource8( + v_grain + x + delta_row * chroma_width - auto_regression_coeff_lag); + const int16x8_t v_grain_hi = + GetSignedSource8(v_grain + x + delta_row * chroma_width - + auto_regression_coeff_lag + 8); + + ACCUMULATE_WEIGHTED_GRAIN(0); + ACCUMULATE_WEIGHTED_GRAIN(1); + ACCUMULATE_WEIGHTED_GRAIN(2); + // The horizontal |auto_regression_coeff_lag| loop is replaced with + // if-statements to give vextq_s16 an immediate param. + if (auto_regression_coeff_lag > 1) { + ACCUMULATE_WEIGHTED_GRAIN(3); + ACCUMULATE_WEIGHTED_GRAIN(4); + } + if (auto_regression_coeff_lag > 2) { + assert(auto_regression_coeff_lag == 3); + ACCUMULATE_WEIGHTED_GRAIN(5); + ACCUMULATE_WEIGHTED_GRAIN(6); + } + } + + if (use_luma) { + const int16x8_t luma = GetSubsampledLuma( + luma_grain + luma_x, subsampling_x, subsampling_y, kLumaWidth); + + // Luma samples get the final coefficient in the formula, but are best + // computed all at once before the final row. + const int coeff_u = + params.auto_regression_coeff_u[pos + auto_regression_coeff_lag]; + const int coeff_v = + params.auto_regression_coeff_v[pos + auto_regression_coeff_lag]; + + sum_u.val[0] = vmlal_n_s16(sum_u.val[0], vget_low_s16(luma), coeff_u); + sum_u.val[1] = vmlal_n_s16(sum_u.val[1], vget_high_s16(luma), coeff_u); + sum_v.val[0] = vmlal_n_s16(sum_v.val[0], vget_low_s16(luma), coeff_v); + sum_v.val[1] = vmlal_n_s16(sum_v.val[1], vget_high_s16(luma), coeff_v); + } + + WRITE_AUTO_REGRESSION_RESULT(0); + WRITE_AUTO_REGRESSION_RESULT(1); + WRITE_AUTO_REGRESSION_RESULT(2); + WRITE_AUTO_REGRESSION_RESULT(3); + if (chroma_width_remainder == 6) { + WRITE_AUTO_REGRESSION_RESULT(4); + WRITE_AUTO_REGRESSION_RESULT(5); + } + + luma_grain += kLumaWidth << subsampling_y; + u_grain += chroma_width; + v_grain += chroma_width; + } while (++y < chroma_height); +#undef ACCUMULATE_WEIGHTED_GRAIN +#undef WRITE_AUTO_REGRESSION_RESULT +} + +// Applies an auto-regressive filter to the white noise in luma_grain. +template <int bitdepth, typename GrainType, int auto_regression_coeff_lag> +void ApplyAutoRegressiveFilterToLumaGrain_NEON(const FilmGrainParams& params, + void* luma_grain_buffer) { + static_assert(auto_regression_coeff_lag > 0, ""); + const int8_t* const auto_regression_coeff_y = params.auto_regression_coeff_y; + const uint8_t auto_regression_shift = params.auto_regression_shift; + + int y = kAutoRegressionBorder; + auto* luma_grain = + static_cast<GrainType*>(luma_grain_buffer) + kLumaWidth * y; + do { + // Each row is computed 8 values at a time in the following loop. At the + // end of the loop, 4 values remain to write. They are given a special + // reduced iteration at the end. + int x = kAutoRegressionBorder; + do { + int pos = 0; + int32x4x2_t sum; + SetZero(&sum); + for (int delta_row = -auto_regression_coeff_lag; delta_row < 0; + ++delta_row) { + // These loads may overflow to the next row, but they are never called + // on the final row of a grain block. Therefore, they will never exceed + // the block boundaries. + const int16x8_t src_grain_lo = + GetSignedSource8(luma_grain + x + delta_row * kLumaWidth - + auto_regression_coeff_lag); + const int16x8_t src_grain_hi = + GetSignedSource8(luma_grain + x + delta_row * kLumaWidth - + auto_regression_coeff_lag + 8); + + // A pictorial representation of the auto-regressive filter for + // various values of params.auto_regression_coeff_lag. The letter 'O' + // represents the current sample. (The filter always operates on the + // current sample with filter coefficient 1.) The letters 'X' + // represent the neighboring samples that the filter operates on, below + // their corresponding "offset" number. + // + // params.auto_regression_coeff_lag == 3: + // 0 1 2 3 4 5 6 + // X X X X X X X + // X X X X X X X + // X X X X X X X + // X X X O + // params.auto_regression_coeff_lag == 2: + // 0 1 2 3 4 + // X X X X X + // X X X X X + // X X O + // params.auto_regression_coeff_lag == 1: + // 0 1 2 + // X X X + // X O + // params.auto_regression_coeff_lag == 0: + // O + // The function relies on the caller to skip the call in the 0 lag + // case. + +#define ACCUMULATE_WEIGHTED_GRAIN(offset) \ + sum = AccumulateWeightedGrain<offset>(src_grain_lo, src_grain_hi, \ + auto_regression_coeff_y[pos++], sum) + ACCUMULATE_WEIGHTED_GRAIN(0); + ACCUMULATE_WEIGHTED_GRAIN(1); + ACCUMULATE_WEIGHTED_GRAIN(2); + // The horizontal |auto_regression_coeff_lag| loop is replaced with + // if-statements to give vextq_s16 an immediate param. + if (auto_regression_coeff_lag > 1) { + ACCUMULATE_WEIGHTED_GRAIN(3); + ACCUMULATE_WEIGHTED_GRAIN(4); + } + if (auto_regression_coeff_lag > 2) { + assert(auto_regression_coeff_lag == 3); + ACCUMULATE_WEIGHTED_GRAIN(5); + ACCUMULATE_WEIGHTED_GRAIN(6); + } + } + // At this point in the filter, the source addresses and destination + // addresses overlap. Because this is an auto-regressive filter, the + // higher lanes cannot be computed without the results of the lower lanes. + // Each call to WriteFinalAutoRegression incorporates preceding values + // on the final row, and writes a single sample. This allows the next + // pixel's value to be computed in the next call. +#define WRITE_AUTO_REGRESSION_RESULT(lane) \ + WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>( \ + luma_grain + x, sum, auto_regression_coeff_y, pos, \ + auto_regression_shift) + + WRITE_AUTO_REGRESSION_RESULT(0); + WRITE_AUTO_REGRESSION_RESULT(1); + WRITE_AUTO_REGRESSION_RESULT(2); + WRITE_AUTO_REGRESSION_RESULT(3); + WRITE_AUTO_REGRESSION_RESULT(4); + WRITE_AUTO_REGRESSION_RESULT(5); + WRITE_AUTO_REGRESSION_RESULT(6); + WRITE_AUTO_REGRESSION_RESULT(7); + x += 8; + // Leave the final four pixels for the special iteration below. + } while (x < kLumaWidth - kAutoRegressionBorder - 4); + + // Final 4 pixels in the row. + int pos = 0; + int32x4x2_t sum; + SetZero(&sum); + for (int delta_row = -auto_regression_coeff_lag; delta_row < 0; + ++delta_row) { + const int16x8_t src_grain_lo = GetSignedSource8( + luma_grain + x + delta_row * kLumaWidth - auto_regression_coeff_lag); + const int16x8_t src_grain_hi = + GetSignedSource8(luma_grain + x + delta_row * kLumaWidth - + auto_regression_coeff_lag + 8); + + ACCUMULATE_WEIGHTED_GRAIN(0); + ACCUMULATE_WEIGHTED_GRAIN(1); + ACCUMULATE_WEIGHTED_GRAIN(2); + // The horizontal |auto_regression_coeff_lag| loop is replaced with + // if-statements to give vextq_s16 an immediate param. + if (auto_regression_coeff_lag > 1) { + ACCUMULATE_WEIGHTED_GRAIN(3); + ACCUMULATE_WEIGHTED_GRAIN(4); + } + if (auto_regression_coeff_lag > 2) { + assert(auto_regression_coeff_lag == 3); + ACCUMULATE_WEIGHTED_GRAIN(5); + ACCUMULATE_WEIGHTED_GRAIN(6); + } + } + // delta_row == 0 + WRITE_AUTO_REGRESSION_RESULT(0); + WRITE_AUTO_REGRESSION_RESULT(1); + WRITE_AUTO_REGRESSION_RESULT(2); + WRITE_AUTO_REGRESSION_RESULT(3); + luma_grain += kLumaWidth; + } while (++y < kLumaHeight); + +#undef WRITE_AUTO_REGRESSION_RESULT +#undef ACCUMULATE_WEIGHTED_GRAIN +} + +void InitializeScalingLookupTable_NEON( + int num_points, const uint8_t point_value[], const uint8_t point_scaling[], + uint8_t scaling_lut[kScalingLookupTableSize]) { + if (num_points == 0) { + memset(scaling_lut, 0, sizeof(scaling_lut[0]) * kScalingLookupTableSize); + return; + } + static_assert(sizeof(scaling_lut[0]) == 1, ""); + memset(scaling_lut, point_scaling[0], point_value[0]); + const uint32x4_t steps = vmovl_u16(vcreate_u16(0x0003000200010000)); + const uint32x4_t offset = vdupq_n_u32(32768); + for (int i = 0; i < num_points - 1; ++i) { + const int delta_y = point_scaling[i + 1] - point_scaling[i]; + const int delta_x = point_value[i + 1] - point_value[i]; + const int delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x); + const int delta4 = delta << 2; + const uint8x8_t base_point = vdup_n_u8(point_scaling[i]); + uint32x4_t upscaled_points0 = vmlaq_n_u32(offset, steps, delta); + const uint32x4_t line_increment4 = vdupq_n_u32(delta4); + // Get the second set of 4 points by adding 4 steps to the first set. + uint32x4_t upscaled_points1 = vaddq_u32(upscaled_points0, line_increment4); + // We obtain the next set of 8 points by adding 8 steps to each of the + // current 8 points. + const uint32x4_t line_increment8 = vshlq_n_u32(line_increment4, 1); + int x = 0; + do { + const uint16x4_t interp_points0 = vshrn_n_u32(upscaled_points0, 16); + const uint16x4_t interp_points1 = vshrn_n_u32(upscaled_points1, 16); + const uint8x8_t interp_points = + vmovn_u16(vcombine_u16(interp_points0, interp_points1)); + // The spec guarantees that the max value of |point_value[i]| + x is 255. + // Writing 8 bytes starting at the final table byte, leaves 7 bytes of + // required padding. + vst1_u8(&scaling_lut[point_value[i] + x], + vadd_u8(interp_points, base_point)); + upscaled_points0 = vaddq_u32(upscaled_points0, line_increment8); + upscaled_points1 = vaddq_u32(upscaled_points1, line_increment8); + x += 8; + } while (x < delta_x); + } + const uint8_t last_point_value = point_value[num_points - 1]; + memset(&scaling_lut[last_point_value], point_scaling[num_points - 1], + kScalingLookupTableSize - last_point_value); +} + +inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low, + const int16x8_t high) { + const int16x8_t clipped_to_ceiling = vminq_s16(high, value); + return vmaxq_s16(low, clipped_to_ceiling); +} + +template <int bitdepth, typename Pixel> +inline int16x8_t GetScalingFactors( + const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) { + int16_t start_vals[8]; + if (bitdepth == 8) { + start_vals[0] = scaling_lut[source[0]]; + start_vals[1] = scaling_lut[source[1]]; + start_vals[2] = scaling_lut[source[2]]; + start_vals[3] = scaling_lut[source[3]]; + start_vals[4] = scaling_lut[source[4]]; + start_vals[5] = scaling_lut[source[5]]; + start_vals[6] = scaling_lut[source[6]]; + start_vals[7] = scaling_lut[source[7]]; + return vld1q_s16(start_vals); + } + int16_t end_vals[8]; + // TODO(petersonab): Precompute this into a larger table for direct lookups. + int index = source[0] >> 2; + start_vals[0] = scaling_lut[index]; + end_vals[0] = scaling_lut[index + 1]; + index = source[1] >> 2; + start_vals[1] = scaling_lut[index]; + end_vals[1] = scaling_lut[index + 1]; + index = source[2] >> 2; + start_vals[2] = scaling_lut[index]; + end_vals[2] = scaling_lut[index + 1]; + index = source[3] >> 2; + start_vals[3] = scaling_lut[index]; + end_vals[3] = scaling_lut[index + 1]; + index = source[4] >> 2; + start_vals[4] = scaling_lut[index]; + end_vals[4] = scaling_lut[index + 1]; + index = source[5] >> 2; + start_vals[5] = scaling_lut[index]; + end_vals[5] = scaling_lut[index + 1]; + index = source[6] >> 2; + start_vals[6] = scaling_lut[index]; + end_vals[6] = scaling_lut[index + 1]; + index = source[7] >> 2; + start_vals[7] = scaling_lut[index]; + end_vals[7] = scaling_lut[index + 1]; + const int16x8_t start = vld1q_s16(start_vals); + const int16x8_t end = vld1q_s16(end_vals); + int16x8_t remainder = GetSignedSource8(source); + remainder = vandq_s16(remainder, vdupq_n_s16(3)); + const int16x8_t delta = vmulq_s16(vsubq_s16(end, start), remainder); + return vaddq_s16(start, vrshrq_n_s16(delta, 2)); +} + +inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling, + const int16x8_t scaling_shift_vect) { + const int16x8_t upscaled_noise = vmulq_s16(noise, scaling); + return vrshlq_s16(upscaled_noise, scaling_shift_vect); +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +inline int16x8_t ScaleNoise(const int16x8_t noise, const int16x8_t scaling, + const int32x4_t scaling_shift_vect) { + // TODO(petersonab): Try refactoring scaling lookup table to int16_t and + // upscaling by 7 bits to permit high half multiply. This would eliminate + // the intermediate 32x4 registers. Also write the averaged values directly + // into the table so it doesn't have to be done for every pixel in + // the frame. + const int32x4_t upscaled_noise_lo = + vmull_s16(vget_low_s16(noise), vget_low_s16(scaling)); + const int32x4_t upscaled_noise_hi = + vmull_s16(vget_high_s16(noise), vget_high_s16(scaling)); + const int16x4_t noise_lo = + vmovn_s32(vrshlq_s32(upscaled_noise_lo, scaling_shift_vect)); + const int16x4_t noise_hi = + vmovn_s32(vrshlq_s32(upscaled_noise_hi, scaling_shift_vect)); + return vcombine_s16(noise_lo, noise_hi); +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +template <int bitdepth, typename GrainType, typename Pixel> +void BlendNoiseWithImageLuma_NEON( + const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift, + int width, int height, int start_height, + const uint8_t scaling_lut_y[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y, + ptrdiff_t dest_stride_y) { + const auto* noise_image = + static_cast<const Array2D<GrainType>*>(noise_image_ptr); + const auto* in_y_row = static_cast<const Pixel*>(source_plane_y); + source_stride_y /= sizeof(Pixel); + auto* out_y_row = static_cast<Pixel*>(dest_plane_y); + dest_stride_y /= sizeof(Pixel); + const int16x8_t floor = vdupq_n_s16(min_value); + const int16x8_t ceiling = vdupq_n_s16(max_luma); + // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe + // for 16 bit signed integers. In higher bitdepths, however, we have to + // expand to 32 to protect the sign bit. + const int16x8_t scaling_shift_vect16 = vdupq_n_s16(-scaling_shift); +#if LIBGAV1_MAX_BITDEPTH >= 10 + const int32x4_t scaling_shift_vect32 = vdupq_n_s32(-scaling_shift); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + + int y = 0; + do { + int x = 0; + do { + // This operation on the unsigned input is safe in 8bpp because the vector + // is widened before it is reinterpreted. + const int16x8_t orig = GetSignedSource8(&in_y_row[x]); + const int16x8_t scaling = + GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]); + int16x8_t noise = + GetSignedSource8(&(noise_image[kPlaneY][y + start_height][x])); + + if (bitdepth == 8) { + noise = ScaleNoise(noise, scaling, scaling_shift_vect16); + } else { +#if LIBGAV1_MAX_BITDEPTH >= 10 + noise = ScaleNoise(noise, scaling, scaling_shift_vect32); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + } + const int16x8_t combined = vaddq_s16(orig, noise); + // In 8bpp, when params_.clip_to_restricted_range == false, we can replace + // clipping with vqmovun_s16, but it's not likely to be worth copying the + // function for just that case, though the gain would be very small. + StoreUnsigned8(&out_y_row[x], + vreinterpretq_u16_s16(Clip3(combined, floor, ceiling))); + x += 8; + } while (x < width); + in_y_row += source_stride_y; + out_y_row += dest_stride_y; + } while (++y < height); +} + +template <int bitdepth, typename GrainType, typename Pixel> +inline int16x8_t BlendChromaValsWithCfl( + const Pixel* average_luma_buffer, + const uint8_t scaling_lut[kScalingLookupTableSize], + const Pixel* chroma_cursor, const GrainType* noise_image_cursor, + const int16x8_t scaling_shift_vect16, + const int32x4_t scaling_shift_vect32) { + const int16x8_t scaling = + GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer); + const int16x8_t orig = GetSignedSource8(chroma_cursor); + int16x8_t noise = GetSignedSource8(noise_image_cursor); + if (bitdepth == 8) { + noise = ScaleNoise(noise, scaling, scaling_shift_vect16); + } else { + noise = ScaleNoise(noise, scaling, scaling_shift_vect32); + } + return vaddq_s16(orig, noise); +} + +template <int bitdepth, typename GrainType, typename Pixel> +LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_NEON( + const Array2D<GrainType>& noise_image, int min_value, int max_chroma, + int width, int height, int start_height, int subsampling_x, + int subsampling_y, int scaling_shift, + const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row, + ptrdiff_t source_stride_y, const Pixel* in_chroma_row, + ptrdiff_t source_stride_chroma, Pixel* out_chroma_row, + ptrdiff_t dest_stride) { + const int16x8_t floor = vdupq_n_s16(min_value); + const int16x8_t ceiling = vdupq_n_s16(max_chroma); + Pixel luma_buffer[16]; + memset(luma_buffer, 0, sizeof(luma_buffer)); + // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe + // for 16 bit signed integers. In higher bitdepths, however, we have to + // expand to 32 to protect the sign bit. + const int16x8_t scaling_shift_vect16 = vdupq_n_s16(-scaling_shift); + const int32x4_t scaling_shift_vect32 = vdupq_n_s32(-scaling_shift); + + const int chroma_height = (height + subsampling_y) >> subsampling_y; + const int chroma_width = (width + subsampling_x) >> subsampling_x; + const int safe_chroma_width = chroma_width & ~7; + + // Writing to this buffer avoids the cost of doing 8 lane lookups in a row + // in GetScalingFactors. + Pixel average_luma_buffer[8]; + assert(start_height % 2 == 0); + start_height >>= subsampling_y; + int y = 0; + do { + int x = 0; + do { + const int luma_x = x << subsampling_x; + // TODO(petersonab): Consider specializing by subsampling_x. In the 444 + // case &in_y_row[x] can be passed to GetScalingFactors directly. + const uint16x8_t average_luma = + GetAverageLuma(&in_y_row[luma_x], subsampling_x); + StoreUnsigned8(average_luma_buffer, average_luma); + + const int16x8_t blended = + BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>( + average_luma_buffer, scaling_lut, &in_chroma_row[x], + &(noise_image[y + start_height][x]), scaling_shift_vect16, + scaling_shift_vect32); + + // In 8bpp, when params_.clip_to_restricted_range == false, we can replace + // clipping with vqmovun_s16, but it's not likely to be worth copying the + // function for just that case. + StoreUnsigned8(&out_chroma_row[x], + vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); + x += 8; + } while (x < safe_chroma_width); + + if (x < chroma_width) { + const int luma_x = x << subsampling_x; + const int valid_range = width - luma_x; + memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0])); + luma_buffer[valid_range] = in_y_row[width - 1]; + const uint16x8_t average_luma = + GetAverageLuma(luma_buffer, subsampling_x); + StoreUnsigned8(average_luma_buffer, average_luma); + + const int16x8_t blended = + BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>( + average_luma_buffer, scaling_lut, &in_chroma_row[x], + &(noise_image[y + start_height][x]), scaling_shift_vect16, + scaling_shift_vect32); + // In 8bpp, when params_.clip_to_restricted_range == false, we can replace + // clipping with vqmovun_s16, but it's not likely to be worth copying the + // function for just that case. + StoreUnsigned8(&out_chroma_row[x], + vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); + } + + in_y_row += source_stride_y << subsampling_y; + in_chroma_row += source_stride_chroma; + out_chroma_row += dest_stride; + } while (++y < chroma_height); +} + +// This function is for the case params_.chroma_scaling_from_luma == true. +// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y. +template <int bitdepth, typename GrainType, typename Pixel> +void BlendNoiseWithImageChromaWithCfl_NEON( + Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, + int min_value, int max_chroma, int width, int height, int start_height, + int subsampling_x, int subsampling_y, + const uint8_t scaling_lut[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, + const void* source_plane_uv, ptrdiff_t source_stride_uv, + void* dest_plane_uv, ptrdiff_t dest_stride_uv) { + const auto* noise_image = + static_cast<const Array2D<GrainType>*>(noise_image_ptr); + const auto* in_y = static_cast<const Pixel*>(source_plane_y); + source_stride_y /= sizeof(Pixel); + + const auto* in_uv = static_cast<const Pixel*>(source_plane_uv); + source_stride_uv /= sizeof(Pixel); + auto* out_uv = static_cast<Pixel*>(dest_plane_uv); + dest_stride_uv /= sizeof(Pixel); + // Looping over one plane at a time is faster in higher resolutions, despite + // re-computing luma. + BlendChromaPlaneWithCfl_NEON<bitdepth, GrainType, Pixel>( + noise_image[plane], min_value, max_chroma, width, height, start_height, + subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y, + source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv); +} + +} // namespace + +namespace low_bitdepth { +namespace { + +inline int16x8_t BlendChromaValsNoCfl( + const uint8_t scaling_lut[kScalingLookupTableSize], + const uint8_t* chroma_cursor, const int8_t* noise_image_cursor, + const int16x8_t& average_luma, const int16x8_t& scaling_shift_vect, + const int16x8_t& offset, int luma_multiplier, int chroma_multiplier) { + uint8_t merged_buffer[8]; + const int16x8_t orig = GetSignedSource8(chroma_cursor); + const int16x8_t weighted_luma = vmulq_n_s16(average_luma, luma_multiplier); + const int16x8_t weighted_chroma = vmulq_n_s16(orig, chroma_multiplier); + // Maximum value of |combined_u| is 127*255 = 0x7E81. + const int16x8_t combined = vhaddq_s16(weighted_luma, weighted_chroma); + // Maximum value of u_offset is (255 << 5) = 0x1FE0. + // 0x7E81 + 0x1FE0 = 0x9E61, therefore another halving add is required. + const uint8x8_t merged = vqshrun_n_s16(vhaddq_s16(offset, combined), 4); + vst1_u8(merged_buffer, merged); + const int16x8_t scaling = + GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer); + int16x8_t noise = GetSignedSource8(noise_image_cursor); + noise = ScaleNoise(noise, scaling, scaling_shift_vect); + return vaddq_s16(orig, noise); +} + +LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_NEON( + const Array2D<int8_t>& noise_image, int min_value, int max_chroma, + int width, int height, int start_height, int subsampling_x, + int subsampling_y, int scaling_shift, int chroma_offset, + int chroma_multiplier, int luma_multiplier, + const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row, + ptrdiff_t source_stride_y, const uint8_t* in_chroma_row, + ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row, + ptrdiff_t dest_stride) { + const int16x8_t floor = vdupq_n_s16(min_value); + const int16x8_t ceiling = vdupq_n_s16(max_chroma); + // In 8bpp, the maximum upscaled noise is 127*255 = 0x7E81, which is safe + // for 16 bit signed integers. In higher bitdepths, however, we have to + // expand to 32 to protect the sign bit. + const int16x8_t scaling_shift_vect = vdupq_n_s16(-scaling_shift); + + const int chroma_height = (height + subsampling_y) >> subsampling_y; + const int chroma_width = (width + subsampling_x) >> subsampling_x; + const int safe_chroma_width = chroma_width & ~7; + uint8_t luma_buffer[16]; + const int16x8_t offset = vdupq_n_s16(chroma_offset << 5); + + start_height >>= subsampling_y; + int y = 0; + do { + int x = 0; + do { + const int luma_x = x << subsampling_x; + const int16x8_t average_luma = vreinterpretq_s16_u16( + GetAverageLuma(&in_y_row[luma_x], subsampling_x)); + const int16x8_t blended = BlendChromaValsNoCfl( + scaling_lut, &in_chroma_row[x], &(noise_image[y + start_height][x]), + average_luma, scaling_shift_vect, offset, luma_multiplier, + chroma_multiplier); + // In 8bpp, when params_.clip_to_restricted_range == false, we can + // replace clipping with vqmovun_s16, but the gain would be small. + StoreUnsigned8(&out_chroma_row[x], + vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); + + x += 8; + } while (x < safe_chroma_width); + + if (x < chroma_width) { + // Begin right edge iteration. Same as the normal iterations, but the + // |average_luma| computation requires a duplicated luma value at the + // end. + const int luma_x = x << subsampling_x; + const int valid_range = width - luma_x; + memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0])); + luma_buffer[valid_range] = in_y_row[width - 1]; + + const int16x8_t average_luma = + vreinterpretq_s16_u16(GetAverageLuma(luma_buffer, subsampling_x)); + const int16x8_t blended = BlendChromaValsNoCfl( + scaling_lut, &in_chroma_row[x], &(noise_image[y + start_height][x]), + average_luma, scaling_shift_vect, offset, luma_multiplier, + chroma_multiplier); + StoreUnsigned8(&out_chroma_row[x], + vreinterpretq_u16_s16(Clip3(blended, floor, ceiling))); + // End of right edge iteration. + } + + in_y_row += source_stride_y << subsampling_y; + in_chroma_row += source_stride_chroma; + out_chroma_row += dest_stride; + } while (++y < chroma_height); +} + +// This function is for the case params_.chroma_scaling_from_luma == false. +void BlendNoiseWithImageChroma8bpp_NEON( + Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, + int min_value, int max_chroma, int width, int height, int start_height, + int subsampling_x, int subsampling_y, + const uint8_t scaling_lut[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, + const void* source_plane_uv, ptrdiff_t source_stride_uv, + void* dest_plane_uv, ptrdiff_t dest_stride_uv) { + assert(plane == kPlaneU || plane == kPlaneV); + const auto* noise_image = + static_cast<const Array2D<int8_t>*>(noise_image_ptr); + const auto* in_y = static_cast<const uint8_t*>(source_plane_y); + const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv); + auto* out_uv = static_cast<uint8_t*>(dest_plane_uv); + + const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset; + const int luma_multiplier = + (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier; + const int multiplier = + (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier; + BlendChromaPlane8bpp_NEON(noise_image[plane], min_value, max_chroma, width, + height, start_height, subsampling_x, subsampling_y, + params.chroma_scaling, offset, multiplier, + luma_multiplier, scaling_lut, in_y, source_stride_y, + in_uv, source_stride_uv, out_uv, dest_stride_uv); +} + +inline void WriteOverlapLine8bpp_NEON(const int8_t* noise_stripe_row, + const int8_t* noise_stripe_row_prev, + int plane_width, + const int8x8_t grain_coeff, + const int8x8_t old_coeff, + int8_t* noise_image_row) { + int x = 0; + do { + // Note that these reads may exceed noise_stripe_row's width by up to 7 + // bytes. + const int8x8_t source_grain = vld1_s8(noise_stripe_row + x); + const int8x8_t source_old = vld1_s8(noise_stripe_row_prev + x); + const int16x8_t weighted_grain = vmull_s8(grain_coeff, source_grain); + const int16x8_t grain = vmlal_s8(weighted_grain, old_coeff, source_old); + // Note that this write may exceed noise_image_row's width by up to 7 bytes. + vst1_s8(noise_image_row + x, vqrshrn_n_s16(grain, 5)); + x += 8; + } while (x < plane_width); +} + +void ConstructNoiseImageOverlap8bpp_NEON(const void* noise_stripes_buffer, + int width, int height, + int subsampling_x, int subsampling_y, + void* noise_image_buffer) { + const auto* noise_stripes = + static_cast<const Array2DView<int8_t>*>(noise_stripes_buffer); + auto* noise_image = static_cast<Array2D<int8_t>*>(noise_image_buffer); + const int plane_width = (width + subsampling_x) >> subsampling_x; + const int plane_height = (height + subsampling_y) >> subsampling_y; + const int stripe_height = 32 >> subsampling_y; + const int stripe_mask = stripe_height - 1; + int y = stripe_height; + int luma_num = 1; + if (subsampling_y == 0) { + const int8x8_t first_row_grain_coeff = vdup_n_s8(17); + const int8x8_t first_row_old_coeff = vdup_n_s8(27); + const int8x8_t second_row_grain_coeff = first_row_old_coeff; + const int8x8_t second_row_old_coeff = first_row_grain_coeff; + for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) { + const int8_t* noise_stripe = (*noise_stripes)[luma_num]; + const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1]; + WriteOverlapLine8bpp_NEON( + noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width, + first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]); + + WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width], + &noise_stripe_prev[(32 + 1) * plane_width], + plane_width, second_row_grain_coeff, + second_row_old_coeff, (*noise_image)[y + 1]); + } + // Either one partial stripe remains (remaining_height > 0), + // OR image is less than one stripe high (remaining_height < 0), + // OR all stripes are completed (remaining_height == 0). + const int remaining_height = plane_height - y; + if (remaining_height <= 0) { + return; + } + const int8_t* noise_stripe = (*noise_stripes)[luma_num]; + const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1]; + WriteOverlapLine8bpp_NEON( + noise_stripe, &noise_stripe_prev[32 * plane_width], plane_width, + first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]); + + if (remaining_height > 1) { + WriteOverlapLine8bpp_NEON(&noise_stripe[plane_width], + &noise_stripe_prev[(32 + 1) * plane_width], + plane_width, second_row_grain_coeff, + second_row_old_coeff, (*noise_image)[y + 1]); + } + } else { // subsampling_y == 1 + const int8x8_t first_row_grain_coeff = vdup_n_s8(22); + const int8x8_t first_row_old_coeff = vdup_n_s8(23); + for (; y < plane_height; ++luma_num, y += stripe_height) { + const int8_t* noise_stripe = (*noise_stripes)[luma_num]; + const int8_t* noise_stripe_prev = (*noise_stripes)[luma_num - 1]; + WriteOverlapLine8bpp_NEON( + noise_stripe, &noise_stripe_prev[16 * plane_width], plane_width, + first_row_grain_coeff, first_row_old_coeff, (*noise_image)[y]); + } + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + + // LumaAutoRegressionFunc + dsp->film_grain.luma_auto_regression[0] = + ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 1>; + dsp->film_grain.luma_auto_regression[1] = + ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 2>; + dsp->film_grain.luma_auto_regression[2] = + ApplyAutoRegressiveFilterToLumaGrain_NEON<8, int8_t, 3>; + + // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag] + // Chroma autoregression should never be called when lag is 0 and use_luma + // is false. + dsp->film_grain.chroma_auto_regression[0][0] = nullptr; + dsp->film_grain.chroma_auto_regression[0][1] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 1, false>; + dsp->film_grain.chroma_auto_regression[0][2] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 2, false>; + dsp->film_grain.chroma_auto_regression[0][3] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 3, false>; + dsp->film_grain.chroma_auto_regression[1][0] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 0, true>; + dsp->film_grain.chroma_auto_regression[1][1] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 1, true>; + dsp->film_grain.chroma_auto_regression[1][2] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 2, true>; + dsp->film_grain.chroma_auto_regression[1][3] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<8, int8_t, 3, true>; + + dsp->film_grain.construct_noise_image_overlap = + ConstructNoiseImageOverlap8bpp_NEON; + + dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_NEON; + + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_NEON<8, int8_t, uint8_t>; + dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_NEON; + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_NEON<8, int8_t, uint8_t>; +} + +} // namespace +} // namespace low_bitdepth + +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + + // LumaAutoRegressionFunc + dsp->film_grain.luma_auto_regression[0] = + ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 1>; + dsp->film_grain.luma_auto_regression[1] = + ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 2>; + dsp->film_grain.luma_auto_regression[2] = + ApplyAutoRegressiveFilterToLumaGrain_NEON<10, int16_t, 3>; + + // ChromaAutoRegressionFunc[use_luma][auto_regression_coeff_lag][subsampling] + // Chroma autoregression should never be called when lag is 0 and use_luma + // is false. + dsp->film_grain.chroma_auto_regression[0][0] = nullptr; + dsp->film_grain.chroma_auto_regression[0][1] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 1, false>; + dsp->film_grain.chroma_auto_regression[0][2] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 2, false>; + dsp->film_grain.chroma_auto_regression[0][3] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 3, false>; + dsp->film_grain.chroma_auto_regression[1][0] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 0, true>; + dsp->film_grain.chroma_auto_regression[1][1] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 1, true>; + dsp->film_grain.chroma_auto_regression[1][2] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 2, true>; + dsp->film_grain.chroma_auto_regression[1][3] = + ApplyAutoRegressiveFilterToChromaGrains_NEON<10, int16_t, 3, true>; + + dsp->film_grain.initialize_scaling_lut = InitializeScalingLookupTable_NEON; + + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_NEON<10, int16_t, uint16_t>; + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_NEON<10, int16_t, uint16_t>; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace film_grain + +void FilmGrainInit_NEON() { + film_grain::low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + film_grain::high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_NEON + +namespace libgav1 { +namespace dsp { + +void FilmGrainInit_NEON() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_ENABLE_NEON |