aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86/film_grain_sse4.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86/film_grain_sse4.cc')
-rw-r--r--src/dsp/x86/film_grain_sse4.cc106
1 files changed, 43 insertions, 63 deletions
diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc
index 745c1ca..9ece947 100644
--- a/src/dsp/x86/film_grain_sse4.cc
+++ b/src/dsp/x86/film_grain_sse4.cc
@@ -126,30 +126,16 @@ inline __m128i Clip3(const __m128i value, const __m128i low,
}
template <int bitdepth, typename Pixel>
-inline __m128i GetScalingFactors(
- const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) {
+inline __m128i GetScalingFactors(const int16_t* scaling_lut,
+ const Pixel* source) {
alignas(16) int16_t start_vals[8];
- if (bitdepth == 8) {
- // TODO(petersonab): Speed this up by creating a uint16_t scaling_lut.
- // Currently this code results in a series of movzbl.
- for (int i = 0; i < 8; ++i) {
- start_vals[i] = scaling_lut[source[i]];
- }
- return LoadAligned16(start_vals);
- }
- alignas(16) int16_t end_vals[8];
- // TODO(petersonab): Precompute this into a larger table for direct lookups.
+ static_assert(bitdepth <= kBitdepth10,
+ "SSE4 Film Grain is not yet implemented for 12bpp.");
for (int i = 0; i < 8; ++i) {
- const int index = source[i] >> 2;
- start_vals[i] = scaling_lut[index];
- end_vals[i] = scaling_lut[index + 1];
+ assert(source[i] < kScalingLookupTableSize << (bitdepth - 2));
+ start_vals[i] = scaling_lut[source[i]];
}
- const __m128i start = LoadAligned16(start_vals);
- const __m128i end = LoadAligned16(end_vals);
- __m128i remainder = LoadSource(source);
- remainder = _mm_srli_epi16(_mm_slli_epi16(remainder, 14), 1);
- const __m128i delta = _mm_mulhrs_epi16(_mm_sub_epi16(end, start), remainder);
- return _mm_add_epi16(start, delta);
+ return LoadAligned16(start_vals);
}
// |scaling_shift| is in range [8,11].
@@ -162,11 +148,10 @@ inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling,
template <int bitdepth, typename GrainType, typename Pixel>
void BlendNoiseWithImageLuma_SSE4_1(
- const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift,
- int width, int height, int start_height,
- const uint8_t scaling_lut_y[kScalingLookupTableSize],
- const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y,
- ptrdiff_t dest_stride_y) {
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma,
+ int scaling_shift, int width, int height, int start_height,
+ const int16_t* scaling_lut_y, const void* source_plane_y,
+ ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) {
const auto* noise_image =
static_cast<const Array2D<GrainType>*>(noise_image_ptr);
const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
@@ -181,7 +166,6 @@ void BlendNoiseWithImageLuma_SSE4_1(
do {
int x = 0;
for (; x < safe_width; x += 8) {
- // TODO(b/133525232): Make 16-pixel version of loop body.
const __m128i orig = LoadSource(&in_y_row[x]);
const __m128i scaling =
GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
@@ -216,9 +200,9 @@ void BlendNoiseWithImageLuma_SSE4_1(
template <int bitdepth, typename GrainType, typename Pixel>
inline __m128i BlendChromaValsWithCfl(
- const Pixel* average_luma_buffer,
- const uint8_t scaling_lut[kScalingLookupTableSize],
- const Pixel* chroma_cursor, const GrainType* noise_image_cursor,
+ const Pixel* LIBGAV1_RESTRICT average_luma_buffer,
+ const int16_t* scaling_lut, const Pixel* LIBGAV1_RESTRICT chroma_cursor,
+ const GrainType* LIBGAV1_RESTRICT noise_image_cursor,
const __m128i scaling_shift) {
const __m128i scaling =
GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
@@ -232,11 +216,10 @@ template <int bitdepth, typename GrainType, typename Pixel>
LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
int width, int height, int start_height, int subsampling_x,
- int subsampling_y, int scaling_shift,
- const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row,
- ptrdiff_t source_stride_y, const Pixel* in_chroma_row,
- ptrdiff_t source_stride_chroma, Pixel* out_chroma_row,
- ptrdiff_t dest_stride) {
+ int subsampling_y, int scaling_shift, const int16_t* scaling_lut,
+ const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+ const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma,
+ Pixel* out_chroma_row, ptrdiff_t dest_stride) {
const __m128i floor = _mm_set1_epi16(min_value);
const __m128i ceiling = _mm_set1_epi16(max_chroma);
alignas(16) Pixel luma_buffer[16];
@@ -258,8 +241,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
int x = 0;
for (; x < safe_chroma_width; x += 8) {
const int luma_x = x << subsampling_x;
- // TODO(petersonab): Consider specializing by subsampling_x. In the 444
- // case &in_y_row[x] can be passed to GetScalingFactors directly.
const __m128i average_luma =
GetAverageLuma(&in_y_row[luma_x], subsampling_x);
StoreUnsigned(average_luma_buffer, average_luma);
@@ -277,7 +258,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
// Prevent huge indices from entering GetScalingFactors due to
// uninitialized values. This is not a problem in 8bpp because the table
// is made larger than 255 values.
- if (bitdepth > 8) {
+ if (bitdepth > kBitdepth8) {
memset(luma_buffer, 0, sizeof(luma_buffer));
}
const int luma_x = x << subsampling_x;
@@ -306,11 +287,11 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
template <int bitdepth, typename GrainType, typename Pixel>
void BlendNoiseWithImageChromaWithCfl_SSE4_1(
- Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
- int min_value, int max_chroma, int width, int height, int start_height,
- int subsampling_x, int subsampling_y,
- const uint8_t scaling_lut[kScalingLookupTableSize],
- const void* source_plane_y, ptrdiff_t source_stride_y,
+ Plane plane, const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, const int16_t* scaling_lut,
+ const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
const void* source_plane_uv, ptrdiff_t source_stride_uv,
void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
const auto* noise_image =
@@ -335,10 +316,10 @@ namespace {
// |offset| is 32x4 packed to add with the result of _mm_madd_epi16.
inline __m128i BlendChromaValsNoCfl8bpp(
- const uint8_t scaling_lut[kScalingLookupTableSize], const __m128i& orig,
- const int8_t* noise_image_cursor, const __m128i& average_luma,
- const __m128i& scaling_shift, const __m128i& offset,
- const __m128i& weights) {
+ const int16_t* scaling_lut, const __m128i& orig,
+ const int8_t* LIBGAV1_RESTRICT noise_image_cursor,
+ const __m128i& average_luma, const __m128i& scaling_shift,
+ const __m128i& offset, const __m128i& weights) {
uint8_t merged_buffer[8];
const __m128i combined_lo =
_mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights);
@@ -351,9 +332,9 @@ inline __m128i BlendChromaValsNoCfl8bpp(
StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged));
const __m128i scaling =
- GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer);
+ GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer);
__m128i noise = LoadSource(noise_image_cursor);
- noise = ScaleNoise<8>(noise, scaling, scaling_shift);
+ noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift);
return _mm_add_epi16(orig, noise);
}
@@ -361,11 +342,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
int width, int height, int start_height, int subsampling_x,
int subsampling_y, int scaling_shift, int chroma_offset,
- int chroma_multiplier, int luma_multiplier,
- const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row,
- ptrdiff_t source_stride_y, const uint8_t* in_chroma_row,
- ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row,
- ptrdiff_t dest_stride) {
+ int chroma_multiplier, int luma_multiplier, const int16_t* scaling_lut,
+ const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y,
+ const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma,
+ uint8_t* out_chroma_row, ptrdiff_t dest_stride) {
const __m128i floor = _mm_set1_epi16(min_value);
const __m128i ceiling = _mm_set1_epi16(max_chroma);
@@ -432,11 +412,11 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
// This function is for the case params_.chroma_scaling_from_luma == false.
void BlendNoiseWithImageChroma8bpp_SSE4_1(
- Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
- int min_value, int max_chroma, int width, int height, int start_height,
- int subsampling_x, int subsampling_y,
- const uint8_t scaling_lut[kScalingLookupTableSize],
- const void* source_plane_y, ptrdiff_t source_stride_y,
+ Plane plane, const FilmGrainParams& params,
+ const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, const int16_t* scaling_lut,
+ const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y,
const void* source_plane_uv, ptrdiff_t source_stride_uv,
void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
assert(plane == kPlaneU || plane == kPlaneV);
@@ -463,10 +443,10 @@ void Init8bpp() {
assert(dsp != nullptr);
dsp->film_grain.blend_noise_luma =
- BlendNoiseWithImageLuma_SSE4_1<8, int8_t, uint8_t>;
+ BlendNoiseWithImageLuma_SSE4_1<kBitdepth8, int8_t, uint8_t>;
dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1;
dsp->film_grain.blend_noise_chroma[1] =
- BlendNoiseWithImageChromaWithCfl_SSE4_1<8, int8_t, uint8_t>;
+ BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth8, int8_t, uint8_t>;
}
} // namespace
@@ -481,9 +461,9 @@ void Init10bpp() {
assert(dsp != nullptr);
dsp->film_grain.blend_noise_luma =
- BlendNoiseWithImageLuma_SSE4_1<10, int16_t, uint16_t>;
+ BlendNoiseWithImageLuma_SSE4_1<kBitdepth10, int16_t, uint16_t>;
dsp->film_grain.blend_noise_chroma[1] =
- BlendNoiseWithImageChromaWithCfl_SSE4_1<10, int16_t, uint16_t>;
+ BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth10, int16_t, uint16_t>;
}
} // namespace