diff options
Diffstat (limited to 'src/dsp/x86/film_grain_sse4.cc')
-rw-r--r-- | src/dsp/x86/film_grain_sse4.cc | 106 |
1 files changed, 43 insertions, 63 deletions
diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc index 745c1ca..9ece947 100644 --- a/src/dsp/x86/film_grain_sse4.cc +++ b/src/dsp/x86/film_grain_sse4.cc @@ -126,30 +126,16 @@ inline __m128i Clip3(const __m128i value, const __m128i low, } template <int bitdepth, typename Pixel> -inline __m128i GetScalingFactors( - const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) { +inline __m128i GetScalingFactors(const int16_t* scaling_lut, + const Pixel* source) { alignas(16) int16_t start_vals[8]; - if (bitdepth == 8) { - // TODO(petersonab): Speed this up by creating a uint16_t scaling_lut. - // Currently this code results in a series of movzbl. - for (int i = 0; i < 8; ++i) { - start_vals[i] = scaling_lut[source[i]]; - } - return LoadAligned16(start_vals); - } - alignas(16) int16_t end_vals[8]; - // TODO(petersonab): Precompute this into a larger table for direct lookups. + static_assert(bitdepth <= kBitdepth10, + "SSE4 Film Grain is not yet implemented for 12bpp."); for (int i = 0; i < 8; ++i) { - const int index = source[i] >> 2; - start_vals[i] = scaling_lut[index]; - end_vals[i] = scaling_lut[index + 1]; + assert(source[i] < kScalingLookupTableSize << (bitdepth - 2)); + start_vals[i] = scaling_lut[source[i]]; } - const __m128i start = LoadAligned16(start_vals); - const __m128i end = LoadAligned16(end_vals); - __m128i remainder = LoadSource(source); - remainder = _mm_srli_epi16(_mm_slli_epi16(remainder, 14), 1); - const __m128i delta = _mm_mulhrs_epi16(_mm_sub_epi16(end, start), remainder); - return _mm_add_epi16(start, delta); + return LoadAligned16(start_vals); } // |scaling_shift| is in range [8,11]. @@ -162,11 +148,10 @@ inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling, template <int bitdepth, typename GrainType, typename Pixel> void BlendNoiseWithImageLuma_SSE4_1( - const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift, - int width, int height, int start_height, - const uint8_t scaling_lut_y[kScalingLookupTableSize], - const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y, - ptrdiff_t dest_stride_y) { + const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_luma, + int scaling_shift, int width, int height, int start_height, + const int16_t* scaling_lut_y, const void* source_plane_y, + ptrdiff_t source_stride_y, void* dest_plane_y, ptrdiff_t dest_stride_y) { const auto* noise_image = static_cast<const Array2D<GrainType>*>(noise_image_ptr); const auto* in_y_row = static_cast<const Pixel*>(source_plane_y); @@ -181,7 +166,6 @@ void BlendNoiseWithImageLuma_SSE4_1( do { int x = 0; for (; x < safe_width; x += 8) { - // TODO(b/133525232): Make 16-pixel version of loop body. const __m128i orig = LoadSource(&in_y_row[x]); const __m128i scaling = GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]); @@ -216,9 +200,9 @@ void BlendNoiseWithImageLuma_SSE4_1( template <int bitdepth, typename GrainType, typename Pixel> inline __m128i BlendChromaValsWithCfl( - const Pixel* average_luma_buffer, - const uint8_t scaling_lut[kScalingLookupTableSize], - const Pixel* chroma_cursor, const GrainType* noise_image_cursor, + const Pixel* LIBGAV1_RESTRICT average_luma_buffer, + const int16_t* scaling_lut, const Pixel* LIBGAV1_RESTRICT chroma_cursor, + const GrainType* LIBGAV1_RESTRICT noise_image_cursor, const __m128i scaling_shift) { const __m128i scaling = GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer); @@ -232,11 +216,10 @@ template <int bitdepth, typename GrainType, typename Pixel> LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( const Array2D<GrainType>& noise_image, int min_value, int max_chroma, int width, int height, int start_height, int subsampling_x, - int subsampling_y, int scaling_shift, - const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row, - ptrdiff_t source_stride_y, const Pixel* in_chroma_row, - ptrdiff_t source_stride_chroma, Pixel* out_chroma_row, - ptrdiff_t dest_stride) { + int subsampling_y, int scaling_shift, const int16_t* scaling_lut, + const Pixel* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y, + const Pixel* in_chroma_row, ptrdiff_t source_stride_chroma, + Pixel* out_chroma_row, ptrdiff_t dest_stride) { const __m128i floor = _mm_set1_epi16(min_value); const __m128i ceiling = _mm_set1_epi16(max_chroma); alignas(16) Pixel luma_buffer[16]; @@ -258,8 +241,6 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( int x = 0; for (; x < safe_chroma_width; x += 8) { const int luma_x = x << subsampling_x; - // TODO(petersonab): Consider specializing by subsampling_x. In the 444 - // case &in_y_row[x] can be passed to GetScalingFactors directly. const __m128i average_luma = GetAverageLuma(&in_y_row[luma_x], subsampling_x); StoreUnsigned(average_luma_buffer, average_luma); @@ -277,7 +258,7 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( // Prevent huge indices from entering GetScalingFactors due to // uninitialized values. This is not a problem in 8bpp because the table // is made larger than 255 values. - if (bitdepth > 8) { + if (bitdepth > kBitdepth8) { memset(luma_buffer, 0, sizeof(luma_buffer)); } const int luma_x = x << subsampling_x; @@ -306,11 +287,11 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( // This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y. template <int bitdepth, typename GrainType, typename Pixel> void BlendNoiseWithImageChromaWithCfl_SSE4_1( - Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, - int min_value, int max_chroma, int width, int height, int start_height, - int subsampling_x, int subsampling_y, - const uint8_t scaling_lut[kScalingLookupTableSize], - const void* source_plane_y, ptrdiff_t source_stride_y, + Plane plane, const FilmGrainParams& params, + const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma, + int width, int height, int start_height, int subsampling_x, + int subsampling_y, const int16_t* scaling_lut, + const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y, const void* source_plane_uv, ptrdiff_t source_stride_uv, void* dest_plane_uv, ptrdiff_t dest_stride_uv) { const auto* noise_image = @@ -335,10 +316,10 @@ namespace { // |offset| is 32x4 packed to add with the result of _mm_madd_epi16. inline __m128i BlendChromaValsNoCfl8bpp( - const uint8_t scaling_lut[kScalingLookupTableSize], const __m128i& orig, - const int8_t* noise_image_cursor, const __m128i& average_luma, - const __m128i& scaling_shift, const __m128i& offset, - const __m128i& weights) { + const int16_t* scaling_lut, const __m128i& orig, + const int8_t* LIBGAV1_RESTRICT noise_image_cursor, + const __m128i& average_luma, const __m128i& scaling_shift, + const __m128i& offset, const __m128i& weights) { uint8_t merged_buffer[8]; const __m128i combined_lo = _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights); @@ -351,9 +332,9 @@ inline __m128i BlendChromaValsNoCfl8bpp( StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged)); const __m128i scaling = - GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer); + GetScalingFactors<kBitdepth8, uint8_t>(scaling_lut, merged_buffer); __m128i noise = LoadSource(noise_image_cursor); - noise = ScaleNoise<8>(noise, scaling, scaling_shift); + noise = ScaleNoise<kBitdepth8>(noise, scaling, scaling_shift); return _mm_add_epi16(orig, noise); } @@ -361,11 +342,10 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1( const Array2D<int8_t>& noise_image, int min_value, int max_chroma, int width, int height, int start_height, int subsampling_x, int subsampling_y, int scaling_shift, int chroma_offset, - int chroma_multiplier, int luma_multiplier, - const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row, - ptrdiff_t source_stride_y, const uint8_t* in_chroma_row, - ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row, - ptrdiff_t dest_stride) { + int chroma_multiplier, int luma_multiplier, const int16_t* scaling_lut, + const uint8_t* LIBGAV1_RESTRICT in_y_row, ptrdiff_t source_stride_y, + const uint8_t* in_chroma_row, ptrdiff_t source_stride_chroma, + uint8_t* out_chroma_row, ptrdiff_t dest_stride) { const __m128i floor = _mm_set1_epi16(min_value); const __m128i ceiling = _mm_set1_epi16(max_chroma); @@ -432,11 +412,11 @@ LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1( // This function is for the case params_.chroma_scaling_from_luma == false. void BlendNoiseWithImageChroma8bpp_SSE4_1( - Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, - int min_value, int max_chroma, int width, int height, int start_height, - int subsampling_x, int subsampling_y, - const uint8_t scaling_lut[kScalingLookupTableSize], - const void* source_plane_y, ptrdiff_t source_stride_y, + Plane plane, const FilmGrainParams& params, + const void* LIBGAV1_RESTRICT noise_image_ptr, int min_value, int max_chroma, + int width, int height, int start_height, int subsampling_x, + int subsampling_y, const int16_t* scaling_lut, + const void* LIBGAV1_RESTRICT source_plane_y, ptrdiff_t source_stride_y, const void* source_plane_uv, ptrdiff_t source_stride_uv, void* dest_plane_uv, ptrdiff_t dest_stride_uv) { assert(plane == kPlaneU || plane == kPlaneV); @@ -463,10 +443,10 @@ void Init8bpp() { assert(dsp != nullptr); dsp->film_grain.blend_noise_luma = - BlendNoiseWithImageLuma_SSE4_1<8, int8_t, uint8_t>; + BlendNoiseWithImageLuma_SSE4_1<kBitdepth8, int8_t, uint8_t>; dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1; dsp->film_grain.blend_noise_chroma[1] = - BlendNoiseWithImageChromaWithCfl_SSE4_1<8, int8_t, uint8_t>; + BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth8, int8_t, uint8_t>; } } // namespace @@ -481,9 +461,9 @@ void Init10bpp() { assert(dsp != nullptr); dsp->film_grain.blend_noise_luma = - BlendNoiseWithImageLuma_SSE4_1<10, int16_t, uint16_t>; + BlendNoiseWithImageLuma_SSE4_1<kBitdepth10, int16_t, uint16_t>; dsp->film_grain.blend_noise_chroma[1] = - BlendNoiseWithImageChromaWithCfl_SSE4_1<10, int16_t, uint16_t>; + BlendNoiseWithImageChromaWithCfl_SSE4_1<kBitdepth10, int16_t, uint16_t>; } } // namespace |