diff options
Diffstat (limited to 'src/dsp/x86/film_grain_sse4.cc')
-rw-r--r-- | src/dsp/x86/film_grain_sse4.cc | 514 |
1 files changed, 514 insertions, 0 deletions
diff --git a/src/dsp/x86/film_grain_sse4.cc b/src/dsp/x86/film_grain_sse4.cc new file mode 100644 index 0000000..745c1ca --- /dev/null +++ b/src/dsp/x86/film_grain_sse4.cc @@ -0,0 +1,514 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/film_grain.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_SSE4_1 +#include <smmintrin.h> + +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstring> + +#include "src/dsp/common.h" +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/film_grain_common.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/logging.h" + +namespace libgav1 { +namespace dsp { +namespace film_grain { +namespace { + +// Load 8 values from source, widening to int16_t intermediate value size. +// The function is overloaded for each type and bitdepth for simplicity. +inline __m128i LoadSource(const int8_t* src) { + return _mm_cvtepi8_epi16(LoadLo8(src)); +} + +// Load 8 values from source, widening to int16_t intermediate value size. +inline __m128i LoadSource(const uint8_t* src) { + return _mm_cvtepu8_epi16(LoadLo8(src)); +} + +inline __m128i LoadSourceMsan(const uint8_t* src, const int valid_range) { + return _mm_cvtepu8_epi16(LoadLo8Msan(src, 8 - valid_range)); +} + +// Store 8 values to dest, narrowing to uint8_t from int16_t intermediate value. +inline void StoreUnsigned(uint8_t* dest, const __m128i data) { + StoreLo8(dest, _mm_packus_epi16(data, data)); +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +// Load 8 values from source. +inline __m128i LoadSource(const int16_t* src) { return LoadUnaligned16(src); } + +// Load 8 values from source. +inline __m128i LoadSource(const uint16_t* src) { return LoadUnaligned16(src); } + +// Store 8 values to dest. +inline void StoreUnsigned(uint16_t* dest, const __m128i data) { + StoreUnaligned16(dest, data); +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed. +inline __m128i GetAverageLuma(const uint8_t* const luma, int subsampling_x) { + if (subsampling_x != 0) { + const __m128i src = LoadUnaligned16(luma); + + return RightShiftWithRounding_U16( + _mm_hadd_epi16(_mm_cvtepu8_epi16(src), + _mm_unpackhi_epi8(src, _mm_setzero_si128())), + 1); + } + return _mm_cvtepu8_epi16(LoadLo8(luma)); +} + +inline __m128i GetAverageLumaMsan(const uint8_t* const luma, int subsampling_x, + int valid_range) { + if (subsampling_x != 0) { + const __m128i src = LoadUnaligned16Msan(luma, 16 - valid_range); + + return RightShiftWithRounding_U16( + _mm_hadd_epi16(_mm_cvtepu8_epi16(src), + _mm_unpackhi_epi8(src, _mm_setzero_si128())), + 1); + } + return _mm_cvtepu8_epi16(LoadLo8Msan(luma, 8 - valid_range)); +} + +#if LIBGAV1_MAX_BITDEPTH >= 10 +// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed. +inline __m128i GetAverageLuma(const uint16_t* const luma, int subsampling_x) { + if (subsampling_x != 0) { + return RightShiftWithRounding_U16( + _mm_hadd_epi16(LoadUnaligned16(luma), LoadUnaligned16(luma + 8)), 1); + } + return LoadUnaligned16(luma); +} + +inline __m128i GetAverageLumaMsan(const uint16_t* const luma, int subsampling_x, + int valid_range) { + if (subsampling_x != 0) { + return RightShiftWithRounding_U16( + _mm_hadd_epi16( + LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)), + LoadUnaligned16Msan(luma + 8, 32 - valid_range * sizeof(*luma))), + 1); + } + return LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)); +} +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +inline __m128i Clip3(const __m128i value, const __m128i low, + const __m128i high) { + const __m128i clipped_to_ceiling = _mm_min_epi16(high, value); + return _mm_max_epi16(low, clipped_to_ceiling); +} + +template <int bitdepth, typename Pixel> +inline __m128i GetScalingFactors( + const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) { + alignas(16) int16_t start_vals[8]; + if (bitdepth == 8) { + // TODO(petersonab): Speed this up by creating a uint16_t scaling_lut. + // Currently this code results in a series of movzbl. + for (int i = 0; i < 8; ++i) { + start_vals[i] = scaling_lut[source[i]]; + } + return LoadAligned16(start_vals); + } + alignas(16) int16_t end_vals[8]; + // TODO(petersonab): Precompute this into a larger table for direct lookups. + for (int i = 0; i < 8; ++i) { + const int index = source[i] >> 2; + start_vals[i] = scaling_lut[index]; + end_vals[i] = scaling_lut[index + 1]; + } + const __m128i start = LoadAligned16(start_vals); + const __m128i end = LoadAligned16(end_vals); + __m128i remainder = LoadSource(source); + remainder = _mm_srli_epi16(_mm_slli_epi16(remainder, 14), 1); + const __m128i delta = _mm_mulhrs_epi16(_mm_sub_epi16(end, start), remainder); + return _mm_add_epi16(start, delta); +} + +// |scaling_shift| is in range [8,11]. +template <int bitdepth> +inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling, + const __m128i scaling_shift) { + const __m128i shifted_scale_factors = _mm_sll_epi16(scaling, scaling_shift); + return _mm_mulhrs_epi16(noise, shifted_scale_factors); +} + +template <int bitdepth, typename GrainType, typename Pixel> +void BlendNoiseWithImageLuma_SSE4_1( + const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift, + int width, int height, int start_height, + const uint8_t scaling_lut_y[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y, + ptrdiff_t dest_stride_y) { + const auto* noise_image = + static_cast<const Array2D<GrainType>*>(noise_image_ptr); + const auto* in_y_row = static_cast<const Pixel*>(source_plane_y); + source_stride_y /= sizeof(Pixel); + auto* out_y_row = static_cast<Pixel*>(dest_plane_y); + dest_stride_y /= sizeof(Pixel); + const __m128i floor = _mm_set1_epi16(min_value); + const __m128i ceiling = _mm_set1_epi16(max_luma); + const int safe_width = width & ~7; + const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift); + int y = 0; + do { + int x = 0; + for (; x < safe_width; x += 8) { + // TODO(b/133525232): Make 16-pixel version of loop body. + const __m128i orig = LoadSource(&in_y_row[x]); + const __m128i scaling = + GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]); + __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x])); + + noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift); + const __m128i combined = _mm_add_epi16(orig, noise); + StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling)); + } + + if (x < width) { + Pixel luma_buffer[8]; + // Prevent arbitrary indices from entering GetScalingFactors. + memset(luma_buffer, 0, sizeof(luma_buffer)); + const int valid_range = width - x; + memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0])); + luma_buffer[valid_range] = in_y_row[width - 1]; + const __m128i orig = LoadSource(&in_y_row[x]); + const __m128i scaling = + GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, luma_buffer); + __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x])); + + noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift); + const __m128i combined = _mm_add_epi16(orig, noise); + StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling)); + } + in_y_row += source_stride_y; + out_y_row += dest_stride_y; + } while (++y < height); + out_y_row = static_cast<Pixel*>(dest_plane_y); +} + +template <int bitdepth, typename GrainType, typename Pixel> +inline __m128i BlendChromaValsWithCfl( + const Pixel* average_luma_buffer, + const uint8_t scaling_lut[kScalingLookupTableSize], + const Pixel* chroma_cursor, const GrainType* noise_image_cursor, + const __m128i scaling_shift) { + const __m128i scaling = + GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer); + const __m128i orig = LoadSource(chroma_cursor); + __m128i noise = LoadSource(noise_image_cursor); + noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift); + return _mm_add_epi16(orig, noise); +} + +template <int bitdepth, typename GrainType, typename Pixel> +LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1( + const Array2D<GrainType>& noise_image, int min_value, int max_chroma, + int width, int height, int start_height, int subsampling_x, + int subsampling_y, int scaling_shift, + const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row, + ptrdiff_t source_stride_y, const Pixel* in_chroma_row, + ptrdiff_t source_stride_chroma, Pixel* out_chroma_row, + ptrdiff_t dest_stride) { + const __m128i floor = _mm_set1_epi16(min_value); + const __m128i ceiling = _mm_set1_epi16(max_chroma); + alignas(16) Pixel luma_buffer[16]; + + const int chroma_height = (height + subsampling_y) >> subsampling_y; + const int chroma_width = (width + subsampling_x) >> subsampling_x; + // |chroma_width| is rounded up. If |width| is odd, then the final pixel will + // need to be guarded from overread, even if |chroma_width| is divisible by 8. + const int safe_chroma_width = (chroma_width - (width & 1)) & ~7; + + // Writing to this buffer avoids the cost of doing 8 lane lookups in a row + // in GetScalingFactors. + Pixel average_luma_buffer[8]; + assert(start_height % 2 == 0); + start_height >>= subsampling_y; + const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift); + int y = 0; + do { + int x = 0; + for (; x < safe_chroma_width; x += 8) { + const int luma_x = x << subsampling_x; + // TODO(petersonab): Consider specializing by subsampling_x. In the 444 + // case &in_y_row[x] can be passed to GetScalingFactors directly. + const __m128i average_luma = + GetAverageLuma(&in_y_row[luma_x], subsampling_x); + StoreUnsigned(average_luma_buffer, average_luma); + + const __m128i blended = + BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>( + average_luma_buffer, scaling_lut, &in_chroma_row[x], + &(noise_image[y + start_height][x]), derived_scaling_shift); + StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling)); + } + + // This section only runs if width % (8 << sub_x) != 0. It should never run + // on 720p and above. + if (x < chroma_width) { + // Prevent huge indices from entering GetScalingFactors due to + // uninitialized values. This is not a problem in 8bpp because the table + // is made larger than 255 values. + if (bitdepth > 8) { + memset(luma_buffer, 0, sizeof(luma_buffer)); + } + const int luma_x = x << subsampling_x; + const int valid_range = width - luma_x; + assert(valid_range < 16); + memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0])); + luma_buffer[valid_range] = in_y_row[width - 1]; + const __m128i average_luma = + GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1); + StoreUnsigned(average_luma_buffer, average_luma); + + const __m128i blended = + BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>( + average_luma_buffer, scaling_lut, &in_chroma_row[x], + &(noise_image[y + start_height][x]), derived_scaling_shift); + StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling)); + } + + in_y_row += source_stride_y << subsampling_y; + in_chroma_row += source_stride_chroma; + out_chroma_row += dest_stride; + } while (++y < chroma_height); +} + +// This function is for the case params_.chroma_scaling_from_luma == true. +// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y. +template <int bitdepth, typename GrainType, typename Pixel> +void BlendNoiseWithImageChromaWithCfl_SSE4_1( + Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, + int min_value, int max_chroma, int width, int height, int start_height, + int subsampling_x, int subsampling_y, + const uint8_t scaling_lut[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, + const void* source_plane_uv, ptrdiff_t source_stride_uv, + void* dest_plane_uv, ptrdiff_t dest_stride_uv) { + const auto* noise_image = + static_cast<const Array2D<GrainType>*>(noise_image_ptr); + const auto* in_y = static_cast<const Pixel*>(source_plane_y); + source_stride_y /= sizeof(Pixel); + + const auto* in_uv = static_cast<const Pixel*>(source_plane_uv); + source_stride_uv /= sizeof(Pixel); + auto* out_uv = static_cast<Pixel*>(dest_plane_uv); + dest_stride_uv /= sizeof(Pixel); + BlendChromaPlaneWithCfl_SSE4_1<bitdepth, GrainType, Pixel>( + noise_image[plane], min_value, max_chroma, width, height, start_height, + subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y, + source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv); +} + +} // namespace + +namespace low_bitdepth { +namespace { + +// |offset| is 32x4 packed to add with the result of _mm_madd_epi16. +inline __m128i BlendChromaValsNoCfl8bpp( + const uint8_t scaling_lut[kScalingLookupTableSize], const __m128i& orig, + const int8_t* noise_image_cursor, const __m128i& average_luma, + const __m128i& scaling_shift, const __m128i& offset, + const __m128i& weights) { + uint8_t merged_buffer[8]; + const __m128i combined_lo = + _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights); + const __m128i combined_hi = + _mm_madd_epi16(_mm_unpackhi_epi16(average_luma, orig), weights); + const __m128i merged_base = _mm_packs_epi32(_mm_srai_epi32((combined_lo), 6), + _mm_srai_epi32((combined_hi), 6)); + + const __m128i merged = _mm_add_epi16(merged_base, offset); + + StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged)); + const __m128i scaling = + GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer); + __m128i noise = LoadSource(noise_image_cursor); + noise = ScaleNoise<8>(noise, scaling, scaling_shift); + return _mm_add_epi16(orig, noise); +} + +LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1( + const Array2D<int8_t>& noise_image, int min_value, int max_chroma, + int width, int height, int start_height, int subsampling_x, + int subsampling_y, int scaling_shift, int chroma_offset, + int chroma_multiplier, int luma_multiplier, + const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row, + ptrdiff_t source_stride_y, const uint8_t* in_chroma_row, + ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row, + ptrdiff_t dest_stride) { + const __m128i floor = _mm_set1_epi16(min_value); + const __m128i ceiling = _mm_set1_epi16(max_chroma); + + const int chroma_height = (height + subsampling_y) >> subsampling_y; + const int chroma_width = (width + subsampling_x) >> subsampling_x; + // |chroma_width| is rounded up. If |width| is odd, then the final luma pixel + // will need to be guarded from overread, even if |chroma_width| is a + // multiple of 8. + const int safe_chroma_width = (chroma_width - (width & 1)) & ~7; + alignas(16) uint8_t luma_buffer[16]; + const __m128i offset = _mm_set1_epi16(chroma_offset); + const __m128i multipliers = _mm_set1_epi32(LeftShift(chroma_multiplier, 16) | + (luma_multiplier & 0xFFFF)); + const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift); + + start_height >>= subsampling_y; + int y = 0; + do { + int x = 0; + for (; x < safe_chroma_width; x += 8) { + const int luma_x = x << subsampling_x; + const __m128i average_luma = + GetAverageLuma(&in_y_row[luma_x], subsampling_x); + const __m128i orig_chroma = LoadSource(&in_chroma_row[x]); + const __m128i blended = BlendChromaValsNoCfl8bpp( + scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), + average_luma, derived_scaling_shift, offset, multipliers); + StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling)); + } + + if (x < chroma_width) { + // Begin right edge iteration. Same as the normal iterations, but the + // |average_luma| computation requires a duplicated luma value at the + // end. + const int luma_x = x << subsampling_x; + const int valid_range = width - luma_x; + assert(valid_range < 16); + // There is no need to pre-initialize this buffer, because merged values + // used as indices are saturated in the 8bpp case. Uninitialized values + // are written outside the frame. + memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0])); + luma_buffer[valid_range] = in_y_row[width - 1]; + const int valid_range_chroma = chroma_width - x; + uint8_t chroma_buffer[8]; + memcpy(chroma_buffer, &in_chroma_row[x], + valid_range_chroma * sizeof(in_chroma_row[0])); + + const __m128i average_luma = + GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1); + const __m128i orig_chroma = + LoadSourceMsan(chroma_buffer, valid_range_chroma); + const __m128i blended = BlendChromaValsNoCfl8bpp( + scaling_lut, orig_chroma, &(noise_image[y + start_height][x]), + average_luma, derived_scaling_shift, offset, multipliers); + StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling)); + // End of right edge iteration. + } + + in_y_row += source_stride_y << subsampling_y; + in_chroma_row += source_stride_chroma; + out_chroma_row += dest_stride; + } while (++y < chroma_height); +} + +// This function is for the case params_.chroma_scaling_from_luma == false. +void BlendNoiseWithImageChroma8bpp_SSE4_1( + Plane plane, const FilmGrainParams& params, const void* noise_image_ptr, + int min_value, int max_chroma, int width, int height, int start_height, + int subsampling_x, int subsampling_y, + const uint8_t scaling_lut[kScalingLookupTableSize], + const void* source_plane_y, ptrdiff_t source_stride_y, + const void* source_plane_uv, ptrdiff_t source_stride_uv, + void* dest_plane_uv, ptrdiff_t dest_stride_uv) { + assert(plane == kPlaneU || plane == kPlaneV); + const auto* noise_image = + static_cast<const Array2D<int8_t>*>(noise_image_ptr); + const auto* in_y = static_cast<const uint8_t*>(source_plane_y); + const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv); + auto* out_uv = static_cast<uint8_t*>(dest_plane_uv); + + const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset; + const int luma_multiplier = + (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier; + const int multiplier = + (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier; + BlendChromaPlane8bpp_SSE4_1( + noise_image[plane], min_value, max_chroma, width, height, start_height, + subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier, + luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv, + source_stride_uv, out_uv, dest_stride_uv); +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_SSE4_1<8, int8_t, uint8_t>; + dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1; + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_SSE4_1<8, int8_t, uint8_t>; +} + +} // namespace +} // namespace low_bitdepth + +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +void Init10bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + + dsp->film_grain.blend_noise_luma = + BlendNoiseWithImageLuma_SSE4_1<10, int16_t, uint16_t>; + dsp->film_grain.blend_noise_chroma[1] = + BlendNoiseWithImageChromaWithCfl_SSE4_1<10, int16_t, uint16_t>; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +} // namespace film_grain + +void FilmGrainInit_SSE4_1() { + film_grain::low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + film_grain::high_bitdepth::Init10bpp(); +#endif // LIBGAV1_MAX_BITDEPTH >= 10 +} + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_ENABLE_SSE4_1 + +namespace libgav1 { +namespace dsp { + +void FilmGrainInit_SSE4_1() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_SSE4_1 |