diff options
Diffstat (limited to 'src/dsp/arm/super_res_neon.cc')
-rw-r--r-- | src/dsp/arm/super_res_neon.cc | 151 |
1 files changed, 143 insertions, 8 deletions
diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc index 1680450..91537c4 100644 --- a/src/dsp/arm/super_res_neon.cc +++ b/src/dsp/arm/super_res_neon.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "src/dsp/arm/common_neon.h" #include "src/dsp/super_res.h" #include "src/utils/cpu.h" @@ -20,6 +19,7 @@ #include <arm_neon.h> +#include "src/dsp/arm/common_neon.h" #include "src/dsp/constants.h" #include "src/dsp/dsp.h" #include "src/utils/common.h" @@ -82,10 +82,10 @@ inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps], } void SuperRes_NEON(const void* const coefficients, void* const source, - const ptrdiff_t stride, const int height, + const ptrdiff_t source_stride, const int height, const int downscaled_width, const int upscaled_width, const int initial_subpixel_x, const int step, - void* const dest) { + void* const dest, const ptrdiff_t dest_stride) { auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps); auto* dst = static_cast<uint8_t*>(dest); int y = height; @@ -100,7 +100,7 @@ void SuperRes_NEON(const void* const coefficients, void* const source, int x = RightShiftWithCeiling(upscaled_width, 4); // The below code calculates up to 15 extra upscaled // pixels which will over-read up to 15 downscaled pixels in the end of each - // row. kSuperResHorizontalBorder accounts for this. + // row. kSuperResHorizontalPadding accounts for this. do { for (int i = 0; i < 8; ++i, subpixel_x += step) { sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]); @@ -135,8 +135,8 @@ void SuperRes_NEON(const void* const coefficients, void* const source, vst1q_u8(dst_ptr, vcombine_u8(d0, d1)); dst_ptr += 16; } while (--x != 0); - src += stride; - dst += stride; + src += source_stride; + dst += dest_stride; } while (--y != 0); } @@ -149,12 +149,147 @@ void Init8bpp() { } // namespace } // namespace low_bitdepth -void SuperResInit_NEON() { low_bitdepth::Init8bpp(); } +//------------------------------------------------------------------------------ +#if LIBGAV1_MAX_BITDEPTH >= 10 +namespace high_bitdepth { +namespace { + +void SuperResCoefficients_NEON(const int upscaled_width, + const int initial_subpixel_x, const int step, + void* const coefficients) { + auto* dst = static_cast<uint16_t*>(coefficients); + int subpixel_x = initial_subpixel_x; + int x = RightShiftWithCeiling(upscaled_width, 3); + do { + uint16x8_t filter[8]; + for (int i = 0; i < 8; ++i, subpixel_x += step) { + const uint8x8_t filter_8 = + vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >> + kSuperResExtraBits]); + // uint8_t -> uint16_t + filter[i] = vmovl_u8(filter_8); + } + + Transpose8x8(filter); + + vst1q_u16(dst, filter[0]); + dst += 8; + vst1q_u16(dst, filter[1]); + dst += 8; + vst1q_u16(dst, filter[2]); + dst += 8; + vst1q_u16(dst, filter[3]); + dst += 8; + vst1q_u16(dst, filter[4]); + dst += 8; + vst1q_u16(dst, filter[5]); + dst += 8; + vst1q_u16(dst, filter[6]); + dst += 8; + vst1q_u16(dst, filter[7]); + dst += 8; + } while (--x != 0); +} + +// The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then +// subtracting all negative with saturation will clip to zero. +// 0 1 2 3 4 5 6 7 +// tap sign: - + - + + - + - +inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps], + const uint16_t** coefficients, int bitdepth) { + uint16x8_t f[kSuperResFilterTaps]; + for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) { + f[i] = vld1q_u16(*coefficients); + } + + uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1])); + res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3])); + res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4])); + res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6])); + + uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0])); + temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2])); + temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5])); + temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7])); + + res_lo = vqsubq_u32(res_lo, temp_lo); + + uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1])); + res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3])); + res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4])); + res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6])); + uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0])); + temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2])); + temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5])); + temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7])); + + res_hi = vqsubq_u32(res_hi, temp_hi); + + const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits), + vqrshrn_n_u32(res_hi, kFilterBits)); + + // Clip the result at (1 << bd) - 1. + return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1)); +} + +template <int bitdepth> +void SuperRes_NEON(const void* const coefficients, void* const source, + const ptrdiff_t source_stride, const int height, + const int downscaled_width, const int upscaled_width, + const int initial_subpixel_x, const int step, + void* const dest, const ptrdiff_t dest_stride) { + auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps); + auto* dst = static_cast<uint16_t*>(dest); + int y = height; + do { + const auto* filter = static_cast<const uint16_t*>(coefficients); + uint16_t* dst_ptr = dst; + ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width, + kSuperResHorizontalBorder, kSuperResHorizontalBorder); + int subpixel_x = initial_subpixel_x; + uint16x8_t sr[8]; + int x = RightShiftWithCeiling(upscaled_width, 3); + // The below code calculates up to 7 extra upscaled + // pixels which will over-read up to 7 downscaled pixels in the end of each + // row. kSuperResHorizontalBorder accounts for this. + do { + for (int i = 0; i < 8; ++i, subpixel_x += step) { + sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]); + } + + Transpose8x8(sr); + + const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth); + vst1q_u16(dst_ptr, d0); + dst_ptr += 8; + } while (--x != 0); + src += source_stride; + dst += dest_stride; + } while (--y != 0); +} + +void Init10bpp() { + Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10); + assert(dsp != nullptr); + dsp->super_res_coefficients = SuperResCoefficients_NEON; + dsp->super_res = SuperRes_NEON<10>; +} + +} // namespace +} // namespace high_bitdepth +#endif // LIBGAV1_MAX_BITDEPTH >= 10 + +void SuperResInit_NEON() { + low_bitdepth::Init8bpp(); +#if LIBGAV1_MAX_BITDEPTH >= 10 + high_bitdepth::Init10bpp(); +#endif +} } // namespace dsp } // namespace libgav1 -#else // !LIBGAV1_ENABLE_NEON +#else // !LIBGAV1_ENABLE_NEON namespace libgav1 { namespace dsp { |