aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/arm/super_res_neon.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/arm/super_res_neon.cc')
-rw-r--r--src/dsp/arm/super_res_neon.cc151
1 files changed, 143 insertions, 8 deletions
diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc
index 1680450..91537c4 100644
--- a/src/dsp/arm/super_res_neon.cc
+++ b/src/dsp/arm/super_res_neon.cc
@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/arm/common_neon.h"
#include "src/dsp/super_res.h"
#include "src/utils/cpu.h"
@@ -20,6 +19,7 @@
#include <arm_neon.h>
+#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/common.h"
@@ -82,10 +82,10 @@ inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
}
void SuperRes_NEON(const void* const coefficients, void* const source,
- const ptrdiff_t stride, const int height,
+ const ptrdiff_t source_stride, const int height,
const int downscaled_width, const int upscaled_width,
const int initial_subpixel_x, const int step,
- void* const dest) {
+ void* const dest, const ptrdiff_t dest_stride) {
auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
auto* dst = static_cast<uint8_t*>(dest);
int y = height;
@@ -100,7 +100,7 @@ void SuperRes_NEON(const void* const coefficients, void* const source,
int x = RightShiftWithCeiling(upscaled_width, 4);
// The below code calculates up to 15 extra upscaled
// pixels which will over-read up to 15 downscaled pixels in the end of each
- // row. kSuperResHorizontalBorder accounts for this.
+ // row. kSuperResHorizontalPadding accounts for this.
do {
for (int i = 0; i < 8; ++i, subpixel_x += step) {
sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
@@ -135,8 +135,8 @@ void SuperRes_NEON(const void* const coefficients, void* const source,
vst1q_u8(dst_ptr, vcombine_u8(d0, d1));
dst_ptr += 16;
} while (--x != 0);
- src += stride;
- dst += stride;
+ src += source_stride;
+ dst += dest_stride;
} while (--y != 0);
}
@@ -149,12 +149,147 @@ void Init8bpp() {
} // namespace
} // namespace low_bitdepth
-void SuperResInit_NEON() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void SuperResCoefficients_NEON(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint16_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ do {
+ uint16x8_t filter[8];
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ const uint8x8_t filter_8 =
+ vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+ kSuperResExtraBits]);
+ // uint8_t -> uint16_t
+ filter[i] = vmovl_u8(filter_8);
+ }
+
+ Transpose8x8(filter);
+
+ vst1q_u16(dst, filter[0]);
+ dst += 8;
+ vst1q_u16(dst, filter[1]);
+ dst += 8;
+ vst1q_u16(dst, filter[2]);
+ dst += 8;
+ vst1q_u16(dst, filter[3]);
+ dst += 8;
+ vst1q_u16(dst, filter[4]);
+ dst += 8;
+ vst1q_u16(dst, filter[5]);
+ dst += 8;
+ vst1q_u16(dst, filter[6]);
+ dst += 8;
+ vst1q_u16(dst, filter[7]);
+ dst += 8;
+ } while (--x != 0);
+}
+
+// The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then
+// subtracting all negative with saturation will clip to zero.
+// 0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps],
+ const uint16_t** coefficients, int bitdepth) {
+ uint16x8_t f[kSuperResFilterTaps];
+ for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) {
+ f[i] = vld1q_u16(*coefficients);
+ }
+
+ uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6]));
+
+ uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7]));
+
+ res_lo = vqsubq_u32(res_lo, temp_lo);
+
+ uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6]));
+ uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7]));
+
+ res_hi = vqsubq_u32(res_hi, temp_hi);
+
+ const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits),
+ vqrshrn_n_u32(res_hi, kFilterBits));
+
+ // Clip the result at (1 << bd) - 1.
+ return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1));
+}
+
+template <int bitdepth>
+void SuperRes_NEON(const void* const coefficients, void* const source,
+ const ptrdiff_t source_stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const dest, const ptrdiff_t dest_stride) {
+ auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint16_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint16_t*>(coefficients);
+ uint16_t* dst_ptr = dst;
+ ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+ int subpixel_x = initial_subpixel_x;
+ uint16x8_t sr[8];
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ // The below code calculates up to 7 extra upscaled
+ // pixels which will over-read up to 7 downscaled pixels in the end of each
+ // row. kSuperResHorizontalBorder accounts for this.
+ do {
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]);
+ }
+
+ Transpose8x8(sr);
+
+ const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth);
+ vst1q_u16(dst_ptr, d0);
+ dst_ptr += 8;
+ } while (--x != 0);
+ src += source_stride;
+ dst += dest_stride;
+ } while (--y != 0);
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->super_res_coefficients = SuperResCoefficients_NEON;
+ dsp->super_res = SuperRes_NEON<10>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {