diff options
author | Boyuan Yang <byang@debian.org> | 2022-07-14 15:56:57 -0400 |
---|---|---|
committer | Boyuan Yang <byang@debian.org> | 2022-07-14 15:56:57 -0400 |
commit | d4dbf19f6b0181ee78034bfe4caf189d1c016998 (patch) | |
tree | 47d5d28d2ab770a10e6c48788725c51dffeb84a9 /src/dsp/arm/inverse_transform_10bit_neon.cc | |
parent | 320ef65362608ee1148c299d8d5d7618af34e470 (diff) | |
download | libgav1-d4dbf19f6b0181ee78034bfe4caf189d1c016998.tar.gz libgav1-d4dbf19f6b0181ee78034bfe4caf189d1c016998.tar.bz2 libgav1-d4dbf19f6b0181ee78034bfe4caf189d1c016998.zip |
New upstream version 0.18.0
Diffstat (limited to 'src/dsp/arm/inverse_transform_10bit_neon.cc')
-rw-r--r-- | src/dsp/arm/inverse_transform_10bit_neon.cc | 28 |
1 files changed, 20 insertions, 8 deletions
diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc index 617accc..e6f0d9d 100644 --- a/src/dsp/arm/inverse_transform_10bit_neon.cc +++ b/src/dsp/arm/inverse_transform_10bit_neon.cc @@ -282,9 +282,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row, const int32x4_t max = vdupq_n_s32((1 << range) - 1); int32x4_t s[4], x[4]; - LoadSrc<4>(dst, step, 0, x); if (is_row) { - Transpose4x4(x, x); + assert(step == 4); + int32x4x4_t y = vld4q_s32(dst); + for (int i = 0; i < 4; ++i) x[i] = y.val[i]; + } else { + LoadSrc<4>(dst, step, 0, x); } // stage 1. @@ -301,9 +304,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row, for (auto& i : s) { i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift))); } - Transpose4x4(s, s); + int32x4x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = s[i]; + vst4q_s32(dst, y); + } else { + StoreDst<4>(dst, step, 0, s); } - StoreDst<4>(dst, step, 0, s); } template <ButterflyRotationFunc butterfly_rotation, @@ -937,9 +943,12 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row, int32x4_t s[8]; int32x4_t x[4]; - LoadSrc<4>(dst, step, 0, x); if (is_row) { - Transpose4x4(x, x); + assert(step == 4); + int32x4x4_t y = vld4q_s32(dst); + for (int i = 0; i < 4; ++i) x[i] = y.val[i]; + } else { + LoadSrc<4>(dst, step, 0, x); } // stage 1. @@ -981,9 +990,12 @@ LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row, x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift))); x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift))); x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift))); - Transpose4x4(x, x); + int32x4x4_t y; + for (int i = 0; i < 4; ++i) y.val[i] = x[i]; + vst4q_s32(dst, y); + } else { + StoreDst<4>(dst, step, 0, x); } - StoreDst<4>(dst, step, 0, x); } alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344, |