aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/arm/inverse_transform_10bit_neon.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/arm/inverse_transform_10bit_neon.cc')
-rw-r--r--src/dsp/arm/inverse_transform_10bit_neon.cc728
1 files changed, 485 insertions, 243 deletions
diff --git a/src/dsp/arm/inverse_transform_10bit_neon.cc b/src/dsp/arm/inverse_transform_10bit_neon.cc
index ff184a1..617accc 100644
--- a/src/dsp/arm/inverse_transform_10bit_neon.cc
+++ b/src/dsp/arm/inverse_transform_10bit_neon.cc
@@ -67,7 +67,8 @@ LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4],
//------------------------------------------------------------------------------
template <int store_count>
-LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* dst, int32_t stride, int32_t idx,
+LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* LIBGAV1_RESTRICT dst,
+ int32_t stride, int32_t idx,
const int32x4_t* const s) {
assert(store_count % 4 == 0);
for (int i = 0; i < store_count; i += 4) {
@@ -79,8 +80,8 @@ LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* dst, int32_t stride, int32_t idx,
}
template <int load_count>
-LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* src, int32_t stride,
- int32_t idx, int32x4_t* x) {
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* LIBGAV1_RESTRICT src,
+ int32_t stride, int32_t idx, int32x4_t* x) {
assert(load_count % 4 == 0);
for (int i = 0; i < load_count; i += 4) {
x[i] = vld1q_s32(&src[i * stride + idx]);
@@ -168,8 +169,8 @@ LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
}
LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
- bool flip, const int32x4_t* min,
- const int32x4_t* max) {
+ bool flip, const int32x4_t min,
+ const int32x4_t max) {
int32x4_t x, y;
if (flip) {
y = vqaddq_s32(*b, *a);
@@ -178,8 +179,8 @@ LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
x = vqaddq_s32(*a, *b);
y = vqsubq_s32(*a, *b);
}
- *a = vmaxq_s32(vminq_s32(x, *max), *min);
- *b = vmaxq_s32(vminq_s32(y, *max), *min);
+ *a = vmaxq_s32(vminq_s32(x, max), min);
+ *b = vmaxq_s32(vminq_s32(y, max), min);
}
using ButterflyRotationFunc = void (*)(int32x4_t* a, int32x4_t* b, int angle,
@@ -248,8 +249,8 @@ LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
template <ButterflyRotationFunc butterfly_rotation,
bool is_fast_butterfly = false>
-LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t* min,
- const int32x4_t* max,
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t min,
+ const int32x4_t max,
const bool is_last_stage) {
// stage 12.
if (is_fast_butterfly) {
@@ -293,12 +294,12 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
s[2] = x[1];
s[3] = x[3];
- Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+ Dct4Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/true);
if (is_row) {
const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
- for (int i = 0; i < 4; ++i) {
- s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+ for (auto& i : s) {
+ i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
}
Transpose4x4(s, s);
}
@@ -307,8 +308,8 @@ LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
template <ButterflyRotationFunc butterfly_rotation,
bool is_fast_butterfly = false>
-LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t* min,
- const int32x4_t* max,
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t min,
+ const int32x4_t max,
const bool is_last_stage) {
// stage 8.
if (is_fast_butterfly) {
@@ -370,13 +371,13 @@ LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row,
s[6] = x[3];
s[7] = x[7];
- Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
- Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+ Dct4Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/false);
+ Dct8Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/true);
if (is_row) {
const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
- for (int i = 0; i < 8; ++i) {
- s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+ for (auto& i : s) {
+ i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
}
Transpose4x4(&s[0], &s[0]);
Transpose4x4(&s[4], &s[4]);
@@ -389,8 +390,8 @@ LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row,
template <ButterflyRotationFunc butterfly_rotation,
bool is_fast_butterfly = false>
-LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t* min,
- const int32x4_t* max,
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t min,
+ const int32x4_t max,
const bool is_last_stage) {
// stage 5.
if (is_fast_butterfly) {
@@ -487,14 +488,14 @@ LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
s[14] = x[7];
s[15] = x[15];
- Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
- Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
- Dct16Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+ Dct4Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/false);
+ Dct8Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/false);
+ Dct16Stages<butterfly_rotation>(s, min, max, /*is_last_stage=*/true);
if (is_row) {
const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
- for (int i = 0; i < 16; ++i) {
- s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+ for (auto& i : s) {
+ i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
}
for (int idx = 0; idx < 16; idx += 8) {
Transpose4x4(&s[idx], &s[idx]);
@@ -509,8 +510,8 @@ LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
template <ButterflyRotationFunc butterfly_rotation,
bool is_fast_butterfly = false>
-LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t* min,
- const int32x4_t* max,
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t min,
+ const int32x4_t max,
const bool is_last_stage) {
// stage 3
if (is_fast_butterfly) {
@@ -677,10 +678,10 @@ LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
s[30] = x[15];
s[31] = x[31];
- Dct4Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
- Dct8Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
- Dct16Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
- Dct32Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/true);
+ Dct4Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/false);
+ Dct8Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/false);
+ Dct16Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/false);
+ Dct32Stages<ButterflyRotation_4>(s, min, max, /*is_last_stage=*/true);
if (is_row) {
const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
@@ -688,8 +689,8 @@ LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
int32x4_t output[8];
Transpose4x4(&s[idx], &output[0]);
Transpose4x4(&s[idx + 4], &output[4]);
- for (int i = 0; i < 8; ++i) {
- output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
+ for (auto& o : output) {
+ o = vmovl_s16(vqmovn_s32(vqrshlq_s32(o, v_row_shift)));
}
StoreDst<4>(dst, step, idx, &output[0]);
StoreDst<4>(dst, step, idx + 4, &output[4]);
@@ -764,13 +765,13 @@ void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
s[62] = x[31];
Dct4Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
- s, &min, &max, /*is_last_stage=*/false);
+ s, min, max, /*is_last_stage=*/false);
Dct8Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
- s, &min, &max, /*is_last_stage=*/false);
+ s, min, max, /*is_last_stage=*/false);
Dct16Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
- s, &min, &max, /*is_last_stage=*/false);
+ s, min, max, /*is_last_stage=*/false);
Dct32Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
- s, &min, &max, /*is_last_stage=*/false);
+ s, min, max, /*is_last_stage=*/false);
//-- start dct 64 stages
// stage 2.
@@ -792,22 +793,22 @@ void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
// stage 4.
- HadamardRotation(&s[32], &s[33], false, &min, &max);
- HadamardRotation(&s[34], &s[35], true, &min, &max);
- HadamardRotation(&s[36], &s[37], false, &min, &max);
- HadamardRotation(&s[38], &s[39], true, &min, &max);
- HadamardRotation(&s[40], &s[41], false, &min, &max);
- HadamardRotation(&s[42], &s[43], true, &min, &max);
- HadamardRotation(&s[44], &s[45], false, &min, &max);
- HadamardRotation(&s[46], &s[47], true, &min, &max);
- HadamardRotation(&s[48], &s[49], false, &min, &max);
- HadamardRotation(&s[50], &s[51], true, &min, &max);
- HadamardRotation(&s[52], &s[53], false, &min, &max);
- HadamardRotation(&s[54], &s[55], true, &min, &max);
- HadamardRotation(&s[56], &s[57], false, &min, &max);
- HadamardRotation(&s[58], &s[59], true, &min, &max);
- HadamardRotation(&s[60], &s[61], false, &min, &max);
- HadamardRotation(&s[62], &s[63], true, &min, &max);
+ HadamardRotation(&s[32], &s[33], false, min, max);
+ HadamardRotation(&s[34], &s[35], true, min, max);
+ HadamardRotation(&s[36], &s[37], false, min, max);
+ HadamardRotation(&s[38], &s[39], true, min, max);
+ HadamardRotation(&s[40], &s[41], false, min, max);
+ HadamardRotation(&s[42], &s[43], true, min, max);
+ HadamardRotation(&s[44], &s[45], false, min, max);
+ HadamardRotation(&s[46], &s[47], true, min, max);
+ HadamardRotation(&s[48], &s[49], false, min, max);
+ HadamardRotation(&s[50], &s[51], true, min, max);
+ HadamardRotation(&s[52], &s[53], false, min, max);
+ HadamardRotation(&s[54], &s[55], true, min, max);
+ HadamardRotation(&s[56], &s[57], false, min, max);
+ HadamardRotation(&s[58], &s[59], true, min, max);
+ HadamardRotation(&s[60], &s[61], false, min, max);
+ HadamardRotation(&s[62], &s[63], true, min, max);
// stage 7.
ButterflyRotation_4(&s[62], &s[33], 60 - 0, true);
@@ -820,22 +821,22 @@ void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
ButterflyRotation_4(&s[49], &s[46], 60 - 48 + 64, true);
// stage 11.
- HadamardRotation(&s[32], &s[35], false, &min, &max);
- HadamardRotation(&s[33], &s[34], false, &min, &max);
- HadamardRotation(&s[36], &s[39], true, &min, &max);
- HadamardRotation(&s[37], &s[38], true, &min, &max);
- HadamardRotation(&s[40], &s[43], false, &min, &max);
- HadamardRotation(&s[41], &s[42], false, &min, &max);
- HadamardRotation(&s[44], &s[47], true, &min, &max);
- HadamardRotation(&s[45], &s[46], true, &min, &max);
- HadamardRotation(&s[48], &s[51], false, &min, &max);
- HadamardRotation(&s[49], &s[50], false, &min, &max);
- HadamardRotation(&s[52], &s[55], true, &min, &max);
- HadamardRotation(&s[53], &s[54], true, &min, &max);
- HadamardRotation(&s[56], &s[59], false, &min, &max);
- HadamardRotation(&s[57], &s[58], false, &min, &max);
- HadamardRotation(&s[60], &s[63], true, &min, &max);
- HadamardRotation(&s[61], &s[62], true, &min, &max);
+ HadamardRotation(&s[32], &s[35], false, min, max);
+ HadamardRotation(&s[33], &s[34], false, min, max);
+ HadamardRotation(&s[36], &s[39], true, min, max);
+ HadamardRotation(&s[37], &s[38], true, min, max);
+ HadamardRotation(&s[40], &s[43], false, min, max);
+ HadamardRotation(&s[41], &s[42], false, min, max);
+ HadamardRotation(&s[44], &s[47], true, min, max);
+ HadamardRotation(&s[45], &s[46], true, min, max);
+ HadamardRotation(&s[48], &s[51], false, min, max);
+ HadamardRotation(&s[49], &s[50], false, min, max);
+ HadamardRotation(&s[52], &s[55], true, min, max);
+ HadamardRotation(&s[53], &s[54], true, min, max);
+ HadamardRotation(&s[56], &s[59], false, min, max);
+ HadamardRotation(&s[57], &s[58], false, min, max);
+ HadamardRotation(&s[60], &s[63], true, min, max);
+ HadamardRotation(&s[61], &s[62], true, min, max);
// stage 16.
ButterflyRotation_4(&s[61], &s[34], 56, true);
@@ -848,22 +849,22 @@ void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
ButterflyRotation_4(&s[50], &s[45], 56 - 32 + 64, true);
// stage 21.
- HadamardRotation(&s[32], &s[39], false, &min, &max);
- HadamardRotation(&s[33], &s[38], false, &min, &max);
- HadamardRotation(&s[34], &s[37], false, &min, &max);
- HadamardRotation(&s[35], &s[36], false, &min, &max);
- HadamardRotation(&s[40], &s[47], true, &min, &max);
- HadamardRotation(&s[41], &s[46], true, &min, &max);
- HadamardRotation(&s[42], &s[45], true, &min, &max);
- HadamardRotation(&s[43], &s[44], true, &min, &max);
- HadamardRotation(&s[48], &s[55], false, &min, &max);
- HadamardRotation(&s[49], &s[54], false, &min, &max);
- HadamardRotation(&s[50], &s[53], false, &min, &max);
- HadamardRotation(&s[51], &s[52], false, &min, &max);
- HadamardRotation(&s[56], &s[63], true, &min, &max);
- HadamardRotation(&s[57], &s[62], true, &min, &max);
- HadamardRotation(&s[58], &s[61], true, &min, &max);
- HadamardRotation(&s[59], &s[60], true, &min, &max);
+ HadamardRotation(&s[32], &s[39], false, min, max);
+ HadamardRotation(&s[33], &s[38], false, min, max);
+ HadamardRotation(&s[34], &s[37], false, min, max);
+ HadamardRotation(&s[35], &s[36], false, min, max);
+ HadamardRotation(&s[40], &s[47], true, min, max);
+ HadamardRotation(&s[41], &s[46], true, min, max);
+ HadamardRotation(&s[42], &s[45], true, min, max);
+ HadamardRotation(&s[43], &s[44], true, min, max);
+ HadamardRotation(&s[48], &s[55], false, min, max);
+ HadamardRotation(&s[49], &s[54], false, min, max);
+ HadamardRotation(&s[50], &s[53], false, min, max);
+ HadamardRotation(&s[51], &s[52], false, min, max);
+ HadamardRotation(&s[56], &s[63], true, min, max);
+ HadamardRotation(&s[57], &s[62], true, min, max);
+ HadamardRotation(&s[58], &s[61], true, min, max);
+ HadamardRotation(&s[59], &s[60], true, min, max);
// stage 25.
ButterflyRotation_4(&s[59], &s[36], 48, true);
@@ -876,22 +877,22 @@ void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
ButterflyRotation_4(&s[52], &s[43], 112, true);
// stage 28.
- HadamardRotation(&s[32], &s[47], false, &min, &max);
- HadamardRotation(&s[33], &s[46], false, &min, &max);
- HadamardRotation(&s[34], &s[45], false, &min, &max);
- HadamardRotation(&s[35], &s[44], false, &min, &max);
- HadamardRotation(&s[36], &s[43], false, &min, &max);
- HadamardRotation(&s[37], &s[42], false, &min, &max);
- HadamardRotation(&s[38], &s[41], false, &min, &max);
- HadamardRotation(&s[39], &s[40], false, &min, &max);
- HadamardRotation(&s[48], &s[63], true, &min, &max);
- HadamardRotation(&s[49], &s[62], true, &min, &max);
- HadamardRotation(&s[50], &s[61], true, &min, &max);
- HadamardRotation(&s[51], &s[60], true, &min, &max);
- HadamardRotation(&s[52], &s[59], true, &min, &max);
- HadamardRotation(&s[53], &s[58], true, &min, &max);
- HadamardRotation(&s[54], &s[57], true, &min, &max);
- HadamardRotation(&s[55], &s[56], true, &min, &max);
+ HadamardRotation(&s[32], &s[47], false, min, max);
+ HadamardRotation(&s[33], &s[46], false, min, max);
+ HadamardRotation(&s[34], &s[45], false, min, max);
+ HadamardRotation(&s[35], &s[44], false, min, max);
+ HadamardRotation(&s[36], &s[43], false, min, max);
+ HadamardRotation(&s[37], &s[42], false, min, max);
+ HadamardRotation(&s[38], &s[41], false, min, max);
+ HadamardRotation(&s[39], &s[40], false, min, max);
+ HadamardRotation(&s[48], &s[63], true, min, max);
+ HadamardRotation(&s[49], &s[62], true, min, max);
+ HadamardRotation(&s[50], &s[61], true, min, max);
+ HadamardRotation(&s[51], &s[60], true, min, max);
+ HadamardRotation(&s[52], &s[59], true, min, max);
+ HadamardRotation(&s[53], &s[58], true, min, max);
+ HadamardRotation(&s[54], &s[57], true, min, max);
+ HadamardRotation(&s[55], &s[56], true, min, max);
// stage 30.
ButterflyRotation_4(&s[55], &s[40], 32, true);
@@ -905,10 +906,10 @@ void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
// stage 31.
for (int i = 0; i < 32; i += 4) {
- HadamardRotation(&s[i], &s[63 - i], false, &min, &max);
- HadamardRotation(&s[i + 1], &s[63 - i - 1], false, &min, &max);
- HadamardRotation(&s[i + 2], &s[63 - i - 2], false, &min, &max);
- HadamardRotation(&s[i + 3], &s[63 - i - 3], false, &min, &max);
+ HadamardRotation(&s[i], &s[63 - i], false, min, max);
+ HadamardRotation(&s[i + 1], &s[63 - i - 1], false, min, max);
+ HadamardRotation(&s[i + 2], &s[63 - i - 2], false, min, max);
+ HadamardRotation(&s[i + 3], &s[63 - i - 3], false, min, max);
}
//-- end dct 64 stages
if (is_row) {
@@ -917,8 +918,8 @@ void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
int32x4_t output[8];
Transpose4x4(&s[idx], &output[0]);
Transpose4x4(&s[idx + 4], &output[4]);
- for (int i = 0; i < 8; ++i) {
- output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
+ for (auto& o : output) {
+ o = vmovl_s16(vqmovn_s32(vqrshlq_s32(o, v_row_shift)));
}
StoreDst<4>(dst, step, idx, &output[0]);
StoreDst<4>(dst, step, idx + 4, &output[4]);
@@ -1089,20 +1090,20 @@ LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row,
butterfly_rotation(&s[6], &s[7], 60 - 48, true);
// stage 3.
- HadamardRotation(&s[0], &s[4], false, &min, &max);
- HadamardRotation(&s[1], &s[5], false, &min, &max);
- HadamardRotation(&s[2], &s[6], false, &min, &max);
- HadamardRotation(&s[3], &s[7], false, &min, &max);
+ HadamardRotation(&s[0], &s[4], false, min, max);
+ HadamardRotation(&s[1], &s[5], false, min, max);
+ HadamardRotation(&s[2], &s[6], false, min, max);
+ HadamardRotation(&s[3], &s[7], false, min, max);
// stage 4.
butterfly_rotation(&s[4], &s[5], 48 - 0, true);
butterfly_rotation(&s[7], &s[6], 48 - 32, true);
// stage 5.
- HadamardRotation(&s[0], &s[2], false, &min, &max);
- HadamardRotation(&s[4], &s[6], false, &min, &max);
- HadamardRotation(&s[1], &s[3], false, &min, &max);
- HadamardRotation(&s[5], &s[7], false, &min, &max);
+ HadamardRotation(&s[0], &s[2], false, min, max);
+ HadamardRotation(&s[4], &s[6], false, min, max);
+ HadamardRotation(&s[1], &s[3], false, min, max);
+ HadamardRotation(&s[5], &s[7], false, min, max);
// stage 6.
butterfly_rotation(&s[2], &s[3], 32, true);
@@ -1120,8 +1121,8 @@ LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row,
if (is_row) {
const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
- for (int i = 0; i < 8; ++i) {
- x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
+ for (auto& i : x) {
+ i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
}
Transpose4x4(&x[0], &x[0]);
Transpose4x4(&x[4], &x[4]);
@@ -1289,14 +1290,14 @@ LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
butterfly_rotation(&s[14], &s[15], 62 - 56, true);
// stage 3.
- HadamardRotation(&s[0], &s[8], false, &min, &max);
- HadamardRotation(&s[1], &s[9], false, &min, &max);
- HadamardRotation(&s[2], &s[10], false, &min, &max);
- HadamardRotation(&s[3], &s[11], false, &min, &max);
- HadamardRotation(&s[4], &s[12], false, &min, &max);
- HadamardRotation(&s[5], &s[13], false, &min, &max);
- HadamardRotation(&s[6], &s[14], false, &min, &max);
- HadamardRotation(&s[7], &s[15], false, &min, &max);
+ HadamardRotation(&s[0], &s[8], false, min, max);
+ HadamardRotation(&s[1], &s[9], false, min, max);
+ HadamardRotation(&s[2], &s[10], false, min, max);
+ HadamardRotation(&s[3], &s[11], false, min, max);
+ HadamardRotation(&s[4], &s[12], false, min, max);
+ HadamardRotation(&s[5], &s[13], false, min, max);
+ HadamardRotation(&s[6], &s[14], false, min, max);
+ HadamardRotation(&s[7], &s[15], false, min, max);
// stage 4.
butterfly_rotation(&s[8], &s[9], 56 - 0, true);
@@ -1305,14 +1306,14 @@ LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
butterfly_rotation(&s[15], &s[14], 8 + 32, true);
// stage 5.
- HadamardRotation(&s[0], &s[4], false, &min, &max);
- HadamardRotation(&s[8], &s[12], false, &min, &max);
- HadamardRotation(&s[1], &s[5], false, &min, &max);
- HadamardRotation(&s[9], &s[13], false, &min, &max);
- HadamardRotation(&s[2], &s[6], false, &min, &max);
- HadamardRotation(&s[10], &s[14], false, &min, &max);
- HadamardRotation(&s[3], &s[7], false, &min, &max);
- HadamardRotation(&s[11], &s[15], false, &min, &max);
+ HadamardRotation(&s[0], &s[4], false, min, max);
+ HadamardRotation(&s[8], &s[12], false, min, max);
+ HadamardRotation(&s[1], &s[5], false, min, max);
+ HadamardRotation(&s[9], &s[13], false, min, max);
+ HadamardRotation(&s[2], &s[6], false, min, max);
+ HadamardRotation(&s[10], &s[14], false, min, max);
+ HadamardRotation(&s[3], &s[7], false, min, max);
+ HadamardRotation(&s[11], &s[15], false, min, max);
// stage 6.
butterfly_rotation(&s[4], &s[5], 48 - 0, true);
@@ -1321,14 +1322,14 @@ LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
butterfly_rotation(&s[15], &s[14], 48 - 32, true);
// stage 7.
- HadamardRotation(&s[0], &s[2], false, &min, &max);
- HadamardRotation(&s[4], &s[6], false, &min, &max);
- HadamardRotation(&s[8], &s[10], false, &min, &max);
- HadamardRotation(&s[12], &s[14], false, &min, &max);
- HadamardRotation(&s[1], &s[3], false, &min, &max);
- HadamardRotation(&s[5], &s[7], false, &min, &max);
- HadamardRotation(&s[9], &s[11], false, &min, &max);
- HadamardRotation(&s[13], &s[15], false, &min, &max);
+ HadamardRotation(&s[0], &s[2], false, min, max);
+ HadamardRotation(&s[4], &s[6], false, min, max);
+ HadamardRotation(&s[8], &s[10], false, min, max);
+ HadamardRotation(&s[12], &s[14], false, min, max);
+ HadamardRotation(&s[1], &s[3], false, min, max);
+ HadamardRotation(&s[5], &s[7], false, min, max);
+ HadamardRotation(&s[9], &s[11], false, min, max);
+ HadamardRotation(&s[13], &s[15], false, min, max);
// stage 8.
butterfly_rotation(&s[2], &s[3], 32, true);
@@ -1356,8 +1357,8 @@ LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
if (is_row) {
const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
- for (int i = 0; i < 16; ++i) {
- x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
+ for (auto& i : x) {
+ i = vmovl_s16(vqmovn_s32(vqrshlq_s32(i, v_row_shift)));
}
for (int idx = 0; idx < 16; idx += 8) {
Transpose4x4(&x[idx], &x[idx]);
@@ -1517,59 +1518,23 @@ LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
template <int identity_size>
LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
Array2DView<uint16_t> frame, const int start_x, const int start_y,
- const int tx_width, const int tx_height, const int32_t* source) {
- static_assert(identity_size == 4 || identity_size == 8 || identity_size == 16,
+ const int tx_width, const int tx_height,
+ const int32_t* LIBGAV1_RESTRICT source) {
+ static_assert(identity_size == 4 || identity_size == 8 ||
+ identity_size == 16 || identity_size == 32,
"Invalid identity_size.");
const int stride = frame.columns();
- uint16_t* dst = frame[start_y] + start_x;
+ uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11);
const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
- if (tx_width == 4) {
- int i = 0;
- do {
- int32x4x2_t v_src, v_dst_i, a, b;
- v_src.val[0] = vld1q_s32(&source[i * 4]);
- v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
- if (identity_size == 4) {
- v_dst_i.val[0] =
- vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
- v_dst_i.val[1] =
- vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
- a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
- a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
- } else if (identity_size == 8) {
- v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
- v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
- a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
- a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
- } else { // identity_size == 16
- v_dst_i.val[0] =
- vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
- v_dst_i.val[1] =
- vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
- a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
- a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
- }
- uint16x4x2_t frame_data;
- frame_data.val[0] = vld1_u16(dst);
- frame_data.val[1] = vld1_u16(dst + stride);
- b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
- b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
- vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
- vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
- dst += stride << 1;
- i += 2;
- } while (i < tx_height);
- } else {
- int i = 0;
- do {
- const int row = i * tx_width;
- int j = 0;
+ if (identity_size < 32) {
+ if (tx_width == 4) {
+ int i = 0;
do {
int32x4x2_t v_src, v_dst_i, a, b;
- v_src.val[0] = vld1q_s32(&source[row + j]);
- v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+ v_src.val[0] = vld1q_s32(&source[i * 4]);
+ v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
if (identity_size == 4) {
v_dst_i.val[0] =
vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
@@ -1591,13 +1556,72 @@ LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
}
uint16x4x2_t frame_data;
- frame_data.val[0] = vld1_u16(dst + j);
- frame_data.val[1] = vld1_u16(dst + j + 4);
+ frame_data.val[0] = vld1_u16(dst);
+ frame_data.val[1] = vld1_u16(dst + stride);
b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
- vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
- vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
- j += 8;
+ vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+ vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+ dst += stride << 1;
+ i += 2;
+ } while (i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ int32x4x2_t v_src, v_dst_i, a, b;
+ v_src.val[0] = vld1q_s32(&source[row + j]);
+ v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+ if (identity_size == 4) {
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ } else if (identity_size == 8) {
+ v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+ v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+ a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+ a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+ } else { // identity_size == 16
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ }
+ uint16x4x2_t frame_data;
+ frame_data.val[0] = vld1_u16(dst + j);
+ frame_data.val[1] = vld1_u16(dst + j + 4);
+ b.val[0] =
+ vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+ b.val[1] =
+ vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+ vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+ vst1_u16(dst + j + 4,
+ vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ const int32x4_t v_dst_i = vld1q_s32(&source[row + j]);
+ const uint16x4_t frame_data = vld1_u16(dst + j);
+ const int32x4_t a = vrshrq_n_s32(v_dst_i, 2);
+ const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
+ const uint16x4_t d = vmin_u16(vqmovun_s32(b), v_max_bitdepth);
+ vst1_u16(dst + j, d);
+ j += 4;
} while (j < tx_width);
dst += stride;
} while (++i < tx_height);
@@ -1606,9 +1630,10 @@ LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
Array2DView<uint16_t> frame, const int start_x, const int start_y,
- const int tx_width, const int tx_height, const int32_t* source) {
+ const int tx_width, const int tx_height,
+ const int32_t* LIBGAV1_RESTRICT source) {
const int stride = frame.columns();
- uint16_t* dst = frame[start_y] + start_x;
+ uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11);
const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
@@ -1747,6 +1772,119 @@ LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
return true;
}
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest,
+ const int32_t step) {
+ auto* const dst = static_cast<int32_t*>(dest);
+
+ // When combining the identity32 multiplier with the row shift, the
+ // calculation for tx_height equal to 16 can be simplified from
+ // ((A * 4) + 1) >> 1) to (A * 2).
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 32; j += 4) {
+ const int32x4_t v_src = vld1q_s32(&dst[i * step + j]);
+ const int32x4_t v_dst_i = vqaddq_s32(v_src, v_src);
+ vst1q_s32(&dst[i * step + j], v_dst_i);
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+ int adjusted_tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x2_t v_src0 = vdup_n_s32(dst[0]);
+ const int32x2_t v_src =
+ vqrdmulh_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+ // When combining the identity32 multiplier with the row shift, the
+ // calculation for tx_height equal to 16 can be simplified from
+ // ((A * 4) + 1) >> 1) to (A * 2).
+ const int32x2_t v_dst_0 = vqadd_s32(v_src, v_src);
+ vst1_lane_s32(dst, v_dst_0, 0);
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// Walsh Hadamard Transform.
+
+// Process 4 wht4 rows and columns.
+LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint16_t* LIBGAV1_RESTRICT dst,
+ const int dst_stride,
+ const void* LIBGAV1_RESTRICT source,
+ const int adjusted_tx_height) {
+ const auto* const src = static_cast<const int32_t*>(source);
+ int32x4_t s[4];
+
+ if (adjusted_tx_height == 1) {
+ // Special case: only src[0] is nonzero.
+ // src[0] 0 0 0
+ // 0 0 0 0
+ // 0 0 0 0
+ // 0 0 0 0
+ //
+ // After the row and column transforms are applied, we have:
+ // f h h h
+ // g i i i
+ // g i i i
+ // g i i i
+ // where f, g, h, i are computed as follows.
+ int32_t f = (src[0] >> 2) - (src[0] >> 3);
+ const int32_t g = f >> 1;
+ f = f - (f >> 1);
+ const int32_t h = (src[0] >> 3) - (src[0] >> 4);
+ const int32_t i = (src[0] >> 4);
+ s[0] = vdupq_n_s32(h);
+ s[0] = vsetq_lane_s32(f, s[0], 0);
+ s[1] = vdupq_n_s32(i);
+ s[1] = vsetq_lane_s32(g, s[1], 0);
+ s[2] = s[3] = s[1];
+ } else {
+ // Load the 4x4 source in transposed form.
+ int32x4x4_t columns = vld4q_s32(src);
+
+ // Shift right and permute the columns for the WHT.
+ s[0] = vshrq_n_s32(columns.val[0], 2);
+ s[2] = vshrq_n_s32(columns.val[1], 2);
+ s[3] = vshrq_n_s32(columns.val[2], 2);
+ s[1] = vshrq_n_s32(columns.val[3], 2);
+
+ // Row transforms.
+ s[0] = vaddq_s32(s[0], s[2]);
+ s[3] = vsubq_s32(s[3], s[1]);
+ int32x4_t e = vhsubq_s32(s[0], s[3]); // e = (s[0] - s[3]) >> 1
+ s[1] = vsubq_s32(e, s[1]);
+ s[2] = vsubq_s32(e, s[2]);
+ s[0] = vsubq_s32(s[0], s[1]);
+ s[3] = vaddq_s32(s[3], s[2]);
+
+ int32x4_t x[4];
+ Transpose4x4(s, x);
+
+ s[0] = x[0];
+ s[2] = x[1];
+ s[3] = x[2];
+ s[1] = x[3];
+
+ // Column transforms.
+ s[0] = vaddq_s32(s[0], s[2]);
+ s[3] = vsubq_s32(s[3], s[1]);
+ e = vhsubq_s32(s[0], s[3]); // e = (s[0] - s[3]) >> 1
+ s[1] = vsubq_s32(e, s[1]);
+ s[2] = vsubq_s32(e, s[2]);
+ s[0] = vsubq_s32(s[0], s[1]);
+ s[3] = vaddq_s32(s[3], s[2]);
+ }
+
+ // Store to frame.
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+ for (int row = 0; row < 4; row += 1) {
+ const uint16x4_t frame_data = vld1_u16(dst);
+ const int32x4_t b = vaddw_s16(s[row], vreinterpret_s16_u16(frame_data));
+ vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
+ dst += dst_stride;
+ }
+}
+
//------------------------------------------------------------------------------
// row/column transform loops
@@ -1837,11 +1975,12 @@ LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows,
template <int tx_height, bool enable_flip_rows = false>
LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
Array2DView<uint16_t> frame, const int start_x, const int start_y,
- const int tx_width, const int32_t* source, TransformType tx_type) {
+ const int tx_width, const int32_t* LIBGAV1_RESTRICT source,
+ TransformType tx_type) {
const bool flip_rows =
enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
const int stride = frame.columns();
- uint16_t* dst = frame[start_y] + start_x;
+ uint16_t* LIBGAV1_RESTRICT dst = frame[start_y] + start_x;
if (tx_width == 4) {
for (int i = 0; i < tx_height; ++i) {
@@ -1887,7 +2026,7 @@ void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
auto* src = static_cast<int32_t*>(src_buffer);
const int tx_height = kTransformHeight[tx_size];
const bool should_round = (tx_height == 8);
- const int row_shift = (tx_height == 16);
+ const int row_shift = static_cast<int>(tx_height == 16);
if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
return;
@@ -1909,8 +2048,10 @@ void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
}
void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
- int start_x, int start_y, void* dst_frame) {
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int32_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -1962,8 +2103,10 @@ void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
}
void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
- int start_x, int start_y, void* dst_frame) {
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int32_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2014,8 +2157,10 @@ void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
}
void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
- int start_x, int start_y, void* dst_frame) {
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int32_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2066,8 +2211,10 @@ void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
}
void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
- int start_x, int start_y, void* dst_frame) {
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int32_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2117,8 +2264,10 @@ void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
}
void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
- int start_x, int start_y, void* dst_frame) {
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int32_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2168,8 +2317,10 @@ void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
}
void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
- int start_x, int start_y, void* dst_frame) {
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int32_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2222,8 +2373,10 @@ void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
}
void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
- int start_x, int start_y, void* dst_frame) {
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int32_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2275,8 +2428,10 @@ void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
void Adst16TransformLoopColumn_NEON(TransformType tx_type,
TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
- int start_x, int start_y, void* dst_frame) {
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int32_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2335,9 +2490,10 @@ void Identity4TransformLoopRow_NEON(TransformType tx_type,
void Identity4TransformLoopColumn_NEON(TransformType tx_type,
TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
int start_x, int start_y,
- void* dst_frame) {
+ void* LIBGAV1_RESTRICT dst_frame) {
auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
auto* src = static_cast<int32_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2416,9 +2572,10 @@ void Identity8TransformLoopRow_NEON(TransformType tx_type,
void Identity8TransformLoopColumn_NEON(TransformType tx_type,
TransformSize tx_size,
- int adjusted_tx_height, void* src_buffer,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
int start_x, int start_y,
- void* dst_frame) {
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int32_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2457,8 +2614,9 @@ void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
void Identity16TransformLoopColumn_NEON(TransformType tx_type,
TransformSize tx_size,
int adjusted_tx_height,
- void* src_buffer, int start_x,
- int start_y, void* dst_frame) {
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
auto* src = static_cast<int32_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
@@ -2470,60 +2628,144 @@ void Identity16TransformLoopColumn_NEON(TransformType tx_type,
adjusted_tx_height, src);
}
+void Identity32TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ const int tx_height = kTransformHeight[tx_size];
+
+ // When combining the identity32 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 32 can be simplified
+ // from ((A * 4) + 2) >> 2) to A.
+ if ((tx_height & 0x28) != 0) {
+ return;
+ }
+
+ // Process kTransformSize32x16. The src is always rounded before the identity
+ // transform and shifted by 1 afterwards.
+ auto* src = static_cast<int32_t*>(src_buffer);
+ if (Identity32DcOnly(src, adjusted_tx_height)) {
+ return;
+ }
+
+ assert(tx_size == kTransformSize32x16);
+ ApplyRounding<32>(src, adjusted_tx_height);
+ int i = adjusted_tx_height;
+ do {
+ Identity32Row16_NEON(src, /*step=*/32);
+ src += 128;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity32TransformLoopColumn_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_NEON(TransformType tx_type, TransformSize tx_size,
+ int /*adjusted_tx_height*/, void* /*src_buffer*/,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+ // Do both row and column transforms in the column-transform pass.
+}
+
+void Wht4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height,
+ void* LIBGAV1_RESTRICT src_buffer,
+ int start_x, int start_y,
+ void* LIBGAV1_RESTRICT dst_frame) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+
+ // Process 4 1d wht4 rows and columns in parallel.
+ const auto* src = static_cast<int32_t*>(src_buffer);
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ uint16_t* dst = frame[start_y] + start_x;
+ const int dst_stride = frame.columns();
+ Wht4_NEON(dst, dst_stride, src, adjusted_tx_height);
+}
+
//------------------------------------------------------------------------------
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
assert(dsp != nullptr);
// Maximum transform size for Dct is 64.
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kRow] =
Dct4TransformLoopRow_NEON;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize4][kColumn] =
Dct4TransformLoopColumn_NEON;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kRow] =
Dct8TransformLoopRow_NEON;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize8][kColumn] =
Dct8TransformLoopColumn_NEON;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kRow] =
Dct16TransformLoopRow_NEON;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize16][kColumn] =
Dct16TransformLoopColumn_NEON;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kRow] =
Dct32TransformLoopRow_NEON;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize32][kColumn] =
Dct32TransformLoopColumn_NEON;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kRow] =
Dct64TransformLoopRow_NEON;
- dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ dsp->inverse_transforms[kTransform1dDct][kTransform1dSize64][kColumn] =
Dct64TransformLoopColumn_NEON;
// Maximum transform size for Adst is 16.
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kRow] =
Adst4TransformLoopRow_NEON;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize4][kColumn] =
Adst4TransformLoopColumn_NEON;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kRow] =
Adst8TransformLoopRow_NEON;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize8][kColumn] =
Adst8TransformLoopColumn_NEON;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kRow] =
Adst16TransformLoopRow_NEON;
- dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ dsp->inverse_transforms[kTransform1dAdst][kTransform1dSize16][kColumn] =
Adst16TransformLoopColumn_NEON;
// Maximum transform size for Identity transform is 32.
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kRow] =
Identity4TransformLoopRow_NEON;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize4][kColumn] =
Identity4TransformLoopColumn_NEON;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kRow] =
Identity8TransformLoopRow_NEON;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize8][kColumn] =
Identity8TransformLoopColumn_NEON;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kRow] =
Identity16TransformLoopRow_NEON;
- dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize16][kColumn] =
Identity16TransformLoopColumn_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kRow] =
+ Identity32TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dIdentity][kTransform1dSize32][kColumn] =
+ Identity32TransformLoopColumn_NEON;
+
+ // Maximum transform size for Wht is 4.
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kRow] =
+ Wht4TransformLoopRow_NEON;
+ dsp->inverse_transforms[kTransform1dWht][kTransform1dSize4][kColumn] =
+ Wht4TransformLoopColumn_NEON;
}
} // namespace