aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/arm/intrapred_neon.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/arm/intrapred_neon.cc')
-rw-r--r--src/dsp/arm/intrapred_neon.cc247
1 files changed, 246 insertions, 1 deletions
diff --git a/src/dsp/arm/intrapred_neon.cc b/src/dsp/arm/intrapred_neon.cc
index c967d82..c143648 100644
--- a/src/dsp/arm/intrapred_neon.cc
+++ b/src/dsp/arm/intrapred_neon.cc
@@ -26,6 +26,7 @@
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
@@ -964,6 +965,200 @@ struct DcDefs {
using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
};
+// IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
+
+template <int block_height>
+void Horizontal4xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16x4_t row = vld1_dup_u16(left + y);
+ vst1_u16(dst16, row);
+ dst += stride;
+ } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal8xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16x8_t row = vld1q_dup_u16(left + y);
+ vst1q_u16(dst16, row);
+ dst += stride;
+ } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal16xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ const uint16x8_t row0 = vld1q_dup_u16(left + y);
+ const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row0);
+ vst1q_u16(dst16 + 8, row0);
+ dst += stride;
+ dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row1);
+ vst1q_u16(dst16 + 8, row1);
+ dst += stride;
+ y += 2;
+ } while (y < block_height);
+}
+
+template <int block_height>
+void Horizontal32xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ const uint16x8_t row0 = vld1q_dup_u16(left + y);
+ const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row0);
+ vst1q_u16(dst16 + 8, row0);
+ vst1q_u16(dst16 + 16, row0);
+ vst1q_u16(dst16 + 24, row0);
+ dst += stride;
+ dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row1);
+ vst1q_u16(dst16 + 8, row1);
+ vst1q_u16(dst16 + 16, row1);
+ vst1q_u16(dst16 + 24, row1);
+ dst += stride;
+ y += 2;
+ } while (y < block_height);
+}
+
+// IntraPredFuncs_NEON::Vertical -- copy top row to all rows
+
+template <int block_height>
+void Vertical4xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x8_t row = vld1_u8(top);
+ int y = block_height;
+ do {
+ vst1_u8(dst, row);
+ dst += stride;
+ } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical8xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row = vld1q_u8(top);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row);
+ dst += stride;
+ } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical16xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int block_height>
+void Vertical32xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ const uint8x16_t row2 = vld1q_u8(top + 32);
+ const uint8x16_t row3 = vld1q_u8(top + 48);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int block_height>
+void Vertical64xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ const uint8x16_t row2 = vld1q_u8(top + 32);
+ const uint8x16_t row3 = vld1q_u8(top + 48);
+ const uint8x16_t row4 = vld1q_u8(top + 64);
+ const uint8x16_t row5 = vld1q_u8(top + 80);
+ const uint8x16_t row6 = vld1q_u8(top + 96);
+ const uint8x16_t row7 = vld1q_u8(top + 112);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ vst1q_u8(dst + 64, row4);
+ vst1q_u8(dst + 80, row5);
+ vst1q_u8(dst + 96, row6);
+ vst1q_u8(dst + 112, row7);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ vst1q_u8(dst + 64, row4);
+ vst1q_u8(dst + 80, row5);
+ vst1q_u8(dst + 96, row6);
+ vst1q_u8(dst + 112, row7);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
assert(dsp != nullptr);
@@ -973,6 +1168,8 @@ void Init10bpp() {
DcDefs::_4x4::DcLeft;
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
DcDefs::_4x4::Dc;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+ Vertical4xH_NEON<4>;
// 4x8
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
@@ -981,6 +1178,10 @@ void Init10bpp() {
DcDefs::_4x8::DcLeft;
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
DcDefs::_4x8::Dc;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ Horizontal4xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+ Vertical4xH_NEON<8>;
// 4x16
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
@@ -989,6 +1190,10 @@ void Init10bpp() {
DcDefs::_4x16::DcLeft;
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
DcDefs::_4x16::Dc;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ Horizontal4xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+ Vertical4xH_NEON<16>;
// 8x4
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
@@ -997,6 +1202,8 @@ void Init10bpp() {
DcDefs::_8x4::DcLeft;
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
DcDefs::_8x4::Dc;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+ Vertical8xH_NEON<4>;
// 8x8
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
@@ -1005,6 +1212,10 @@ void Init10bpp() {
DcDefs::_8x8::DcLeft;
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
DcDefs::_8x8::Dc;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ Horizontal8xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+ Vertical8xH_NEON<8>;
// 8x16
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
@@ -1013,6 +1224,8 @@ void Init10bpp() {
DcDefs::_8x16::DcLeft;
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
DcDefs::_8x16::Dc;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+ Vertical8xH_NEON<16>;
// 8x32
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
@@ -1021,6 +1234,10 @@ void Init10bpp() {
DcDefs::_8x32::DcLeft;
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
DcDefs::_8x32::Dc;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ Horizontal8xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+ Vertical8xH_NEON<32>;
// 16x4
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
@@ -1029,6 +1246,8 @@ void Init10bpp() {
DcDefs::_16x4::DcLeft;
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
DcDefs::_16x4::Dc;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+ Vertical16xH_NEON<4>;
// 16x8
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
@@ -1037,6 +1256,10 @@ void Init10bpp() {
DcDefs::_16x8::DcLeft;
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
DcDefs::_16x8::Dc;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ Horizontal16xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+ Vertical16xH_NEON<8>;
// 16x16
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
@@ -1045,6 +1268,8 @@ void Init10bpp() {
DcDefs::_16x16::DcLeft;
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
DcDefs::_16x16::Dc;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+ Vertical16xH_NEON<16>;
// 16x32
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
@@ -1053,6 +1278,8 @@ void Init10bpp() {
DcDefs::_16x32::DcLeft;
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
DcDefs::_16x32::Dc;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+ Vertical16xH_NEON<32>;
// 16x64
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
@@ -1061,6 +1288,8 @@ void Init10bpp() {
DcDefs::_16x64::DcLeft;
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
DcDefs::_16x64::Dc;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+ Vertical16xH_NEON<64>;
// 32x8
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
@@ -1069,6 +1298,8 @@ void Init10bpp() {
DcDefs::_32x8::DcLeft;
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
DcDefs::_32x8::Dc;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+ Vertical32xH_NEON<8>;
// 32x16
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
@@ -1077,6 +1308,8 @@ void Init10bpp() {
DcDefs::_32x16::DcLeft;
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
DcDefs::_32x16::Dc;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+ Vertical32xH_NEON<16>;
// 32x32
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
@@ -1085,6 +1318,8 @@ void Init10bpp() {
DcDefs::_32x32::DcLeft;
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
DcDefs::_32x32::Dc;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+ Vertical32xH_NEON<32>;
// 32x64
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
@@ -1093,6 +1328,10 @@ void Init10bpp() {
DcDefs::_32x64::DcLeft;
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
DcDefs::_32x64::Dc;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ Horizontal32xH_NEON<64>;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+ Vertical32xH_NEON<64>;
// 64x16
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
@@ -1101,6 +1340,8 @@ void Init10bpp() {
DcDefs::_64x16::DcLeft;
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
DcDefs::_64x16::Dc;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+ Vertical64xH_NEON<16>;
// 64x32
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
@@ -1109,6 +1350,8 @@ void Init10bpp() {
DcDefs::_64x32::DcLeft;
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
DcDefs::_64x32::Dc;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+ Vertical64xH_NEON<32>;
// 64x64
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
@@ -1117,6 +1360,8 @@ void Init10bpp() {
DcDefs::_64x64::DcLeft;
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
DcDefs::_64x64::Dc;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+ Vertical64xH_NEON<64>;
}
} // namespace
@@ -1133,7 +1378,7 @@ void IntraPredInit_NEON() {
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {