aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/arm/common_neon.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/arm/common_neon.h')
-rw-r--r--src/dsp/arm/common_neon.h52
1 files changed, 34 insertions, 18 deletions
diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h
index 9c46525..c0af2c1 100644
--- a/src/dsp/arm/common_neon.h
+++ b/src/dsp/arm/common_neon.h
@@ -309,6 +309,12 @@ inline uint8x16_t MaskOverreadsQ(const uint8x16_t source,
return dst;
}
+inline uint16x8_t MaskOverreadsQ(const uint16x8_t source,
+ const ptrdiff_t over_read_in_bytes) {
+ return vreinterpretq_u16_u8(
+ MaskOverreadsQ(vreinterpretq_u8_u16(source), over_read_in_bytes));
+}
+
inline uint8x8_t Load1MsanU8(const uint8_t* const source,
const ptrdiff_t over_read_in_bytes) {
return MaskOverreads(vld1_u8(source), over_read_in_bytes);
@@ -325,20 +331,6 @@ inline uint16x8_t Load1QMsanU16(const uint16_t* const source,
vreinterpretq_u8_u16(vld1q_u16(source)), over_read_in_bytes));
}
-inline uint16x8x2_t Load2QMsanU16(const uint16_t* const source,
- const ptrdiff_t over_read_in_bytes) {
- // Relative source index of elements (2 bytes each):
- // dst.val[0]: 00 02 04 06 08 10 12 14
- // dst.val[1]: 01 03 05 07 09 11 13 15
- uint16x8x2_t dst = vld2q_u16(source);
- dst.val[0] = vreinterpretq_u16_u8(MaskOverreadsQ(
- vreinterpretq_u8_u16(dst.val[0]), over_read_in_bytes >> 1));
- dst.val[1] = vreinterpretq_u16_u8(
- MaskOverreadsQ(vreinterpretq_u8_u16(dst.val[1]),
- (over_read_in_bytes >> 1) + (over_read_in_bytes % 4)));
- return dst;
-}
-
inline uint32x4_t Load1QMsanU32(const uint32_t* const source,
const ptrdiff_t over_read_in_bytes) {
return vreinterpretq_u32_u8(MaskOverreadsQ(
@@ -402,6 +394,24 @@ inline void Store8(void* const buf, const uint16x8_t val) {
vst1q_u16(static_cast<uint16_t*>(buf), val);
}
+inline void Store4QMsanS16(void* const buf, const int16x8x4_t src) {
+#if LIBGAV1_MSAN
+ // The memory shadow is incorrect for vst4q_u16, only marking the first 16
+ // bytes of the destination as initialized. To avoid missing truly
+ // uninitialized memory, check the input vectors first, before marking the
+ // whole 64 bytes initialized. If any input vector contains unused values, it
+ // should pass through MaskOverreadsQ first.
+ __msan_check_mem_is_initialized(&src.val[0], sizeof(src.val[0]));
+ __msan_check_mem_is_initialized(&src.val[1], sizeof(src.val[1]));
+ __msan_check_mem_is_initialized(&src.val[2], sizeof(src.val[2]));
+ __msan_check_mem_is_initialized(&src.val[3], sizeof(src.val[3]));
+ vst4q_s16(static_cast<int16_t*>(buf), src);
+ __msan_unpoison(buf, sizeof(int16x8x4_t));
+#else
+ vst4q_s16(static_cast<int16_t*>(buf), src);
+#endif // LIBGAV1_MSAN
+}
+
//------------------------------------------------------------------------------
// Pointer helpers.
@@ -587,7 +597,8 @@ inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
//------------------------------------------------------------------------------
// Saturation helpers.
-inline int16x4_t Clip3S16(int16x4_t val, int16x4_t low, int16x4_t high) {
+inline int16x4_t Clip3S16(const int16x4_t val, const int16x4_t low,
+ const int16x4_t high) {
return vmin_s16(vmax_s16(val, low), high);
}
@@ -596,7 +607,7 @@ inline int16x8_t Clip3S16(const int16x8_t val, const int16x8_t low,
return vminq_s16(vmaxq_s16(val, low), high);
}
-inline uint16x8_t ConvertToUnsignedPixelU16(int16x8_t val, int bitdepth) {
+inline uint16x8_t ConvertToUnsignedPixelU16(const int16x8_t val, int bitdepth) {
const int16x8_t low = vdupq_n_s16(0);
const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
@@ -727,7 +738,7 @@ inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); }
// Output:
// b0.val[0]: 00 01 02 03 16 17 18 19
// b0.val[1]: 04 05 06 07 20 21 22 23
-inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) {
+inline int16x8x2_t VtrnqS64(const int32x4_t a0, const int32x4_t a1) {
int16x8x2_t b0;
b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
vreinterpret_s16_s32(vget_low_s32(a1)));
@@ -736,7 +747,7 @@ inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) {
return b0;
}
-inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
+inline uint16x8x2_t VtrnqU64(const uint32x4_t a0, const uint32x4_t a1) {
uint16x8x2_t b0;
b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
vreinterpret_u16_u32(vget_low_u32(a1)));
@@ -750,6 +761,11 @@ inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) {
// 10 11 12 13
// 20 21 22 23
// 30 31 32 33
+// Output:
+// 00 10 20 30
+// 01 11 21 31
+// 02 12 22 32
+// 03 13 23 33
inline void Transpose4x4(uint16x4_t a[4]) {
// b:
// 00 10 02 12