diff options
Diffstat (limited to 'src/dsp/arm/common_neon.h')
-rw-r--r-- | src/dsp/arm/common_neon.h | 52 |
1 files changed, 34 insertions, 18 deletions
diff --git a/src/dsp/arm/common_neon.h b/src/dsp/arm/common_neon.h index 9c46525..c0af2c1 100644 --- a/src/dsp/arm/common_neon.h +++ b/src/dsp/arm/common_neon.h @@ -309,6 +309,12 @@ inline uint8x16_t MaskOverreadsQ(const uint8x16_t source, return dst; } +inline uint16x8_t MaskOverreadsQ(const uint16x8_t source, + const ptrdiff_t over_read_in_bytes) { + return vreinterpretq_u16_u8( + MaskOverreadsQ(vreinterpretq_u8_u16(source), over_read_in_bytes)); +} + inline uint8x8_t Load1MsanU8(const uint8_t* const source, const ptrdiff_t over_read_in_bytes) { return MaskOverreads(vld1_u8(source), over_read_in_bytes); @@ -325,20 +331,6 @@ inline uint16x8_t Load1QMsanU16(const uint16_t* const source, vreinterpretq_u8_u16(vld1q_u16(source)), over_read_in_bytes)); } -inline uint16x8x2_t Load2QMsanU16(const uint16_t* const source, - const ptrdiff_t over_read_in_bytes) { - // Relative source index of elements (2 bytes each): - // dst.val[0]: 00 02 04 06 08 10 12 14 - // dst.val[1]: 01 03 05 07 09 11 13 15 - uint16x8x2_t dst = vld2q_u16(source); - dst.val[0] = vreinterpretq_u16_u8(MaskOverreadsQ( - vreinterpretq_u8_u16(dst.val[0]), over_read_in_bytes >> 1)); - dst.val[1] = vreinterpretq_u16_u8( - MaskOverreadsQ(vreinterpretq_u8_u16(dst.val[1]), - (over_read_in_bytes >> 1) + (over_read_in_bytes % 4))); - return dst; -} - inline uint32x4_t Load1QMsanU32(const uint32_t* const source, const ptrdiff_t over_read_in_bytes) { return vreinterpretq_u32_u8(MaskOverreadsQ( @@ -402,6 +394,24 @@ inline void Store8(void* const buf, const uint16x8_t val) { vst1q_u16(static_cast<uint16_t*>(buf), val); } +inline void Store4QMsanS16(void* const buf, const int16x8x4_t src) { +#if LIBGAV1_MSAN + // The memory shadow is incorrect for vst4q_u16, only marking the first 16 + // bytes of the destination as initialized. To avoid missing truly + // uninitialized memory, check the input vectors first, before marking the + // whole 64 bytes initialized. If any input vector contains unused values, it + // should pass through MaskOverreadsQ first. + __msan_check_mem_is_initialized(&src.val[0], sizeof(src.val[0])); + __msan_check_mem_is_initialized(&src.val[1], sizeof(src.val[1])); + __msan_check_mem_is_initialized(&src.val[2], sizeof(src.val[2])); + __msan_check_mem_is_initialized(&src.val[3], sizeof(src.val[3])); + vst4q_s16(static_cast<int16_t*>(buf), src); + __msan_unpoison(buf, sizeof(int16x8x4_t)); +#else + vst4q_s16(static_cast<int16_t*>(buf), src); +#endif // LIBGAV1_MSAN +} + //------------------------------------------------------------------------------ // Pointer helpers. @@ -587,7 +597,8 @@ inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) { //------------------------------------------------------------------------------ // Saturation helpers. -inline int16x4_t Clip3S16(int16x4_t val, int16x4_t low, int16x4_t high) { +inline int16x4_t Clip3S16(const int16x4_t val, const int16x4_t low, + const int16x4_t high) { return vmin_s16(vmax_s16(val, low), high); } @@ -596,7 +607,7 @@ inline int16x8_t Clip3S16(const int16x8_t val, const int16x8_t low, return vminq_s16(vmaxq_s16(val, low), high); } -inline uint16x8_t ConvertToUnsignedPixelU16(int16x8_t val, int bitdepth) { +inline uint16x8_t ConvertToUnsignedPixelU16(const int16x8_t val, int bitdepth) { const int16x8_t low = vdupq_n_s16(0); const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1); @@ -727,7 +738,7 @@ inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); } // Output: // b0.val[0]: 00 01 02 03 16 17 18 19 // b0.val[1]: 04 05 06 07 20 21 22 23 -inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) { +inline int16x8x2_t VtrnqS64(const int32x4_t a0, const int32x4_t a1) { int16x8x2_t b0; b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1))); @@ -736,7 +747,7 @@ inline int16x8x2_t VtrnqS64(int32x4_t a0, int32x4_t a1) { return b0; } -inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) { +inline uint16x8x2_t VtrnqU64(const uint32x4_t a0, const uint32x4_t a1) { uint16x8x2_t b0; b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), vreinterpret_u16_u32(vget_low_u32(a1))); @@ -750,6 +761,11 @@ inline uint16x8x2_t VtrnqU64(uint32x4_t a0, uint32x4_t a1) { // 10 11 12 13 // 20 21 22 23 // 30 31 32 33 +// Output: +// 00 10 20 30 +// 01 11 21 31 +// 02 12 22 32 +// 03 13 23 33 inline void Transpose4x4(uint16x4_t a[4]) { // b: // 00 10 02 12 |