1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
|
/*
* Copyright 2021 The libgav1 Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//------------------------------------------------------------------------------
// Load functions.
inline __m128i Load2(const void* src) {
int16_t val;
memcpy(&val, src, sizeof(val));
return _mm_cvtsi32_si128(val);
}
inline __m128i Load2x2(const void* src1, const void* src2) {
uint16_t val1;
uint16_t val2;
memcpy(&val1, src1, sizeof(val1));
memcpy(&val2, src2, sizeof(val2));
return _mm_cvtsi32_si128(val1 | (val2 << 16));
}
// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
template <int lane>
inline __m128i Load2(const void* const buf, __m128i val) {
int16_t temp;
memcpy(&temp, buf, 2);
return _mm_insert_epi16(val, temp, lane);
}
inline __m128i Load4(const void* src) {
// With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
// intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
// movss instruction.
//
// Until compiler support of _mm_loadu_si32 is widespread, use of
// _mm_loadu_si32 is banned.
int val;
memcpy(&val, src, sizeof(val));
return _mm_cvtsi32_si128(val);
}
inline __m128i Load4x2(const void* src1, const void* src2) {
// With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
// intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
// movss instruction.
//
// Until compiler support of _mm_loadu_si32 is widespread, use of
// _mm_loadu_si32 is banned.
int val1, val2;
memcpy(&val1, src1, sizeof(val1));
memcpy(&val2, src2, sizeof(val2));
return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
}
inline __m128i LoadLo8(const void* a) {
return _mm_loadl_epi64(static_cast<const __m128i*>(a));
}
inline __m128i LoadHi8(const __m128i v, const void* a) {
const __m128 x =
_mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
return _mm_castps_si128(x);
}
inline __m128i LoadUnaligned16(const void* a) {
return _mm_loadu_si128(static_cast<const __m128i*>(a));
}
inline __m128i LoadAligned16(const void* a) {
assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
return _mm_load_si128(static_cast<const __m128i*>(a));
}
//------------------------------------------------------------------------------
// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
inline __m128i MaskOverreads(const __m128i source,
const ptrdiff_t over_read_in_bytes) {
__m128i dst = source;
#if LIBGAV1_MSAN
if (over_read_in_bytes > 0) {
__m128i mask = _mm_set1_epi8(-1);
for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) {
mask = _mm_srli_si128(mask, 1);
}
dst = _mm_and_si128(dst, mask);
}
#else
static_cast<void>(over_read_in_bytes);
#endif
return dst;
}
inline __m128i LoadLo8Msan(const void* const source,
const ptrdiff_t over_read_in_bytes) {
return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
}
inline __m128i LoadHi8Msan(const __m128i v, const void* source,
const ptrdiff_t over_read_in_bytes) {
return MaskOverreads(LoadHi8(v, source), over_read_in_bytes);
}
inline __m128i LoadAligned16Msan(const void* const source,
const ptrdiff_t over_read_in_bytes) {
return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
}
inline __m128i LoadUnaligned16Msan(const void* const source,
const ptrdiff_t over_read_in_bytes) {
return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
}
//------------------------------------------------------------------------------
// Store functions.
inline void Store2(void* dst, const __m128i x) {
const int val = _mm_cvtsi128_si32(x);
memcpy(dst, &val, 2);
}
inline void Store4(void* dst, const __m128i x) {
const int val = _mm_cvtsi128_si32(x);
memcpy(dst, &val, sizeof(val));
}
inline void StoreLo8(void* a, const __m128i v) {
_mm_storel_epi64(static_cast<__m128i*>(a), v);
}
inline void StoreHi8(void* a, const __m128i v) {
_mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
}
inline void StoreAligned16(void* a, const __m128i v) {
assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
_mm_store_si128(static_cast<__m128i*>(a), v);
}
inline void StoreUnaligned16(void* a, const __m128i v) {
_mm_storeu_si128(static_cast<__m128i*>(a), v);
}
//------------------------------------------------------------------------------
// Arithmetic utilities.
inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
assert(bits <= 16);
// Shift out all but the last bit.
const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
// Avg with zero will shift by 1 and round.
return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128());
}
inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
assert(bits < 16);
const __m128i v_bias_d =
_mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
return _mm_srai_epi16(v_tmp_d, bits);
}
inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
return _mm_srli_epi32(v_tmp_d, bits);
}
inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
return _mm_srai_epi32(v_tmp_d, bits);
}
// Use this when |bits| is not an immediate value.
inline __m128i VariableRightShiftWithRounding_S32(const __m128i v_val_d,
int bits) {
const __m128i v_bias_d =
_mm_set1_epi32(static_cast<int32_t>((1 << bits) >> 1));
const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
return _mm_sra_epi32(v_tmp_d, _mm_cvtsi32_si128(bits));
}
//------------------------------------------------------------------------------
// Masking utilities
inline __m128i MaskHighNBytes(int n) {
static constexpr uint8_t kMask[32] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
};
return LoadUnaligned16(kMask + n);
}
|