aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86/intrapred_sse4.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86/intrapred_sse4.cc')
-rw-r--r--src/dsp/x86/intrapred_sse4.cc202
1 files changed, 108 insertions, 94 deletions
diff --git a/src/dsp/x86/intrapred_sse4.cc b/src/dsp/x86/intrapred_sse4.cc
index 063929d..556afed 100644
--- a/src/dsp/x86/intrapred_sse4.cc
+++ b/src/dsp/x86/intrapred_sse4.cc
@@ -90,11 +90,11 @@ struct DirectionalPredFuncs_SSE4_1 {
template <int width_log2, int height_log2, DcSumFunc top_sumfn,
DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
-void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
- shiftk, dc_mult>::DcTop(void* const dest,
- ptrdiff_t stride,
- const void* const top_row,
- const void* /*left_column*/) {
+void DcPredFuncs_SSE4_1<
+ width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+ dc_mult>::DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* /*left_column*/) {
const __m128i rounder = _mm_set1_epi32(1 << (width_log2 - 1));
const __m128i sum = top_sumfn(top_row);
const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2);
@@ -103,11 +103,11 @@ void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
template <int width_log2, int height_log2, DcSumFunc top_sumfn,
DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
-void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
- shiftk,
- dc_mult>::DcLeft(void* const dest, ptrdiff_t stride,
- const void* /*top_row*/,
- const void* const left_column) {
+void DcPredFuncs_SSE4_1<
+ width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+ dc_mult>::DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i rounder = _mm_set1_epi32(1 << (height_log2 - 1));
const __m128i sum = left_sumfn(left_column);
const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), height_log2);
@@ -116,10 +116,11 @@ void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
template <int width_log2, int height_log2, DcSumFunc top_sumfn,
DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
-void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
- shiftk, dc_mult>::Dc(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void DcPredFuncs_SSE4_1<
+ width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
+ dc_mult>::Dc(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i rounder =
_mm_set1_epi32((1 << (width_log2 - 1)) + (1 << (height_log2 - 1)));
const __m128i sum_top = top_sumfn(top_row);
@@ -141,8 +142,8 @@ void DcPredFuncs_SSE4_1<width_log2, height_log2, top_sumfn, left_sumfn, storefn,
template <ColumnStoreFunc col_storefn>
void DirectionalPredFuncs_SSE4_1<col_storefn>::Horizontal(
- void* const dest, ptrdiff_t stride, const void* /*top_row*/,
- const void* const left_column) {
+ void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
col_storefn(dest, stride, left_column);
}
@@ -384,8 +385,9 @@ inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
// ColStoreN<height> copies each of the |height| values in |column| across its
// corresponding in dest.
template <WriteDuplicateFunc writefn>
-inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const __m128i col_data = Load4(column);
const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
const __m128i col_dup32 = _mm_unpacklo_epi16(col_dup16, col_dup16);
@@ -393,8 +395,9 @@ inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride,
}
template <WriteDuplicateFunc writefn>
-inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const ptrdiff_t stride4 = stride << 2;
const __m128i col_data = LoadLo8(column);
const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
@@ -407,8 +410,9 @@ inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
}
template <WriteDuplicateFunc writefn>
-inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const ptrdiff_t stride4 = stride << 2;
const __m128i col_data = _mm_loadu_si128(static_cast<const __m128i*>(column));
const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
@@ -428,8 +432,9 @@ inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
}
template <WriteDuplicateFunc writefn>
-inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const ptrdiff_t stride4 = stride << 2;
auto* dst = static_cast<uint8_t*>(dest);
for (int y = 0; y < 32; y += 16) {
@@ -457,8 +462,9 @@ inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
}
template <WriteDuplicateFunc writefn>
-inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const ptrdiff_t stride4 = stride << 2;
auto* dst = static_cast<uint8_t*>(dest);
for (int y = 0; y < 64; y += 16) {
@@ -574,7 +580,7 @@ struct DirDefs {
};
template <int y_mask>
-inline void WritePaethLine4(uint8_t* dst, const __m128i& top,
+inline void WritePaethLine4(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
const __m128i& left, const __m128i& top_lefts,
const __m128i& top_dists, const __m128i& left_dists,
const __m128i& top_left_diffs) {
@@ -614,7 +620,7 @@ inline void WritePaethLine4(uint8_t* dst, const __m128i& top,
// could pay off to accommodate top_left_dists for cmpgt, and repack into epi8
// for the blends.
template <int y_mask>
-inline void WritePaethLine8(uint8_t* dst, const __m128i& top,
+inline void WritePaethLine8(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
const __m128i& left, const __m128i& top_lefts,
const __m128i& top_dists, const __m128i& left_dists,
const __m128i& top_left_diffs) {
@@ -658,7 +664,7 @@ inline void WritePaethLine8(uint8_t* dst, const __m128i& top,
// |left_dists| is provided alongside its spread out version because it doesn't
// change between calls and interacts with both kinds of packing.
template <int y_mask>
-inline void WritePaethLine16(uint8_t* dst, const __m128i& top,
+inline void WritePaethLine16(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
const __m128i& left, const __m128i& top_lefts,
const __m128i& top_dists,
const __m128i& left_dists,
@@ -712,8 +718,9 @@ inline void WritePaethLine16(uint8_t* dst, const __m128i& top,
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), pred);
}
-void Paeth4x4_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row, const void* const left_column) {
+void Paeth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
@@ -742,8 +749,9 @@ void Paeth4x4_SSE4_1(void* const dest, ptrdiff_t stride,
top_left_diff);
}
-void Paeth4x8_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row, const void* const left_column) {
+void Paeth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = LoadLo8(left_column);
const __m128i left_lo = _mm_cvtepu8_epi32(left);
const __m128i left_hi = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
@@ -787,9 +795,9 @@ void Paeth4x8_SSE4_1(void* const dest, ptrdiff_t stride,
top_left_diff);
}
-void Paeth4x16_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = LoadUnaligned16(left_column);
const __m128i left_0 = _mm_cvtepu8_epi32(left);
const __m128i left_1 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
@@ -862,8 +870,9 @@ void Paeth4x16_SSE4_1(void* const dest, ptrdiff_t stride,
top_left_diff);
}
-void Paeth8x4_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row, const void* const left_column) {
+void Paeth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -891,8 +900,9 @@ void Paeth8x4_SSE4_1(void* const dest, ptrdiff_t stride,
top_left_diff);
}
-void Paeth8x8_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row, const void* const left_column) {
+void Paeth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -932,9 +942,9 @@ void Paeth8x8_SSE4_1(void* const dest, ptrdiff_t stride,
top_left_diff);
}
-void Paeth8x16_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = LoadUnaligned16(left_column);
const __m128i left_lo = _mm_cvtepu8_epi16(left);
const __m128i left_hi = _mm_cvtepu8_epi16(_mm_srli_si128(left, 8));
@@ -1001,18 +1011,18 @@ void Paeth8x16_SSE4_1(void* const dest, ptrdiff_t stride,
left_dists, top_left_diff);
}
-void Paeth8x32_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
auto* const dst = static_cast<uint8_t*>(dest);
Paeth8x16_SSE4_1(dst, stride, top_row, left_column);
Paeth8x16_SSE4_1(dst + (stride << 4), stride, top_row, left_ptr + 16);
}
-void Paeth16x4_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = Load4(left_column);
const __m128i top = LoadUnaligned16(top_row);
const __m128i top_lo = _mm_cvtepu8_epi16(top);
@@ -1057,7 +1067,7 @@ void Paeth16x4_SSE4_1(void* const dest, ptrdiff_t stride,
// Inlined for calling with offsets in larger transform sizes, mainly to
// preserve top_left.
-inline void WritePaeth16x8(void* const dest, ptrdiff_t stride,
+inline void WritePaeth16x8(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
const uint8_t top_left, const __m128i top,
const __m128i left) {
const __m128i top_lo = _mm_cvtepu8_epi16(top);
@@ -1115,9 +1125,9 @@ inline void WritePaeth16x8(void* const dest, ptrdiff_t stride,
top_left_diff_lo, top_left_diff_hi);
}
-void Paeth16x8_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i top = LoadUnaligned16(top_row);
const __m128i left = LoadLo8(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -1213,18 +1223,18 @@ void WritePaeth16x16(void* const dest, ptrdiff_t stride, const uint8_t top_left,
top_left_diff_lo, top_left_diff_hi);
}
-void Paeth16x16_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth16x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = LoadUnaligned16(left_column);
const __m128i top = LoadUnaligned16(top_row);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
WritePaeth16x16(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
}
-void Paeth16x32_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth16x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left_0 = LoadUnaligned16(left_column);
const __m128i top = LoadUnaligned16(top_row);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -1236,9 +1246,9 @@ void Paeth16x32_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst + (stride << 4), stride, top_left, top, left_1);
}
-void Paeth16x64_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth16x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const ptrdiff_t stride16 = stride << 4;
const __m128i left_0 = LoadUnaligned16(left_column);
const __m128i top = LoadUnaligned16(top_row);
@@ -1258,9 +1268,9 @@ void Paeth16x64_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst, stride, top_left, top, left_3);
}
-void Paeth32x8_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = LoadLo8(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
const __m128i top_0 = LoadUnaligned16(top_row);
@@ -1271,9 +1281,9 @@ void Paeth32x8_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x8(dst + 16, stride, top_left, top_1, left);
}
-void Paeth32x16_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth32x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = LoadUnaligned16(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
const __m128i top_0 = LoadUnaligned16(top_row);
@@ -1284,9 +1294,9 @@ void Paeth32x16_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
}
-void Paeth32x32_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth32x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const __m128i left_0 = LoadUnaligned16(left_ptr);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -1302,9 +1312,9 @@ void Paeth32x32_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
}
-void Paeth32x64_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth32x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const __m128i left_0 = LoadUnaligned16(left_ptr);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
@@ -1328,9 +1338,9 @@ void Paeth32x64_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
}
-void Paeth64x16_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth64x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const __m128i left = LoadUnaligned16(left_column);
const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
const __m128i top_0 = LoadUnaligned16(top_ptr);
@@ -1345,9 +1355,9 @@ void Paeth64x16_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst + 48, stride, top_left, top_3, left);
}
-void Paeth64x32_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth64x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const __m128i left_0 = LoadUnaligned16(left_ptr);
const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
@@ -1369,9 +1379,9 @@ void Paeth64x32_SSE4_1(void* const dest, ptrdiff_t stride,
WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
}
-void Paeth64x64_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column) {
+void Paeth64x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const top_row,
+ const void* LIBGAV1_RESTRICT const left_column) {
const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
const __m128i left_0 = LoadUnaligned16(left_ptr);
const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
@@ -1793,7 +1803,6 @@ void Init8bpp() {
DirDefs::_64x64::Horizontal;
#endif
} // NOLINT(readability/fn_size)
-// TODO(petersonab): Split Init8bpp function into family-specific files.
} // namespace
} // namespace low_bitdepth
@@ -1937,16 +1946,18 @@ inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
// ColStoreN<height> copies each of the |height| values in |column| across its
// corresponding row in dest.
template <WriteDuplicateFunc writefn>
-inline void ColStore4_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const __m128i col_data = LoadLo8(column);
const __m128i col_dup32 = _mm_unpacklo_epi16(col_data, col_data);
writefn(dest, stride, col_dup32);
}
template <WriteDuplicateFunc writefn>
-inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const __m128i col_data = LoadUnaligned16(column);
const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
@@ -1958,8 +1969,9 @@ inline void ColStore8_SSE4_1(void* const dest, ptrdiff_t stride,
}
template <WriteDuplicateFunc writefn>
-inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const ptrdiff_t stride4 = stride << 2;
auto* dst = static_cast<uint8_t*>(dest);
for (int y = 0; y < 32; y += 16) {
@@ -1975,8 +1987,9 @@ inline void ColStore16_SSE4_1(void* const dest, ptrdiff_t stride,
}
template <WriteDuplicateFunc writefn>
-inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const ptrdiff_t stride4 = stride << 2;
auto* dst = static_cast<uint8_t*>(dest);
for (int y = 0; y < 64; y += 16) {
@@ -1992,8 +2005,9 @@ inline void ColStore32_SSE4_1(void* const dest, ptrdiff_t stride,
}
template <WriteDuplicateFunc writefn>
-inline void ColStore64_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const column) {
+inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
+ ptrdiff_t stride,
+ const void* LIBGAV1_RESTRICT const column) {
const ptrdiff_t stride4 = stride << 2;
auto* dst = static_cast<uint8_t*>(dest);
for (int y = 0; y < 128; y += 16) {