aboutsummaryrefslogtreecommitdiff
path: root/src/dsp/x86/convolve_sse4.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/dsp/x86/convolve_sse4.cc')
-rw-r--r--src/dsp/x86/convolve_sse4.cc284
1 files changed, 146 insertions, 138 deletions
diff --git a/src/dsp/x86/convolve_sse4.cc b/src/dsp/x86/convolve_sse4.cc
index 9b72fe4..f7e5a71 100644
--- a/src/dsp/x86/convolve_sse4.cc
+++ b/src/dsp/x86/convolve_sse4.cc
@@ -37,7 +37,7 @@ namespace {
#include "src/dsp/x86/convolve_sse4.inc"
template <int filter_index>
-__m128i SumHorizontalTaps(const uint8_t* const src,
+__m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
__m128i v_src[4];
const __m128i src_long = LoadUnaligned16(src);
@@ -68,7 +68,7 @@ __m128i SumHorizontalTaps(const uint8_t* const src,
}
template <int filter_index>
-__m128i SimpleHorizontalTaps(const uint8_t* const src,
+__m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
__m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
@@ -84,7 +84,7 @@ __m128i SimpleHorizontalTaps(const uint8_t* const src,
}
template <int filter_index>
-__m128i HorizontalTaps8To16(const uint8_t* const src,
+__m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src,
const __m128i* const v_tap) {
const __m128i sum = SumHorizontalTaps<filter_index>(src, v_tap);
@@ -93,10 +93,11 @@ __m128i HorizontalTaps8To16(const uint8_t* const src,
template <int num_taps, int filter_index, bool is_2d = false,
bool is_compound = false>
-void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dest, const ptrdiff_t pred_stride,
- const int width, const int height,
- const __m128i* const v_tap) {
+void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dest,
+ const ptrdiff_t pred_stride, const int width,
+ const int height, const __m128i* const v_tap) {
auto* dest8 = static_cast<uint8_t*>(dest);
auto* dest16 = static_cast<uint16_t*>(dest);
@@ -206,9 +207,10 @@ void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
template <bool is_2d = false, bool is_compound = false>
LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
- const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
- const ptrdiff_t dst_stride, const int width, const int height,
- const int filter_id, const int filter_index) {
+ const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
+ const int width, const int height, const int filter_id,
+ const int filter_index) {
assert(filter_id != 0);
__m128i v_tap[4];
const __m128i v_horizontal_filter =
@@ -241,13 +243,13 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
}
}
-void Convolve2D_SSE4_1(const void* const reference,
+void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
const int vertical_filter_index,
const int horizontal_filter_id,
const int vertical_filter_id, const int width,
- const int height, void* prediction,
+ const int height, void* LIBGAV1_RESTRICT prediction,
const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
@@ -328,10 +330,11 @@ void Convolve2D_SSE4_1(const void* const reference,
}
template <int filter_index, bool is_compound = false>
-void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dst, const ptrdiff_t dst_stride,
- const int width, const int height,
- const __m128i* const v_tap) {
+void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride,
+ void* LIBGAV1_RESTRICT const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const __m128i* const v_tap) {
const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
auto* dst8 = static_cast<uint8_t*>(dst);
@@ -400,14 +403,12 @@ void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
} while (x < width);
}
-void ConvolveVertical_SSE4_1(const void* const reference,
- const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/,
- const int vertical_filter_index,
- const int /*horizontal_filter_id*/,
- const int vertical_filter_id, const int width,
- const int height, void* prediction,
- const ptrdiff_t pred_stride) {
+void ConvolveVertical_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int vertical_filter_index, const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
@@ -477,14 +478,12 @@ void ConvolveVertical_SSE4_1(const void* const reference,
}
}
-void ConvolveCompoundCopy_SSE4(const void* const reference,
- const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/,
- const int /*vertical_filter_index*/,
- const int /*horizontal_filter_id*/,
- const int /*vertical_filter_id*/,
- const int width, const int height,
- void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveCompoundCopy_SSE4(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
const auto* src = static_cast<const uint8_t*>(reference);
const ptrdiff_t src_stride = reference_stride;
auto* dest = static_cast<uint16_t*>(prediction);
@@ -539,11 +538,11 @@ void ConvolveCompoundCopy_SSE4(const void* const reference,
}
void ConvolveCompoundVertical_SSE4_1(
- const void* const reference, const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/, const int vertical_filter_index,
- const int /*horizontal_filter_id*/, const int vertical_filter_id,
- const int width, const int height, void* prediction,
- const ptrdiff_t /*pred_stride*/) {
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int vertical_filter_index, const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
@@ -608,14 +607,12 @@ void ConvolveCompoundVertical_SSE4_1(
}
}
-void ConvolveHorizontal_SSE4_1(const void* const reference,
- const ptrdiff_t reference_stride,
- const int horizontal_filter_index,
- const int /*vertical_filter_index*/,
- const int horizontal_filter_id,
- const int /*vertical_filter_id*/,
- const int width, const int height,
- void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveHorizontal_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
// Set |src| to the outermost tap.
const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
@@ -626,11 +623,11 @@ void ConvolveHorizontal_SSE4_1(const void* const reference,
}
void ConvolveCompoundHorizontal_SSE4_1(
- const void* const reference, const ptrdiff_t reference_stride,
- const int horizontal_filter_index, const int /*vertical_filter_index*/,
- const int horizontal_filter_id, const int /*vertical_filter_id*/,
- const int width, const int height, void* prediction,
- const ptrdiff_t /*pred_stride*/) {
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int /*vertical_filter_index*/, const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
auto* dest = static_cast<uint16_t*>(prediction);
@@ -640,14 +637,12 @@ void ConvolveCompoundHorizontal_SSE4_1(
filter_index);
}
-void ConvolveCompound2D_SSE4_1(const void* const reference,
- const ptrdiff_t reference_stride,
- const int horizontal_filter_index,
- const int vertical_filter_index,
- const int horizontal_filter_id,
- const int vertical_filter_id, const int width,
- const int height, void* prediction,
- const ptrdiff_t /*pred_stride*/) {
+void ConvolveCompound2D_SSE4_1(
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int horizontal_filter_index,
+ const int vertical_filter_index, const int horizontal_filter_id,
+ const int vertical_filter_id, const int width, const int height,
+ void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
// The output of the horizontal filter, i.e. the intermediate_result, is
// guaranteed to fit in int16_t.
alignas(16) uint16_t
@@ -835,7 +830,8 @@ inline void GetHalfSubPixelFilter(__m128i* output) {
// exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of
// |step_x|.
template <int num_taps, int grade_x>
-inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
+inline void PrepareSourceVectors(const uint8_t* LIBGAV1_RESTRICT src,
+ const __m128i src_indices,
__m128i* const source /*[num_taps >> 1]*/) {
// |used_bytes| is only computed in msan builds. Mask away unused bytes for
// msan because it incorrectly models the outcome of the shuffles in some
@@ -900,10 +896,11 @@ inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) {
}
template <int grade_x, int filter_index, int num_taps>
-inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride,
- int width, int subpixel_x, int step_x,
+inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
+ ptrdiff_t src_stride, int width,
+ int subpixel_x, int step_x,
int intermediate_height,
- int16_t* intermediate) {
+ int16_t* LIBGAV1_RESTRICT intermediate) {
// Account for the 0-taps that precede the 2 nonzero taps.
const int kernel_offset = (8 - num_taps) >> 1;
const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -946,11 +943,11 @@ inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride,
}
// |width| >= 8
+ int16_t* intermediate_x = intermediate;
int x = 0;
do {
const uint8_t* src_x =
&src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
- int16_t* intermediate_x = intermediate + x;
// Only add steps to the 10-bit truncated p to avoid overflow.
const __m128i p_fraction = _mm_set1_epi16(p & 1023);
const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
@@ -976,7 +973,8 @@ inline void ConvolveHorizontalScale(const uint8_t* src, ptrdiff_t src_stride,
}
template <int num_taps>
-inline void PrepareVerticalTaps(const int8_t* taps, __m128i* output) {
+inline void PrepareVerticalTaps(const int8_t* LIBGAV1_RESTRICT taps,
+ __m128i* output) {
// Avoid overreading the filter due to starting at kernel_offset.
// The only danger of overread is in the final filter, which has 4 taps.
const __m128i filter =
@@ -1072,10 +1070,12 @@ __m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
// |width_class| is 2, 4, or 8, according to the Store function that should be
// used.
template <int num_taps, int width_class, bool is_compound>
-inline void ConvolveVerticalScale(const int16_t* src, const int width,
- const int subpixel_y, const int filter_index,
- const int step_y, const int height,
- void* dest, const ptrdiff_t dest_stride) {
+inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT src,
+ const int intermediate_height,
+ const int width, const int subpixel_y,
+ const int filter_index, const int step_y,
+ const int height, void* LIBGAV1_RESTRICT dest,
+ const ptrdiff_t dest_stride) {
constexpr ptrdiff_t src_stride = kIntermediateStride;
constexpr int kernel_offset = (8 - num_taps) / 2;
const int16_t* src_y = src;
@@ -1138,15 +1138,19 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
// |width_class| >= 8
__m128i filter_taps[num_taps >> 1];
- do { // y > 0
- src_y = src + (p >> kScaleSubPixelBits) * src_stride;
- const int filter_id = (p >> 6) & kSubPixelMask;
- const int8_t* filter =
- kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
- PrepareVerticalTaps<num_taps>(filter, filter_taps);
-
- int x = 0;
- do { // x < width
+ int x = 0;
+ do { // x < width
+ auto* dest_y = static_cast<uint8_t*>(dest) + x;
+ auto* dest16_y = static_cast<uint16_t*>(dest) + x;
+ int p = subpixel_y & 1023;
+ int y = height;
+ do { // y > 0
+ const int filter_id = (p >> 6) & kSubPixelMask;
+ const int8_t* filter =
+ kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
+ PrepareVerticalTaps<num_taps>(filter, filter_taps);
+
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
for (int i = 0; i < num_taps; ++i) {
s[i] = LoadUnaligned16(src_y + i * src_stride);
}
@@ -1154,38 +1158,36 @@ inline void ConvolveVerticalScale(const int16_t* src, const int width,
const __m128i sums =
Sum2DVerticalTaps<num_taps, is_compound>(s, filter_taps);
if (is_compound) {
- StoreUnaligned16(dest16_y + x, sums);
+ StoreUnaligned16(dest16_y, sums);
} else {
- StoreLo8(dest_y + x, _mm_packus_epi16(sums, sums));
+ StoreLo8(dest_y, _mm_packus_epi16(sums, sums));
}
- x += 8;
- src_y += 8;
- } while (x < width);
- p += step_y;
- dest_y += dest_stride;
- dest16_y += dest_stride;
- } while (--y != 0);
+ p += step_y;
+ dest_y += dest_stride;
+ dest16_y += dest_stride;
+ } while (--y != 0);
+ src += kIntermediateStride * intermediate_height;
+ x += 8;
+ } while (x < width);
}
template <bool is_compound>
-void ConvolveScale2D_SSE4_1(const void* const reference,
+void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
const int vertical_filter_index,
const int subpixel_x, const int subpixel_y,
const int step_x, const int step_y, const int width,
- const int height, void* prediction,
+ const int height, void* LIBGAV1_RESTRICT prediction,
const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
assert(step_x <= 2048);
// The output of the horizontal filter, i.e. the intermediate_result, is
// guaranteed to fit in int16_t.
- // TODO(petersonab): Reduce intermediate block stride to width to make smaller
- // blocks faster.
alignas(16) int16_t
- intermediate_result[kMaxSuperBlockSizeInPixels *
- (2 * kMaxSuperBlockSizeInPixels + kSubPixelTaps)];
+ intermediate_result[kIntermediateAllocWidth *
+ (2 * kIntermediateAllocWidth + kSubPixelTaps)];
const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
const int intermediate_height =
(((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
@@ -1282,76 +1284,78 @@ void ConvolveScale2D_SSE4_1(const void* const reference,
case 1:
if (!is_compound && width == 2) {
ConvolveVerticalScale<6, 2, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else if (width == 4) {
ConvolveVerticalScale<6, 4, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else {
ConvolveVerticalScale<6, 8, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
}
break;
case 2:
if (!is_compound && width == 2) {
ConvolveVerticalScale<8, 2, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else if (width == 4) {
ConvolveVerticalScale<8, 4, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else {
ConvolveVerticalScale<8, 8, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
}
break;
case 3:
if (!is_compound && width == 2) {
ConvolveVerticalScale<2, 2, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else if (width == 4) {
ConvolveVerticalScale<2, 4, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else {
ConvolveVerticalScale<2, 8, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
}
break;
default:
assert(vert_filter_index == 4 || vert_filter_index == 5);
if (!is_compound && width == 2) {
ConvolveVerticalScale<4, 2, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else if (width == 4) {
ConvolveVerticalScale<4, 4, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
} else {
ConvolveVerticalScale<4, 8, is_compound>(
- intermediate, width, subpixel_y, vert_filter_index, step_y, height,
- prediction, pred_stride);
+ intermediate, intermediate_height, width, subpixel_y,
+ vert_filter_index, step_y, height, prediction, pred_stride);
}
}
}
-inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
+ uint8_t* LIBGAV1_RESTRICT dst) {
const __m128i left = LoadUnaligned16(src);
const __m128i right = LoadUnaligned16(src + 1);
StoreUnaligned16(dst, _mm_avg_epu8(left, right));
}
template <int width>
-inline void IntraBlockCopyHorizontal(const uint8_t* src,
+inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride,
- const int height, uint8_t* dst,
+ const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t dst_stride) {
const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
@@ -1392,10 +1396,11 @@ inline void IntraBlockCopyHorizontal(const uint8_t* src,
}
void ConvolveIntraBlockCopyHorizontal_SSE4_1(
- const void* const reference, const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
- const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
- const int height, void* const prediction, const ptrdiff_t pred_stride) {
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*subpixel_x*/,
+ const int /*subpixel_y*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
const auto* src = static_cast<const uint8_t*>(reference);
auto* dest = static_cast<uint8_t*>(prediction);
@@ -1464,9 +1469,10 @@ void ConvolveIntraBlockCopyHorizontal_SSE4_1(
}
template <int width>
-inline void IntraBlockCopyVertical(const uint8_t* src,
+inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src,
const ptrdiff_t src_stride, const int height,
- uint8_t* dst, const ptrdiff_t dst_stride) {
+ uint8_t* LIBGAV1_RESTRICT dst,
+ const ptrdiff_t dst_stride) {
const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
__m128i row[8], below[8];
@@ -1553,11 +1559,11 @@ inline void IntraBlockCopyVertical(const uint8_t* src,
}
void ConvolveIntraBlockCopyVertical_SSE4_1(
- const void* const reference, const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
- const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
- const int width, const int height, void* const prediction,
- const ptrdiff_t pred_stride) {
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
const auto* src = static_cast<const uint8_t*>(reference);
auto* dest = static_cast<uint8_t*>(prediction);
@@ -1622,7 +1628,8 @@ void ConvolveIntraBlockCopyVertical_SSE4_1(
}
// Load then add two uint8_t vectors. Return the uint16_t vector result.
-inline __m128i LoadU8AndAddLong(const uint8_t* src, const uint8_t* src1) {
+inline __m128i LoadU8AndAddLong(const uint8_t* LIBGAV1_RESTRICT src,
+ const uint8_t* LIBGAV1_RESTRICT src1) {
const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src));
const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1));
return _mm_add_epi16(a, b);
@@ -1637,8 +1644,9 @@ inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) {
}
template <int width>
-inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
- const int height, uint8_t* dst,
+inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src,
+ const ptrdiff_t src_stride, const int height,
+ uint8_t* LIBGAV1_RESTRICT dst,
const ptrdiff_t dst_stride) {
const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
@@ -1793,11 +1801,11 @@ inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
}
void ConvolveIntraBlockCopy2D_SSE4_1(
- const void* const reference, const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
- const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
- const int width, const int height, void* const prediction,
- const ptrdiff_t pred_stride) {
+ const void* LIBGAV1_RESTRICT const reference,
+ const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width, const int height,
+ void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
const auto* src = static_cast<const uint8_t*>(reference);
auto* dest = static_cast<uint8_t*>(prediction);
// Note: allow vertical access to height + 1. Because this function is only