diff options
Diffstat (limited to 'src/dsp/x86/convolve_avx2.cc')
-rw-r--r-- | src/dsp/x86/convolve_avx2.cc | 534 |
1 files changed, 534 insertions, 0 deletions
diff --git a/src/dsp/x86/convolve_avx2.cc b/src/dsp/x86/convolve_avx2.cc new file mode 100644 index 0000000..3df2120 --- /dev/null +++ b/src/dsp/x86/convolve_avx2.cc @@ -0,0 +1,534 @@ +// Copyright 2020 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/dsp/convolve.h" +#include "src/utils/cpu.h" + +#if LIBGAV1_TARGETING_AVX2 +#include <immintrin.h> + +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstring> + +#include "src/dsp/constants.h" +#include "src/dsp/dsp.h" +#include "src/dsp/x86/common_avx2.h" +#include "src/dsp/x86/common_sse4.h" +#include "src/utils/common.h" +#include "src/utils/constants.h" + +namespace libgav1 { +namespace dsp { +namespace low_bitdepth { +namespace { + +constexpr int kHorizontalOffset = 3; + +// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and +// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final +// sum from outranging int16_t. +template <int filter_index> +__m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) { + __m256i sum; + if (filter_index < 2) { + // 6 taps. + const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]); // k2k1 + const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]); // k4k3 + const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]); // k6k5 + sum = _mm256_add_epi16(v_madd_21, v_madd_43); + sum = _mm256_add_epi16(sum, v_madd_65); + } else if (filter_index == 2) { + // 8 taps. + const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]); // k1k0 + const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]); // k3k2 + const __m256i v_madd_54 = _mm256_maddubs_epi16(src[2], taps[2]); // k5k4 + const __m256i v_madd_76 = _mm256_maddubs_epi16(src[3], taps[3]); // k7k6 + const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32); + const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76); + sum = _mm256_add_epi16(v_sum_7654, v_sum_3210); + } else if (filter_index == 3) { + // 2 taps. + sum = _mm256_maddubs_epi16(src[0], taps[0]); // k4k3 + } else { + // 4 taps. + const __m256i v_madd_32 = _mm256_maddubs_epi16(src[0], taps[0]); // k3k2 + const __m256i v_madd_54 = _mm256_maddubs_epi16(src[1], taps[1]); // k5k4 + sum = _mm256_add_epi16(v_madd_32, v_madd_54); + } + return sum; +} + +template <int filter_index> +__m256i SumHorizontalTaps(const __m256i* const src, + const __m256i* const v_tap) { + __m256i v_src[4]; + const __m256i src_long = *src; + const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long); + const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long); + + if (filter_index < 2) { + // 6 taps. + v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21 + v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 + v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65 + } else if (filter_index == 2) { + // 8 taps. + v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10 + v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 + v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 + v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76 + } else if (filter_index == 3) { + // 2 taps. + v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43 + } else if (filter_index > 3) { + // 4 taps. + v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32 + v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54 + } + return SumOnePassTaps<filter_index>(v_src, v_tap); +} + +template <int filter_index> +__m256i SimpleHorizontalTaps(const __m256i* const src, + const __m256i* const v_tap) { + __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap); + + // Normally the Horizontal pass does the downshift in two passes: + // kInterRoundBitsHorizontal - 1 and then (kFilterBits - + // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them + // requires adding the rounding offset from the skipped shift. + constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); + + sum = _mm256_add_epi16(sum, _mm256_set1_epi16(first_shift_rounding_bit)); + sum = RightShiftWithRounding_S16(sum, kFilterBits - 1); + return _mm256_packus_epi16(sum, sum); +} + +template <int filter_index> +__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, + const __m128i* const v_tap) { + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]); + + if (filter_index == 3) { + // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17 + const __m128i v_src_43 = _mm_shuffle_epi8( + v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403)); + const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3 + return v_sum_43; + } + + // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 + const __m128i v_src_32 = _mm_shuffle_epi8( + v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302)); + // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx + const __m128i v_src_54 = _mm_shuffle_epi8( + v_src, _mm_set_epi32(0x800f0f0e, 0x0e0d0d0c, 0x80070706, 0x06050504)); + const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2 + const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4 + const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32); + return v_sum_5432; +} + +template <int filter_index> +__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, + const __m128i* const v_tap) { + __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + + // Normally the Horizontal pass does the downshift in two passes: + // kInterRoundBitsHorizontal - 1 and then (kFilterBits - + // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them + // requires adding the rounding offset from the skipped shift. + constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); + + sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit)); + sum = RightShiftWithRounding_S16(sum, kFilterBits - 1); + return _mm_packus_epi16(sum, sum); +} + +template <int filter_index> +__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride, + const __m128i* const v_tap) { + const __m128i sum = + SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + + return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); +} + +// Filter 2xh sizes. +template <int num_taps, int step, int filter_index, bool is_2d = false, + bool is_compound = false> +void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, + void* const dest, const ptrdiff_t pred_stride, + const int /*width*/, const int height, + const __m128i* const v_tap) { + auto* dest8 = static_cast<uint8_t*>(dest); + auto* dest16 = static_cast<uint16_t*>(dest); + + // Horizontal passes only need to account for |num_taps| 2 and 4 when + // |width| <= 4. + assert(num_taps <= 4); + if (num_taps <= 4) { + if (!is_compound) { + int y = 0; + do { + if (is_2d) { + const __m128i sum = + HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap); + Store4(&dest16[0], sum); + dest16 += pred_stride; + Store4(&dest16[0], _mm_srli_si128(sum, 8)); + dest16 += pred_stride; + } else { + const __m128i sum = + SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap); + Store2(dest8, sum); + dest8 += pred_stride; + Store2(dest8, _mm_srli_si128(sum, 4)); + dest8 += pred_stride; + } + + src += src_stride << 1; + y += 2; + } while (y < height - 1); + + // The 2d filters have an odd |height| because the horizontal pass + // generates context for the vertical pass. + if (is_2d) { + assert(height % 2 == 1); + __m128i sum; + const __m128i input = LoadLo8(&src[2]); + if (filter_index == 3) { + // 03 04 04 05 05 06 06 07 .... + const __m128i v_src_43 = + _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3); + sum = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3 + } else { + // 02 03 03 04 04 05 05 06 06 07 .... + const __m128i v_src_32 = + _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1); + // 04 05 05 06 06 07 07 08 ... + const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4); + const __m128i v_madd_32 = + _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2 + const __m128i v_madd_54 = + _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4 + sum = _mm_add_epi16(v_madd_54, v_madd_32); + } + sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); + Store4(dest16, sum); + } + } + } +} + +// Filter widths >= 4. +template <int num_taps, int step, int filter_index, bool is_2d = false, + bool is_compound = false> +void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride, + void* const dest, const ptrdiff_t pred_stride, + const int width, const int height, + const __m256i* const v_tap) { + auto* dest8 = static_cast<uint8_t*>(dest); + auto* dest16 = static_cast<uint16_t*>(dest); + + if (width >= 32) { + int y = height; + do { + int x = 0; + do { + if (is_2d || is_compound) { + // placeholder + } else { + // Load src used to calculate dest8[7:0] and dest8[23:16]. + const __m256i src_long = LoadUnaligned32(&src[x]); + const __m256i result = + SimpleHorizontalTaps<filter_index>(&src_long, v_tap); + // Load src used to calculate dest8[15:8] and dest8[31:24]. + const __m256i src_long2 = LoadUnaligned32(&src[x + 8]); + const __m256i result2 = + SimpleHorizontalTaps<filter_index>(&src_long2, v_tap); + // Combine results and store. + StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2)); + } + x += step * 4; + } while (x < width); + src += src_stride; + dest8 += pred_stride; + dest16 += pred_stride; + } while (--y != 0); + } else if (width == 16) { + int y = height; + do { + if (is_2d || is_compound) { + // placeholder + } else { + // Load into 2 128 bit lanes. + const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]), + LoadUnaligned16(&src[src_stride])); + const __m256i result = + SimpleHorizontalTaps<filter_index>(&src_long, v_tap); + const __m256i src_long2 = SetrM128i( + LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride])); + const __m256i result2 = + SimpleHorizontalTaps<filter_index>(&src_long2, v_tap); + const __m256i packed_result = _mm256_unpacklo_epi64(result, result2); + StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result)); + StoreUnaligned16(&dest8[pred_stride], + _mm256_extracti128_si256(packed_result, 1)); + } + src += src_stride * 2; + dest8 += pred_stride * 2; + dest16 += pred_stride * 2; + y -= 2; + } while (y != 0); + } else if (width == 8) { + int y = height; + do { + if (is_2d || is_compound) { + // placeholder + } else { + const __m128i this_row = LoadUnaligned16(&src[0]); + const __m128i next_row = LoadUnaligned16(&src[src_stride]); + // Load into 2 128 bit lanes. + const __m256i src_long = SetrM128i(this_row, next_row); + const __m256i result = + SimpleHorizontalTaps<filter_index>(&src_long, v_tap); + StoreLo8(&dest8[0], _mm256_castsi256_si128(result)); + StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1)); + } + src += src_stride * 2; + dest8 += pred_stride * 2; + dest16 += pred_stride * 2; + y -= 2; + } while (y != 0); + } else { // width == 4 + int y = height; + do { + if (is_2d || is_compound) { + // placeholder + } else { + const __m128i this_row = LoadUnaligned16(&src[0]); + const __m128i next_row = LoadUnaligned16(&src[src_stride]); + // Load into 2 128 bit lanes. + const __m256i src_long = SetrM128i(this_row, next_row); + const __m256i result = + SimpleHorizontalTaps<filter_index>(&src_long, v_tap); + Store4(&dest8[0], _mm256_castsi256_si128(result)); + Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1)); + } + src += src_stride * 2; + dest8 += pred_stride * 2; + dest16 += pred_stride * 2; + y -= 2; + } while (y != 0); + } +} + +template <int num_taps, bool is_2d_vertical = false> +LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter, + __m128i* v_tap) { + if (num_taps == 8) { + v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0 + v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 + v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 + v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); + v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); + v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]); + } + } else if (num_taps == 6) { + const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); + v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1 + v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 + v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); + } + } else if (num_taps == 4) { + v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 + v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); + } + } else { // num_taps == 2 + const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); + v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 + if (is_2d_vertical) { + v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); + } else { + v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); + } + } +} + +template <int num_taps, bool is_2d_vertical = false> +LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter, + __m256i* v_tap) { + if (num_taps == 8) { + v_tap[0] = _mm256_broadcastw_epi16(*filter); // k1k0 + v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2 + v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4 + v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6)); // k7k6 + if (is_2d_vertical) { + // placeholder + } + } else if (num_taps == 6) { + v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1)); // k2k1 + v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3 + v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5)); // k6k5 + if (is_2d_vertical) { + // placeholder + } + } else if (num_taps == 4) { + v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2 + v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4 + if (is_2d_vertical) { + // placeholder + } + } else { // num_taps == 2 + v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3 + if (is_2d_vertical) { + // placeholder + } + } +} + +template <bool is_2d = false, bool is_compound = false> +LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH( + const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, + const ptrdiff_t dst_stride, const int width, const int height, + const int filter_id, const int filter_index) { + assert(filter_id != 0); + __m128i v_tap[4]; + const __m128i v_horizontal_filter = + LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]); + + if (filter_index == 4) { // 4 tap. + SetupTaps<4>(&v_horizontal_filter, v_tap); + FilterHorizontal<4, 8, 4, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 5) { // 4 tap. + SetupTaps<4>(&v_horizontal_filter, v_tap); + FilterHorizontal<4, 8, 5, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else { // 2 tap. + SetupTaps<2>(&v_horizontal_filter, v_tap); + FilterHorizontal<2, 8, 3, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } +} + +template <bool is_2d = false, bool is_compound = false> +LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( + const uint8_t* const src, const ptrdiff_t src_stride, void* const dst, + const ptrdiff_t dst_stride, const int width, const int height, + const int filter_id, const int filter_index) { + assert(filter_id != 0); + __m256i v_tap[4]; + const __m128i v_horizontal_filter = + LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]); + + if (filter_index == 2) { // 8 tap. + SetupTaps<8>(&v_horizontal_filter, v_tap); + FilterHorizontal<8, 8, 2, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 1) { // 6 tap. + SetupTaps<6>(&v_horizontal_filter, v_tap); + FilterHorizontal<6, 8, 1, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 0) { // 6 tap. + SetupTaps<6>(&v_horizontal_filter, v_tap); + FilterHorizontal<6, 8, 0, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 4) { // 4 tap. + SetupTaps<4>(&v_horizontal_filter, v_tap); + FilterHorizontal<4, 8, 4, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index == 5) { // 4 tap. + SetupTaps<4>(&v_horizontal_filter, v_tap); + FilterHorizontal<4, 8, 5, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else { // 2 tap. + SetupTaps<2>(&v_horizontal_filter, v_tap); + FilterHorizontal<2, 8, 3, is_2d, is_compound>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } +} + +void ConvolveHorizontal_AVX2(const void* const reference, + const ptrdiff_t reference_stride, + const int horizontal_filter_index, + const int /*vertical_filter_index*/, + const int horizontal_filter_id, + const int /*vertical_filter_id*/, const int width, + const int height, void* prediction, + const ptrdiff_t pred_stride) { + const int filter_index = GetFilterIndex(horizontal_filter_index, width); + // Set |src| to the outermost tap. + const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset; + auto* dest = static_cast<uint8_t*>(prediction); + + if (width > 2) { + DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height, + horizontal_filter_id, filter_index); + } else { + // Use non avx2 version for smaller widths. + DoHorizontalPass2xH(src, reference_stride, dest, pred_stride, width, height, + horizontal_filter_id, filter_index); + } +} + +void Init8bpp() { + Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8); + assert(dsp != nullptr); + dsp->convolve[0][0][0][1] = ConvolveHorizontal_AVX2; +} + +} // namespace +} // namespace low_bitdepth + +void ConvolveInit_AVX2() { low_bitdepth::Init8bpp(); } + +} // namespace dsp +} // namespace libgav1 + +#else // !LIBGAV1_TARGETING_AVX2 +namespace libgav1 { +namespace dsp { + +void ConvolveInit_AVX2() {} + +} // namespace dsp +} // namespace libgav1 +#endif // LIBGAV1_TARGETING_AVX2 |