diff options
Diffstat (limited to 'src/dsp/arm')
-rw-r--r-- | src/dsp/arm/convolve_10bit_neon.cc | 62 | ||||
-rw-r--r-- | src/dsp/arm/convolve_neon.cc | 12 | ||||
-rw-r--r-- | src/dsp/arm/film_grain_neon.cc | 18 | ||||
-rw-r--r-- | src/dsp/arm/intrapred_directional_neon.cc | 2 | ||||
-rw-r--r-- | src/dsp/arm/inverse_transform_neon.cc | 9 | ||||
-rw-r--r-- | src/dsp/arm/loop_filter_10bit_neon.cc | 2 | ||||
-rw-r--r-- | src/dsp/arm/loop_restoration_10bit_neon.cc | 14 | ||||
-rw-r--r-- | src/dsp/arm/loop_restoration_neon.cc | 6 |
8 files changed, 59 insertions, 66 deletions
diff --git a/src/dsp/arm/convolve_10bit_neon.cc b/src/dsp/arm/convolve_10bit_neon.cc index 389f029..1aa0cc7 100644 --- a/src/dsp/arm/convolve_10bit_neon.cc +++ b/src/dsp/arm/convolve_10bit_neon.cc @@ -412,30 +412,21 @@ void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src, void* LIBGAV1_RESTRICT const dest, const ptrdiff_t pred_stride, const int width, const int height, const int16x4_t* const v_tap) { - assert(width < 8 || num_taps != 4); - // Don't simplify the redundant if conditions with the template parameters, - // which helps the compiler generate compact code. - if (width >= 8 && num_taps != 4) { - FilterHorizontalWidth8AndUp<num_taps, is_compound, is_2d>( - src, src_stride, dest, pred_stride, width, height, v_tap); - return; - } - // Horizontal passes only needs to account for number of taps 2 and 4 when // |width| <= 4. assert(width <= 4); assert(num_taps == 2 || num_taps == 4); if (num_taps == 2 || num_taps == 4) { - if (width == 4) { - FilterHorizontalWidth4<num_taps, is_compound, is_2d>( - src, src_stride, dest, pred_stride, height, v_tap); - return; - } - assert(width == 2); - if (!is_compound) { + if (width == 2 && !is_compound) { FilterHorizontalWidth2<num_taps, is_2d>(src, src_stride, dest, pred_stride, height, v_tap); + return; } + assert(width == 4); + FilterHorizontalWidth4<num_taps, is_compound, is_2d>( + src, src_stride, dest, pred_stride, height, v_tap); + } else { + assert(false); } } @@ -454,19 +445,32 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass( v_tap[k] = vdup_n_s16(kHalfSubPixelFilters[filter_index][filter_id][k]); } - if (filter_index == 2) { // 8 tap. - FilterHorizontal<8, is_compound, is_2d>(src, src_stride, dst, dst_stride, - width, height, v_tap); - } else if (filter_index < 2) { // 6 tap. - FilterHorizontal<6, is_compound, is_2d>(src + 1, src_stride, dst, - dst_stride, width, height, v_tap); - } else if ((filter_index & 0x4) != 0) { // 4 tap. - // ((filter_index == 4) | (filter_index == 5)) - FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst, - dst_stride, width, height, v_tap); - } else { // 2 tap. - FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst, - dst_stride, width, height, v_tap); + // Horizontal filter. + // Filter types used for width <= 4 are different from those for width > 4. + // When width > 4, the valid filter index range is always [0, 3]. + // When width <= 4, the valid filter index range is always [4, 5]. + if (width >= 8) { + if (filter_index == 2) { // 8 tap. + FilterHorizontalWidth8AndUp<8, is_compound, is_2d>( + src, src_stride, dst, dst_stride, width, height, v_tap); + } else if (filter_index < 2) { // 6 tap. + FilterHorizontalWidth8AndUp<6, is_compound, is_2d>( + src + 1, src_stride, dst, dst_stride, width, height, v_tap); + } else { // 2 tap. + assert(filter_index == 3); + FilterHorizontalWidth8AndUp<2, is_compound, is_2d>( + src + 3, src_stride, dst, dst_stride, width, height, v_tap); + } + } else { + if ((filter_index & 0x4) != 0) { // 4 tap. + // ((filter_index == 4) | (filter_index == 5)) + FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst, + dst_stride, width, height, v_tap); + } else { // 2 tap. + assert(filter_index == 3); + FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst, + dst_stride, width, height, v_tap); + } } } diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc index 5b80da2..97b3f26 100644 --- a/src/dsp/arm/convolve_neon.cc +++ b/src/dsp/arm/convolve_neon.cc @@ -371,16 +371,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT const src, assert(width <= 4); assert(filter_index >= 3 && filter_index <= 5); if (filter_index >= 3 && filter_index <= 5) { - if (width == 4) { - FilterHorizontalWidth4<filter_index, is_2d, is_compound>( - src, src_stride, dest, pred_stride, height, v_tap); - return; - } - assert(width == 2); - if (!is_compound) { + if (width == 2 && !is_compound) { FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest, pred_stride, height, v_tap); + return; } + assert(width == 4); + FilterHorizontalWidth4<filter_index, is_2d, is_compound>( + src, src_stride, dest, pred_stride, height, v_tap); } } diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc index 76e1151..cde887c 100644 --- a/src/dsp/arm/film_grain_neon.cc +++ b/src/dsp/arm/film_grain_neon.cc @@ -682,26 +682,14 @@ inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low, template <int bitdepth, typename Pixel> inline int16x8_t GetScalingFactors(const int16_t scaling_lut[], - const Pixel* source) { + const Pixel* source, + const int valid_range = 8) { int16_t start_vals[8]; static_assert(bitdepth <= kBitdepth10, "NEON Film Grain is not yet implemented for 12bpp."); #if LIBGAV1_MSAN - memset(start_vals, 0, sizeof(start_vals)); + if (valid_range < 8) memset(start_vals, 0, sizeof(start_vals)); #endif - for (int i = 0; i < 8; ++i) { - assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8))); - start_vals[i] = scaling_lut[source[i]]; - } - return vld1q_s16(start_vals); -} - -template <int bitdepth, typename Pixel> -inline int16x8_t GetScalingFactors(const int16_t scaling_lut[], - const Pixel* source, const int valid_range) { - int16_t start_vals[8]; - static_assert(bitdepth <= kBitdepth10, - "NEON Film Grain is not yet implemented for 12bpp."); for (int i = 0; i < valid_range; ++i) { assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8))); start_vals[i] = scaling_lut[source[i]]; diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc index e9bdcf0..d36ef5f 100644 --- a/src/dsp/arm/intrapred_directional_neon.cc +++ b/src/dsp/arm/intrapred_directional_neon.cc @@ -1752,7 +1752,7 @@ inline void DirectionalZone2FromLeftCol_8x8( const int index_scale_bits = 6; // The values in |offset_y| are negative, except for the first element, which // is zero. - int16x8_t offset_y = left_y; + int16x8_t offset_y; int16x8_t shift_upsampled = left_y; // The shift argument must be a constant, otherwise use upsample_shift // directly. diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc index 452f14a..cc4e4a4 100644 --- a/src/dsp/arm/inverse_transform_neon.cc +++ b/src/dsp/arm/inverse_transform_neon.cc @@ -345,11 +345,12 @@ LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int16x8_t* a, int16x8_t* b, const int angle, const bool flip) { + // Clang < 14 targeting armv8.1-a+ optimizes vqrdmulhq_n_s16 and vqsubq_s16 + // (in HadamardRotation) into vqrdmlshq_s16 resulting in an "off by one" + // error. This behavior was fixed in 14.0.0: + // https://github.com/llvm/llvm-project/commit/82973edfb72a95b442fa6d2bb404e15a4031855e #if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__) && \ - defined(__clang__) // ARM v8.1-A - // Clang optimizes vqrdmulhq_n_s16 and vqsubq_s16 (in HadamardRotation) into - // vqrdmlshq_s16 resulting in an "off by one" error. For now, do not use - // vqrdmulhq_n_s16(). + defined(__clang__) && __clang_major__ < 14 const int16_t cos128 = Cos128(angle); const int16_t sin128 = Sin128(angle); const int32x4_t x0 = vmull_n_s16(vget_low_s16(*b), -sin128); diff --git a/src/dsp/arm/loop_filter_10bit_neon.cc b/src/dsp/arm/loop_filter_10bit_neon.cc index a9dd98f..abdc074 100644 --- a/src/dsp/arm/loop_filter_10bit_neon.cc +++ b/src/dsp/arm/loop_filter_10bit_neon.cc @@ -444,7 +444,6 @@ void Horizontal6_NEON(void* const dest, const ptrdiff_t stride, const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); if (vget_lane_u64(need_filter6, 0) == 0) { // Filter6() does not apply, but Filter4() applies to one or more values. - p0q0_output = p0q0; p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); } else { @@ -526,7 +525,6 @@ void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh, const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); if (vget_lane_u64(need_filter6, 0) == 0) { // Filter6() does not apply, but Filter4() applies to one or more values. - p0q0_output = p0q0; p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); } else { diff --git a/src/dsp/arm/loop_restoration_10bit_neon.cc b/src/dsp/arm/loop_restoration_10bit_neon.cc index 410bc20..9191080 100644 --- a/src/dsp/arm/loop_restoration_10bit_neon.cc +++ b/src/dsp/arm/loop_restoration_10bit_neon.cc @@ -1130,7 +1130,13 @@ inline void LookupIntermediate(const uint16x8_t sum, const uint16x8_t index, const uint8x8_t idx = vqmovn_u16(index); uint8_t temp[8]; vst1_u8(temp, idx); - *ma = vsetq_lane_u8(kSgrMaLookup[temp[0]], *ma, offset + 0); + // offset == 0 is assumed to be the first call to this function. The value is + // duplicated to avoid -Wuninitialized warnings under gcc. + if (offset == 0) { + *ma = vdupq_n_u8(kSgrMaLookup[temp[0]]); + } else { + *ma = vsetq_lane_u8(kSgrMaLookup[temp[0]], *ma, offset + 0); + } *ma = vsetq_lane_u8(kSgrMaLookup[temp[1]], *ma, offset + 1); *ma = vsetq_lane_u8(kSgrMaLookup[temp[2]], *ma, offset + 2); *ma = vsetq_lane_u8(kSgrMaLookup[temp[3]], *ma, offset + 3); @@ -1712,8 +1718,6 @@ LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3( s[0] = Load1QMsanU16(src + 0, overread_in_bytes + 0); s[1] = Load1QMsanU16(src + 8, overread_in_bytes + 16); Square(s[0], sq); - // Quiet "may be used uninitialized" warning. - mas[0] = mas[1] = vdupq_n_u8(0); BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs); int x = 0; @@ -2067,8 +2071,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2( s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0); s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16); Square(s[0], sq); - // Quiet "may be used uninitialized" warning. - mas[0] = mas[1] = vdupq_n_u8(0); BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs); int x = 0; @@ -2255,8 +2257,6 @@ inline void BoxFilterLastRow( s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0); s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16); Square(s[0], sq); - // Quiet "may be used uninitialized" warning. - ma3[0] = ma3[1] = vdupq_n_u8(0); BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq, &ma3[0], &ma5[0], b3, b5); diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc index cd8552e..adb8f36 100644 --- a/src/dsp/arm/loop_restoration_neon.cc +++ b/src/dsp/arm/loop_restoration_neon.cc @@ -1125,7 +1125,11 @@ inline void CalculateIntermediate(const uint16x8_t sum, val = AdjustValue(val, idx, 101); // 101 is the last index which value is 3. val = AdjustValue(val, idx, 169); // 169 is the last index which value is 2. val = AdjustValue(val, idx, 254); // 254 is the last index which value is 1. - *ma = (offset == 0) ? vcombine_u8(val, vget_high_u8(*ma)) + // offset == 0 is assumed to be the first call to this function. Note + // vget_high_u8(*ma) is not used in this case to avoid a -Wuninitialized + // warning with some versions of gcc. vdup_n_u8(0) could work as well, but in + // most cases clang and gcc generated better code with this version. + *ma = (offset == 0) ? vcombine_u8(val, val) : vcombine_u8(vget_low_u8(*ma), val); // b = ma * b * one_over_n |