8 files changed, 59 insertions, 66 deletions
diff --git a/src/dsp/arm/convolve_10bit_neon.cc b/src/dsp/arm/convolve_10bit_neon.cc
index 389f029..1aa0cc7 100644
--- a/src/dsp/arm/convolve_10bit_neon.cc
+++ b/src/dsp/arm/convolve_10bit_neon.cc
@@ -412,30 +412,21 @@ void FilterHorizontal(const uint16_t* LIBGAV1_RESTRICT const src,
                       void* LIBGAV1_RESTRICT const dest,
                       const ptrdiff_t pred_stride, const int width,
                       const int height, const int16x4_t* const v_tap) {
-  assert(width < 8 || num_taps != 4);
-  // Don't simplify the redundant if conditions with the template parameters,
-  // which helps the compiler generate compact code.
-  if (width >= 8 && num_taps != 4) {
-    FilterHorizontalWidth8AndUp<num_taps, is_compound, is_2d>(
-        src, src_stride, dest, pred_stride, width, height, v_tap);
-    return;
-  }
-
   // Horizontal passes only needs to account for number of taps 2 and 4 when
   // |width| <= 4.
   assert(width <= 4);
   assert(num_taps == 2 || num_taps == 4);
   if (num_taps == 2 || num_taps == 4) {
-    if (width == 4) {
-      FilterHorizontalWidth4<num_taps, is_compound, is_2d>(
-          src, src_stride, dest, pred_stride, height, v_tap);
-      return;
-    }
-    assert(width == 2);
-    if (!is_compound) {
+    if (width == 2 && !is_compound) {
       FilterHorizontalWidth2<num_taps, is_2d>(src, src_stride, dest,
                                               pred_stride, height, v_tap);
+      return;
     }
+    assert(width == 4);
+    FilterHorizontalWidth4<num_taps, is_compound, is_2d>(
+        src, src_stride, dest, pred_stride, height, v_tap);
+  } else {
+    assert(false);
   }
 }
 
@@ -454,19 +445,32 @@ LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
     v_tap[k] = vdup_n_s16(kHalfSubPixelFilters[filter_index][filter_id][k]);
   }
 
-  if (filter_index == 2) {  // 8 tap.
-    FilterHorizontal<8, is_compound, is_2d>(src, src_stride, dst, dst_stride,
-                                            width, height, v_tap);
-  } else if (filter_index < 2) {  // 6 tap.
-    FilterHorizontal<6, is_compound, is_2d>(src + 1, src_stride, dst,
-                                            dst_stride, width, height, v_tap);
-  } else if ((filter_index & 0x4) != 0) {  // 4 tap.
-    // ((filter_index == 4) | (filter_index == 5))
-    FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst,
-                                            dst_stride, width, height, v_tap);
-  } else {  // 2 tap.
-    FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst,
-                                            dst_stride, width, height, v_tap);
+  // Horizontal filter.
+  // Filter types used for width <= 4 are different from those for width > 4.
+  // When width > 4, the valid filter index range is always [0, 3].
+  // When width <= 4, the valid filter index range is always [4, 5].
+  if (width >= 8) {
+    if (filter_index == 2) {  // 8 tap.
+      FilterHorizontalWidth8AndUp<8, is_compound, is_2d>(
+          src, src_stride, dst, dst_stride, width, height, v_tap);
+    } else if (filter_index < 2) {  // 6 tap.
+      FilterHorizontalWidth8AndUp<6, is_compound, is_2d>(
+          src + 1, src_stride, dst, dst_stride, width, height, v_tap);
+    } else {  // 2 tap.
+      assert(filter_index == 3);
+      FilterHorizontalWidth8AndUp<2, is_compound, is_2d>(
+          src + 3, src_stride, dst, dst_stride, width, height, v_tap);
+    }
+  } else {
+    if ((filter_index & 0x4) != 0) {  // 4 tap.
+      // ((filter_index == 4) | (filter_index == 5))
+      FilterHorizontal<4, is_compound, is_2d>(src + 2, src_stride, dst,
+                                              dst_stride, width, height, v_tap);
+    } else {  // 2 tap.
+      assert(filter_index == 3);
+      FilterHorizontal<2, is_compound, is_2d>(src + 3, src_stride, dst,
+                                              dst_stride, width, height, v_tap);
+    }
   }
 }
 
diff --git a/src/dsp/arm/convolve_neon.cc b/src/dsp/arm/convolve_neon.cc
index 5b80da2..97b3f26 100644
--- a/src/dsp/arm/convolve_neon.cc
+++ b/src/dsp/arm/convolve_neon.cc
@@ -371,16 +371,14 @@ void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT const src,
   assert(width <= 4);
   assert(filter_index >= 3 && filter_index <= 5);
   if (filter_index >= 3 && filter_index <= 5) {
-    if (width == 4) {
-      FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
-          src, src_stride, dest, pred_stride, height, v_tap);
-      return;
-    }
-    assert(width == 2);
-    if (!is_compound) {
+    if (width == 2 && !is_compound) {
       FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
                                                   pred_stride, height, v_tap);
+      return;
     }
+    assert(width == 4);
+    FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
+        src, src_stride, dest, pred_stride, height, v_tap);
   }
 }
 
diff --git a/src/dsp/arm/film_grain_neon.cc b/src/dsp/arm/film_grain_neon.cc
index 76e1151..cde887c 100644
--- a/src/dsp/arm/film_grain_neon.cc
+++ b/src/dsp/arm/film_grain_neon.cc
@@ -682,26 +682,14 @@ inline int16x8_t Clip3(const int16x8_t value, const int16x8_t low,
 
 template <int bitdepth, typename Pixel>
 inline int16x8_t GetScalingFactors(const int16_t scaling_lut[],
-                                   const Pixel* source) {
+                                   const Pixel* source,
+                                   const int valid_range = 8) {
   int16_t start_vals[8];
   static_assert(bitdepth <= kBitdepth10,
                 "NEON Film Grain is not yet implemented for 12bpp.");
 #if LIBGAV1_MSAN
-  memset(start_vals, 0, sizeof(start_vals));
+  if (valid_range < 8) memset(start_vals, 0, sizeof(start_vals));
 #endif
-  for (int i = 0; i < 8; ++i) {
-    assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8)));
-    start_vals[i] = scaling_lut[source[i]];
-  }
-  return vld1q_s16(start_vals);
-}
-
-template <int bitdepth, typename Pixel>
-inline int16x8_t GetScalingFactors(const int16_t scaling_lut[],
-                                   const Pixel* source, const int valid_range) {
-  int16_t start_vals[8];
-  static_assert(bitdepth <= kBitdepth10,
-                "NEON Film Grain is not yet implemented for 12bpp.");
   for (int i = 0; i < valid_range; ++i) {
     assert(source[i] < (kScalingLookupTableSize << (bitdepth - kBitdepth8)));
     start_vals[i] = scaling_lut[source[i]];
diff --git a/src/dsp/arm/intrapred_directional_neon.cc b/src/dsp/arm/intrapred_directional_neon.cc
index e9bdcf0..d36ef5f 100644
--- a/src/dsp/arm/intrapred_directional_neon.cc
+++ b/src/dsp/arm/intrapred_directional_neon.cc
@@ -1752,7 +1752,7 @@ inline void DirectionalZone2FromLeftCol_8x8(
   const int index_scale_bits = 6;
   // The values in |offset_y| are negative, except for the first element, which
   // is zero.
-  int16x8_t offset_y = left_y;
+  int16x8_t offset_y;
   int16x8_t shift_upsampled = left_y;
   // The shift argument must be a constant, otherwise use upsample_shift
   // directly.
diff --git a/src/dsp/arm/inverse_transform_neon.cc b/src/dsp/arm/inverse_transform_neon.cc
index 452f14a..cc4e4a4 100644
--- a/src/dsp/arm/inverse_transform_neon.cc
+++ b/src/dsp/arm/inverse_transform_neon.cc
@@ -345,11 +345,12 @@ LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int16x8_t* a,
                                                          int16x8_t* b,
                                                          const int angle,
                                                          const bool flip) {
+  // Clang < 14 targeting armv8.1-a+ optimizes vqrdmulhq_n_s16 and vqsubq_s16
+  // (in HadamardRotation) into vqrdmlshq_s16 resulting in an "off by one"
+  // error. This behavior was fixed in 14.0.0:
+  // https://github.com/llvm/llvm-project/commit/82973edfb72a95b442fa6d2bb404e15a4031855e
 #if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__) && \
-    defined(__clang__)  // ARM v8.1-A
-  // Clang optimizes vqrdmulhq_n_s16 and vqsubq_s16 (in HadamardRotation) into
-  // vqrdmlshq_s16 resulting in an "off by one" error. For now, do not use
-  // vqrdmulhq_n_s16().
+    defined(__clang__) && __clang_major__ < 14
   const int16_t cos128 = Cos128(angle);
   const int16_t sin128 = Sin128(angle);
   const int32x4_t x0 = vmull_n_s16(vget_low_s16(*b), -sin128);
diff --git a/src/dsp/arm/loop_filter_10bit_neon.cc b/src/dsp/arm/loop_filter_10bit_neon.cc
index a9dd98f..abdc074 100644
--- a/src/dsp/arm/loop_filter_10bit_neon.cc
+++ b/src/dsp/arm/loop_filter_10bit_neon.cc
@@ -444,7 +444,6 @@ void Horizontal6_NEON(void* const dest, const ptrdiff_t stride,
   const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
   if (vget_lane_u64(need_filter6, 0) == 0) {
     // Filter6() does not apply, but Filter4() applies to one or more values.
-    p0q0_output = p0q0;
     p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
     p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
   } else {
@@ -526,7 +525,6 @@ void Vertical6_NEON(void* const dest, const ptrdiff_t stride, int outer_thresh,
   const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
   if (vget_lane_u64(need_filter6, 0) == 0) {
     // Filter6() does not apply, but Filter4() applies to one or more values.
-    p0q0_output = p0q0;
     p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
     p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
   } else {
diff --git a/src/dsp/arm/loop_restoration_10bit_neon.cc b/src/dsp/arm/loop_restoration_10bit_neon.cc
index 410bc20..9191080 100644
--- a/src/dsp/arm/loop_restoration_10bit_neon.cc
+++ b/src/dsp/arm/loop_restoration_10bit_neon.cc
@@ -1130,7 +1130,13 @@ inline void LookupIntermediate(const uint16x8_t sum, const uint16x8_t index,
   const uint8x8_t idx = vqmovn_u16(index);
   uint8_t temp[8];
   vst1_u8(temp, idx);
-  *ma = vsetq_lane_u8(kSgrMaLookup[temp[0]], *ma, offset + 0);
+  // offset == 0 is assumed to be the first call to this function. The value is
+  // duplicated to avoid -Wuninitialized warnings under gcc.
+  if (offset == 0) {
+    *ma = vdupq_n_u8(kSgrMaLookup[temp[0]]);
+  } else {
+    *ma = vsetq_lane_u8(kSgrMaLookup[temp[0]], *ma, offset + 0);
+  }
   *ma = vsetq_lane_u8(kSgrMaLookup[temp[1]], *ma, offset + 1);
   *ma = vsetq_lane_u8(kSgrMaLookup[temp[2]], *ma, offset + 2);
   *ma = vsetq_lane_u8(kSgrMaLookup[temp[3]], *ma, offset + 3);
@@ -1712,8 +1718,6 @@ LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
   s[0] = Load1QMsanU16(src + 0, overread_in_bytes + 0);
   s[1] = Load1QMsanU16(src + 8, overread_in_bytes + 16);
   Square(s[0], sq);
-  // Quiet "may be used uninitialized" warning.
-  mas[0] = mas[1] = vdupq_n_u8(0);
   BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
 
   int x = 0;
@@ -2067,8 +2071,6 @@ LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
   s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
   s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
   Square(s[0], sq);
-  // Quiet "may be used uninitialized" warning.
-  mas[0] = mas[1] = vdupq_n_u8(0);
   BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
 
   int x = 0;
@@ -2255,8 +2257,6 @@ inline void BoxFilterLastRow(
   s[0] = Load1QMsanU16(src0 + 0, overread_in_bytes + 0);
   s[1] = Load1QMsanU16(src0 + 8, overread_in_bytes + 16);
   Square(s[0], sq);
-  // Quiet "may be used uninitialized" warning.
-  ma3[0] = ma3[1] = vdupq_n_u8(0);
   BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
                                sq, &ma3[0], &ma5[0], b3, b5);
 
diff --git a/src/dsp/arm/loop_restoration_neon.cc b/src/dsp/arm/loop_restoration_neon.cc
index cd8552e..adb8f36 100644
--- a/src/dsp/arm/loop_restoration_neon.cc
+++ b/src/dsp/arm/loop_restoration_neon.cc
@@ -1125,7 +1125,11 @@ inline void CalculateIntermediate(const uint16x8_t sum,
   val = AdjustValue(val, idx, 101);  // 101 is the last index which value is 3.
   val = AdjustValue(val, idx, 169);  // 169 is the last index which value is 2.
   val = AdjustValue(val, idx, 254);  // 254 is the last index which value is 1.
-  *ma = (offset == 0) ? vcombine_u8(val, vget_high_u8(*ma))
+  // offset == 0 is assumed to be the first call to this function. Note
+  // vget_high_u8(*ma) is not used in this case to avoid a -Wuninitialized
+  // warning with some versions of gcc. vdup_n_u8(0) could work as well, but in
+  // most cases clang and gcc generated better code with this version.
+  *ma = (offset == 0) ? vcombine_u8(val, val)
                       : vcombine_u8(vget_low_u8(*ma), val);
 
   // b = ma * b * one_over_n