1 files changed, 1117 insertions, 0 deletions
diff --git a/src/utils/entropy_decoder.cc b/src/utils/entropy_decoder.cc
new file mode 100644
index 0000000..bf21199
--- /dev/null
+++ b/src/utils/entropy_decoder.cc
@@ -0,0 +1,1117 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/utils/entropy_decoder.h"
+
+#include <cassert>
+#include <cstring>
+
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
+
+#if defined(__ARM_NEON__) || defined(__aarch64__) || \
+    (defined(_MSC_VER) && defined(_M_ARM))
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 1
+#else
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 0
+#endif
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined(__SSE2__) || defined(LIBGAV1_X86_MSVC)
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 1
+#else
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 0
+#endif
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+#include <emmintrin.h>
+#endif
+
+namespace libgav1 {
+namespace {
+
+constexpr uint32_t kReadBitMask = ~255;
+constexpr int kCdfPrecision = 6;
+constexpr int kMinimumProbabilityPerSymbol = 4;
+
+// This function computes the "cur" variable as specified inside the do-while
+// loop in Section 8.2.6 of the spec. This function is monotonically
+// decreasing as the values of index increases (note that the |cdf| array is
+// sorted in decreasing order).
+uint32_t ScaleCdf(uint32_t values_in_range_shifted, const uint16_t* const cdf,
+                  int index, int symbol_count) {
+  return ((values_in_range_shifted * (cdf[index] >> kCdfPrecision)) >> 1) +
+         (kMinimumProbabilityPerSymbol * (symbol_count - index));
+}
+
+void UpdateCdf(uint16_t* const cdf, const int symbol_count, const int symbol) {
+  const uint16_t count = cdf[symbol_count];
+  // rate is computed in the spec as:
+  //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+  // In this case cdf[N] is |count|.
+  // Min(FloorLog2(N), 2) is 1 for symbol_count == {2, 3} and 2 for all
+  // symbol_count > 3. So the equation becomes:
+  //  4 + (count > 15) + (count > 31) + (symbol_count > 3).
+  // Note that the largest value for count is 32 (it is not incremented beyond
+  // 32). So using that information:
+  //  count >> 4 is 0 for count from 0 to 15.
+  //  count >> 4 is 1 for count from 16 to 31.
+  //  count >> 4 is 2 for count == 31.
+  // Now, the equation becomes:
+  //  4 + (count >> 4) + (symbol_count > 3).
+  // Since (count >> 4) can only be 0 or 1 or 2, the addition could be replaced
+  // with bitwise or:
+  //  (4 | (count >> 4)) + (symbol_count > 3).
+  // but using addition will allow the compiler to eliminate an operation when
+  // symbol_count is known and this function is inlined.
+  const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count > 3);
+  // Hints for further optimizations:
+  //
+  // 1. clang can vectorize this for loop with width 4, even though the loop
+  // contains an if-else statement. Therefore, it may be advantageous to use
+  // "i < symbol_count" as the loop condition when symbol_count is 8, 12, or 16
+  // (a multiple of 4 that's not too small).
+  //
+  // 2. The for loop can be rewritten in the following form, which would enable
+  // clang to vectorize the loop with width 8:
+  //
+  //   const int rounding = (1 << rate) - 1;
+  //   for (int i = 0; i < symbol_count - 1; ++i) {
+  //     const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
+  //     cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
+  //   }
+  //
+  // The subtraction (a - cdf[i]) relies on the overflow semantics of unsigned
+  // integer arithmetic. The result of the unsigned subtraction is cast to a
+  // signed integer and right-shifted. This requires the right shift of a
+  // signed integer be an arithmetic shift, which is true for clang, gcc, and
+  // Visual C++.
+  assert(symbol_count - 1 > 0);
+  int i = 0;
+  do {
+    if (i < symbol) {
+      cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
+    } else {
+      cdf[i] -= cdf[i] >> rate;
+    }
+  } while (++i < symbol_count - 1);
+  cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+}
+
+// Define the UpdateCdfN functions. UpdateCdfN is a specialized implementation
+// of UpdateCdf based on the fact that symbol_count == N. UpdateCdfN uses the
+// SIMD instruction sets if available.
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+// The UpdateCdf() method contains the following for loop:
+//
+//   for (int i = 0; i < symbol_count - 1; ++i) {
+//     if (i < symbol) {
+//       cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
+//     } else {
+//       cdf[i] -= cdf[i] >> rate;
+//     }
+//   }
+//
+// It can be rewritten in the following two forms, which are amenable to SIMD
+// implementations:
+//
+//   const int rounding = (1 << rate) - 1;
+//   for (int i = 0; i < symbol_count - 1; ++i) {
+//     const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
+//     cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
+//   }
+//
+// or:
+//
+//   const int rounding = (1 << rate) - 1;
+//   for (int i = 0; i < symbol_count - 1; ++i) {
+//     const uint16_t a = (i < symbol) ? (kCdfMaxProbability - rounding) : 0;
+//     cdf[i] -= static_cast<int16_t>(cdf[i] - a) >> rate;
+//   }
+//
+// The following ARM NEON implementations use a modified version of the first
+// form, using the comparison mask and unsigned rollover to avoid the need to
+// calculate rounding.
+//
+// The cdf array has symbol_count + 1 elements. The first symbol_count elements
+// are the CDF. The last element is a count that is initialized to 0 and may
+// grow up to 32. The for loop in UpdateCdf updates the CDF in the array. Since
+// cdf[symbol_count - 1] is always 0, the for loop does not update
+// cdf[symbol_count - 1]. However, it would be correct to have the for loop
+// update cdf[symbol_count - 1] anyway: since symbol_count - 1 >= symbol, the
+// for loop would take the else branch when i is symbol_count - 1:
+//      cdf[i] -= cdf[i] >> rate;
+// Since cdf[symbol_count - 1] is 0, cdf[symbol_count - 1] would still be 0
+// after the update. The ARM NEON implementations take advantage of this in the
+// following two cases:
+// 1. When symbol_count is 8 or 16, the vectorized code updates the first
+//    symbol_count elements in the array.
+// 2. When symbol_count is 7, the vectorized code updates all the 8 elements in
+//    the cdf array. Since an invalid CDF value is written into cdf[7], the
+//    count in cdf[7] needs to be fixed up after the vectorized code.
+
+void UpdateCdf5(uint16_t* const cdf, const int symbol) {
+  uint16x4_t cdf_vec = vld1_u16(cdf);
+  const uint16_t count = cdf[5];
+  const int rate = (count >> 4) + 5;
+  const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
+  const uint16x4_t index = vcreate_u16(0x0003000200010000);
+  const uint16x4_t symbol_vec = vdup_n_u16(symbol);
+  const uint16x4_t mask = vcge_u16(index, symbol_vec);
+  // i < symbol: 32768, i >= symbol: 65535.
+  const uint16x4_t a = vorr_u16(mask, cdf_max_probability);
+  // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+  const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(a, cdf_vec));
+  // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+  const uint16x4_t cdf_offset = vsub_u16(cdf_vec, mask);
+  const int16x4_t negative_rate = vdup_n_s16(-rate);
+  // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
+  const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
+  // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+  // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+  cdf_vec = vadd_u16(cdf_offset, delta);
+  vst1_u16(cdf, cdf_vec);
+  cdf[5] = count + static_cast<uint16_t>(count < 32);
+}
+
+// This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
+template <int symbol_count>
+void UpdateCdf7To9(uint16_t* const cdf, const int symbol) {
+  static_assert(symbol_count >= 7 && symbol_count <= 9, "");
+  uint16x8_t cdf_vec = vld1q_u16(cdf);
+  const uint16_t count = cdf[symbol_count];
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+  const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+                                        vcreate_u16(0x0007000600050004));
+  const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+  const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+  const int16x8_t negative_rate = vdupq_n_s16(-rate);
+  const uint16x8_t delta =
+      vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf, cdf_vec);
+  cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<7>(cdf, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<8>(cdf, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf11(uint16_t* const cdf, const int symbol) {
+  uint16x8_t cdf_vec = vld1q_u16(cdf + 2);
+  const uint16_t count = cdf[11];
+  cdf[11] = count + static_cast<uint16_t>(count < 32);
+  const int rate = (count >> 4) + 5;
+  if (symbol > 1) {
+    cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+    cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+    const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+    const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+    const int16x8_t negative_rate = vdupq_n_s16(-rate);
+    const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002),
+                                          vcreate_u16(0x0009000800070006));
+    const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+    const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+    const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+    const uint16x8_t delta =
+        vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+    cdf_vec = vaddq_u16(cdf_offset, delta);
+    vst1q_u16(cdf + 2, cdf_vec);
+  } else {
+    if (symbol != 0) {
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    } else {
+      cdf[0] -= cdf[0] >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    }
+    const int16x8_t negative_rate = vdupq_n_s16(-rate);
+    const uint16x8_t delta = vshlq_u16(cdf_vec, negative_rate);
+    cdf_vec = vsubq_u16(cdf_vec, delta);
+    vst1q_u16(cdf + 2, cdf_vec);
+  }
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf13(uint16_t* const cdf, const int symbol) {
+  uint16x8_t cdf_vec0 = vld1q_u16(cdf);
+  uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4);
+  const uint16_t count = cdf[13];
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+  const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+  const int16x8_t negative_rate = vdupq_n_s16(-rate);
+
+  uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+                                  vcreate_u16(0x0007000600050004));
+  uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec0));
+  uint16x8_t cdf_offset = vsubq_u16(cdf_vec0, mask);
+  uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec0 = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf, cdf_vec0);
+
+  index = vcombine_u16(vcreate_u16(0x0007000600050004),
+                       vcreate_u16(0x000b000a00090008));
+  mask = vcgeq_u16(index, symbol_vec);
+  a = vorrq_u16(mask, cdf_max_probability);
+  diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec1));
+  cdf_offset = vsubq_u16(cdf_vec1, mask);
+  delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec1 = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf + 4, cdf_vec1);
+
+  cdf[13] = count + static_cast<uint16_t>(count < 32);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf16(uint16_t* const cdf, const int symbol) {
+  uint16x8_t cdf_vec = vld1q_u16(cdf);
+  const uint16_t count = cdf[16];
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
+  const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
+  const int16x8_t negative_rate = vdupq_n_s16(-rate);
+
+  uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
+                                  vcreate_u16(0x0007000600050004));
+  uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
+  uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf, cdf_vec);
+
+  cdf_vec = vld1q_u16(cdf + 8);
+  index = vcombine_u16(vcreate_u16(0x000b000a00090008),
+                       vcreate_u16(0x000f000e000d000c));
+  mask = vcgeq_u16(index, symbol_vec);
+  a = vorrq_u16(mask, cdf_max_probability);
+  diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  cdf_offset = vsubq_u16(cdf_vec, mask);
+  delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
+  cdf_vec = vaddq_u16(cdf_offset, delta);
+  vst1q_u16(cdf + 8, cdf_vec);
+
+  cdf[16] = count + static_cast<uint16_t>(count < 32);
+}
+
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+
+inline __m128i LoadLo8(const void* a) {
+  return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+  return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+  _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+  _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+void UpdateCdf5(uint16_t* const cdf, const int symbol) {
+  __m128i cdf_vec = LoadLo8(cdf);
+  const uint16_t count = cdf[5];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+  const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
+  const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0);
+  // i >= symbol.
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  // i < symbol: 32768, i >= symbol: 65535.
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+  // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+  // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+  // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+  cdf_vec = _mm_add_epi16(cdf_offset, delta);
+  StoreLo8(cdf, cdf_vec);
+  cdf[5] = count + static_cast<uint16_t>(count < 32);
+}
+
+// This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
+template <int symbol_count>
+void UpdateCdf7To9(uint16_t* const cdf, const int symbol) {
+  static_assert(symbol_count >= 7 && symbol_count <= 9, "");
+  __m128i cdf_vec = LoadUnaligned16(cdf);
+  const uint16_t count = cdf[symbol_count];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+  const __m128i index =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  cdf_vec = _mm_add_epi16(cdf_offset, delta);
+  StoreUnaligned16(cdf, cdf_vec);
+  cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<7>(cdf, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<8>(cdf, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf11(uint16_t* const cdf, const int symbol) {
+  __m128i cdf_vec = LoadUnaligned16(cdf + 2);
+  const uint16_t count = cdf[11];
+  cdf[11] = count + static_cast<uint16_t>(count < 32);
+  const int rate = (count >> 4) + 5;
+  if (symbol > 1) {
+    cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+    cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+    const __m128i cdf_max_probability =
+        _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+    const __m128i index =
+        _mm_set_epi32(0x000a0009, 0x00080007, 0x00060005, 0x00040003);
+    const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+    const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+    const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+    const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+    const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+    const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+    cdf_vec = _mm_add_epi16(cdf_offset, delta);
+    StoreUnaligned16(cdf + 2, cdf_vec);
+  } else {
+    if (symbol != 0) {
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    } else {
+      cdf[0] -= cdf[0] >> rate;
+      cdf[1] -= cdf[1] >> rate;
+    }
+    const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
+    cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+    StoreUnaligned16(cdf + 2, cdf_vec);
+  }
+}
+
+// See UpdateCdf5 for implementation details.
+void UpdateCdf13(uint16_t* const cdf, const int symbol) {
+  __m128i cdf_vec0 = LoadLo8(cdf);
+  __m128i cdf_vec1 = LoadUnaligned16(cdf + 4);
+  const uint16_t count = cdf[13];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+
+  const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
+  StoreLo8(cdf, cdf_vec0);
+
+  const __m128i index1 =
+      _mm_set_epi32(0x000c000b, 0x000a0009, 0x00080007, 0x00060005);
+  const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+  const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+  const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+  const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
+  const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
+  cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
+  StoreUnaligned16(cdf + 4, cdf_vec1);
+
+  cdf[13] = count + static_cast<uint16_t>(count < 32);
+}
+
+void UpdateCdf16(uint16_t* const cdf, const int symbol) {
+  __m128i cdf_vec0 = LoadUnaligned16(cdf);
+  const uint16_t count = cdf[16];
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+
+  const __m128i index =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
+  const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+  cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
+  StoreUnaligned16(cdf, cdf_vec0);
+
+  __m128i cdf_vec1 = LoadUnaligned16(cdf + 8);
+  const __m128i index1 =
+      _mm_set_epi32(0x0010000f, 0x000e000d, 0x000c000b, 0x000a0009);
+  const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+  const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+  const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+  const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
+  const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
+  cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
+  StoreUnaligned16(cdf + 8, cdf_vec1);
+
+  cdf[16] = count + static_cast<uint16_t>(count < 32);
+}
+
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+
+void UpdateCdf5(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 5, symbol);
+}
+
+void UpdateCdf7(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 7, symbol);
+}
+
+void UpdateCdf8(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 8, symbol);
+}
+
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 9, symbol);
+}
+
+void UpdateCdf11(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 11, symbol);
+}
+
+void UpdateCdf13(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 13, symbol);
+}
+
+void UpdateCdf16(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 16, symbol);
+}
+
+#endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+#endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+
+inline DaalaBitReader::WindowSize HostToBigEndian(
+    const DaalaBitReader::WindowSize x) {
+  static_assert(sizeof(x) == 4 || sizeof(x) == 8, "");
+#if defined(__GNUC__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return (sizeof(x) == 8) ? __builtin_bswap64(x) : __builtin_bswap32(x);
+#else
+  return x;
+#endif
+#elif defined(_WIN32)
+  // Note Windows targets are assumed to be little endian.
+  return static_cast<DaalaBitReader::WindowSize>(
+      (sizeof(x) == 8) ? _byteswap_uint64(static_cast<unsigned __int64>(x))
+                       : _byteswap_ulong(static_cast<unsigned long>(x)));
+#else
+#error Unknown compiler!
+#endif  // defined(__GNUC__)
+}
+
+}  // namespace
+
+#if !LIBGAV1_CXX17
+constexpr int DaalaBitReader::kWindowSize;  // static.
+#endif
+
+DaalaBitReader::DaalaBitReader(const uint8_t* data, size_t size,
+                               bool allow_update_cdf)
+    : data_(data),
+      data_end_(data + size),
+      data_memcpy_end_((size >= sizeof(WindowSize))
+                           ? data + size - sizeof(WindowSize) + 1
+                           : data),
+      allow_update_cdf_(allow_update_cdf),
+      values_in_range_(kCdfMaxProbability) {
+  if (data_ < data_memcpy_end_) {
+    // This is a simplified version of PopulateBits() which loads 8 extra bits
+    // and skips the unnecessary shifts of value and window_diff_.
+    WindowSize value;
+    memcpy(&value, data_, sizeof(value));
+    data_ += sizeof(value);
+    window_diff_ = HostToBigEndian(value) ^ -1;
+    // Note the initial value of bits_ is larger than kMaxCachedBits as it's
+    // used to restore the most significant 0 bit that would be present after
+    // PopulateBits() when we extract the first symbol value.
+    // As shown in Section 8.2.2 Initialization process for symbol decoder,
+    // which uses a fixed offset to read the symbol values, the most
+    // significant bit is always 0:
+    //   The variable numBits is set equal to Min( sz * 8, 15).
+    //   The variable buf is read using the f(numBits) parsing process.
+    //   The variable paddedBuf is set equal to ( buf << (15 - numBits) ).
+    //   The variable SymbolValue is set to ((1 << 15) - 1) ^ paddedBuf.
+    bits_ = kWindowSize - 15;
+    return;
+  }
+  window_diff_ = 0;
+  bits_ = -15;
+  PopulateBits();
+}
+
+// This is similar to the ReadSymbol() implementation but it is optimized based
+// on the following facts:
+//   * The probability is fixed at half. So some multiplications can be replaced
+//     with bit operations.
+//   * Symbol count is fixed at 2.
+int DaalaBitReader::ReadBit() {
+  const uint32_t curr =
+      ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  int bit = 1;
+  if (symbol_value >= curr) {
+    values_in_range_ -= curr;
+    window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+    bit = 0;
+  } else {
+    values_in_range_ = curr;
+  }
+  NormalizeRange();
+  return bit;
+}
+
+int64_t DaalaBitReader::ReadLiteral(int num_bits) {
+  assert(num_bits <= 32);
+  assert(num_bits > 0);
+  uint32_t literal = 0;
+  int bit = num_bits - 1;
+  do {
+    // ARM can combine a shift operation with a constant number of bits with
+    // some other operations, such as the OR operation.
+    // Here is an ARM disassembly example:
+    // orr w1, w0, w1, lsl #1
+    // which left shifts register w1 by 1 bit and OR the shift result with
+    // register w0.
+    // The next 2 lines are equivalent to:
+    // literal |= static_cast<uint32_t>(ReadBit()) << bit;
+    literal <<= 1;
+    literal |= static_cast<uint32_t>(ReadBit());
+  } while (--bit >= 0);
+  return literal;
+}
+
+int DaalaBitReader::ReadSymbol(uint16_t* const cdf, int symbol_count) {
+  const int symbol = ReadSymbolImpl(cdf, symbol_count);
+  if (allow_update_cdf_) {
+    UpdateCdf(cdf, symbol_count, symbol);
+  }
+  return symbol;
+}
+
+bool DaalaBitReader::ReadSymbol(uint16_t* cdf) {
+  assert(cdf[1] == 0);
+  const bool symbol = ReadSymbolImpl(cdf[0]) != 0;
+  if (allow_update_cdf_) {
+    const uint16_t count = cdf[2];
+    // rate is computed in the spec as:
+    //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+    // In this case N is 2 and cdf[N] is |count|. So the equation becomes:
+    //  4 + (count > 15) + (count > 31)
+    // Note that the largest value for count is 32 (it is not incremented beyond
+    // 32). So using that information:
+    //  count >> 4 is 0 for count from 0 to 15.
+    //  count >> 4 is 1 for count from 16 to 31.
+    //  count >> 4 is 2 for count == 32.
+    // Now, the equation becomes:
+    //  4 + (count >> 4).
+    // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
+    // with bitwise or. So the final equation is:
+    //  4 | (count >> 4).
+    const int rate = 4 | (count >> 4);
+    if (symbol) {
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+    } else {
+      cdf[0] -= cdf[0] >> rate;
+    }
+    cdf[2] += static_cast<uint16_t>(count < 32);
+  }
+  return symbol;
+}
+
+bool DaalaBitReader::ReadSymbolWithoutCdfUpdate(uint16_t cdf) {
+  return ReadSymbolImpl(cdf) != 0;
+}
+
+template <int symbol_count>
+int DaalaBitReader::ReadSymbol(uint16_t* const cdf) {
+  static_assert(symbol_count >= 3 && symbol_count <= 16, "");
+  if (symbol_count == 3 || symbol_count == 4) {
+    return ReadSymbol3Or4(cdf, symbol_count);
+  }
+  int symbol;
+  if (symbol_count == 8) {
+    symbol = ReadSymbolImpl8(cdf);
+  } else if (symbol_count <= 13) {
+    symbol = ReadSymbolImpl(cdf, symbol_count);
+  } else {
+    symbol = ReadSymbolImplBinarySearch(cdf, symbol_count);
+  }
+  if (allow_update_cdf_) {
+    if (symbol_count == 5) {
+      UpdateCdf5(cdf, symbol);
+    } else if (symbol_count == 7) {
+      UpdateCdf7(cdf, symbol);
+    } else if (symbol_count == 8) {
+      UpdateCdf8(cdf, symbol);
+    } else if (symbol_count == 9) {
+      UpdateCdf9(cdf, symbol);
+    } else if (symbol_count == 11) {
+      UpdateCdf11(cdf, symbol);
+    } else if (symbol_count == 13) {
+      UpdateCdf13(cdf, symbol);
+    } else if (symbol_count == 16) {
+      UpdateCdf16(cdf, symbol);
+    } else {
+      UpdateCdf(cdf, symbol_count, symbol);
+    }
+  }
+  return symbol;
+}
+
+int DaalaBitReader::ReadSymbolImpl(const uint16_t* const cdf,
+                                   int symbol_count) {
+  assert(cdf[symbol_count - 1] == 0);
+  --symbol_count;
+  uint32_t curr = values_in_range_;
+  int symbol = -1;
+  uint32_t prev;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count;
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over.
+  do {
+    prev = curr;
+    curr = (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) +
+           delta;
+    delta -= kMinimumProbabilityPerSymbol;
+  } while (symbol_value < curr);
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return symbol;
+}
+
+int DaalaBitReader::ReadSymbolImplBinarySearch(const uint16_t* const cdf,
+                                               int symbol_count) {
+  assert(cdf[symbol_count - 1] == 0);
+  assert(symbol_count > 1 && symbol_count <= 16);
+  --symbol_count;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over. Since the CDFs are sorted, we can use binary
+  // search to do this. Let |symbol| be the index of the first |cdf| array
+  // entry whose scaled cdf value is less than or equal to |symbol_value|. The
+  // binary search maintains the invariant:
+  //   low <= symbol <= high + 1
+  // and terminates when low == high + 1.
+  int low = 0;
+  int high = symbol_count - 1;
+  // The binary search maintains the invariants that |prev| is the scaled cdf
+  // value for low - 1 and |curr| is the scaled cdf value for high + 1. (By
+  // convention, the scaled cdf value for -1 is values_in_range_.) When the
+  // binary search terminates, |prev| is the scaled cdf value for symbol - 1
+  // and |curr| is the scaled cdf value for |symbol|.
+  uint32_t prev = values_in_range_;
+  uint32_t curr = 0;
+  const uint32_t values_in_range_shifted = values_in_range_ >> 8;
+  do {
+    const int mid = DivideBy2(low + high);
+    const uint32_t scaled_cdf =
+        ScaleCdf(values_in_range_shifted, cdf, mid, symbol_count);
+    if (symbol_value < scaled_cdf) {
+      low = mid + 1;
+      prev = scaled_cdf;
+    } else {
+      high = mid - 1;
+      curr = scaled_cdf;
+    }
+  } while (low <= high);
+  assert(low == high + 1);
+  // At this point, |low| is the symbol that has been decoded.
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return low;
+}
+
+int DaalaBitReader::ReadSymbolImpl(uint16_t cdf) {
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  const uint32_t curr =
+      (((values_in_range_ >> 8) * (cdf >> kCdfPrecision)) >> 1) +
+      kMinimumProbabilityPerSymbol;
+  const int symbol = static_cast<int>(symbol_value < curr);
+  if (symbol == 1) {
+    values_in_range_ = curr;
+  } else {
+    values_in_range_ -= curr;
+    window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  }
+  NormalizeRange();
+  return symbol;
+}
+
+// Equivalent to ReadSymbol(cdf, [3,4]), with the ReadSymbolImpl and UpdateCdf
+// calls inlined.
+int DaalaBitReader::ReadSymbol3Or4(uint16_t* const cdf,
+                                   const int symbol_count) {
+  assert(cdf[symbol_count - 1] == 0);
+  uint32_t curr = values_in_range_;
+  uint32_t prev;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  uint32_t delta = kMinimumProbabilityPerSymbol * (symbol_count - 1);
+  const uint32_t values_in_range_shifted = values_in_range_ >> 8;
+
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over. If allow_update_cdf_ is true, update the |cdf|
+  // array.
+  //
+  // The original code is:
+  //
+  //  int symbol = -1;
+  //  do {
+  //    prev = curr;
+  //    curr =
+  //        ((values_in_range_shifted * (cdf[++symbol] >> kCdfPrecision)) >> 1)
+  //        + delta;
+  //    delta -= kMinimumProbabilityPerSymbol;
+  //  } while (symbol_value < curr);
+  //  if (allow_update_cdf_) {
+  //    UpdateCdf(cdf, [3,4], symbol);
+  //  }
+  //
+  // The do-while loop is unrolled with three or four iterations, and the
+  // UpdateCdf call is inlined and merged into the iterations.
+  int symbol = 0;
+  // Iteration 0.
+  prev = curr;
+  curr =
+      ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+  if (symbol_value >= curr) {
+    // symbol == 0.
+    if (allow_update_cdf_) {
+      // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/0).
+      const uint16_t count = cdf[symbol_count];
+      cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+      const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
+      if (symbol_count == 4) {
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+        // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
+        // NEON code is slower. Consider using the C version if __arm__ is
+        // defined.
+        // 2. The ARM NEON code (compiled for arm64) is slightly slower on
+        // Samsung Galaxy S8+ (SM-G955FD).
+        uint16x4_t cdf_vec = vld1_u16(cdf);
+        const int16x4_t negative_rate = vdup_n_s16(-rate);
+        const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
+        cdf_vec = vsub_u16(cdf_vec, delta);
+        vst1_u16(cdf, cdf_vec);
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+        __m128i cdf_vec = LoadLo8(cdf);
+        const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
+        cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+        StoreLo8(cdf, cdf_vec);
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+        cdf[0] -= cdf[0] >> rate;
+        cdf[1] -= cdf[1] >> rate;
+        cdf[2] -= cdf[2] >> rate;
+#endif
+      } else {  // symbol_count == 3.
+        cdf[0] -= cdf[0] >> rate;
+        cdf[1] -= cdf[1] >> rate;
+      }
+    }
+    goto found;
+  }
+  ++symbol;
+  delta -= kMinimumProbabilityPerSymbol;
+  // Iteration 1.
+  prev = curr;
+  curr =
+      ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+  if (symbol_value >= curr) {
+    // symbol == 1.
+    if (allow_update_cdf_) {
+      // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/1).
+      const uint16_t count = cdf[symbol_count];
+      cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+      const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] -= cdf[1] >> rate;
+      if (symbol_count == 4) cdf[2] -= cdf[2] >> rate;
+    }
+    goto found;
+  }
+  ++symbol;
+  if (symbol_count == 4) {
+    delta -= kMinimumProbabilityPerSymbol;
+    // Iteration 2.
+    prev = curr;
+    curr = ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) +
+           delta;
+    if (symbol_value >= curr) {
+      // symbol == 2.
+      if (allow_update_cdf_) {
+        // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
+        const uint16_t count = cdf[4];
+        cdf[4] += static_cast<uint16_t>(count < 32);
+        const int rate = (count >> 4) + 5;
+        cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+        cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+        cdf[2] -= cdf[2] >> rate;
+      }
+      goto found;
+    }
+    ++symbol;
+  }
+  // |delta| is 0 for the last iteration.
+  // Iteration 2 (symbol_count == 3) or 3 (symbol_count == 4).
+  prev = curr;
+  // Since cdf[symbol_count - 1] is 0 and |delta| is 0, |curr| is also 0.
+  curr = 0;
+  // symbol == [2,3].
+  if (allow_update_cdf_) {
+    // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/[2,3]).
+    const uint16_t count = cdf[symbol_count];
+    cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+    const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count == 4);
+    if (symbol_count == 4) {
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+      // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
+      // code is a tiny bit slower. Consider using the C version if __arm__ is
+      // defined.
+      uint16x4_t cdf_vec = vld1_u16(cdf);
+      const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
+      const int16x4_t diff =
+          vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
+      const int16x4_t negative_rate = vdup_n_s16(-rate);
+      const uint16x4_t delta =
+          vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
+      cdf_vec = vadd_u16(cdf_vec, delta);
+      vst1_u16(cdf, cdf_vec);
+      cdf[3] = 0;
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+      __m128i cdf_vec = LoadLo8(cdf);
+      const __m128i cdf_max_probability =
+          _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+      const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
+      const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+      cdf_vec = _mm_add_epi16(cdf_vec, delta);
+      StoreLo8(cdf, cdf_vec);
+      cdf[3] = 0;
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+      cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
+#endif
+    } else {  // symbol_count == 3.
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+    }
+  }
+found:
+  // End of unrolled do-while loop.
+
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return symbol;
+}
+
+int DaalaBitReader::ReadSymbolImpl8(const uint16_t* const cdf) {
+  assert(cdf[7] == 0);
+  uint32_t curr = values_in_range_;
+  uint32_t prev;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  uint32_t delta = kMinimumProbabilityPerSymbol * 7;
+  // Search through the |cdf| array to determine where the scaled cdf value and
+  // |symbol_value| cross over.
+  //
+  // The original code is:
+  //
+  // int symbol = -1;
+  // do {
+  //   prev = curr;
+  //   curr =
+  //       (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1)
+  //       + delta;
+  //   delta -= kMinimumProbabilityPerSymbol;
+  // } while (symbol_value < curr);
+  //
+  // The do-while loop is unrolled with eight iterations.
+  int symbol = 0;
+
+#define READ_SYMBOL_ITERATION                                                \
+  prev = curr;                                                               \
+  curr = (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + \
+         delta;                                                              \
+  if (symbol_value >= curr) goto found;                                      \
+  ++symbol;                                                                  \
+  delta -= kMinimumProbabilityPerSymbol
+
+  READ_SYMBOL_ITERATION;  // Iteration 0.
+  READ_SYMBOL_ITERATION;  // Iteration 1.
+  READ_SYMBOL_ITERATION;  // Iteration 2.
+  READ_SYMBOL_ITERATION;  // Iteration 3.
+  READ_SYMBOL_ITERATION;  // Iteration 4.
+  READ_SYMBOL_ITERATION;  // Iteration 5.
+
+  // The last two iterations can be simplified, so they don't use the
+  // READ_SYMBOL_ITERATION macro.
+#undef READ_SYMBOL_ITERATION
+
+  // Iteration 6.
+  prev = curr;
+  curr =
+      (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
+  if (symbol_value >= curr) goto found;  // symbol == 6.
+  ++symbol;
+  // |delta| is 0 for the last iteration.
+  // Iteration 7.
+  prev = curr;
+  // Since cdf[7] is 0 and |delta| is 0, |curr| is also 0.
+  curr = 0;
+  // symbol == 7.
+found:
+  // End of unrolled do-while loop.
+
+  values_in_range_ = prev - curr;
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
+  NormalizeRange();
+  return symbol;
+}
+
+void DaalaBitReader::PopulateBits() {
+  constexpr int kMaxCachedBits = kWindowSize - 16;
+#if defined(__aarch64__)
+  // Fast path: read eight bytes and add the first six bytes to window_diff_.
+  // This fast path makes the following assumptions.
+  // 1. We assume that unaligned load of uint64_t is fast.
+  // 2. When there are enough bytes in data_, the for loop below reads 6 or 7
+  //    bytes depending on the value of bits_. This fast path always reads 6
+  //    bytes, which results in more calls to PopulateBits(). We assume that
+  //    making more calls to a faster PopulateBits() is overall a win.
+  // NOTE: Although this fast path could also be used on x86_64, it hurts
+  // performance (measured on Lenovo ThinkStation P920 running Linux). (The
+  // reason is still unknown.) Therefore this fast path is only used on arm64.
+  static_assert(kWindowSize == 64, "");
+  if (data_ < data_memcpy_end_) {
+    uint64_t value;
+    // arm64 supports unaligned loads, so this memcpy call is compiled to a
+    // single ldr instruction.
+    memcpy(&value, data_, sizeof(value));
+    data_ += kMaxCachedBits >> 3;
+    value = HostToBigEndian(value) ^ -1;
+    value >>= kWindowSize - kMaxCachedBits;
+    window_diff_ = value | (window_diff_ << kMaxCachedBits);
+    bits_ += kMaxCachedBits;
+    return;
+  }
+#endif
+
+  const uint8_t* data = data_;
+  int bits = bits_;
+  WindowSize window_diff = window_diff_;
+
+  int count = kWindowSize - 9 - (bits + 15);
+  // The fast path above, if compiled, would cause clang 8.0.7 to vectorize
+  // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7
+  // iterations when WindowSize is 64 bits. So it is not profitable to
+  // vectorize this loop. Note that clang 8.0.7 does not vectorize this loop if
+  // the fast path above is not compiled.
+
+#ifdef __clang__
+#pragma clang loop vectorize(disable) interleave(disable)
+#endif
+  for (; count >= 0 && data < data_end_; count -= 8) {
+    const uint8_t value = *data++ ^ -1;
+    window_diff = static_cast<WindowSize>(value) | (window_diff << 8);
+    bits += 8;
+  }
+  assert(bits <= kMaxCachedBits);
+  if (data == data_end_) {
+    // Shift in some 1s. This is equivalent to providing fake 0 data bits.
+    window_diff = ((window_diff + 1) << (kMaxCachedBits - bits)) - 1;
+    bits = kMaxCachedBits;
+  }
+
+  data_ = data;
+  bits_ = bits;
+  window_diff_ = window_diff;
+}
+
+void DaalaBitReader::NormalizeRange() {
+  const int bits_used = 15 ^ FloorLog2(values_in_range_);
+  bits_ -= bits_used;
+  values_in_range_ <<= bits_used;
+  if (bits_ < 0) PopulateBits();
+}
+
+// Explicit instantiations.
+template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<6>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<9>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<12>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf);
+
+}  // namespace libgav1