From e8d277081293b6fb2a5d469616baaa7a06f52496 Mon Sep 17 00:00:00 2001 From: qinxialei Date: Thu, 29 Oct 2020 11:26:59 +0800 Subject: Import Upstream version 0.16.0 --- src/utils/array_2d.h | 131 ++++ src/utils/bit_mask_set.h | 79 +++ src/utils/bit_reader.cc | 117 ++++ src/utils/bit_reader.h | 49 ++ src/utils/block_parameters_holder.cc | 107 ++++ src/utils/block_parameters_holder.h | 85 +++ src/utils/blocking_counter.h | 97 +++ src/utils/common.h | 534 ++++++++++++++++ src/utils/compiler_attributes.h | 181 ++++++ src/utils/constants.cc | 874 ++++++++++++++++++++++++++ src/utils/constants.h | 744 ++++++++++++++++++++++ src/utils/cpu.cc | 84 +++ src/utils/cpu.h | 107 ++++ src/utils/dynamic_buffer.h | 82 +++ src/utils/entropy_decoder.cc | 1117 ++++++++++++++++++++++++++++++++++ src/utils/entropy_decoder.h | 123 ++++ src/utils/executor.cc | 21 + src/utils/executor.h | 36 ++ src/utils/libgav1_utils.cmake | 72 +++ src/utils/logging.cc | 65 ++ src/utils/logging.h | 85 +++ src/utils/memory.h | 237 ++++++++ src/utils/parameter_tree.cc | 133 ++++ src/utils/parameter_tree.h | 113 ++++ src/utils/queue.h | 105 ++++ src/utils/raw_bit_reader.cc | 224 +++++++ src/utils/raw_bit_reader.h | 78 +++ src/utils/reference_info.h | 92 +++ src/utils/segmentation.cc | 31 + src/utils/segmentation.h | 32 + src/utils/segmentation_map.cc | 49 ++ src/utils/segmentation_map.h | 71 +++ src/utils/stack.h | 59 ++ src/utils/threadpool.cc | 323 ++++++++++ src/utils/threadpool.h | 167 +++++ src/utils/types.h | 525 ++++++++++++++++ src/utils/unbounded_queue.h | 245 ++++++++ src/utils/vector.h | 352 +++++++++++ 38 files changed, 7626 insertions(+) create mode 100644 src/utils/array_2d.h create mode 100644 src/utils/bit_mask_set.h create mode 100644 src/utils/bit_reader.cc create mode 100644 src/utils/bit_reader.h create mode 100644 src/utils/block_parameters_holder.cc create mode 100644 src/utils/block_parameters_holder.h create mode 100644 src/utils/blocking_counter.h create mode 100644 src/utils/common.h create mode 100644 src/utils/compiler_attributes.h create mode 100644 src/utils/constants.cc create mode 100644 src/utils/constants.h create mode 100644 src/utils/cpu.cc create mode 100644 src/utils/cpu.h create mode 100644 src/utils/dynamic_buffer.h create mode 100644 src/utils/entropy_decoder.cc create mode 100644 src/utils/entropy_decoder.h create mode 100644 src/utils/executor.cc create mode 100644 src/utils/executor.h create mode 100644 src/utils/libgav1_utils.cmake create mode 100644 src/utils/logging.cc create mode 100644 src/utils/logging.h create mode 100644 src/utils/memory.h create mode 100644 src/utils/parameter_tree.cc create mode 100644 src/utils/parameter_tree.h create mode 100644 src/utils/queue.h create mode 100644 src/utils/raw_bit_reader.cc create mode 100644 src/utils/raw_bit_reader.h create mode 100644 src/utils/reference_info.h create mode 100644 src/utils/segmentation.cc create mode 100644 src/utils/segmentation.h create mode 100644 src/utils/segmentation_map.cc create mode 100644 src/utils/segmentation_map.h create mode 100644 src/utils/stack.h create mode 100644 src/utils/threadpool.cc create mode 100644 src/utils/threadpool.h create mode 100644 src/utils/types.h create mode 100644 src/utils/unbounded_queue.h create mode 100644 src/utils/vector.h (limited to 'src/utils') diff --git a/src/utils/array_2d.h b/src/utils/array_2d.h new file mode 100644 index 0000000..2df6241 --- /dev/null +++ b/src/utils/array_2d.h @@ -0,0 +1,131 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_ARRAY_2D_H_ +#define LIBGAV1_SRC_UTILS_ARRAY_2D_H_ + +#include +#include +#include +#include +#include +#include + +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { + +// Exposes a 1D allocated memory buffer as a 2D array. +template +class Array2DView { + public: + Array2DView() = default; + Array2DView(int rows, int columns, T* const data) { + Reset(rows, columns, data); + } + + // Copyable and Movable. + Array2DView(const Array2DView& rhs) = default; + Array2DView& operator=(const Array2DView& rhs) = default; + + void Reset(int rows, int columns, T* const data) { + rows_ = rows; + columns_ = columns; + data_ = data; + } + + int rows() const { return rows_; } + int columns() const { return columns_; } + + T* operator[](int row) { return const_cast(GetRow(row)); } + + const T* operator[](int row) const { return GetRow(row); } + + private: + const T* GetRow(int row) const { + assert(row < rows_); + const ptrdiff_t offset = static_cast(row) * columns_; + return data_ + offset; + } + + int rows_ = 0; + int columns_ = 0; + T* data_ = nullptr; +}; + +// Allocates and owns the contiguous memory and exposes an Array2DView of +// dimension |rows| x |columns|. +template +class Array2D { + public: + Array2D() = default; + + // Copyable and Movable. + Array2D(const Array2D& rhs) = default; + Array2D& operator=(const Array2D& rhs) = default; + + LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns, + bool zero_initialize = true) { + size_ = rows * columns; + // If T is not a trivial type, we should always reallocate the data_ + // buffer, so that the destructors of any existing objects are invoked. + if (!std::is_trivial::value || allocated_size_ < size_) { + // Note: This invokes the global operator new if T is a non-class type, + // such as integer or enum types, or a class type that is not derived + // from libgav1::Allocable, such as std::unique_ptr. If we enforce a + // maximum allocation size or keep track of our own heap memory + // consumption, we will need to handle the allocations here that use the + // global operator new. + if (zero_initialize) { + data_.reset(new (std::nothrow) T[size_]()); + } else { + data_.reset(new (std::nothrow) T[size_]); + } + if (data_ == nullptr) { + allocated_size_ = 0; + return false; + } + allocated_size_ = size_; + } else if (zero_initialize) { + // Cast the data_ pointer to void* to avoid the GCC -Wclass-memaccess + // warning. The memset is safe because T is a trivial type. + void* dest = data_.get(); + memset(dest, 0, sizeof(T) * size_); + } + data_view_.Reset(rows, columns, data_.get()); + return true; + } + + int rows() const { return data_view_.rows(); } + int columns() const { return data_view_.columns(); } + size_t size() const { return size_; } + T* data() { return data_.get(); } + const T* data() const { return data_.get(); } + + T* operator[](int row) { return data_view_[row]; } + + const T* operator[](int row) const { return data_view_[row]; } + + private: + std::unique_ptr data_ = nullptr; + size_t allocated_size_ = 0; + size_t size_ = 0; + Array2DView data_view_; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_ARRAY_2D_H_ diff --git a/src/utils/bit_mask_set.h b/src/utils/bit_mask_set.h new file mode 100644 index 0000000..7371753 --- /dev/null +++ b/src/utils/bit_mask_set.h @@ -0,0 +1,79 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_ +#define LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_ + +#include + +namespace libgav1 { + +// This class is used to check if a given value is equal to one of the several +// predetermined values using a bit mask instead of a chain of comparisons and +// ||s. This usually results in fewer instructions. +// +// Usage: +// constexpr BitMaskSet set(value1, value2); +// set.Contains(value1) => returns true. +// set.Contains(value3) => returns false. +class BitMaskSet { + public: + explicit constexpr BitMaskSet(uint32_t mask) : mask_(mask) {} + + constexpr BitMaskSet(int v1, int v2) : mask_((1U << v1) | (1U << v2)) {} + + constexpr BitMaskSet(int v1, int v2, int v3) + : mask_((1U << v1) | (1U << v2) | (1U << v3)) {} + + constexpr BitMaskSet(int v1, int v2, int v3, int v4) + : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4)) {} + + constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5) + : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5)) {} + + constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6) + : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) | + (1U << v6)) {} + + constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7) + : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) | + (1U << v6) | (1U << v7)) {} + + constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7, + int v8, int v9) + : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) | + (1U << v6) | (1U << v7) | (1U << v8) | (1U << v9)) {} + + constexpr BitMaskSet(int v1, int v2, int v3, int v4, int v5, int v6, int v7, + int v8, int v9, int v10) + : mask_((1U << v1) | (1U << v2) | (1U << v3) | (1U << v4) | (1U << v5) | + (1U << v6) | (1U << v7) | (1U << v8) | (1U << v9) | (1U << v10)) { + } + + constexpr bool Contains(uint8_t value) const { + return MaskContainsValue(mask_, value); + } + + static constexpr bool MaskContainsValue(uint32_t mask, uint8_t value) { + return ((mask >> value) & 1) != 0; + } + + private: + const uint32_t mask_; +}; + +} // namespace libgav1 +#endif // LIBGAV1_SRC_UTILS_BIT_MASK_SET_H_ diff --git a/src/utils/bit_reader.cc b/src/utils/bit_reader.cc new file mode 100644 index 0000000..3234128 --- /dev/null +++ b/src/utils/bit_reader.cc @@ -0,0 +1,117 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/bit_reader.h" + +#include +#include + +#include "src/utils/common.h" + +namespace libgav1 { +namespace { + +bool Assign(int* const value, int assignment, bool return_value) { + *value = assignment; + return return_value; +} + +// 5.9.29. +int InverseRecenter(int r, int v) { + if (v > (r << 1)) { + return v; + } + if ((v & 1) != 0) { + return r - ((v + 1) >> 1); + } + return r + (v >> 1); +} + +} // namespace + +bool BitReader::DecodeSignedSubexpWithReference(int low, int high, + int reference, int control, + int* const value) { + if (!DecodeUnsignedSubexpWithReference(high - low, reference - low, control, + value)) { + return false; + } + *value += low; + return true; +} + +bool BitReader::DecodeUniform(int n, int* const value) { + if (n <= 1) { + return Assign(value, 0, true); + } + const int w = FloorLog2(n) + 1; + const int m = (1 << w) - n; + assert(w - 1 < 32); + const int v = static_cast(ReadLiteral(w - 1)); + if (v == -1) { + return Assign(value, 0, false); + } + if (v < m) { + return Assign(value, v, true); + } + const int extra_bit = ReadBit(); + if (extra_bit == -1) { + return Assign(value, 0, false); + } + return Assign(value, (v << 1) - m + extra_bit, true); +} + +bool BitReader::DecodeUnsignedSubexpWithReference(int mx, int reference, + int control, + int* const value) { + int v; + if (!DecodeSubexp(mx, control, &v)) return false; + if ((reference << 1) <= mx) { + *value = InverseRecenter(reference, v); + } else { + *value = mx - 1 - InverseRecenter(mx - 1 - reference, v); + } + return true; +} + +bool BitReader::DecodeSubexp(int num_symbols, int control, int* const value) { + int i = 0; + int mk = 0; + while (true) { + const int b = (i != 0) ? control + i - 1 : control; + if (b >= 32) { + return Assign(value, 0, false); + } + const int a = 1 << b; + if (num_symbols <= mk + 3 * a) { + if (!DecodeUniform(num_symbols - mk, value)) return false; + *value += mk; + return true; + } + const int8_t subexp_more_bits = ReadBit(); + if (subexp_more_bits == -1) return false; + if (subexp_more_bits != 0) { + ++i; + mk += a; + } else { + const int subexp_bits = static_cast(ReadLiteral(b)); + if (subexp_bits == -1) { + return Assign(value, 0, false); + } + return Assign(value, subexp_bits + mk, true); + } + } +} + +} // namespace libgav1 diff --git a/src/utils/bit_reader.h b/src/utils/bit_reader.h new file mode 100644 index 0000000..5a10e12 --- /dev/null +++ b/src/utils/bit_reader.h @@ -0,0 +1,49 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_BIT_READER_H_ +#define LIBGAV1_SRC_UTILS_BIT_READER_H_ + +#include + +namespace libgav1 { + +class BitReader { + public: + virtual ~BitReader() = default; + + virtual int ReadBit() = 0; + // |num_bits| has to be <= 32. The function returns a value in the range [0, + // 2^num_bits - 1] (inclusive) on success and -1 on failure. + virtual int64_t ReadLiteral(int num_bits) = 0; + + bool DecodeSignedSubexpWithReference(int low, int high, int reference, + int control, int* value); // 5.9.26. + // Decodes a nonnegative integer with maximum number of values |n| (i.e., + // output in range 0..n-1) by following the process specified in Section + // 4.10.7 ns(n) and Section 4.10.10 NS(n) of the spec. + bool DecodeUniform(int n, int* value); + + private: + // Helper functions for DecodeSignedSubexpWithReference. + bool DecodeUnsignedSubexpWithReference(int mx, int reference, int control, + int* value); // 5.9.27. + bool DecodeSubexp(int num_symbols, int control, int* value); // 5.9.28. +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_BIT_READER_H_ diff --git a/src/utils/block_parameters_holder.cc b/src/utils/block_parameters_holder.cc new file mode 100644 index 0000000..3ccdb9b --- /dev/null +++ b/src/utils/block_parameters_holder.cc @@ -0,0 +1,107 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/block_parameters_holder.h" + +#include + +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/logging.h" +#include "src/utils/parameter_tree.h" +#include "src/utils/types.h" + +namespace libgav1 { + +namespace { + +// Returns the number of super block rows/columns for |value4x4| where value4x4 +// is either rows4x4 or columns4x4. +int RowsOrColumns4x4ToSuperBlocks(int value4x4, bool use_128x128_superblock) { + return use_128x128_superblock ? DivideBy128(MultiplyBy4(value4x4) + 127) + : DivideBy64(MultiplyBy4(value4x4) + 63); +} + +} // namespace + +bool BlockParametersHolder::Reset(int rows4x4, int columns4x4, + bool use_128x128_superblock) { + rows4x4_ = rows4x4; + columns4x4_ = columns4x4; + use_128x128_superblock_ = use_128x128_superblock; + if (!block_parameters_cache_.Reset(rows4x4_, columns4x4_)) { + LIBGAV1_DLOG(ERROR, "block_parameters_cache_.Reset() failed."); + return false; + } + const int rows = + RowsOrColumns4x4ToSuperBlocks(rows4x4_, use_128x128_superblock_); + const int columns = + RowsOrColumns4x4ToSuperBlocks(columns4x4_, use_128x128_superblock_); + const BlockSize sb_size = + use_128x128_superblock_ ? kBlock128x128 : kBlock64x64; + const int multiplier = kNum4x4BlocksWide[sb_size]; + if (!trees_.Reset(rows, columns, /*zero_initialize=*/false)) { + LIBGAV1_DLOG(ERROR, "trees_.Reset() failed."); + return false; + } + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < columns; ++j) { + trees_[i][j] = + ParameterTree::Create(i * multiplier, j * multiplier, sb_size); + if (trees_[i][j] == nullptr) { + LIBGAV1_DLOG(ERROR, "Allocation of trees_[%d][%d] failed.", i, j); + return false; + } + } + } + return true; +} + +void BlockParametersHolder::FillCache(int row4x4, int column4x4, + BlockSize block_size, + BlockParameters* const bp) { + int rows = std::min(static_cast(kNum4x4BlocksHigh[block_size]), + rows4x4_ - row4x4); + const int columns = std::min(static_cast(kNum4x4BlocksWide[block_size]), + columns4x4_ - column4x4); + auto* bp_dst = &block_parameters_cache_[row4x4][column4x4]; + // Specialize columns cases (values in kNum4x4BlocksWide[]) for better + // performance. + if (columns == 1) { + SetBlock(rows, 1, bp, bp_dst, columns4x4_); + } else if (columns == 2) { + SetBlock(rows, 2, bp, bp_dst, columns4x4_); + } else if (columns == 4) { + SetBlock(rows, 4, bp, bp_dst, columns4x4_); + } else if (columns == 8) { + SetBlock(rows, 8, bp, bp_dst, columns4x4_); + } else if (columns == 16) { + SetBlock(rows, 16, bp, bp_dst, columns4x4_); + } else if (columns == 32) { + SetBlock(rows, 32, bp, bp_dst, columns4x4_); + } else { + do { + // The following loop has better performance than using std::fill(). + // std::fill() has some overhead in checking zero loop count. + int x = columns; + auto* d = bp_dst; + do { + *d++ = bp; + } while (--x != 0); + bp_dst += columns4x4_; + } while (--rows != 0); + } +} + +} // namespace libgav1 diff --git a/src/utils/block_parameters_holder.h b/src/utils/block_parameters_holder.h new file mode 100644 index 0000000..35543c3 --- /dev/null +++ b/src/utils/block_parameters_holder.h @@ -0,0 +1,85 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_ +#define LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_ + +#include + +#include "src/utils/array_2d.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" +#include "src/utils/parameter_tree.h" +#include "src/utils/types.h" + +namespace libgav1 { + +// Holds a 2D array of |ParameterTree| objects. Each tree stores the parameters +// corresponding to a superblock. +class BlockParametersHolder { + public: + BlockParametersHolder() = default; + + // Not copyable or movable. + BlockParametersHolder(const BlockParametersHolder&) = delete; + BlockParametersHolder& operator=(const BlockParametersHolder&) = delete; + + // If |use_128x128_superblock| is true, 128x128 superblocks will be used, + // otherwise 64x64 superblocks will be used. + LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4, + bool use_128x128_superblock); + + // Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This + // is done as a simple look up of the |block_parameters_cache_| matrix. + // Returns nullptr if the BlockParameters cannot be found. + BlockParameters* Find(int row4x4, int column4x4) const { + return block_parameters_cache_[row4x4][column4x4]; + } + + BlockParameters** Address(int row4x4, int column4x4) { + return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4; + } + + BlockParameters* const* Address(int row4x4, int column4x4) const { + return block_parameters_cache_.data() + row4x4 * columns4x4_ + column4x4; + } + + int columns4x4() const { return columns4x4_; } + + // Returns the ParameterTree corresponding to superblock starting at (|row|, + // |column|). + ParameterTree* Tree(int row, int column) { return trees_[row][column].get(); } + + // Fills the cache matrix for the block starting at |row4x4|, |column4x4| of + // size |block_size| with the pointer |bp|. + void FillCache(int row4x4, int column4x4, BlockSize block_size, + BlockParameters* bp); + + private: + int rows4x4_ = 0; + int columns4x4_ = 0; + bool use_128x128_superblock_ = false; + Array2D> trees_; + + // This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by + // FillCache() and used by Find() to perform look ups using exactly one look + // up (instead of traversing the entire tree). + Array2D block_parameters_cache_; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_ diff --git a/src/utils/blocking_counter.h b/src/utils/blocking_counter.h new file mode 100644 index 0000000..6d664f8 --- /dev/null +++ b/src/utils/blocking_counter.h @@ -0,0 +1,97 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_ +#define LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_ + +#include +#include // NOLINT (unapproved c++11 header) +#include // NOLINT (unapproved c++11 header) + +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { + +// Implementation of a Blocking Counter that is used for the "fork-join" +// use case. Typical usage would be as follows: +// BlockingCounter counter(num_jobs); +// - spawn the jobs. +// - call counter.Wait() on the master thread. +// - worker threads will call counter.Decrement(). +// - master thread will return from counter.Wait() when all workers are +// complete. +template +class BlockingCounterImpl { + public: + explicit BlockingCounterImpl(int initial_count) + : count_(initial_count), job_failed_(false) {} + + // Increment the counter by |count|. This must be called before Wait() is + // called. This must be called from the same thread that will call Wait(). + void IncrementBy(int count) { + assert(count >= 0); + std::unique_lock lock(mutex_); + count_ += count; + } + + // Decrement the counter by 1. This function can be called only when + // |has_failure_status| is false (i.e.) when this class is being used with the + // |BlockingCounter| alias. + void Decrement() { + static_assert(!has_failure_status, ""); + std::unique_lock lock(mutex_); + if (--count_ == 0) { + condition_.notify_one(); + } + } + + // Decrement the counter by 1. This function can be called only when + // |has_failure_status| is true (i.e.) when this class is being used with the + // |BlockingCounterWithStatus| alias. |job_succeeded| is used to update the + // state of |job_failed_|. + void Decrement(bool job_succeeded) { + static_assert(has_failure_status, ""); + std::unique_lock lock(mutex_); + job_failed_ |= !job_succeeded; + if (--count_ == 0) { + condition_.notify_one(); + } + } + + // Block until the counter becomes 0. This function can be called only once + // per object. If |has_failure_status| is true, true is returned if all the + // jobs succeeded and false is returned if any of the jobs failed. If + // |has_failure_status| is false, this function always returns true. + bool Wait() { + std::unique_lock lock(mutex_); + condition_.wait(lock, [this]() { return count_ == 0; }); + // If |has_failure_status| is false, we simply return true. + return has_failure_status ? !job_failed_ : true; + } + + private: + std::mutex mutex_; + std::condition_variable condition_; + int count_ LIBGAV1_GUARDED_BY(mutex_); + bool job_failed_ LIBGAV1_GUARDED_BY(mutex_); +}; + +using BlockingCounterWithStatus = BlockingCounterImpl; +using BlockingCounter = BlockingCounterImpl; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_BLOCKING_COUNTER_H_ diff --git a/src/utils/common.h b/src/utils/common.h new file mode 100644 index 0000000..ae43c2b --- /dev/null +++ b/src/utils/common.h @@ -0,0 +1,534 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_COMMON_H_ +#define LIBGAV1_SRC_UTILS_COMMON_H_ + +#if defined(_MSC_VER) +#include +#pragma intrinsic(_BitScanForward) +#pragma intrinsic(_BitScanReverse) +#if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64) +#pragma intrinsic(_BitScanReverse64) +#define HAVE_BITSCANREVERSE64 +#endif // defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64) +#endif // defined(_MSC_VER) + +#include +#include +#include +#include +#include +#include + +#include "src/utils/bit_mask_set.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" +#include "src/utils/types.h" + +namespace libgav1 { + +// Aligns |value| to the desired |alignment|. |alignment| must be a power of 2. +template +inline T Align(T value, T alignment) { + assert(alignment != 0); + const T alignment_mask = alignment - 1; + return (value + alignment_mask) & ~alignment_mask; +} + +// Aligns |addr| to the desired |alignment|. |alignment| must be a power of 2. +inline uint8_t* AlignAddr(uint8_t* const addr, const uintptr_t alignment) { + const auto value = reinterpret_cast(addr); + return reinterpret_cast(Align(value, alignment)); +} + +inline int32_t Clip3(int32_t value, int32_t low, int32_t high) { + return value < low ? low : (value > high ? high : value); +} + +template +void ExtendLine(void* const line_start, const int width, const int left, + const int right) { + auto* const start = static_cast(line_start); + const Pixel* src = start; + Pixel* dst = start - left; + // Copy to left and right borders. + Memset(dst, src[0], left); + Memset(dst + left + width, src[width - 1], right); +} + +// The following 2 templates set a block of data with uncontiguous memory to +// |value|. The compilers usually generate several branches to handle different +// cases of |columns| when inlining memset() and std::fill(), and these branches +// are unfortunately within the loop of |rows|. So calling these templates +// directly could be inefficient. It is recommended to specialize common cases +// of |columns|, such as 1, 2, 4, 8, 16 and 32, etc. in advance before +// processing the generic case of |columns|. The code size may be larger, but +// there would be big speed gains. +// Call template MemSetBlock<> when sizeof(|T|) is 1. +// Call template SetBlock<> when sizeof(|T|) is larger than 1. +template +void MemSetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) { + static_assert(sizeof(T) == 1, ""); + do { + memset(dst, value, columns); + dst += stride; + } while (--rows != 0); +} + +template +void SetBlock(int rows, int columns, T value, T* dst, ptrdiff_t stride) { + do { + std::fill(dst, dst + columns, value); + dst += stride; + } while (--rows != 0); +} + +#if defined(__GNUC__) + +inline int CountLeadingZeros(uint32_t n) { + assert(n != 0); + return __builtin_clz(n); +} + +inline int CountLeadingZeros(uint64_t n) { + assert(n != 0); + return __builtin_clzll(n); +} + +inline int CountTrailingZeros(uint32_t n) { + assert(n != 0); + return __builtin_ctz(n); +} + +#elif defined(_MSC_VER) + +inline int CountLeadingZeros(uint32_t n) { + assert(n != 0); + unsigned long first_set_bit; // NOLINT(runtime/int) + const unsigned char bit_set = _BitScanReverse(&first_set_bit, n); + assert(bit_set != 0); + static_cast(bit_set); + return 31 ^ static_cast(first_set_bit); +} + +inline int CountLeadingZeros(uint64_t n) { + assert(n != 0); + unsigned long first_set_bit; // NOLINT(runtime/int) +#if defined(HAVE_BITSCANREVERSE64) + const unsigned char bit_set = + _BitScanReverse64(&first_set_bit, static_cast(n)); +#else // !defined(HAVE_BITSCANREVERSE64) + const auto n_hi = static_cast(n >> 32); // NOLINT(runtime/int) + if (n_hi != 0) { + const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi); + assert(bit_set != 0); + static_cast(bit_set); + return 31 ^ static_cast(first_set_bit); + } + const unsigned char bit_set = _BitScanReverse( + &first_set_bit, static_cast(n)); // NOLINT(runtime/int) +#endif // defined(HAVE_BITSCANREVERSE64) + assert(bit_set != 0); + static_cast(bit_set); + return 63 ^ static_cast(first_set_bit); +} + +#undef HAVE_BITSCANREVERSE64 + +inline int CountTrailingZeros(uint32_t n) { + assert(n != 0); + unsigned long first_set_bit; // NOLINT(runtime/int) + const unsigned char bit_set = _BitScanForward(&first_set_bit, n); + assert(bit_set != 0); + static_cast(bit_set); + return static_cast(first_set_bit); +} + +#else // !defined(__GNUC__) && !defined(_MSC_VER) + +template +inline int CountLeadingZeros(T n) { + assert(n != 0); + const T msb = T{1} << kMSB; + int count = 0; + while ((n & msb) == 0) { + ++count; + n <<= 1; + } + return count; +} + +inline int CountLeadingZeros(uint32_t n) { return CountLeadingZeros<31>(n); } + +inline int CountLeadingZeros(uint64_t n) { return CountLeadingZeros<63>(n); } + +// This is the algorithm on the left in Figure 5-23, Hacker's Delight, Second +// Edition, page 109. The book says: +// If the number of trailing 0's is expected to be small or large, then the +// simple loops shown in Figure 5-23 are quite fast. +inline int CountTrailingZeros(uint32_t n) { + assert(n != 0); + // Create a word with 1's at the positions of the trailing 0's in |n|, and + // 0's elsewhere (e.g., 01011000 => 00000111). + n = ~n & (n - 1); + int count = 0; + while (n != 0) { + ++count; + n >>= 1; + } + return count; +} + +#endif // defined(__GNUC__) + +inline int FloorLog2(int32_t n) { + assert(n > 0); + return 31 ^ CountLeadingZeros(static_cast(n)); +} + +inline int FloorLog2(uint32_t n) { + assert(n > 0); + return 31 ^ CountLeadingZeros(n); +} + +inline int FloorLog2(int64_t n) { + assert(n > 0); + return 63 ^ CountLeadingZeros(static_cast(n)); +} + +inline int FloorLog2(uint64_t n) { + assert(n > 0); + return 63 ^ CountLeadingZeros(n); +} + +inline int CeilLog2(unsigned int n) { + // The expression FloorLog2(n - 1) + 1 is undefined not only for n == 0 but + // also for n == 1, so this expression must be guarded by the n < 2 test. An + // alternative implementation is: + // return (n == 0) ? 0 : FloorLog2(n) + static_cast((n & (n - 1)) != 0); + return (n < 2) ? 0 : FloorLog2(n - 1) + 1; +} + +inline int RightShiftWithCeiling(int value, int bits) { + assert(bits > 0); + return (value + (1 << bits) - 1) >> bits; +} + +inline int32_t RightShiftWithRounding(int32_t value, int bits) { + assert(bits >= 0); + return (value + ((1 << bits) >> 1)) >> bits; +} + +inline uint32_t RightShiftWithRounding(uint32_t value, int bits) { + assert(bits >= 0); + return (value + ((1 << bits) >> 1)) >> bits; +} + +// This variant is used when |value| can exceed 32 bits. Although the final +// result must always fit into int32_t. +inline int32_t RightShiftWithRounding(int64_t value, int bits) { + assert(bits >= 0); + return static_cast((value + ((int64_t{1} << bits) >> 1)) >> bits); +} + +inline int32_t RightShiftWithRoundingSigned(int32_t value, int bits) { + assert(bits > 0); + // The next line is equivalent to: + // return (value >= 0) ? RightShiftWithRounding(value, bits) + // : -RightShiftWithRounding(-value, bits); + return RightShiftWithRounding(value + (value >> 31), bits); +} + +// This variant is used when |value| can exceed 32 bits. Although the final +// result must always fit into int32_t. +inline int32_t RightShiftWithRoundingSigned(int64_t value, int bits) { + assert(bits > 0); + // The next line is equivalent to: + // return (value >= 0) ? RightShiftWithRounding(value, bits) + // : -RightShiftWithRounding(-value, bits); + return RightShiftWithRounding(value + (value >> 63), bits); +} + +constexpr int DivideBy2(int n) { return n >> 1; } +constexpr int DivideBy4(int n) { return n >> 2; } +constexpr int DivideBy8(int n) { return n >> 3; } +constexpr int DivideBy16(int n) { return n >> 4; } +constexpr int DivideBy32(int n) { return n >> 5; } +constexpr int DivideBy64(int n) { return n >> 6; } +constexpr int DivideBy128(int n) { return n >> 7; } + +// Convert |value| to unsigned before shifting to avoid undefined behavior with +// negative values. +inline int LeftShift(int value, int bits) { + assert(bits >= 0); + assert(value >= -(int64_t{1} << (31 - bits))); + assert(value <= (int64_t{1} << (31 - bits)) - ((bits == 0) ? 1 : 0)); + return static_cast(static_cast(value) << bits); +} +inline int MultiplyBy2(int n) { return LeftShift(n, 1); } +inline int MultiplyBy4(int n) { return LeftShift(n, 2); } +inline int MultiplyBy8(int n) { return LeftShift(n, 3); } +inline int MultiplyBy16(int n) { return LeftShift(n, 4); } +inline int MultiplyBy32(int n) { return LeftShift(n, 5); } +inline int MultiplyBy64(int n) { return LeftShift(n, 6); } + +constexpr int Mod32(int n) { return n & 0x1f; } +constexpr int Mod64(int n) { return n & 0x3f; } + +//------------------------------------------------------------------------------ +// Bitstream functions + +constexpr bool IsIntraFrame(FrameType type) { + return type == kFrameKey || type == kFrameIntraOnly; +} + +inline TransformClass GetTransformClass(TransformType tx_type) { + constexpr BitMaskSet kTransformClassVerticalMask( + kTransformTypeIdentityDct, kTransformTypeIdentityAdst, + kTransformTypeIdentityFlipadst); + if (kTransformClassVerticalMask.Contains(tx_type)) { + return kTransformClassVertical; + } + constexpr BitMaskSet kTransformClassHorizontalMask( + kTransformTypeDctIdentity, kTransformTypeAdstIdentity, + kTransformTypeFlipadstIdentity); + if (kTransformClassHorizontalMask.Contains(tx_type)) { + return kTransformClassHorizontal; + } + return kTransformClass2D; +} + +inline int RowOrColumn4x4ToPixel(int row_or_column4x4, Plane plane, + int8_t subsampling) { + return MultiplyBy4(row_or_column4x4) >> (plane == kPlaneY ? 0 : subsampling); +} + +constexpr PlaneType GetPlaneType(Plane plane) { + return static_cast(plane != kPlaneY); +} + +// 5.11.44. +constexpr bool IsDirectionalMode(PredictionMode mode) { + return mode >= kPredictionModeVertical && mode <= kPredictionModeD67; +} + +// 5.9.3. +// +// |a| and |b| are order hints, treated as unsigned order_hint_bits-bit +// integers. |order_hint_shift_bits| equals (32 - order_hint_bits) % 32. +// order_hint_bits is at most 8, so |order_hint_shift_bits| is zero or a +// value between 24 and 31 (inclusive). +// +// If |order_hint_shift_bits| is zero, |a| and |b| are both zeros, and the +// result is zero. If |order_hint_shift_bits| is not zero, returns the +// signed difference |a| - |b| using "modular arithmetic". More precisely, the +// signed difference |a| - |b| is treated as a signed order_hint_bits-bit +// integer and cast to an int. The returned difference is between +// -(1 << (order_hint_bits - 1)) and (1 << (order_hint_bits - 1)) - 1 +// (inclusive). +// +// NOTE: |a| and |b| are the order_hint_bits least significant bits of the +// actual values. This function returns the signed difference between the +// actual values. The returned difference is correct as long as the actual +// values are not more than 1 << (order_hint_bits - 1) - 1 apart. +// +// Example: Suppose order_hint_bits is 4 and |order_hint_shift_bits| +// is 28. Then |a| and |b| are in the range [0, 15], and the actual values for +// |a| and |b| must not be more than 7 apart. (If the actual values for |a| and +// |b| are exactly 8 apart, this function cannot tell whether the actual value +// for |a| is before or after the actual value for |b|.) +// +// First, consider the order hints 2 and 6. For this simple case, we have +// GetRelativeDistance(2, 6, 28) = 2 - 6 = -4, and +// GetRelativeDistance(6, 2, 28) = 6 - 2 = 4. +// +// On the other hand, consider the order hints 2 and 14. The order hints are +// 12 (> 7) apart, so we need to use the actual values instead. The actual +// values may be 34 (= 2 mod 16) and 30 (= 14 mod 16), respectively. Therefore +// we have +// GetRelativeDistance(2, 14, 28) = 34 - 30 = 4, and +// GetRelativeDistance(14, 2, 28) = 30 - 34 = -4. +// +// The following comments apply only to specific CPUs' SIMD implementations, +// such as intrinsics code. +// For the 2 shift operations in this function, if the SIMD packed data is +// 16-bit wide, try to use |order_hint_shift_bits| - 16 as the number of bits to +// shift; If the SIMD packed data is 8-bit wide, try to use +// |order_hint_shift_bits| - 24 as as the number of bits to shift. +// |order_hint_shift_bits| - 16 and |order_hint_shift_bits| - 24 could be -16 or +// -24. In these cases diff is 0, and the behavior of left or right shifting -16 +// or -24 bits is defined for x86 SIMD instructions and ARM NEON instructions, +// and the result of shifting 0 is still 0. There is no guarantee that this +// behavior and result apply to other CPUs' SIMD instructions. +inline int GetRelativeDistance(const unsigned int a, const unsigned int b, + const unsigned int order_hint_shift_bits) { + const int diff = a - b; + assert(order_hint_shift_bits <= 31); + if (order_hint_shift_bits == 0) { + assert(a == 0); + assert(b == 0); + } else { + assert(order_hint_shift_bits >= 24); // i.e., order_hint_bits <= 8 + assert(a < (1u << (32 - order_hint_shift_bits))); + assert(b < (1u << (32 - order_hint_shift_bits))); + assert(diff < (1 << (32 - order_hint_shift_bits))); + assert(diff >= -(1 << (32 - order_hint_shift_bits))); + } + // Sign extend the result of subtracting the values. + // Cast to unsigned int and then left shift to avoid undefined behavior with + // negative values. Cast to int to do the sign extension through right shift. + // This requires the right shift of a signed integer be an arithmetic shift, + // which is true for clang, gcc, and Visual C++. + // These two casts do not generate extra instructions. + // Don't use LeftShift(diff) since a valid diff may fail its assertions. + // For example, GetRelativeDistance(2, 14, 28), diff equals -12 and is less + // than the minimum allowed value of LeftShift() which is -8. + // The next 3 lines are equivalent to: + // const int order_hint_bits = Mod32(32 - order_hint_shift_bits); + // const int m = (1 << order_hint_bits) >> 1; + // return (diff & (m - 1)) - (diff & m); + return static_cast(static_cast(diff) + << order_hint_shift_bits) >> + order_hint_shift_bits; +} + +// Applies |sign| (must be 0 or -1) to |value|, i.e., +// return (sign == 0) ? value : -value; +// and does so without a branch. +constexpr int ApplySign(int value, int sign) { return (value ^ sign) - sign; } + +// 7.9.3. (without the clamp for numerator and denominator). +inline void GetMvProjection(const MotionVector& mv, int numerator, + int division_multiplier, + MotionVector* projection_mv) { + // Allow numerator and to be 0 so that this function can be called + // unconditionally. When numerator is 0, |projection_mv| will be 0, and this + // is what we want. + assert(std::abs(numerator) <= kMaxFrameDistance); + for (int i = 0; i < 2; ++i) { + projection_mv->mv[i] = + Clip3(RightShiftWithRoundingSigned( + mv.mv[i] * numerator * division_multiplier, 14), + -kProjectionMvClamp, kProjectionMvClamp); + } +} + +// 7.9.4. +constexpr int Project(int value, int delta, int dst_sign) { + return value + ApplySign(delta / 64, dst_sign); +} + +inline bool IsBlockSmallerThan8x8(BlockSize size) { + return size < kBlock8x8 && size != kBlock4x16; +} + +// Returns true if the either the width or the height of the block is equal to +// four. +inline bool IsBlockDimension4(BlockSize size) { + return size < kBlock8x8 || size == kBlock16x4; +} + +// Converts bitdepth 8, 10, and 12 to array index 0, 1, and 2, respectively. +constexpr int BitdepthToArrayIndex(int bitdepth) { return (bitdepth - 8) >> 1; } + +// Maps a square transform to an index between [0, 4]. kTransformSize4x4 maps +// to 0, kTransformSize8x8 maps to 1 and so on. +inline int TransformSizeToSquareTransformIndex(TransformSize tx_size) { + assert(kTransformWidth[tx_size] == kTransformHeight[tx_size]); + + // The values of the square transform sizes happen to be in the right + // ranges, so we can just divide them by 4 to get the indexes. + static_assert( + std::is_unsigned::type>::value, ""); + static_assert(kTransformSize4x4 < 4, ""); + static_assert(4 <= kTransformSize8x8 && kTransformSize8x8 < 8, ""); + static_assert(8 <= kTransformSize16x16 && kTransformSize16x16 < 12, ""); + static_assert(12 <= kTransformSize32x32 && kTransformSize32x32 < 16, ""); + static_assert(16 <= kTransformSize64x64 && kTransformSize64x64 < 20, ""); + return DivideBy4(tx_size); +} + +// Gets the corresponding Y/U/V position, to set and get filter masks +// in deblock filtering. +// Returns luma_position if it's Y plane, whose subsampling must be 0. +// Returns the odd position for U/V plane, if there is subsampling. +constexpr int GetDeblockPosition(const int luma_position, + const int subsampling) { + return luma_position | subsampling; +} + +// Returns the size of the residual buffer required to hold the residual values +// for a block or frame of size |rows| by |columns| (taking into account +// |subsampling_x|, |subsampling_y| and |residual_size|). |residual_size| is the +// number of bytes required to represent one residual value. +inline size_t GetResidualBufferSize(const int rows, const int columns, + const int subsampling_x, + const int subsampling_y, + const size_t residual_size) { + // The subsampling multipliers are: + // Both x and y are subsampled: 3 / 2. + // Only x or y is subsampled: 2 / 1 (which is equivalent to 4 / 2). + // Both x and y are not subsampled: 3 / 1 (which is equivalent to 6 / 2). + // So we compute the final subsampling multiplier as follows: + // multiplier = (2 + (4 >> subsampling_x >> subsampling_y)) / 2. + // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary checks + // when parsing quantized coefficients. + const int subsampling_multiplier_num = + 2 + (4 >> subsampling_x >> subsampling_y); + const int number_elements = + (rows * columns * subsampling_multiplier_num) >> 1; + const int tx_padding = 32 * kResidualPaddingVertical; + return residual_size * (number_elements + tx_padding); +} + +// This function is equivalent to: +// std::min({kTransformWidthLog2[tx_size] - 2, +// kTransformWidthLog2[left_tx_size] - 2, +// 2}); +constexpr LoopFilterTransformSizeId GetTransformSizeIdWidth( + TransformSize tx_size, TransformSize left_tx_size) { + return static_cast( + static_cast(tx_size > kTransformSize4x16 && + left_tx_size > kTransformSize4x16) + + static_cast(tx_size > kTransformSize8x32 && + left_tx_size > kTransformSize8x32)); +} + +// This is used for 7.11.3.4 Block Inter Prediction Process, to select convolve +// filters. +inline int GetFilterIndex(const int filter_index, const int length) { + if (length <= 4) { + if (filter_index == kInterpolationFilterEightTap || + filter_index == kInterpolationFilterEightTapSharp) { + return 4; + } + if (filter_index == kInterpolationFilterEightTapSmooth) { + return 5; + } + } + return filter_index; +} + +// This has identical results as RightShiftWithRounding since |subsampling| can +// only be 0 or 1. +constexpr int SubsampledValue(int value, int subsampling) { + return (value + subsampling) >> subsampling; +} + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_COMMON_H_ diff --git a/src/utils/compiler_attributes.h b/src/utils/compiler_attributes.h new file mode 100644 index 0000000..e122426 --- /dev/null +++ b/src/utils/compiler_attributes.h @@ -0,0 +1,181 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_ +#define LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_ + +// A collection of compiler attribute checks and defines to control for +// compatibility across toolchains. + +//------------------------------------------------------------------------------ +// Language version, attribute and feature helpers. + +// Detect c++17 support. Visual Studio sets __cplusplus to 199711L by default +// unless compiled with /Zc:__cplusplus, use the value controlled by /std +// instead. +// https://docs.microsoft.com/en-us/cpp/build/reference/zc-cplusplus +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +#define LIBGAV1_CXX17 1 +#else +#define LIBGAV1_CXX17 0 +#endif + +#if defined(__has_attribute) +#define LIBGAV1_HAS_ATTRIBUTE __has_attribute +#else +#define LIBGAV1_HAS_ATTRIBUTE(x) 0 +#endif + +#if defined(__has_feature) +#define LIBGAV1_HAS_FEATURE __has_feature +#else +#define LIBGAV1_HAS_FEATURE(x) 0 +#endif + +//------------------------------------------------------------------------------ +// Sanitizer attributes. + +#if LIBGAV1_HAS_FEATURE(address_sanitizer) || defined(__SANITIZE_ADDRESS__) +#define LIBGAV1_ASAN 1 +#else +#define LIBGAV1_ASAN 0 +#endif + +#if LIBGAV1_HAS_FEATURE(memory_sanitizer) +#define LIBGAV1_MSAN 1 +#else +#define LIBGAV1_MSAN 0 +#endif + +#if LIBGAV1_HAS_FEATURE(thread_sanitizer) || defined(__SANITIZE_THREAD__) +#define LIBGAV1_TSAN 1 +#else +#define LIBGAV1_TSAN 0 +#endif + +//------------------------------------------------------------------------------ +// AddressSanitizer support. + +// Define the macros for AddressSanitizer manual memory poisoning. See +// https://github.com/google/sanitizers/wiki/AddressSanitizerManualPoisoning. +#if LIBGAV1_ASAN +#include +#else +#define ASAN_POISON_MEMORY_REGION(addr, size) \ + (static_cast(addr), static_cast(size)) +#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ + (static_cast(addr), static_cast(size)) +#endif + +//------------------------------------------------------------------------------ +// Function attributes. +// GCC: https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html +// Clang: https://clang.llvm.org/docs/AttributeReference.html + +#if defined(__GNUC__) +#define LIBGAV1_ALWAYS_INLINE __attribute__((always_inline)) inline +#elif defined(_MSC_VER) +#define LIBGAV1_ALWAYS_INLINE __forceinline +#else +#define LIBGAV1_ALWAYS_INLINE inline +#endif + +// LIBGAV1_MUST_USE_RESULT +// +// Tells the compiler to warn about unused results. +// +// When annotating a function, it must appear as the first part of the +// declaration or definition. The compiler will warn if the return value from +// such a function is unused: +// +// LIBGAV1_MUST_USE_RESULT Sprocket* AllocateSprocket(); +// AllocateSprocket(); // Triggers a warning. +// +// When annotating a class, it is equivalent to annotating every function which +// returns an instance. +// +// class LIBGAV1_MUST_USE_RESULT Sprocket {}; +// Sprocket(); // Triggers a warning. +// +// Sprocket MakeSprocket(); +// MakeSprocket(); // Triggers a warning. +// +// Note that references and pointers are not instances: +// +// Sprocket* SprocketPointer(); +// SprocketPointer(); // Does *not* trigger a warning. +// +// LIBGAV1_MUST_USE_RESULT allows using cast-to-void to suppress the unused +// result warning. For that, warn_unused_result is used only for clang but not +// for gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66425 +#if LIBGAV1_HAS_ATTRIBUTE(nodiscard) +#define LIBGAV1_MUST_USE_RESULT [[nodiscard]] +#elif defined(__clang__) && LIBGAV1_HAS_ATTRIBUTE(warn_unused_result) +#define LIBGAV1_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define LIBGAV1_MUST_USE_RESULT +#endif + +// LIBGAV1_PRINTF_ATTRIBUTE +// +// Tells the compiler to perform `printf` format string checking if the +// compiler supports it; see the 'format' attribute in +// . +// +// Note: As the GCC manual states, "[s]ince non-static C++ methods +// have an implicit 'this' argument, the arguments of such methods +// should be counted from two, not one." +#if LIBGAV1_HAS_ATTRIBUTE(format) || (defined(__GNUC__) && !defined(__clang__)) +#define LIBGAV1_PRINTF_ATTRIBUTE(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#else +#define LIBGAV1_PRINTF_ATTRIBUTE(string_index, first_to_check) +#endif + +//------------------------------------------------------------------------------ +// Thread annotations. + +// LIBGAV1_GUARDED_BY() +// +// Documents if a shared field or global variable needs to be protected by a +// mutex. LIBGAV1_GUARDED_BY() allows the user to specify a particular mutex +// that should be held when accessing the annotated variable. +// +// Although this annotation cannot be applied to local variables, a local +// variable and its associated mutex can often be combined into a small class +// or struct, thereby allowing the annotation. +// +// Example: +// +// class Foo { +// Mutex mu_; +// int p1_ LIBGAV1_GUARDED_BY(mu_); +// ... +// }; +// TODO(b/132506370): this can be reenabled after a local MutexLock +// implementation is added with proper thread annotations. +#if 0 // LIBGAV1_HAS_ATTRIBUTE(guarded_by) +#define LIBGAV1_GUARDED_BY(x) __attribute__((guarded_by(x))) +#else +#define LIBGAV1_GUARDED_BY(x) +#endif + +//------------------------------------------------------------------------------ + +#undef LIBGAV1_HAS_ATTRIBUTE +#undef LIBGAV1_HAS_FEATURE + +#endif // LIBGAV1_SRC_UTILS_COMPILER_ATTRIBUTES_H_ diff --git a/src/utils/constants.cc b/src/utils/constants.cc new file mode 100644 index 0000000..80d7acb --- /dev/null +++ b/src/utils/constants.cc @@ -0,0 +1,874 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/constants.h" + +namespace libgav1 { + +const uint8_t k4x4WidthLog2[kMaxBlockSizes] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, + 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5}; + +const uint8_t k4x4HeightLog2[kMaxBlockSizes] = { + 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 1, 2, 3, 4, 2, 3, 4, 5, 4, 5}; + +const uint8_t kNum4x4BlocksWide[kMaxBlockSizes] = { + 1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 32, 32}; + +const uint8_t kNum4x4BlocksHigh[kMaxBlockSizes] = { + 1, 2, 4, 1, 2, 4, 8, 1, 2, 4, 8, 16, 2, 4, 8, 16, 4, 8, 16, 32, 16, 32}; + +const uint8_t kBlockWidthPixels[kMaxBlockSizes] = { + 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, + 16, 32, 32, 32, 32, 64, 64, 64, 64, 128, 128}; + +const uint8_t kBlockHeightPixels[kMaxBlockSizes] = { + 4, 8, 16, 4, 8, 16, 32, 4, 8, 16, 32, + 64, 8, 16, 32, 64, 16, 32, 64, 128, 64, 128}; + +// 9.3 -- Partition_Subsize[] +const BlockSize kSubSize[kMaxPartitionTypes][kMaxBlockSizes] = { + // kPartitionNone + {kBlock4x4, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x32, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x64, kBlockInvalid, + kBlockInvalid, kBlock128x128}, + // kPartitionHorizontal + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32, kBlockInvalid, + kBlockInvalid, kBlock128x64}, + // kPartitionVertical + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64, kBlockInvalid, + kBlockInvalid, kBlock64x128}, + // kPartitionSplit + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x4, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x32, kBlockInvalid, + kBlockInvalid, kBlock64x64}, + // kPartitionHorizontalWithTopSplit + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32, kBlockInvalid, + kBlockInvalid, kBlock128x64}, + // kPartitionHorizontalWithBottomSplit + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x4, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x32, kBlockInvalid, + kBlockInvalid, kBlock128x64}, + // kPartitionVerticalWithLeftSplit + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64, kBlockInvalid, + kBlockInvalid, kBlock64x128}, + // kPartitionVerticalWithRightSplit + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x32, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x64, kBlockInvalid, + kBlockInvalid, kBlock64x128}, + // kPartitionHorizontal4 + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x4, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock32x8, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock64x16, kBlockInvalid, + kBlockInvalid, kBlockInvalid}, + // kPartitionVertical4 + {kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock4x16, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock8x32, + kBlockInvalid, kBlockInvalid, kBlockInvalid, kBlock16x64, kBlockInvalid, + kBlockInvalid, kBlockInvalid}}; + +// 5.11.38 (implemented as a simple look up. first dimension is block size, +// second and third are subsampling_x and subsampling_y). +const BlockSize kPlaneResidualSize[kMaxBlockSizes][2][2] = { + {{kBlock4x4, kBlock4x4}, {kBlock4x4, kBlock4x4}}, + {{kBlock4x8, kBlock4x4}, {kBlockInvalid, kBlock4x4}}, + {{kBlock4x16, kBlock4x8}, {kBlockInvalid, kBlock4x8}}, + {{kBlock8x4, kBlockInvalid}, {kBlock4x4, kBlock4x4}}, + {{kBlock8x8, kBlock8x4}, {kBlock4x8, kBlock4x4}}, + {{kBlock8x16, kBlock8x8}, {kBlockInvalid, kBlock4x8}}, + {{kBlock8x32, kBlock8x16}, {kBlockInvalid, kBlock4x16}}, + {{kBlock16x4, kBlockInvalid}, {kBlock8x4, kBlock8x4}}, + {{kBlock16x8, kBlockInvalid}, {kBlock8x8, kBlock8x4}}, + {{kBlock16x16, kBlock16x8}, {kBlock8x16, kBlock8x8}}, + {{kBlock16x32, kBlock16x16}, {kBlockInvalid, kBlock8x16}}, + {{kBlock16x64, kBlock16x32}, {kBlockInvalid, kBlock8x32}}, + {{kBlock32x8, kBlockInvalid}, {kBlock16x8, kBlock16x4}}, + {{kBlock32x16, kBlockInvalid}, {kBlock16x16, kBlock16x8}}, + {{kBlock32x32, kBlock32x16}, {kBlock16x32, kBlock16x16}}, + {{kBlock32x64, kBlock32x32}, {kBlockInvalid, kBlock16x32}}, + {{kBlock64x16, kBlockInvalid}, {kBlock32x16, kBlock32x8}}, + {{kBlock64x32, kBlockInvalid}, {kBlock32x32, kBlock32x16}}, + {{kBlock64x64, kBlock64x32}, {kBlock32x64, kBlock32x32}}, + {{kBlock64x128, kBlock64x64}, {kBlockInvalid, kBlock32x64}}, + {{kBlock128x64, kBlockInvalid}, {kBlock64x64, kBlock64x32}}, + {{kBlock128x128, kBlock128x64}, {kBlock64x128, kBlock64x64}}}; + +const int16_t kProjectionMvDivisionLookup[kMaxFrameDistance + 1] = { + 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638, + 1489, 1365, 1260, 1170, 1092, 1024, 963, 910, 862, 819, 780, + 744, 712, 682, 655, 630, 606, 585, 564, 546, 528}; + +const uint8_t kTransformWidth[kNumTransformSizes] = { + 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 32, 32, 32, 32, 64, 64, 64}; + +const uint8_t kTransformHeight[kNumTransformSizes] = { + 4, 8, 16, 4, 8, 16, 32, 4, 8, 16, 32, 64, 8, 16, 32, 64, 16, 32, 64}; + +const uint8_t kTransformWidth4x4[kNumTransformSizes] = { + 1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16}; + +const uint8_t kTransformHeight4x4[kNumTransformSizes] = { + 1, 2, 4, 1, 2, 4, 8, 1, 2, 4, 8, 16, 2, 4, 8, 16, 4, 8, 16}; + +const uint8_t kTransformWidthLog2[kNumTransformSizes] = { + 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6}; + +const uint8_t kTransformHeightLog2[kNumTransformSizes] = { + 2, 3, 4, 2, 3, 4, 5, 2, 3, 4, 5, 6, 3, 4, 5, 6, 4, 5, 6}; + +// 9.3 -- Split_Tx_Size[] +const TransformSize kSplitTransformSize[kNumTransformSizes] = { + kTransformSize4x4, kTransformSize4x4, kTransformSize4x8, + kTransformSize4x4, kTransformSize4x4, kTransformSize8x8, + kTransformSize8x16, kTransformSize8x4, kTransformSize8x8, + kTransformSize8x8, kTransformSize16x16, kTransformSize16x32, + kTransformSize16x8, kTransformSize16x16, kTransformSize16x16, + kTransformSize32x32, kTransformSize32x16, kTransformSize32x32, + kTransformSize32x32}; + +// Square transform of size min(w,h). +const TransformSize kTransformSizeSquareMin[kNumTransformSizes] = { + kTransformSize4x4, kTransformSize4x4, kTransformSize4x4, + kTransformSize4x4, kTransformSize8x8, kTransformSize8x8, + kTransformSize8x8, kTransformSize4x4, kTransformSize8x8, + kTransformSize16x16, kTransformSize16x16, kTransformSize16x16, + kTransformSize8x8, kTransformSize16x16, kTransformSize32x32, + kTransformSize32x32, kTransformSize16x16, kTransformSize32x32, + kTransformSize64x64}; + +// Square transform of size max(w,h). +const TransformSize kTransformSizeSquareMax[kNumTransformSizes] = { + kTransformSize4x4, kTransformSize8x8, kTransformSize16x16, + kTransformSize8x8, kTransformSize8x8, kTransformSize16x16, + kTransformSize32x32, kTransformSize16x16, kTransformSize16x16, + kTransformSize16x16, kTransformSize32x32, kTransformSize64x64, + kTransformSize32x32, kTransformSize32x32, kTransformSize32x32, + kTransformSize64x64, kTransformSize64x64, kTransformSize64x64, + kTransformSize64x64}; + +const uint8_t kNumTransformTypesInSet[kNumTransformSets] = {1, 7, 5, 16, 12, 2}; + +const uint8_t kSgrProjParams[1 << kSgrProjParamsBits][4] = { + {2, 12, 1, 4}, {2, 15, 1, 6}, {2, 18, 1, 8}, {2, 21, 1, 9}, + {2, 24, 1, 10}, {2, 29, 1, 11}, {2, 36, 1, 12}, {2, 45, 1, 13}, + {2, 56, 1, 14}, {2, 68, 1, 15}, {0, 0, 1, 5}, {0, 0, 1, 8}, + {0, 0, 1, 11}, {0, 0, 1, 14}, {2, 30, 0, 0}, {2, 75, 0, 0}}; + +const int8_t kSgrProjMultiplierMin[2] = {-96, -32}; + +const int8_t kSgrProjMultiplierMax[2] = {31, 95}; + +const int8_t kWienerTapsMin[3] = {-5, -23, -17}; + +const int8_t kWienerTapsMax[3] = {10, 8, 46}; + +// This was modified from Upscale_Filter as defined in AV1 Section 7.16, in +// order to support 16-bit packed NEON operations. +// The sign of each tap is: - + - + + - + - +alignas(16) const uint8_t + kUpscaleFilterUnsigned[kSuperResFilterShifts][kSuperResFilterTaps] = { + {0, 0, 0, 128, 0, 0, 0, 0}, {0, 0, 1, 128, 2, 1, 0, 0}, + {0, 1, 3, 127, 4, 2, 1, 0}, {0, 1, 4, 127, 6, 3, 1, 0}, + {0, 2, 6, 126, 8, 3, 1, 0}, {0, 2, 7, 125, 11, 4, 1, 0}, + {1, 2, 8, 125, 13, 5, 2, 0}, {1, 3, 9, 124, 15, 6, 2, 0}, + {1, 3, 10, 123, 18, 6, 2, 1}, {1, 3, 11, 122, 20, 7, 3, 1}, + {1, 4, 12, 121, 22, 8, 3, 1}, {1, 4, 13, 120, 25, 9, 3, 1}, + {1, 4, 14, 118, 28, 9, 3, 1}, {1, 4, 15, 117, 30, 10, 4, 1}, + {1, 5, 16, 116, 32, 11, 4, 1}, {1, 5, 16, 114, 35, 12, 4, 1}, + {1, 5, 17, 112, 38, 12, 4, 1}, {1, 5, 18, 111, 40, 13, 5, 1}, + {1, 5, 18, 109, 43, 14, 5, 1}, {1, 6, 19, 107, 45, 14, 5, 1}, + {1, 6, 19, 105, 48, 15, 5, 1}, {1, 6, 19, 103, 51, 16, 5, 1}, + {1, 6, 20, 101, 53, 16, 6, 1}, {1, 6, 20, 99, 56, 17, 6, 1}, + {1, 6, 20, 97, 58, 17, 6, 1}, {1, 6, 20, 95, 61, 18, 6, 1}, + {2, 7, 20, 93, 64, 18, 6, 2}, {2, 7, 20, 91, 66, 19, 6, 1}, + {2, 7, 20, 88, 69, 19, 6, 1}, {2, 7, 20, 86, 71, 19, 6, 1}, + {2, 7, 20, 84, 74, 20, 7, 2}, {2, 7, 20, 81, 76, 20, 7, 1}, + {2, 7, 20, 79, 79, 20, 7, 2}, {1, 7, 20, 76, 81, 20, 7, 2}, + {2, 7, 20, 74, 84, 20, 7, 2}, {1, 6, 19, 71, 86, 20, 7, 2}, + {1, 6, 19, 69, 88, 20, 7, 2}, {1, 6, 19, 66, 91, 20, 7, 2}, + {2, 6, 18, 64, 93, 20, 7, 2}, {1, 6, 18, 61, 95, 20, 6, 1}, + {1, 6, 17, 58, 97, 20, 6, 1}, {1, 6, 17, 56, 99, 20, 6, 1}, + {1, 6, 16, 53, 101, 20, 6, 1}, {1, 5, 16, 51, 103, 19, 6, 1}, + {1, 5, 15, 48, 105, 19, 6, 1}, {1, 5, 14, 45, 107, 19, 6, 1}, + {1, 5, 14, 43, 109, 18, 5, 1}, {1, 5, 13, 40, 111, 18, 5, 1}, + {1, 4, 12, 38, 112, 17, 5, 1}, {1, 4, 12, 35, 114, 16, 5, 1}, + {1, 4, 11, 32, 116, 16, 5, 1}, {1, 4, 10, 30, 117, 15, 4, 1}, + {1, 3, 9, 28, 118, 14, 4, 1}, {1, 3, 9, 25, 120, 13, 4, 1}, + {1, 3, 8, 22, 121, 12, 4, 1}, {1, 3, 7, 20, 122, 11, 3, 1}, + {1, 2, 6, 18, 123, 10, 3, 1}, {0, 2, 6, 15, 124, 9, 3, 1}, + {0, 2, 5, 13, 125, 8, 2, 1}, {0, 1, 4, 11, 125, 7, 2, 0}, + {0, 1, 3, 8, 126, 6, 2, 0}, {0, 1, 3, 6, 127, 4, 1, 0}, + {0, 1, 2, 4, 127, 3, 1, 0}, {0, 0, 1, 2, 128, 1, 0, 0}, +}; + +alignas(8) const int8_t + kWarpedFilters8[3 * kWarpedPixelPrecisionShifts + 1][8] = { + // [-1, 0). + {0, 0, 127, 1, 0, 0, 0, 0}, + {0, -1, 127, 2, 0, 0, 0, 0}, + {1, -3, 127, 4, -1, 0, 0, 0}, + {1, -4, 126, 6, -2, 1, 0, 0}, + {1, -5, 126, 8, -3, 1, 0, 0}, + {1, -6, 125, 11, -4, 1, 0, 0}, + {1, -7, 124, 13, -4, 1, 0, 0}, + {2, -8, 123, 15, -5, 1, 0, 0}, + {2, -9, 122, 18, -6, 1, 0, 0}, + {2, -10, 121, 20, -6, 1, 0, 0}, + {2, -11, 120, 22, -7, 2, 0, 0}, + {2, -12, 119, 25, -8, 2, 0, 0}, + {3, -13, 117, 27, -8, 2, 0, 0}, + {3, -13, 116, 29, -9, 2, 0, 0}, + {3, -14, 114, 32, -10, 3, 0, 0}, + {3, -15, 113, 35, -10, 2, 0, 0}, + {3, -15, 111, 37, -11, 3, 0, 0}, + {3, -16, 109, 40, -11, 3, 0, 0}, + {3, -16, 108, 42, -12, 3, 0, 0}, + {4, -17, 106, 45, -13, 3, 0, 0}, + {4, -17, 104, 47, -13, 3, 0, 0}, + {4, -17, 102, 50, -14, 3, 0, 0}, + {4, -17, 100, 52, -14, 3, 0, 0}, + {4, -18, 98, 55, -15, 4, 0, 0}, + {4, -18, 96, 58, -15, 3, 0, 0}, + {4, -18, 94, 60, -16, 4, 0, 0}, + {4, -18, 91, 63, -16, 4, 0, 0}, + {4, -18, 89, 65, -16, 4, 0, 0}, + {4, -18, 87, 68, -17, 4, 0, 0}, + {4, -18, 85, 70, -17, 4, 0, 0}, + {4, -18, 82, 73, -17, 4, 0, 0}, + {4, -18, 80, 75, -17, 4, 0, 0}, + {4, -18, 78, 78, -18, 4, 0, 0}, + {4, -17, 75, 80, -18, 4, 0, 0}, + {4, -17, 73, 82, -18, 4, 0, 0}, + {4, -17, 70, 85, -18, 4, 0, 0}, + {4, -17, 68, 87, -18, 4, 0, 0}, + {4, -16, 65, 89, -18, 4, 0, 0}, + {4, -16, 63, 91, -18, 4, 0, 0}, + {4, -16, 60, 94, -18, 4, 0, 0}, + {3, -15, 58, 96, -18, 4, 0, 0}, + {4, -15, 55, 98, -18, 4, 0, 0}, + {3, -14, 52, 100, -17, 4, 0, 0}, + {3, -14, 50, 102, -17, 4, 0, 0}, + {3, -13, 47, 104, -17, 4, 0, 0}, + {3, -13, 45, 106, -17, 4, 0, 0}, + {3, -12, 42, 108, -16, 3, 0, 0}, + {3, -11, 40, 109, -16, 3, 0, 0}, + {3, -11, 37, 111, -15, 3, 0, 0}, + {2, -10, 35, 113, -15, 3, 0, 0}, + {3, -10, 32, 114, -14, 3, 0, 0}, + {2, -9, 29, 116, -13, 3, 0, 0}, + {2, -8, 27, 117, -13, 3, 0, 0}, + {2, -8, 25, 119, -12, 2, 0, 0}, + {2, -7, 22, 120, -11, 2, 0, 0}, + {1, -6, 20, 121, -10, 2, 0, 0}, + {1, -6, 18, 122, -9, 2, 0, 0}, + {1, -5, 15, 123, -8, 2, 0, 0}, + {1, -4, 13, 124, -7, 1, 0, 0}, + {1, -4, 11, 125, -6, 1, 0, 0}, + {1, -3, 8, 126, -5, 1, 0, 0}, + {1, -2, 6, 126, -4, 1, 0, 0}, + {0, -1, 4, 127, -3, 1, 0, 0}, + {0, 0, 2, 127, -1, 0, 0, 0}, + // [0, 1). + {0, 0, 0, 127, 1, 0, 0, 0}, + {0, 0, -1, 127, 2, 0, 0, 0}, + {0, 1, -3, 127, 4, -2, 1, 0}, + {0, 1, -5, 127, 6, -2, 1, 0}, + {0, 2, -6, 126, 8, -3, 1, 0}, + {-1, 2, -7, 126, 11, -4, 2, -1}, + {-1, 3, -8, 125, 13, -5, 2, -1}, + {-1, 3, -10, 124, 16, -6, 3, -1}, + {-1, 4, -11, 123, 18, -7, 3, -1}, + {-1, 4, -12, 122, 20, -7, 3, -1}, + {-1, 4, -13, 121, 23, -8, 3, -1}, + {-2, 5, -14, 120, 25, -9, 4, -1}, + {-1, 5, -15, 119, 27, -10, 4, -1}, + {-1, 5, -16, 118, 30, -11, 4, -1}, + {-2, 6, -17, 116, 33, -12, 5, -1}, + {-2, 6, -17, 114, 35, -12, 5, -1}, + {-2, 6, -18, 113, 38, -13, 5, -1}, + {-2, 7, -19, 111, 41, -14, 6, -2}, + {-2, 7, -19, 110, 43, -15, 6, -2}, + {-2, 7, -20, 108, 46, -15, 6, -2}, + {-2, 7, -20, 106, 49, -16, 6, -2}, + {-2, 7, -21, 104, 51, -16, 7, -2}, + {-2, 7, -21, 102, 54, -17, 7, -2}, + {-2, 8, -21, 100, 56, -18, 7, -2}, + {-2, 8, -22, 98, 59, -18, 7, -2}, + {-2, 8, -22, 96, 62, -19, 7, -2}, + {-2, 8, -22, 94, 64, -19, 7, -2}, + {-2, 8, -22, 91, 67, -20, 8, -2}, + {-2, 8, -22, 89, 69, -20, 8, -2}, + {-2, 8, -22, 87, 72, -21, 8, -2}, + {-2, 8, -21, 84, 74, -21, 8, -2}, + {-2, 8, -22, 82, 77, -21, 8, -2}, + {-2, 8, -21, 79, 79, -21, 8, -2}, + {-2, 8, -21, 77, 82, -22, 8, -2}, + {-2, 8, -21, 74, 84, -21, 8, -2}, + {-2, 8, -21, 72, 87, -22, 8, -2}, + {-2, 8, -20, 69, 89, -22, 8, -2}, + {-2, 8, -20, 67, 91, -22, 8, -2}, + {-2, 7, -19, 64, 94, -22, 8, -2}, + {-2, 7, -19, 62, 96, -22, 8, -2}, + {-2, 7, -18, 59, 98, -22, 8, -2}, + {-2, 7, -18, 56, 100, -21, 8, -2}, + {-2, 7, -17, 54, 102, -21, 7, -2}, + {-2, 7, -16, 51, 104, -21, 7, -2}, + {-2, 6, -16, 49, 106, -20, 7, -2}, + {-2, 6, -15, 46, 108, -20, 7, -2}, + {-2, 6, -15, 43, 110, -19, 7, -2}, + {-2, 6, -14, 41, 111, -19, 7, -2}, + {-1, 5, -13, 38, 113, -18, 6, -2}, + {-1, 5, -12, 35, 114, -17, 6, -2}, + {-1, 5, -12, 33, 116, -17, 6, -2}, + {-1, 4, -11, 30, 118, -16, 5, -1}, + {-1, 4, -10, 27, 119, -15, 5, -1}, + {-1, 4, -9, 25, 120, -14, 5, -2}, + {-1, 3, -8, 23, 121, -13, 4, -1}, + {-1, 3, -7, 20, 122, -12, 4, -1}, + {-1, 3, -7, 18, 123, -11, 4, -1}, + {-1, 3, -6, 16, 124, -10, 3, -1}, + {-1, 2, -5, 13, 125, -8, 3, -1}, + {-1, 2, -4, 11, 126, -7, 2, -1}, + {0, 1, -3, 8, 126, -6, 2, 0}, + {0, 1, -2, 6, 127, -5, 1, 0}, + {0, 1, -2, 4, 127, -3, 1, 0}, + {0, 0, 0, 2, 127, -1, 0, 0}, + // [1, 2). + {0, 0, 0, 1, 127, 0, 0, 0}, + {0, 0, 0, -1, 127, 2, 0, 0}, + {0, 0, 1, -3, 127, 4, -1, 0}, + {0, 0, 1, -4, 126, 6, -2, 1}, + {0, 0, 1, -5, 126, 8, -3, 1}, + {0, 0, 1, -6, 125, 11, -4, 1}, + {0, 0, 1, -7, 124, 13, -4, 1}, + {0, 0, 2, -8, 123, 15, -5, 1}, + {0, 0, 2, -9, 122, 18, -6, 1}, + {0, 0, 2, -10, 121, 20, -6, 1}, + {0, 0, 2, -11, 120, 22, -7, 2}, + {0, 0, 2, -12, 119, 25, -8, 2}, + {0, 0, 3, -13, 117, 27, -8, 2}, + {0, 0, 3, -13, 116, 29, -9, 2}, + {0, 0, 3, -14, 114, 32, -10, 3}, + {0, 0, 3, -15, 113, 35, -10, 2}, + {0, 0, 3, -15, 111, 37, -11, 3}, + {0, 0, 3, -16, 109, 40, -11, 3}, + {0, 0, 3, -16, 108, 42, -12, 3}, + {0, 0, 4, -17, 106, 45, -13, 3}, + {0, 0, 4, -17, 104, 47, -13, 3}, + {0, 0, 4, -17, 102, 50, -14, 3}, + {0, 0, 4, -17, 100, 52, -14, 3}, + {0, 0, 4, -18, 98, 55, -15, 4}, + {0, 0, 4, -18, 96, 58, -15, 3}, + {0, 0, 4, -18, 94, 60, -16, 4}, + {0, 0, 4, -18, 91, 63, -16, 4}, + {0, 0, 4, -18, 89, 65, -16, 4}, + {0, 0, 4, -18, 87, 68, -17, 4}, + {0, 0, 4, -18, 85, 70, -17, 4}, + {0, 0, 4, -18, 82, 73, -17, 4}, + {0, 0, 4, -18, 80, 75, -17, 4}, + {0, 0, 4, -18, 78, 78, -18, 4}, + {0, 0, 4, -17, 75, 80, -18, 4}, + {0, 0, 4, -17, 73, 82, -18, 4}, + {0, 0, 4, -17, 70, 85, -18, 4}, + {0, 0, 4, -17, 68, 87, -18, 4}, + {0, 0, 4, -16, 65, 89, -18, 4}, + {0, 0, 4, -16, 63, 91, -18, 4}, + {0, 0, 4, -16, 60, 94, -18, 4}, + {0, 0, 3, -15, 58, 96, -18, 4}, + {0, 0, 4, -15, 55, 98, -18, 4}, + {0, 0, 3, -14, 52, 100, -17, 4}, + {0, 0, 3, -14, 50, 102, -17, 4}, + {0, 0, 3, -13, 47, 104, -17, 4}, + {0, 0, 3, -13, 45, 106, -17, 4}, + {0, 0, 3, -12, 42, 108, -16, 3}, + {0, 0, 3, -11, 40, 109, -16, 3}, + {0, 0, 3, -11, 37, 111, -15, 3}, + {0, 0, 2, -10, 35, 113, -15, 3}, + {0, 0, 3, -10, 32, 114, -14, 3}, + {0, 0, 2, -9, 29, 116, -13, 3}, + {0, 0, 2, -8, 27, 117, -13, 3}, + {0, 0, 2, -8, 25, 119, -12, 2}, + {0, 0, 2, -7, 22, 120, -11, 2}, + {0, 0, 1, -6, 20, 121, -10, 2}, + {0, 0, 1, -6, 18, 122, -9, 2}, + {0, 0, 1, -5, 15, 123, -8, 2}, + {0, 0, 1, -4, 13, 124, -7, 1}, + {0, 0, 1, -4, 11, 125, -6, 1}, + {0, 0, 1, -3, 8, 126, -5, 1}, + {0, 0, 1, -2, 6, 126, -4, 1}, + {0, 0, 0, -1, 4, 127, -3, 1}, + {0, 0, 0, 0, 2, 127, -1, 0}, + // dummy, replicate row index 191. + {0, 0, 0, 0, 2, 127, -1, 0}}; + +alignas(16) const int16_t + kWarpedFilters[3 * kWarpedPixelPrecisionShifts + 1][8] = { + // [-1, 0). + {0, 0, 127, 1, 0, 0, 0, 0}, + {0, -1, 127, 2, 0, 0, 0, 0}, + {1, -3, 127, 4, -1, 0, 0, 0}, + {1, -4, 126, 6, -2, 1, 0, 0}, + {1, -5, 126, 8, -3, 1, 0, 0}, + {1, -6, 125, 11, -4, 1, 0, 0}, + {1, -7, 124, 13, -4, 1, 0, 0}, + {2, -8, 123, 15, -5, 1, 0, 0}, + {2, -9, 122, 18, -6, 1, 0, 0}, + {2, -10, 121, 20, -6, 1, 0, 0}, + {2, -11, 120, 22, -7, 2, 0, 0}, + {2, -12, 119, 25, -8, 2, 0, 0}, + {3, -13, 117, 27, -8, 2, 0, 0}, + {3, -13, 116, 29, -9, 2, 0, 0}, + {3, -14, 114, 32, -10, 3, 0, 0}, + {3, -15, 113, 35, -10, 2, 0, 0}, + {3, -15, 111, 37, -11, 3, 0, 0}, + {3, -16, 109, 40, -11, 3, 0, 0}, + {3, -16, 108, 42, -12, 3, 0, 0}, + {4, -17, 106, 45, -13, 3, 0, 0}, + {4, -17, 104, 47, -13, 3, 0, 0}, + {4, -17, 102, 50, -14, 3, 0, 0}, + {4, -17, 100, 52, -14, 3, 0, 0}, + {4, -18, 98, 55, -15, 4, 0, 0}, + {4, -18, 96, 58, -15, 3, 0, 0}, + {4, -18, 94, 60, -16, 4, 0, 0}, + {4, -18, 91, 63, -16, 4, 0, 0}, + {4, -18, 89, 65, -16, 4, 0, 0}, + {4, -18, 87, 68, -17, 4, 0, 0}, + {4, -18, 85, 70, -17, 4, 0, 0}, + {4, -18, 82, 73, -17, 4, 0, 0}, + {4, -18, 80, 75, -17, 4, 0, 0}, + {4, -18, 78, 78, -18, 4, 0, 0}, + {4, -17, 75, 80, -18, 4, 0, 0}, + {4, -17, 73, 82, -18, 4, 0, 0}, + {4, -17, 70, 85, -18, 4, 0, 0}, + {4, -17, 68, 87, -18, 4, 0, 0}, + {4, -16, 65, 89, -18, 4, 0, 0}, + {4, -16, 63, 91, -18, 4, 0, 0}, + {4, -16, 60, 94, -18, 4, 0, 0}, + {3, -15, 58, 96, -18, 4, 0, 0}, + {4, -15, 55, 98, -18, 4, 0, 0}, + {3, -14, 52, 100, -17, 4, 0, 0}, + {3, -14, 50, 102, -17, 4, 0, 0}, + {3, -13, 47, 104, -17, 4, 0, 0}, + {3, -13, 45, 106, -17, 4, 0, 0}, + {3, -12, 42, 108, -16, 3, 0, 0}, + {3, -11, 40, 109, -16, 3, 0, 0}, + {3, -11, 37, 111, -15, 3, 0, 0}, + {2, -10, 35, 113, -15, 3, 0, 0}, + {3, -10, 32, 114, -14, 3, 0, 0}, + {2, -9, 29, 116, -13, 3, 0, 0}, + {2, -8, 27, 117, -13, 3, 0, 0}, + {2, -8, 25, 119, -12, 2, 0, 0}, + {2, -7, 22, 120, -11, 2, 0, 0}, + {1, -6, 20, 121, -10, 2, 0, 0}, + {1, -6, 18, 122, -9, 2, 0, 0}, + {1, -5, 15, 123, -8, 2, 0, 0}, + {1, -4, 13, 124, -7, 1, 0, 0}, + {1, -4, 11, 125, -6, 1, 0, 0}, + {1, -3, 8, 126, -5, 1, 0, 0}, + {1, -2, 6, 126, -4, 1, 0, 0}, + {0, -1, 4, 127, -3, 1, 0, 0}, + {0, 0, 2, 127, -1, 0, 0, 0}, + // [0, 1). + {0, 0, 0, 127, 1, 0, 0, 0}, + {0, 0, -1, 127, 2, 0, 0, 0}, + {0, 1, -3, 127, 4, -2, 1, 0}, + {0, 1, -5, 127, 6, -2, 1, 0}, + {0, 2, -6, 126, 8, -3, 1, 0}, + {-1, 2, -7, 126, 11, -4, 2, -1}, + {-1, 3, -8, 125, 13, -5, 2, -1}, + {-1, 3, -10, 124, 16, -6, 3, -1}, + {-1, 4, -11, 123, 18, -7, 3, -1}, + {-1, 4, -12, 122, 20, -7, 3, -1}, + {-1, 4, -13, 121, 23, -8, 3, -1}, + {-2, 5, -14, 120, 25, -9, 4, -1}, + {-1, 5, -15, 119, 27, -10, 4, -1}, + {-1, 5, -16, 118, 30, -11, 4, -1}, + {-2, 6, -17, 116, 33, -12, 5, -1}, + {-2, 6, -17, 114, 35, -12, 5, -1}, + {-2, 6, -18, 113, 38, -13, 5, -1}, + {-2, 7, -19, 111, 41, -14, 6, -2}, + {-2, 7, -19, 110, 43, -15, 6, -2}, + {-2, 7, -20, 108, 46, -15, 6, -2}, + {-2, 7, -20, 106, 49, -16, 6, -2}, + {-2, 7, -21, 104, 51, -16, 7, -2}, + {-2, 7, -21, 102, 54, -17, 7, -2}, + {-2, 8, -21, 100, 56, -18, 7, -2}, + {-2, 8, -22, 98, 59, -18, 7, -2}, + {-2, 8, -22, 96, 62, -19, 7, -2}, + {-2, 8, -22, 94, 64, -19, 7, -2}, + {-2, 8, -22, 91, 67, -20, 8, -2}, + {-2, 8, -22, 89, 69, -20, 8, -2}, + {-2, 8, -22, 87, 72, -21, 8, -2}, + {-2, 8, -21, 84, 74, -21, 8, -2}, + {-2, 8, -22, 82, 77, -21, 8, -2}, + {-2, 8, -21, 79, 79, -21, 8, -2}, + {-2, 8, -21, 77, 82, -22, 8, -2}, + {-2, 8, -21, 74, 84, -21, 8, -2}, + {-2, 8, -21, 72, 87, -22, 8, -2}, + {-2, 8, -20, 69, 89, -22, 8, -2}, + {-2, 8, -20, 67, 91, -22, 8, -2}, + {-2, 7, -19, 64, 94, -22, 8, -2}, + {-2, 7, -19, 62, 96, -22, 8, -2}, + {-2, 7, -18, 59, 98, -22, 8, -2}, + {-2, 7, -18, 56, 100, -21, 8, -2}, + {-2, 7, -17, 54, 102, -21, 7, -2}, + {-2, 7, -16, 51, 104, -21, 7, -2}, + {-2, 6, -16, 49, 106, -20, 7, -2}, + {-2, 6, -15, 46, 108, -20, 7, -2}, + {-2, 6, -15, 43, 110, -19, 7, -2}, + {-2, 6, -14, 41, 111, -19, 7, -2}, + {-1, 5, -13, 38, 113, -18, 6, -2}, + {-1, 5, -12, 35, 114, -17, 6, -2}, + {-1, 5, -12, 33, 116, -17, 6, -2}, + {-1, 4, -11, 30, 118, -16, 5, -1}, + {-1, 4, -10, 27, 119, -15, 5, -1}, + {-1, 4, -9, 25, 120, -14, 5, -2}, + {-1, 3, -8, 23, 121, -13, 4, -1}, + {-1, 3, -7, 20, 122, -12, 4, -1}, + {-1, 3, -7, 18, 123, -11, 4, -1}, + {-1, 3, -6, 16, 124, -10, 3, -1}, + {-1, 2, -5, 13, 125, -8, 3, -1}, + {-1, 2, -4, 11, 126, -7, 2, -1}, + {0, 1, -3, 8, 126, -6, 2, 0}, + {0, 1, -2, 6, 127, -5, 1, 0}, + {0, 1, -2, 4, 127, -3, 1, 0}, + {0, 0, 0, 2, 127, -1, 0, 0}, + // [1, 2). + {0, 0, 0, 1, 127, 0, 0, 0}, + {0, 0, 0, -1, 127, 2, 0, 0}, + {0, 0, 1, -3, 127, 4, -1, 0}, + {0, 0, 1, -4, 126, 6, -2, 1}, + {0, 0, 1, -5, 126, 8, -3, 1}, + {0, 0, 1, -6, 125, 11, -4, 1}, + {0, 0, 1, -7, 124, 13, -4, 1}, + {0, 0, 2, -8, 123, 15, -5, 1}, + {0, 0, 2, -9, 122, 18, -6, 1}, + {0, 0, 2, -10, 121, 20, -6, 1}, + {0, 0, 2, -11, 120, 22, -7, 2}, + {0, 0, 2, -12, 119, 25, -8, 2}, + {0, 0, 3, -13, 117, 27, -8, 2}, + {0, 0, 3, -13, 116, 29, -9, 2}, + {0, 0, 3, -14, 114, 32, -10, 3}, + {0, 0, 3, -15, 113, 35, -10, 2}, + {0, 0, 3, -15, 111, 37, -11, 3}, + {0, 0, 3, -16, 109, 40, -11, 3}, + {0, 0, 3, -16, 108, 42, -12, 3}, + {0, 0, 4, -17, 106, 45, -13, 3}, + {0, 0, 4, -17, 104, 47, -13, 3}, + {0, 0, 4, -17, 102, 50, -14, 3}, + {0, 0, 4, -17, 100, 52, -14, 3}, + {0, 0, 4, -18, 98, 55, -15, 4}, + {0, 0, 4, -18, 96, 58, -15, 3}, + {0, 0, 4, -18, 94, 60, -16, 4}, + {0, 0, 4, -18, 91, 63, -16, 4}, + {0, 0, 4, -18, 89, 65, -16, 4}, + {0, 0, 4, -18, 87, 68, -17, 4}, + {0, 0, 4, -18, 85, 70, -17, 4}, + {0, 0, 4, -18, 82, 73, -17, 4}, + {0, 0, 4, -18, 80, 75, -17, 4}, + {0, 0, 4, -18, 78, 78, -18, 4}, + {0, 0, 4, -17, 75, 80, -18, 4}, + {0, 0, 4, -17, 73, 82, -18, 4}, + {0, 0, 4, -17, 70, 85, -18, 4}, + {0, 0, 4, -17, 68, 87, -18, 4}, + {0, 0, 4, -16, 65, 89, -18, 4}, + {0, 0, 4, -16, 63, 91, -18, 4}, + {0, 0, 4, -16, 60, 94, -18, 4}, + {0, 0, 3, -15, 58, 96, -18, 4}, + {0, 0, 4, -15, 55, 98, -18, 4}, + {0, 0, 3, -14, 52, 100, -17, 4}, + {0, 0, 3, -14, 50, 102, -17, 4}, + {0, 0, 3, -13, 47, 104, -17, 4}, + {0, 0, 3, -13, 45, 106, -17, 4}, + {0, 0, 3, -12, 42, 108, -16, 3}, + {0, 0, 3, -11, 40, 109, -16, 3}, + {0, 0, 3, -11, 37, 111, -15, 3}, + {0, 0, 2, -10, 35, 113, -15, 3}, + {0, 0, 3, -10, 32, 114, -14, 3}, + {0, 0, 2, -9, 29, 116, -13, 3}, + {0, 0, 2, -8, 27, 117, -13, 3}, + {0, 0, 2, -8, 25, 119, -12, 2}, + {0, 0, 2, -7, 22, 120, -11, 2}, + {0, 0, 1, -6, 20, 121, -10, 2}, + {0, 0, 1, -6, 18, 122, -9, 2}, + {0, 0, 1, -5, 15, 123, -8, 2}, + {0, 0, 1, -4, 13, 124, -7, 1}, + {0, 0, 1, -4, 11, 125, -6, 1}, + {0, 0, 1, -3, 8, 126, -5, 1}, + {0, 0, 1, -2, 6, 126, -4, 1}, + {0, 0, 0, -1, 4, 127, -3, 1}, + {0, 0, 0, 0, 2, 127, -1, 0}, + // dummy, replicate row index 191. + {0, 0, 0, 0, 2, 127, -1, 0}}; + +// Every value in |kSubPixelFilters| is even. Divide by 2 to simplify +// calculations by reducing the range by 1 bit. +alignas(8) const int8_t kHalfSubPixelFilters[6][16][8] = { + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 1, -3, 63, 4, -1, 0, 0}, + {0, 1, -5, 61, 9, -2, 0, 0}, + {0, 1, -6, 58, 14, -4, 1, 0}, + {0, 1, -7, 55, 19, -5, 1, 0}, + {0, 1, -7, 51, 24, -6, 1, 0}, + {0, 1, -8, 47, 29, -6, 1, 0}, + {0, 1, -7, 42, 33, -6, 1, 0}, + {0, 1, -7, 38, 38, -7, 1, 0}, + {0, 1, -6, 33, 42, -7, 1, 0}, + {0, 1, -6, 29, 47, -8, 1, 0}, + {0, 1, -6, 24, 51, -7, 1, 0}, + {0, 1, -5, 19, 55, -7, 1, 0}, + {0, 1, -4, 14, 58, -6, 1, 0}, + {0, 0, -2, 9, 61, -5, 1, 0}, + {0, 0, -1, 4, 63, -3, 1, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 1, 14, 31, 17, 1, 0, 0}, + {0, 0, 13, 31, 18, 2, 0, 0}, + {0, 0, 11, 31, 20, 2, 0, 0}, + {0, 0, 10, 30, 21, 3, 0, 0}, + {0, 0, 9, 29, 22, 4, 0, 0}, + {0, 0, 8, 28, 23, 5, 0, 0}, + {0, -1, 8, 27, 24, 6, 0, 0}, + {0, -1, 7, 26, 26, 7, -1, 0}, + {0, 0, 6, 24, 27, 8, -1, 0}, + {0, 0, 5, 23, 28, 8, 0, 0}, + {0, 0, 4, 22, 29, 9, 0, 0}, + {0, 0, 3, 21, 30, 10, 0, 0}, + {0, 0, 2, 20, 31, 11, 0, 0}, + {0, 0, 2, 18, 31, 13, 0, 0}, + {0, 0, 1, 17, 31, 14, 1, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {-1, 1, -3, 63, 4, -1, 1, 0}, + {-1, 3, -6, 62, 8, -3, 2, -1}, + {-1, 4, -9, 60, 13, -5, 3, -1}, + {-2, 5, -11, 58, 19, -7, 3, -1}, + {-2, 5, -11, 54, 24, -9, 4, -1}, + {-2, 5, -12, 50, 30, -10, 4, -1}, + {-2, 5, -12, 45, 35, -11, 5, -1}, + {-2, 6, -12, 40, 40, -12, 6, -2}, + {-1, 5, -11, 35, 45, -12, 5, -2}, + {-1, 4, -10, 30, 50, -12, 5, -2}, + {-1, 4, -9, 24, 54, -11, 5, -2}, + {-1, 3, -7, 19, 58, -11, 5, -2}, + {-1, 3, -5, 13, 60, -9, 4, -1}, + {-1, 2, -3, 8, 62, -6, 3, -1}, + {0, 1, -1, 4, 63, -3, 1, -1}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 0, 0, 60, 4, 0, 0, 0}, + {0, 0, 0, 56, 8, 0, 0, 0}, + {0, 0, 0, 52, 12, 0, 0, 0}, + {0, 0, 0, 48, 16, 0, 0, 0}, + {0, 0, 0, 44, 20, 0, 0, 0}, + {0, 0, 0, 40, 24, 0, 0, 0}, + {0, 0, 0, 36, 28, 0, 0, 0}, + {0, 0, 0, 32, 32, 0, 0, 0}, + {0, 0, 0, 28, 36, 0, 0, 0}, + {0, 0, 0, 24, 40, 0, 0, 0}, + {0, 0, 0, 20, 44, 0, 0, 0}, + {0, 0, 0, 16, 48, 0, 0, 0}, + {0, 0, 0, 12, 52, 0, 0, 0}, + {0, 0, 0, 8, 56, 0, 0, 0}, + {0, 0, 0, 4, 60, 0, 0, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 0, -2, 63, 4, -1, 0, 0}, + {0, 0, -4, 61, 9, -2, 0, 0}, + {0, 0, -5, 58, 14, -3, 0, 0}, + {0, 0, -6, 55, 19, -4, 0, 0}, + {0, 0, -6, 51, 24, -5, 0, 0}, + {0, 0, -7, 47, 29, -5, 0, 0}, + {0, 0, -6, 42, 33, -5, 0, 0}, + {0, 0, -6, 38, 38, -6, 0, 0}, + {0, 0, -5, 33, 42, -6, 0, 0}, + {0, 0, -5, 29, 47, -7, 0, 0}, + {0, 0, -5, 24, 51, -6, 0, 0}, + {0, 0, -4, 19, 55, -6, 0, 0}, + {0, 0, -3, 14, 58, -5, 0, 0}, + {0, 0, -2, 9, 61, -4, 0, 0}, + {0, 0, -1, 4, 63, -2, 0, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 0, 15, 31, 17, 1, 0, 0}, + {0, 0, 13, 31, 18, 2, 0, 0}, + {0, 0, 11, 31, 20, 2, 0, 0}, + {0, 0, 10, 30, 21, 3, 0, 0}, + {0, 0, 9, 29, 22, 4, 0, 0}, + {0, 0, 8, 28, 23, 5, 0, 0}, + {0, 0, 7, 27, 24, 6, 0, 0}, + {0, 0, 6, 26, 26, 6, 0, 0}, + {0, 0, 6, 24, 27, 7, 0, 0}, + {0, 0, 5, 23, 28, 8, 0, 0}, + {0, 0, 4, 22, 29, 9, 0, 0}, + {0, 0, 3, 21, 30, 10, 0, 0}, + {0, 0, 2, 20, 31, 11, 0, 0}, + {0, 0, 2, 18, 31, 13, 0, 0}, + {0, 0, 1, 17, 31, 15, 0, 0}}}; + +// Absolute values of |kHalfSubPixelFilters|. Used in situations where we know +// the pattern of the signs and account for it in other ways. +const uint8_t kAbsHalfSubPixelFilters[6][16][8] = { + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 1, 3, 63, 4, 1, 0, 0}, + {0, 1, 5, 61, 9, 2, 0, 0}, + {0, 1, 6, 58, 14, 4, 1, 0}, + {0, 1, 7, 55, 19, 5, 1, 0}, + {0, 1, 7, 51, 24, 6, 1, 0}, + {0, 1, 8, 47, 29, 6, 1, 0}, + {0, 1, 7, 42, 33, 6, 1, 0}, + {0, 1, 7, 38, 38, 7, 1, 0}, + {0, 1, 6, 33, 42, 7, 1, 0}, + {0, 1, 6, 29, 47, 8, 1, 0}, + {0, 1, 6, 24, 51, 7, 1, 0}, + {0, 1, 5, 19, 55, 7, 1, 0}, + {0, 1, 4, 14, 58, 6, 1, 0}, + {0, 0, 2, 9, 61, 5, 1, 0}, + {0, 0, 1, 4, 63, 3, 1, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 1, 14, 31, 17, 1, 0, 0}, + {0, 0, 13, 31, 18, 2, 0, 0}, + {0, 0, 11, 31, 20, 2, 0, 0}, + {0, 0, 10, 30, 21, 3, 0, 0}, + {0, 0, 9, 29, 22, 4, 0, 0}, + {0, 0, 8, 28, 23, 5, 0, 0}, + {0, 1, 8, 27, 24, 6, 0, 0}, + {0, 1, 7, 26, 26, 7, 1, 0}, + {0, 0, 6, 24, 27, 8, 1, 0}, + {0, 0, 5, 23, 28, 8, 0, 0}, + {0, 0, 4, 22, 29, 9, 0, 0}, + {0, 0, 3, 21, 30, 10, 0, 0}, + {0, 0, 2, 20, 31, 11, 0, 0}, + {0, 0, 2, 18, 31, 13, 0, 0}, + {0, 0, 1, 17, 31, 14, 1, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {1, 1, 3, 63, 4, 1, 1, 0}, + {1, 3, 6, 62, 8, 3, 2, 1}, + {1, 4, 9, 60, 13, 5, 3, 1}, + {2, 5, 11, 58, 19, 7, 3, 1}, + {2, 5, 11, 54, 24, 9, 4, 1}, + {2, 5, 12, 50, 30, 10, 4, 1}, + {2, 5, 12, 45, 35, 11, 5, 1}, + {2, 6, 12, 40, 40, 12, 6, 2}, + {1, 5, 11, 35, 45, 12, 5, 2}, + {1, 4, 10, 30, 50, 12, 5, 2}, + {1, 4, 9, 24, 54, 11, 5, 2}, + {1, 3, 7, 19, 58, 11, 5, 2}, + {1, 3, 5, 13, 60, 9, 4, 1}, + {1, 2, 3, 8, 62, 6, 3, 1}, + {0, 1, 1, 4, 63, 3, 1, 1}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 0, 0, 60, 4, 0, 0, 0}, + {0, 0, 0, 56, 8, 0, 0, 0}, + {0, 0, 0, 52, 12, 0, 0, 0}, + {0, 0, 0, 48, 16, 0, 0, 0}, + {0, 0, 0, 44, 20, 0, 0, 0}, + {0, 0, 0, 40, 24, 0, 0, 0}, + {0, 0, 0, 36, 28, 0, 0, 0}, + {0, 0, 0, 32, 32, 0, 0, 0}, + {0, 0, 0, 28, 36, 0, 0, 0}, + {0, 0, 0, 24, 40, 0, 0, 0}, + {0, 0, 0, 20, 44, 0, 0, 0}, + {0, 0, 0, 16, 48, 0, 0, 0}, + {0, 0, 0, 12, 52, 0, 0, 0}, + {0, 0, 0, 8, 56, 0, 0, 0}, + {0, 0, 0, 4, 60, 0, 0, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 0, 2, 63, 4, 1, 0, 0}, + {0, 0, 4, 61, 9, 2, 0, 0}, + {0, 0, 5, 58, 14, 3, 0, 0}, + {0, 0, 6, 55, 19, 4, 0, 0}, + {0, 0, 6, 51, 24, 5, 0, 0}, + {0, 0, 7, 47, 29, 5, 0, 0}, + {0, 0, 6, 42, 33, 5, 0, 0}, + {0, 0, 6, 38, 38, 6, 0, 0}, + {0, 0, 5, 33, 42, 6, 0, 0}, + {0, 0, 5, 29, 47, 7, 0, 0}, + {0, 0, 5, 24, 51, 6, 0, 0}, + {0, 0, 4, 19, 55, 6, 0, 0}, + {0, 0, 3, 14, 58, 5, 0, 0}, + {0, 0, 2, 9, 61, 4, 0, 0}, + {0, 0, 1, 4, 63, 2, 0, 0}}, + {{0, 0, 0, 64, 0, 0, 0, 0}, + {0, 0, 15, 31, 17, 1, 0, 0}, + {0, 0, 13, 31, 18, 2, 0, 0}, + {0, 0, 11, 31, 20, 2, 0, 0}, + {0, 0, 10, 30, 21, 3, 0, 0}, + {0, 0, 9, 29, 22, 4, 0, 0}, + {0, 0, 8, 28, 23, 5, 0, 0}, + {0, 0, 7, 27, 24, 6, 0, 0}, + {0, 0, 6, 26, 26, 6, 0, 0}, + {0, 0, 6, 24, 27, 7, 0, 0}, + {0, 0, 5, 23, 28, 8, 0, 0}, + {0, 0, 4, 22, 29, 9, 0, 0}, + {0, 0, 3, 21, 30, 10, 0, 0}, + {0, 0, 2, 20, 31, 11, 0, 0}, + {0, 0, 2, 18, 31, 13, 0, 0}, + {0, 0, 1, 17, 31, 15, 0, 0}}}; + +// 9.3 -- Dr_Intra_Derivative[] +// This is a more compact version of the table from the spec. angle / 2 - 1 is +// used as the lookup. Note angle / 3 - 1 would work too, but the calculation +// becomes more costly. +const int16_t kDirectionalIntraPredictorDerivative[44] = { + // Approx angle + 1023, 0, // 3, ... + 547, // 6, ... + 372, 0, 0, // 9, ... + 273, // 14, ... + 215, 0, // 17, ... + 178, // 20, ... + 151, 0, // 23, ... (113 & 203 are base angles) + 132, // 26, ... + 116, 0, // 29, ... + 102, 0, // 32, ... + 90, // 36, ... + 80, 0, // 39, ... + 71, // 42, ... + 64, 0, // 45, ... (45 & 135 are base angles) + 57, // 48, ... + 51, 0, // 51, ... + 45, 0, // 54, ... + 40, // 58, ... + 35, 0, // 61, ... + 31, // 64, ... + 27, 0, // 67, ... (67 & 157 are base angles) + 23, // 70, ... + 19, 0, // 73, ... + 15, 0, // 76, ... + 11, 0, // 81, ... + 7, // 84, ... + 3, // 87, ... +}; + +const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes] = { + {0, 1}, {2, 2}, {3, 3}}; + +} // namespace libgav1 diff --git a/src/utils/constants.h b/src/utils/constants.h new file mode 100644 index 0000000..34cf56d --- /dev/null +++ b/src/utils/constants.h @@ -0,0 +1,744 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_CONSTANTS_H_ +#define LIBGAV1_SRC_UTILS_CONSTANTS_H_ + +#include +#include + +#include "src/utils/bit_mask_set.h" + +namespace libgav1 { + +// Returns the number of elements between begin (inclusive) and end (inclusive). +constexpr int EnumRangeLength(int begin, int end) { return end - begin + 1; } + +enum { +// Maximum number of threads that the library will ever create. +#if defined(LIBGAV1_MAX_THREADS) && LIBGAV1_MAX_THREADS > 0 + kMaxThreads = LIBGAV1_MAX_THREADS +#else + kMaxThreads = 128 +#endif +}; // anonymous enum + +enum { + kInvalidMvValue = -32768, + kCdfMaxProbability = 32768, + kBlockWidthCount = 5, + kMaxSegments = 8, + kMinQuantizer = 0, + kMinLossyQuantizer = 1, + kMaxQuantizer = 255, + // Quantizer matrix is used only when level < 15. + kNumQuantizerLevelsForQuantizerMatrix = 15, + kFrameLfCount = 4, + kMaxLoopFilterValue = 63, + kNum4x4In64x64 = 256, + kMaxAngleDelta = 3, + kDirectionalIntraModes = 8, + kMaxSuperBlockSizeLog2 = 7, + kMinSuperBlockSizeLog2 = 6, + kGlobalMotionReadControl = 3, + kSuperResScaleNumerator = 8, + kBooleanSymbolCount = 2, + kRestorationTypeSymbolCount = 3, + kSgrProjParamsBits = 4, + kSgrProjPrecisionBits = 7, + // Padding on left and right side of a restoration block. + // 3 is enough, but padding to 4 is more efficient, and makes the temporary + // source buffer 8-pixel aligned. + kRestorationHorizontalBorder = 4, + // Padding on top and bottom side of a restoration block. + kRestorationVerticalBorder = 2, + kCdefBorder = 2, // Padding on each side of a cdef block. + kConvolveBorderLeftTop = 3, // Left/top padding of a convolve block. + // Right/bottom padding of a convolve block. This needs to be 4 at minimum, + // but was increased to simplify the SIMD loads in + // ConvolveCompoundScale2D_NEON() and ConvolveScale2D_NEON(). + kConvolveBorderRight = 8, + kConvolveBorderBottom = 4, + kSubPixelTaps = 8, + kWienerFilterBits = 7, + kWienerFilterTaps = 7, + kMaxPaletteSize = 8, + kMinPaletteSize = 2, + kMaxPaletteSquare = 64, + kBorderPixels = 64, + // The final blending process for film grain needs room to overwrite and read + // with SIMD instructions. The maximum overwrite is 7 pixels, but the border + // is required to be a multiple of 32 by YuvBuffer::Realloc, so that + // subsampled chroma borders are 16-aligned. + kBorderPixelsFilmGrain = 32, + // These constants are the minimum left, right, top, and bottom border sizes + // in pixels as an extension of the frame boundary. The minimum border sizes + // are derived from the following requirements: + // - Warp_C() may read up to 13 pixels before or after a row. + // - Warp_NEON() may read up to 13 pixels before a row. It may read up to 14 + // pixels after a row, but the value of the last read pixel is not used. + // - Warp_C() and Warp_NEON() may read up to 13 pixels above the top row and + // 13 pixels below the bottom row. + kMinLeftBorderPixels = 13, + kMinRightBorderPixels = 13, + kMinTopBorderPixels = 13, + kMinBottomBorderPixels = 13, + kWarpedModelPrecisionBits = 16, + kMaxRefMvStackSize = 8, + kMaxLeastSquaresSamples = 8, + kMaxTemporalMvCandidates = 19, + // The SIMD implementations of motion vection projection functions always + // process 2 or 4 elements together, so we pad the corresponding buffers to + // size 20. + kMaxTemporalMvCandidatesWithPadding = 20, + kMaxSuperBlockSizeInPixels = 128, + kMaxScaledSuperBlockSizeInPixels = 128 * 2, + kMaxSuperBlockSizeSquareInPixels = 128 * 128, + kNum4x4InLoopFilterUnit = 16, + kNum4x4InLoopRestorationUnit = 16, + kProjectionMvClamp = (1 << 14) - 1, // == 16383 + kProjectionMvMaxHorizontalOffset = 8, + kCdefUnitSize = 64, + kCdefUnitSizeWithBorders = kCdefUnitSize + 2 * kCdefBorder, + kRestorationUnitOffset = 8, + // Loop restoration's processing unit size is fixed as 64x64. + kRestorationUnitHeight = 64, + kRestorationUnitWidth = 256, + kRestorationUnitHeightWithBorders = + kRestorationUnitHeight + 2 * kRestorationVerticalBorder, + kRestorationUnitWidthWithBorders = + kRestorationUnitWidth + 2 * kRestorationHorizontalBorder, + kSuperResFilterBits = 6, + kSuperResFilterShifts = 1 << kSuperResFilterBits, + kSuperResFilterTaps = 8, + kSuperResScaleBits = 14, + kSuperResExtraBits = kSuperResScaleBits - kSuperResFilterBits, + kSuperResScaleMask = (1 << 14) - 1, + kSuperResHorizontalBorder = 4, + kSuperResVerticalBorder = 1, + // The SIMD implementations of superres calculate up to 15 extra upscaled + // pixels which will over-read up to 15 downscaled pixels in the end of each + // row. Set the padding to 16 for alignment purposes. + kSuperResHorizontalPadding = 16, + // TODO(chengchen): consider merging these constants: + // kFilterBits, kWienerFilterBits, and kSgrProjPrecisionBits, which are all 7, + // They are designed to match AV1 convolution, which increases coeff + // values up to 7 bits. We could consider to combine them and use kFilterBits + // only. + kFilterBits = 7, + // Sub pixel is used in AV1 to represent a pixel location that is not at + // integer position. Sub pixel is in 1/16 (1 << kSubPixelBits) unit of + // integer pixel. Sub pixel values are interpolated using adjacent integer + // pixel values. The interpolation is a filtering process. + kSubPixelBits = 4, + kSubPixelMask = (1 << kSubPixelBits) - 1, + // Precision bits when computing inter prediction locations. + kScaleSubPixelBits = 10, + kWarpParamRoundingBits = 6, + // Number of fractional bits of lookup in divisor lookup table. + kDivisorLookupBits = 8, + // Number of fractional bits of entries in divisor lookup table. + kDivisorLookupPrecisionBits = 14, + // Number of phases used in warped filtering. + kWarpedPixelPrecisionShifts = 1 << 6, + kResidualPaddingVertical = 4, + kWedgeMaskMasterSize = 64, + kMaxFrameDistance = 31, + kReferenceFrameScalePrecision = 14, + kNumWienerCoefficients = 3, + kLoopFilterMaxModeDeltas = 2, + kMaxCdefStrengths = 8, + kCdefLargeValue = 0x4000, // Used to indicate where CDEF is not available. + kMaxTileColumns = 64, + kMaxTileRows = 64, + kMaxOperatingPoints = 32, + // There can be a maximum of 4 spatial layers and 8 temporal layers. + kMaxLayers = 32, + // The cache line size should ideally be queried at run time. 64 is a common + // cache line size of x86 CPUs. Web searches showed the cache line size of ARM + // CPUs is 32 or 64 bytes. So aligning to 64-byte boundary will work for all + // CPUs that we care about, even though it is excessive for some ARM + // CPUs. + // + // On Linux, the cache line size can be looked up with the command: + // getconf LEVEL1_DCACHE_LINESIZE + kCacheLineSize = 64, +}; // anonymous enum + +enum FrameType : uint8_t { + kFrameKey, + kFrameInter, + kFrameIntraOnly, + kFrameSwitch +}; + +enum Plane : uint8_t { kPlaneY, kPlaneU, kPlaneV }; +enum : uint8_t { kMaxPlanesMonochrome = kPlaneY + 1, kMaxPlanes = kPlaneV + 1 }; + +// The plane types, called luma and chroma in the spec. +enum PlaneType : uint8_t { kPlaneTypeY, kPlaneTypeUV, kNumPlaneTypes }; + +enum ReferenceFrameType : int8_t { + kReferenceFrameNone = -1, + kReferenceFrameIntra, + kReferenceFrameLast, + kReferenceFrameLast2, + kReferenceFrameLast3, + kReferenceFrameGolden, + kReferenceFrameBackward, + kReferenceFrameAlternate2, + kReferenceFrameAlternate, + kNumReferenceFrameTypes, + kNumInterReferenceFrameTypes = + EnumRangeLength(kReferenceFrameLast, kReferenceFrameAlternate), + kNumForwardReferenceTypes = + EnumRangeLength(kReferenceFrameLast, kReferenceFrameGolden), + kNumBackwardReferenceTypes = + EnumRangeLength(kReferenceFrameBackward, kReferenceFrameAlternate) +}; + +enum { + // Unidirectional compound reference pairs that are signaled explicitly: + // {kReferenceFrameLast, kReferenceFrameLast2}, + // {kReferenceFrameLast, kReferenceFrameLast3}, + // {kReferenceFrameLast, kReferenceFrameGolden}, + // {kReferenceFrameBackward, kReferenceFrameAlternate} + kExplicitUnidirectionalCompoundReferences = 4, + // Other unidirectional compound reference pairs: + // {kReferenceFrameLast2, kReferenceFrameLast3}, + // {kReferenceFrameLast2, kReferenceFrameGolden}, + // {kReferenceFrameLast3, kReferenceFrameGolden}, + // {kReferenceFrameBackward, kReferenceFrameAlternate2}, + // {kReferenceFrameAlternate2, kReferenceFrameAlternate} + kUnidirectionalCompoundReferences = + kExplicitUnidirectionalCompoundReferences + 5, +}; // anonymous enum + +enum BlockSize : uint8_t { + kBlock4x4, + kBlock4x8, + kBlock4x16, + kBlock8x4, + kBlock8x8, + kBlock8x16, + kBlock8x32, + kBlock16x4, + kBlock16x8, + kBlock16x16, + kBlock16x32, + kBlock16x64, + kBlock32x8, + kBlock32x16, + kBlock32x32, + kBlock32x64, + kBlock64x16, + kBlock64x32, + kBlock64x64, + kBlock64x128, + kBlock128x64, + kBlock128x128, + kMaxBlockSizes, + kBlockInvalid +}; + +// Partition types. R: Recursive +// +// None Horizontal Vertical Split +// +-------+ +-------+ +---+---+ +---+---+ +// | | | | | | | | R | R | +// | | +-------+ | | | +---+---+ +// | | | | | | | | R | R | +// +-------+ +-------+ +---+---+ +---+---+ +// +// Horizontal Horizontal Vertical Vertical +// with top with bottom with left with right +// split split split split +// +---+---+ +-------+ +---+---+ +---+---+ +// | | | | | | | | | | | +// +---+---+ +---+---+ +---+ | | +---+ +// | | | | | | | | | | | +// +-------+ +---+---+ +---+---+ +---+---+ +// +// Horizontal4 Vertical4 +// +-----+ +-+-+-+ +// +-----+ | | | | +// +-----+ | | | | +// +-----+ +-+-+-+ +enum Partition : uint8_t { + kPartitionNone, + kPartitionHorizontal, + kPartitionVertical, + kPartitionSplit, + kPartitionHorizontalWithTopSplit, + kPartitionHorizontalWithBottomSplit, + kPartitionVerticalWithLeftSplit, + kPartitionVerticalWithRightSplit, + kPartitionHorizontal4, + kPartitionVertical4 +}; +enum : uint8_t { kMaxPartitionTypes = kPartitionVertical4 + 1 }; + +enum PredictionMode : uint8_t { + // Intra prediction modes. + kPredictionModeDc, + kPredictionModeVertical, + kPredictionModeHorizontal, + kPredictionModeD45, + kPredictionModeD135, + kPredictionModeD113, + kPredictionModeD157, + kPredictionModeD203, + kPredictionModeD67, + kPredictionModeSmooth, + kPredictionModeSmoothVertical, + kPredictionModeSmoothHorizontal, + kPredictionModePaeth, + kPredictionModeChromaFromLuma, + // Single inter prediction modes. + kPredictionModeNearestMv, + kPredictionModeNearMv, + kPredictionModeGlobalMv, + kPredictionModeNewMv, + // Compound inter prediction modes. + kPredictionModeNearestNearestMv, + kPredictionModeNearNearMv, + kPredictionModeNearestNewMv, + kPredictionModeNewNearestMv, + kPredictionModeNearNewMv, + kPredictionModeNewNearMv, + kPredictionModeGlobalGlobalMv, + kPredictionModeNewNewMv, + kNumPredictionModes, + kNumCompoundInterPredictionModes = + EnumRangeLength(kPredictionModeNearestNearestMv, kPredictionModeNewNewMv), + kIntraPredictionModesY = + EnumRangeLength(kPredictionModeDc, kPredictionModePaeth), + kIntraPredictionModesUV = + EnumRangeLength(kPredictionModeDc, kPredictionModeChromaFromLuma), + kPredictionModeInvalid = 255 +}; + +enum InterIntraMode : uint8_t { + kInterIntraModeDc, + kInterIntraModeVertical, + kInterIntraModeHorizontal, + kInterIntraModeSmooth, + kNumInterIntraModes +}; + +enum MotionMode : uint8_t { + kMotionModeSimple, + kMotionModeObmc, // Overlapped block motion compensation. + kMotionModeLocalWarp, + kNumMotionModes +}; + +enum TxMode : uint8_t { + kTxModeOnly4x4, + kTxModeLargest, + kTxModeSelect, + kNumTxModes +}; + +// These enums are named as kType1Type2 where Type1 is the transform type for +// the rows and Type2 is the transform type for the columns. +enum TransformType : uint8_t { + kTransformTypeDctDct, + kTransformTypeAdstDct, + kTransformTypeDctAdst, + kTransformTypeAdstAdst, + kTransformTypeFlipadstDct, + kTransformTypeDctFlipadst, + kTransformTypeFlipadstFlipadst, + kTransformTypeAdstFlipadst, + kTransformTypeFlipadstAdst, + kTransformTypeIdentityIdentity, + kTransformTypeIdentityDct, + kTransformTypeDctIdentity, + kTransformTypeIdentityAdst, + kTransformTypeAdstIdentity, + kTransformTypeIdentityFlipadst, + kTransformTypeFlipadstIdentity, + kNumTransformTypes +}; + +constexpr BitMaskSet kTransformFlipColumnsMask(kTransformTypeFlipadstDct, + kTransformTypeFlipadstAdst, + kTransformTypeFlipadstIdentity, + kTransformTypeFlipadstFlipadst); +constexpr BitMaskSet kTransformFlipRowsMask(kTransformTypeDctFlipadst, + kTransformTypeAdstFlipadst, + kTransformTypeIdentityFlipadst, + kTransformTypeFlipadstFlipadst); + +enum TransformSize : uint8_t { + kTransformSize4x4, + kTransformSize4x8, + kTransformSize4x16, + kTransformSize8x4, + kTransformSize8x8, + kTransformSize8x16, + kTransformSize8x32, + kTransformSize16x4, + kTransformSize16x8, + kTransformSize16x16, + kTransformSize16x32, + kTransformSize16x64, + kTransformSize32x8, + kTransformSize32x16, + kTransformSize32x32, + kTransformSize32x64, + kTransformSize64x16, + kTransformSize64x32, + kTransformSize64x64, + kNumTransformSizes +}; + +enum TransformSet : uint8_t { + // DCT Only (1). + kTransformSetDctOnly, + // 2D-DCT and 2D-ADST without flip (4) + Identity (1) + 1D Horizontal/Vertical + // DCT (2) = Total (7). + kTransformSetIntra1, + // 2D-DCT and 2D-ADST without flip (4) + Identity (1) = Total (5). + kTransformSetIntra2, + // All transforms = Total (16). + kTransformSetInter1, + // 2D-DCT and 2D-ADST with flip (9) + Identity (1) + 1D Horizontal/Vertical + // DCT (2) = Total (12). + kTransformSetInter2, + // DCT (1) + Identity (1) = Total (2). + kTransformSetInter3, + kNumTransformSets +}; + +enum TransformClass : uint8_t { + kTransformClass2D, + kTransformClassHorizontal, + kTransformClassVertical, + kNumTransformClasses +}; + +enum FilterIntraPredictor : uint8_t { + kFilterIntraPredictorDc, + kFilterIntraPredictorVertical, + kFilterIntraPredictorHorizontal, + kFilterIntraPredictorD157, + kFilterIntraPredictorPaeth, + kNumFilterIntraPredictors +}; + +enum ObmcDirection : uint8_t { + kObmcDirectionVertical, + kObmcDirectionHorizontal, + kNumObmcDirections +}; + +// In AV1 the name of the filter refers to the direction of filter application. +// Horizontal refers to the column edge and vertical the row edge. +enum LoopFilterType : uint8_t { + kLoopFilterTypeVertical, + kLoopFilterTypeHorizontal, + kNumLoopFilterTypes +}; + +enum LoopFilterTransformSizeId : uint8_t { + kLoopFilterTransformSizeId4x4, + kLoopFilterTransformSizeId8x8, + kLoopFilterTransformSizeId16x16, + kNumLoopFilterTransformSizeIds +}; + +enum LoopRestorationType : uint8_t { + kLoopRestorationTypeNone, + kLoopRestorationTypeSwitchable, + kLoopRestorationTypeWiener, + kLoopRestorationTypeSgrProj, // self guided projection filter. + kNumLoopRestorationTypes +}; + +enum CompoundReferenceType : uint8_t { + kCompoundReferenceUnidirectional, + kCompoundReferenceBidirectional, + kNumCompoundReferenceTypes +}; + +enum CompoundPredictionType : uint8_t { + kCompoundPredictionTypeWedge, + kCompoundPredictionTypeDiffWeighted, + kCompoundPredictionTypeAverage, + kCompoundPredictionTypeIntra, + kCompoundPredictionTypeDistance, + kNumCompoundPredictionTypes, + // Number of compound prediction types that are explicitly signaled in the + // bitstream (in the compound_type syntax element). + kNumExplicitCompoundPredictionTypes = 2 +}; + +enum InterpolationFilter : uint8_t { + kInterpolationFilterEightTap, + kInterpolationFilterEightTapSmooth, + kInterpolationFilterEightTapSharp, + kInterpolationFilterBilinear, + kInterpolationFilterSwitchable, + kNumInterpolationFilters, + // Number of interpolation filters that can be explicitly signaled in the + // compressed headers (when the uncompressed headers allow switchable + // interpolation filters) of the bitstream. + kNumExplicitInterpolationFilters = EnumRangeLength( + kInterpolationFilterEightTap, kInterpolationFilterEightTapSharp) +}; + +enum MvJointType : uint8_t { + kMvJointTypeZero, + kMvJointTypeHorizontalNonZeroVerticalZero, + kMvJointTypeHorizontalZeroVerticalNonZero, + kMvJointTypeNonZero, + kNumMvJointTypes +}; + +enum ObuType : int8_t { + kObuInvalid = -1, + kObuSequenceHeader = 1, + kObuTemporalDelimiter = 2, + kObuFrameHeader = 3, + kObuTileGroup = 4, + kObuMetadata = 5, + kObuFrame = 6, + kObuRedundantFrameHeader = 7, + kObuTileList = 8, + kObuPadding = 15, +}; + +//------------------------------------------------------------------------------ +// ToString() +// +// These functions are meant to be used only in debug logging and within tests. +// They are defined inline to avoid including the strings in the release +// library when logging is disabled; unreferenced functions will not be added to +// any object file in that case. + +inline const char* ToString(const BlockSize size) { + switch (size) { + case kBlock4x4: + return "kBlock4x4"; + case kBlock4x8: + return "kBlock4x8"; + case kBlock4x16: + return "kBlock4x16"; + case kBlock8x4: + return "kBlock8x4"; + case kBlock8x8: + return "kBlock8x8"; + case kBlock8x16: + return "kBlock8x16"; + case kBlock8x32: + return "kBlock8x32"; + case kBlock16x4: + return "kBlock16x4"; + case kBlock16x8: + return "kBlock16x8"; + case kBlock16x16: + return "kBlock16x16"; + case kBlock16x32: + return "kBlock16x32"; + case kBlock16x64: + return "kBlock16x64"; + case kBlock32x8: + return "kBlock32x8"; + case kBlock32x16: + return "kBlock32x16"; + case kBlock32x32: + return "kBlock32x32"; + case kBlock32x64: + return "kBlock32x64"; + case kBlock64x16: + return "kBlock64x16"; + case kBlock64x32: + return "kBlock64x32"; + case kBlock64x64: + return "kBlock64x64"; + case kBlock64x128: + return "kBlock64x128"; + case kBlock128x64: + return "kBlock128x64"; + case kBlock128x128: + return "kBlock128x128"; + case kMaxBlockSizes: + return "kMaxBlockSizes"; + case kBlockInvalid: + return "kBlockInvalid"; + } + abort(); +} + +inline const char* ToString(const InterIntraMode mode) { + switch (mode) { + case kInterIntraModeDc: + return "kInterIntraModeDc"; + case kInterIntraModeVertical: + return "kInterIntraModeVertical"; + case kInterIntraModeHorizontal: + return "kInterIntraModeHorizontal"; + case kInterIntraModeSmooth: + return "kInterIntraModeSmooth"; + case kNumInterIntraModes: + return "kNumInterIntraModes"; + } + abort(); +} + +inline const char* ToString(const ObmcDirection direction) { + switch (direction) { + case kObmcDirectionVertical: + return "kObmcDirectionVertical"; + case kObmcDirectionHorizontal: + return "kObmcDirectionHorizontal"; + case kNumObmcDirections: + return "kNumObmcDirections"; + } + abort(); +} + +inline const char* ToString(const LoopRestorationType type) { + switch (type) { + case kLoopRestorationTypeNone: + return "kLoopRestorationTypeNone"; + case kLoopRestorationTypeSwitchable: + return "kLoopRestorationTypeSwitchable"; + case kLoopRestorationTypeWiener: + return "kLoopRestorationTypeWiener"; + case kLoopRestorationTypeSgrProj: + return "kLoopRestorationTypeSgrProj"; + case kNumLoopRestorationTypes: + return "kNumLoopRestorationTypes"; + } + abort(); +} + +inline const char* ToString(const TransformType type) { + switch (type) { + case kTransformTypeDctDct: + return "kTransformTypeDctDct"; + case kTransformTypeAdstDct: + return "kTransformTypeAdstDct"; + case kTransformTypeDctAdst: + return "kTransformTypeDctAdst"; + case kTransformTypeAdstAdst: + return "kTransformTypeAdstAdst"; + case kTransformTypeFlipadstDct: + return "kTransformTypeFlipadstDct"; + case kTransformTypeDctFlipadst: + return "kTransformTypeDctFlipadst"; + case kTransformTypeFlipadstFlipadst: + return "kTransformTypeFlipadstFlipadst"; + case kTransformTypeAdstFlipadst: + return "kTransformTypeAdstFlipadst"; + case kTransformTypeFlipadstAdst: + return "kTransformTypeFlipadstAdst"; + case kTransformTypeIdentityIdentity: + return "kTransformTypeIdentityIdentity"; + case kTransformTypeIdentityDct: + return "kTransformTypeIdentityDct"; + case kTransformTypeDctIdentity: + return "kTransformTypeDctIdentity"; + case kTransformTypeIdentityAdst: + return "kTransformTypeIdentityAdst"; + case kTransformTypeAdstIdentity: + return "kTransformTypeAdstIdentity"; + case kTransformTypeIdentityFlipadst: + return "kTransformTypeIdentityFlipadst"; + case kTransformTypeFlipadstIdentity: + return "kTransformTypeFlipadstIdentity"; + // case to quiet compiler + case kNumTransformTypes: + return "kNumTransformTypes"; + } + abort(); +} + +//------------------------------------------------------------------------------ + +extern const uint8_t k4x4WidthLog2[kMaxBlockSizes]; + +extern const uint8_t k4x4HeightLog2[kMaxBlockSizes]; + +extern const uint8_t kNum4x4BlocksWide[kMaxBlockSizes]; + +extern const uint8_t kNum4x4BlocksHigh[kMaxBlockSizes]; + +extern const uint8_t kBlockWidthPixels[kMaxBlockSizes]; + +extern const uint8_t kBlockHeightPixels[kMaxBlockSizes]; + +extern const BlockSize kSubSize[kMaxPartitionTypes][kMaxBlockSizes]; + +extern const BlockSize kPlaneResidualSize[kMaxBlockSizes][2][2]; + +extern const int16_t kProjectionMvDivisionLookup[kMaxFrameDistance + 1]; + +extern const uint8_t kTransformWidth[kNumTransformSizes]; + +extern const uint8_t kTransformHeight[kNumTransformSizes]; + +extern const uint8_t kTransformWidth4x4[kNumTransformSizes]; + +extern const uint8_t kTransformHeight4x4[kNumTransformSizes]; + +extern const uint8_t kTransformWidthLog2[kNumTransformSizes]; + +extern const uint8_t kTransformHeightLog2[kNumTransformSizes]; + +extern const TransformSize kSplitTransformSize[kNumTransformSizes]; + +// Square transform of size min(w,h). +extern const TransformSize kTransformSizeSquareMin[kNumTransformSizes]; + +// Square transform of size max(w,h). +extern const TransformSize kTransformSizeSquareMax[kNumTransformSizes]; + +extern const uint8_t kNumTransformTypesInSet[kNumTransformSets]; + +extern const uint8_t kSgrProjParams[1 << kSgrProjParamsBits][4]; + +extern const int8_t kSgrProjMultiplierMin[2]; + +extern const int8_t kSgrProjMultiplierMax[2]; + +extern const int8_t kWienerTapsMin[3]; + +extern const int8_t kWienerTapsMax[3]; + +extern const uint8_t kUpscaleFilterUnsigned[kSuperResFilterShifts] + [kSuperResFilterTaps]; + +// An int8_t version of the kWarpedFilters array. +// Note: The array could be removed with a performance penalty. +extern const int8_t kWarpedFilters8[3 * kWarpedPixelPrecisionShifts + 1][8]; + +extern const int16_t kWarpedFilters[3 * kWarpedPixelPrecisionShifts + 1][8]; + +extern const int8_t kHalfSubPixelFilters[6][16][8]; + +extern const uint8_t kAbsHalfSubPixelFilters[6][16][8]; + +extern const int16_t kDirectionalIntraPredictorDerivative[44]; + +extern const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes]; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_CONSTANTS_H_ diff --git a/src/utils/cpu.cc b/src/utils/cpu.cc new file mode 100644 index 0000000..a6b7057 --- /dev/null +++ b/src/utils/cpu.cc @@ -0,0 +1,84 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/cpu.h" + +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#include +#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) +#include // _xgetbv +#include +#endif + +namespace libgav1 { + +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) +namespace { + +#if defined(__GNUC__) +void CpuId(int leaf, uint32_t info[4]) { + __cpuid_count(leaf, 0 /*ecx=subleaf*/, info[0], info[1], info[2], info[3]); +} + +uint64_t Xgetbv() { + const uint32_t ecx = 0; // ecx specifies the extended control register + uint32_t eax; + uint32_t edx; + __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx)); + return (static_cast(edx) << 32) | eax; +} +#else // _MSC_VER +void CpuId(int leaf, uint32_t info[4]) { + __cpuidex(reinterpret_cast(info), leaf, 0 /*ecx=subleaf*/); +} + +uint64_t Xgetbv() { return _xgetbv(0); } +#endif // __GNUC__ + +} // namespace + +uint32_t GetCpuInfo() { + uint32_t info[4]; + + // Get the highest feature value cpuid supports + CpuId(0, info); + const int max_cpuid_value = info[0]; + if (max_cpuid_value < 1) return 0; + + CpuId(1, info); + uint32_t features = 0; + if ((info[3] & (1 << 26)) != 0) features |= kSSE2; + if ((info[2] & (1 << 9)) != 0) features |= kSSSE3; + if ((info[2] & (1 << 19)) != 0) features |= kSSE4_1; + + // Bits 27 (OSXSAVE) & 28 (256-bit AVX) + if ((info[2] & (3 << 27)) == (3 << 27)) { + // XMM state and YMM state enabled by the OS + if ((Xgetbv() & 0x6) == 0x6) { + features |= kAVX; + if (max_cpuid_value >= 7) { + CpuId(7, info); + if ((info[1] & (1 << 5)) != 0) features |= kAVX2; + } + } + } + + return features; +} +#else +uint32_t GetCpuInfo() { return 0; } +#endif // x86 || x86_64 + +} // namespace libgav1 diff --git a/src/utils/cpu.h b/src/utils/cpu.h new file mode 100644 index 0000000..630b251 --- /dev/null +++ b/src/utils/cpu.h @@ -0,0 +1,107 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_CPU_H_ +#define LIBGAV1_SRC_UTILS_CPU_H_ + +#include + +namespace libgav1 { + +#if defined(__i386__) || defined(__x86_64__) +#define LIBGAV1_X86 +#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) +#define LIBGAV1_X86 +#define LIBGAV1_X86_MSVC +#endif + +#if defined(LIBGAV1_X86) + +#if !defined(LIBGAV1_ENABLE_SSE4_1) +#define LIBGAV1_ENABLE_SSE4_1 1 +#endif + +#if LIBGAV1_ENABLE_SSE4_1 +#if !defined(LIBGAV1_ENABLE_AVX2) +#define LIBGAV1_ENABLE_AVX2 1 +#endif // !defined(LIBGAV1_ENABLE_AVX2) +#else // !LIBGAV1_ENABLE_SSE4_1 +// Disable AVX2 when SSE4.1 is disabled as it may rely on shared components. +#undef LIBGAV1_ENABLE_AVX2 +#define LIBGAV1_ENABLE_AVX2 0 +#endif // LIBGAV1_ENABLE_SSE4_1 + +#else // !LIBGAV1_X86 + +#undef LIBGAV1_ENABLE_AVX2 +#define LIBGAV1_ENABLE_AVX2 0 +#undef LIBGAV1_ENABLE_SSE4_1 +#define LIBGAV1_ENABLE_SSE4_1 0 + +#endif // LIBGAV1_X86 + +// For x86 LIBGAV1_TARGETING_* indicate the source being built is targeting +// (at least) that instruction set. This prevents disabling other instruction +// sets if the current instruction set isn't a global target, e.g., building +// *_avx2.cc w/-mavx2, but the remaining files without the flag. +#if LIBGAV1_ENABLE_AVX2 && defined(__AVX2__) +#define LIBGAV1_TARGETING_AVX2 1 +#else +#define LIBGAV1_TARGETING_AVX2 0 +#endif + +// Note: LIBGAV1_X86_MSVC isn't completely correct for Visual Studio, but there +// is no equivalent to __SSE4_1__. LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS will be +// enabled in dsp.h to compensate for this. +#if LIBGAV1_ENABLE_SSE4_1 && (defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC)) +#define LIBGAV1_TARGETING_SSE4_1 1 +#else +#define LIBGAV1_TARGETING_SSE4_1 0 +#endif + +#undef LIBGAV1_X86 + +#if !defined(LIBGAV1_ENABLE_NEON) +// TODO(jzern): add support for _M_ARM64. +#if defined(__ARM_NEON__) || defined(__aarch64__) || \ + (defined(_MSC_VER) && defined(_M_ARM)) +#define LIBGAV1_ENABLE_NEON 1 +#else +#define LIBGAV1_ENABLE_NEON 0 +#endif +#endif // !defined(LIBGAV1_ENABLE_NEON) + +enum CpuFeatures : uint8_t { + kSSE2 = 1 << 0, +#define LIBGAV1_CPU_SSE2 (1 << 0) + kSSSE3 = 1 << 1, +#define LIBGAV1_CPU_SSSE3 (1 << 1) + kSSE4_1 = 1 << 2, +#define LIBGAV1_CPU_SSE4_1 (1 << 2) + kAVX = 1 << 3, +#define LIBGAV1_CPU_AVX (1 << 3) + kAVX2 = 1 << 4, +#define LIBGAV1_CPU_AVX2 (1 << 4) + kNEON = 1 << 5, +#define LIBGAV1_CPU_NEON (1 << 5) +}; + +// Returns a bit-wise OR of CpuFeatures supported by this platform. +uint32_t GetCpuInfo(); + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_CPU_H_ diff --git a/src/utils/dynamic_buffer.h b/src/utils/dynamic_buffer.h new file mode 100644 index 0000000..b51345a --- /dev/null +++ b/src/utils/dynamic_buffer.h @@ -0,0 +1,82 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_ +#define LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_ + +#include +#include + +#include "src/utils/memory.h" + +namespace libgav1 { + +template +class DynamicBuffer { + public: + T* get() { return buffer_.get(); } + const T* get() const { return buffer_.get(); } + + // Resizes the buffer so that it can hold at least |size| elements. Existing + // contents will be destroyed when resizing to a larger size. + // + // Returns true on success. If Resize() returns false, then subsequent calls + // to get() will return nullptr. + bool Resize(size_t size) { + if (size <= size_) return true; + buffer_.reset(new (std::nothrow) T[size]); + if (buffer_ == nullptr) { + size_ = 0; + return false; + } + size_ = size; + return true; + } + + private: + std::unique_ptr buffer_; + size_t size_ = 0; +}; + +template +class AlignedDynamicBuffer { + public: + T* get() { return buffer_.get(); } + + // Resizes the buffer so that it can hold at least |size| elements. Existing + // contents will be destroyed when resizing to a larger size. + // + // Returns true on success. If Resize() returns false, then subsequent calls + // to get() will return nullptr. + bool Resize(size_t size) { + if (size <= size_) return true; + buffer_ = MakeAlignedUniquePtr(alignment, size); + if (buffer_ == nullptr) { + size_ = 0; + return false; + } + size_ = size; + return true; + } + + private: + AlignedUniquePtr buffer_; + size_t size_ = 0; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_DYNAMIC_BUFFER_H_ diff --git a/src/utils/entropy_decoder.cc b/src/utils/entropy_decoder.cc new file mode 100644 index 0000000..bf21199 --- /dev/null +++ b/src/utils/entropy_decoder.cc @@ -0,0 +1,1117 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/entropy_decoder.h" + +#include +#include + +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" +#include "src/utils/cpu.h" + +#if defined(__ARM_NEON__) || defined(__aarch64__) || \ + (defined(_MSC_VER) && defined(_M_ARM)) +#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 1 +#else +#define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 0 +#endif + +#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON +#include +#endif + +#if defined(__SSE2__) || defined(LIBGAV1_X86_MSVC) +#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 1 +#else +#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 0 +#endif + +#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 +#include +#endif + +namespace libgav1 { +namespace { + +constexpr uint32_t kReadBitMask = ~255; +constexpr int kCdfPrecision = 6; +constexpr int kMinimumProbabilityPerSymbol = 4; + +// This function computes the "cur" variable as specified inside the do-while +// loop in Section 8.2.6 of the spec. This function is monotonically +// decreasing as the values of index increases (note that the |cdf| array is +// sorted in decreasing order). +uint32_t ScaleCdf(uint32_t values_in_range_shifted, const uint16_t* const cdf, + int index, int symbol_count) { + return ((values_in_range_shifted * (cdf[index] >> kCdfPrecision)) >> 1) + + (kMinimumProbabilityPerSymbol * (symbol_count - index)); +} + +void UpdateCdf(uint16_t* const cdf, const int symbol_count, const int symbol) { + const uint16_t count = cdf[symbol_count]; + // rate is computed in the spec as: + // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2) + // In this case cdf[N] is |count|. + // Min(FloorLog2(N), 2) is 1 for symbol_count == {2, 3} and 2 for all + // symbol_count > 3. So the equation becomes: + // 4 + (count > 15) + (count > 31) + (symbol_count > 3). + // Note that the largest value for count is 32 (it is not incremented beyond + // 32). So using that information: + // count >> 4 is 0 for count from 0 to 15. + // count >> 4 is 1 for count from 16 to 31. + // count >> 4 is 2 for count == 31. + // Now, the equation becomes: + // 4 + (count >> 4) + (symbol_count > 3). + // Since (count >> 4) can only be 0 or 1 or 2, the addition could be replaced + // with bitwise or: + // (4 | (count >> 4)) + (symbol_count > 3). + // but using addition will allow the compiler to eliminate an operation when + // symbol_count is known and this function is inlined. + const int rate = (count >> 4) + 4 + static_cast(symbol_count > 3); + // Hints for further optimizations: + // + // 1. clang can vectorize this for loop with width 4, even though the loop + // contains an if-else statement. Therefore, it may be advantageous to use + // "i < symbol_count" as the loop condition when symbol_count is 8, 12, or 16 + // (a multiple of 4 that's not too small). + // + // 2. The for loop can be rewritten in the following form, which would enable + // clang to vectorize the loop with width 8: + // + // const int rounding = (1 << rate) - 1; + // for (int i = 0; i < symbol_count - 1; ++i) { + // const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding; + // cdf[i] += static_cast(a - cdf[i]) >> rate; + // } + // + // The subtraction (a - cdf[i]) relies on the overflow semantics of unsigned + // integer arithmetic. The result of the unsigned subtraction is cast to a + // signed integer and right-shifted. This requires the right shift of a + // signed integer be an arithmetic shift, which is true for clang, gcc, and + // Visual C++. + assert(symbol_count - 1 > 0); + int i = 0; + do { + if (i < symbol) { + cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate; + } else { + cdf[i] -= cdf[i] >> rate; + } + } while (++i < symbol_count - 1); + cdf[symbol_count] += static_cast(count < 32); +} + +// Define the UpdateCdfN functions. UpdateCdfN is a specialized implementation +// of UpdateCdf based on the fact that symbol_count == N. UpdateCdfN uses the +// SIMD instruction sets if available. + +#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON + +// The UpdateCdf() method contains the following for loop: +// +// for (int i = 0; i < symbol_count - 1; ++i) { +// if (i < symbol) { +// cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate; +// } else { +// cdf[i] -= cdf[i] >> rate; +// } +// } +// +// It can be rewritten in the following two forms, which are amenable to SIMD +// implementations: +// +// const int rounding = (1 << rate) - 1; +// for (int i = 0; i < symbol_count - 1; ++i) { +// const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding; +// cdf[i] += static_cast(a - cdf[i]) >> rate; +// } +// +// or: +// +// const int rounding = (1 << rate) - 1; +// for (int i = 0; i < symbol_count - 1; ++i) { +// const uint16_t a = (i < symbol) ? (kCdfMaxProbability - rounding) : 0; +// cdf[i] -= static_cast(cdf[i] - a) >> rate; +// } +// +// The following ARM NEON implementations use a modified version of the first +// form, using the comparison mask and unsigned rollover to avoid the need to +// calculate rounding. +// +// The cdf array has symbol_count + 1 elements. The first symbol_count elements +// are the CDF. The last element is a count that is initialized to 0 and may +// grow up to 32. The for loop in UpdateCdf updates the CDF in the array. Since +// cdf[symbol_count - 1] is always 0, the for loop does not update +// cdf[symbol_count - 1]. However, it would be correct to have the for loop +// update cdf[symbol_count - 1] anyway: since symbol_count - 1 >= symbol, the +// for loop would take the else branch when i is symbol_count - 1: +// cdf[i] -= cdf[i] >> rate; +// Since cdf[symbol_count - 1] is 0, cdf[symbol_count - 1] would still be 0 +// after the update. The ARM NEON implementations take advantage of this in the +// following two cases: +// 1. When symbol_count is 8 or 16, the vectorized code updates the first +// symbol_count elements in the array. +// 2. When symbol_count is 7, the vectorized code updates all the 8 elements in +// the cdf array. Since an invalid CDF value is written into cdf[7], the +// count in cdf[7] needs to be fixed up after the vectorized code. + +void UpdateCdf5(uint16_t* const cdf, const int symbol) { + uint16x4_t cdf_vec = vld1_u16(cdf); + const uint16_t count = cdf[5]; + const int rate = (count >> 4) + 5; + const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability); + const uint16x4_t index = vcreate_u16(0x0003000200010000); + const uint16x4_t symbol_vec = vdup_n_u16(symbol); + const uint16x4_t mask = vcge_u16(index, symbol_vec); + // i < symbol: 32768, i >= symbol: 65535. + const uint16x4_t a = vorr_u16(mask, cdf_max_probability); + // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf. + const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(a, cdf_vec)); + // i < symbol: cdf - 0, i >= symbol: cdf - 65535. + const uint16x4_t cdf_offset = vsub_u16(cdf_vec, mask); + const int16x4_t negative_rate = vdup_n_s16(-rate); + // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate. + const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate)); + // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate). + // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate). + cdf_vec = vadd_u16(cdf_offset, delta); + vst1_u16(cdf, cdf_vec); + cdf[5] = count + static_cast(count < 32); +} + +// This version works for |symbol_count| = 7, 8, or 9. +// See UpdateCdf5 for implementation details. +template +void UpdateCdf7To9(uint16_t* const cdf, const int symbol) { + static_assert(symbol_count >= 7 && symbol_count <= 9, ""); + uint16x8_t cdf_vec = vld1q_u16(cdf); + const uint16_t count = cdf[symbol_count]; + const int rate = (count >> 4) + 5; + const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability); + const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000), + vcreate_u16(0x0007000600050004)); + const uint16x8_t symbol_vec = vdupq_n_u16(symbol); + const uint16x8_t mask = vcgeq_u16(index, symbol_vec); + const uint16x8_t a = vorrq_u16(mask, cdf_max_probability); + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec)); + const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask); + const int16x8_t negative_rate = vdupq_n_s16(-rate); + const uint16x8_t delta = + vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); + cdf_vec = vaddq_u16(cdf_offset, delta); + vst1q_u16(cdf, cdf_vec); + cdf[symbol_count] = count + static_cast(count < 32); +} + +void UpdateCdf7(uint16_t* const cdf, const int symbol) { + UpdateCdf7To9<7>(cdf, symbol); +} + +void UpdateCdf8(uint16_t* const cdf, const int symbol) { + UpdateCdf7To9<8>(cdf, symbol); +} + +void UpdateCdf9(uint16_t* const cdf, const int symbol) { + UpdateCdf7To9<9>(cdf, symbol); +} + +// See UpdateCdf5 for implementation details. +void UpdateCdf11(uint16_t* const cdf, const int symbol) { + uint16x8_t cdf_vec = vld1q_u16(cdf + 2); + const uint16_t count = cdf[11]; + cdf[11] = count + static_cast(count < 32); + const int rate = (count >> 4) + 5; + if (symbol > 1) { + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate; + const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability); + const uint16x8_t symbol_vec = vdupq_n_u16(symbol); + const int16x8_t negative_rate = vdupq_n_s16(-rate); + const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002), + vcreate_u16(0x0009000800070006)); + const uint16x8_t mask = vcgeq_u16(index, symbol_vec); + const uint16x8_t a = vorrq_u16(mask, cdf_max_probability); + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec)); + const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask); + const uint16x8_t delta = + vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); + cdf_vec = vaddq_u16(cdf_offset, delta); + vst1q_u16(cdf + 2, cdf_vec); + } else { + if (symbol != 0) { + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] -= cdf[1] >> rate; + } else { + cdf[0] -= cdf[0] >> rate; + cdf[1] -= cdf[1] >> rate; + } + const int16x8_t negative_rate = vdupq_n_s16(-rate); + const uint16x8_t delta = vshlq_u16(cdf_vec, negative_rate); + cdf_vec = vsubq_u16(cdf_vec, delta); + vst1q_u16(cdf + 2, cdf_vec); + } +} + +// See UpdateCdf5 for implementation details. +void UpdateCdf13(uint16_t* const cdf, const int symbol) { + uint16x8_t cdf_vec0 = vld1q_u16(cdf); + uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4); + const uint16_t count = cdf[13]; + const int rate = (count >> 4) + 5; + const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability); + const uint16x8_t symbol_vec = vdupq_n_u16(symbol); + const int16x8_t negative_rate = vdupq_n_s16(-rate); + + uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000), + vcreate_u16(0x0007000600050004)); + uint16x8_t mask = vcgeq_u16(index, symbol_vec); + uint16x8_t a = vorrq_u16(mask, cdf_max_probability); + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec0)); + uint16x8_t cdf_offset = vsubq_u16(cdf_vec0, mask); + uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); + cdf_vec0 = vaddq_u16(cdf_offset, delta); + vst1q_u16(cdf, cdf_vec0); + + index = vcombine_u16(vcreate_u16(0x0007000600050004), + vcreate_u16(0x000b000a00090008)); + mask = vcgeq_u16(index, symbol_vec); + a = vorrq_u16(mask, cdf_max_probability); + diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec1)); + cdf_offset = vsubq_u16(cdf_vec1, mask); + delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); + cdf_vec1 = vaddq_u16(cdf_offset, delta); + vst1q_u16(cdf + 4, cdf_vec1); + + cdf[13] = count + static_cast(count < 32); +} + +// See UpdateCdf5 for implementation details. +void UpdateCdf16(uint16_t* const cdf, const int symbol) { + uint16x8_t cdf_vec = vld1q_u16(cdf); + const uint16_t count = cdf[16]; + const int rate = (count >> 4) + 5; + const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability); + const uint16x8_t symbol_vec = vdupq_n_u16(symbol); + const int16x8_t negative_rate = vdupq_n_s16(-rate); + + uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000), + vcreate_u16(0x0007000600050004)); + uint16x8_t mask = vcgeq_u16(index, symbol_vec); + uint16x8_t a = vorrq_u16(mask, cdf_max_probability); + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec)); + uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask); + uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); + cdf_vec = vaddq_u16(cdf_offset, delta); + vst1q_u16(cdf, cdf_vec); + + cdf_vec = vld1q_u16(cdf + 8); + index = vcombine_u16(vcreate_u16(0x000b000a00090008), + vcreate_u16(0x000f000e000d000c)); + mask = vcgeq_u16(index, symbol_vec); + a = vorrq_u16(mask, cdf_max_probability); + diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec)); + cdf_offset = vsubq_u16(cdf_vec, mask); + delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate)); + cdf_vec = vaddq_u16(cdf_offset, delta); + vst1q_u16(cdf + 8, cdf_vec); + + cdf[16] = count + static_cast(count < 32); +} + +#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON + +#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 + +inline __m128i LoadLo8(const void* a) { + return _mm_loadl_epi64(static_cast(a)); +} + +inline __m128i LoadUnaligned16(const void* a) { + return _mm_loadu_si128(static_cast(a)); +} + +inline void StoreLo8(void* a, const __m128i v) { + _mm_storel_epi64(static_cast<__m128i*>(a), v); +} + +inline void StoreUnaligned16(void* a, const __m128i v) { + _mm_storeu_si128(static_cast<__m128i*>(a), v); +} + +void UpdateCdf5(uint16_t* const cdf, const int symbol) { + __m128i cdf_vec = LoadLo8(cdf); + const uint16_t count = cdf[5]; + const int rate = (count >> 4) + 5; + const __m128i cdf_max_probability = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0); + const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001); + const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0); + // i >= symbol. + const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec); + // i < symbol: 32768, i >= symbol: 65535. + const __m128i a = _mm_or_si128(mask, cdf_max_probability); + // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf. + const __m128i diff = _mm_sub_epi16(a, cdf_vec); + // i < symbol: cdf - 0, i >= symbol: cdf - 65535. + const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask); + // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate. + const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); + // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate). + // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate). + cdf_vec = _mm_add_epi16(cdf_offset, delta); + StoreLo8(cdf, cdf_vec); + cdf[5] = count + static_cast(count < 32); +} + +// This version works for |symbol_count| = 7, 8, or 9. +// See UpdateCdf5 for implementation details. +template +void UpdateCdf7To9(uint16_t* const cdf, const int symbol) { + static_assert(symbol_count >= 7 && symbol_count <= 9, ""); + __m128i cdf_vec = LoadUnaligned16(cdf); + const uint16_t count = cdf[symbol_count]; + const int rate = (count >> 4) + 5; + const __m128i cdf_max_probability = + _mm_set1_epi16(static_cast(kCdfMaxProbability)); + const __m128i index = + _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); + const __m128i symbol_vec = _mm_set1_epi16(static_cast(symbol)); + const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec); + const __m128i a = _mm_or_si128(mask, cdf_max_probability); + const __m128i diff = _mm_sub_epi16(a, cdf_vec); + const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask); + const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); + cdf_vec = _mm_add_epi16(cdf_offset, delta); + StoreUnaligned16(cdf, cdf_vec); + cdf[symbol_count] = count + static_cast(count < 32); +} + +void UpdateCdf7(uint16_t* const cdf, const int symbol) { + UpdateCdf7To9<7>(cdf, symbol); +} + +void UpdateCdf8(uint16_t* const cdf, const int symbol) { + UpdateCdf7To9<8>(cdf, symbol); +} + +void UpdateCdf9(uint16_t* const cdf, const int symbol) { + UpdateCdf7To9<9>(cdf, symbol); +} + +// See UpdateCdf5 for implementation details. +void UpdateCdf11(uint16_t* const cdf, const int symbol) { + __m128i cdf_vec = LoadUnaligned16(cdf + 2); + const uint16_t count = cdf[11]; + cdf[11] = count + static_cast(count < 32); + const int rate = (count >> 4) + 5; + if (symbol > 1) { + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate; + const __m128i cdf_max_probability = + _mm_set1_epi16(static_cast(kCdfMaxProbability)); + const __m128i index = + _mm_set_epi32(0x000a0009, 0x00080007, 0x00060005, 0x00040003); + const __m128i symbol_vec = _mm_set1_epi16(static_cast(symbol)); + const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec); + const __m128i a = _mm_or_si128(mask, cdf_max_probability); + const __m128i diff = _mm_sub_epi16(a, cdf_vec); + const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask); + const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); + cdf_vec = _mm_add_epi16(cdf_offset, delta); + StoreUnaligned16(cdf + 2, cdf_vec); + } else { + if (symbol != 0) { + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] -= cdf[1] >> rate; + } else { + cdf[0] -= cdf[0] >> rate; + cdf[1] -= cdf[1] >> rate; + } + const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate)); + cdf_vec = _mm_sub_epi16(cdf_vec, delta); + StoreUnaligned16(cdf + 2, cdf_vec); + } +} + +// See UpdateCdf5 for implementation details. +void UpdateCdf13(uint16_t* const cdf, const int symbol) { + __m128i cdf_vec0 = LoadLo8(cdf); + __m128i cdf_vec1 = LoadUnaligned16(cdf + 4); + const uint16_t count = cdf[13]; + const int rate = (count >> 4) + 5; + const __m128i cdf_max_probability = + _mm_set1_epi16(static_cast(kCdfMaxProbability)); + const __m128i symbol_vec = _mm_set1_epi16(static_cast(symbol)); + + const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001); + const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec); + const __m128i a = _mm_or_si128(mask, cdf_max_probability); + const __m128i diff = _mm_sub_epi16(a, cdf_vec0); + const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask); + const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); + cdf_vec0 = _mm_add_epi16(cdf_offset, delta); + StoreLo8(cdf, cdf_vec0); + + const __m128i index1 = + _mm_set_epi32(0x000c000b, 0x000a0009, 0x00080007, 0x00060005); + const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec); + const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability); + const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1); + const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1); + const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate)); + cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1); + StoreUnaligned16(cdf + 4, cdf_vec1); + + cdf[13] = count + static_cast(count < 32); +} + +void UpdateCdf16(uint16_t* const cdf, const int symbol) { + __m128i cdf_vec0 = LoadUnaligned16(cdf); + const uint16_t count = cdf[16]; + const int rate = (count >> 4) + 5; + const __m128i cdf_max_probability = + _mm_set1_epi16(static_cast(kCdfMaxProbability)); + const __m128i symbol_vec = _mm_set1_epi16(static_cast(symbol)); + + const __m128i index = + _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001); + const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec); + const __m128i a = _mm_or_si128(mask, cdf_max_probability); + const __m128i diff = _mm_sub_epi16(a, cdf_vec0); + const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask); + const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); + cdf_vec0 = _mm_add_epi16(cdf_offset, delta); + StoreUnaligned16(cdf, cdf_vec0); + + __m128i cdf_vec1 = LoadUnaligned16(cdf + 8); + const __m128i index1 = + _mm_set_epi32(0x0010000f, 0x000e000d, 0x000c000b, 0x000a0009); + const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec); + const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability); + const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1); + const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1); + const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate)); + cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1); + StoreUnaligned16(cdf + 8, cdf_vec1); + + cdf[16] = count + static_cast(count < 32); +} + +#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 + +void UpdateCdf5(uint16_t* const cdf, const int symbol) { + UpdateCdf(cdf, 5, symbol); +} + +void UpdateCdf7(uint16_t* const cdf, const int symbol) { + UpdateCdf(cdf, 7, symbol); +} + +void UpdateCdf8(uint16_t* const cdf, const int symbol) { + UpdateCdf(cdf, 8, symbol); +} + +void UpdateCdf9(uint16_t* const cdf, const int symbol) { + UpdateCdf(cdf, 9, symbol); +} + +void UpdateCdf11(uint16_t* const cdf, const int symbol) { + UpdateCdf(cdf, 11, symbol); +} + +void UpdateCdf13(uint16_t* const cdf, const int symbol) { + UpdateCdf(cdf, 13, symbol); +} + +void UpdateCdf16(uint16_t* const cdf, const int symbol) { + UpdateCdf(cdf, 16, symbol); +} + +#endif // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 +#endif // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON + +inline DaalaBitReader::WindowSize HostToBigEndian( + const DaalaBitReader::WindowSize x) { + static_assert(sizeof(x) == 4 || sizeof(x) == 8, ""); +#if defined(__GNUC__) +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return (sizeof(x) == 8) ? __builtin_bswap64(x) : __builtin_bswap32(x); +#else + return x; +#endif +#elif defined(_WIN32) + // Note Windows targets are assumed to be little endian. + return static_cast( + (sizeof(x) == 8) ? _byteswap_uint64(static_cast(x)) + : _byteswap_ulong(static_cast(x))); +#else +#error Unknown compiler! +#endif // defined(__GNUC__) +} + +} // namespace + +#if !LIBGAV1_CXX17 +constexpr int DaalaBitReader::kWindowSize; // static. +#endif + +DaalaBitReader::DaalaBitReader(const uint8_t* data, size_t size, + bool allow_update_cdf) + : data_(data), + data_end_(data + size), + data_memcpy_end_((size >= sizeof(WindowSize)) + ? data + size - sizeof(WindowSize) + 1 + : data), + allow_update_cdf_(allow_update_cdf), + values_in_range_(kCdfMaxProbability) { + if (data_ < data_memcpy_end_) { + // This is a simplified version of PopulateBits() which loads 8 extra bits + // and skips the unnecessary shifts of value and window_diff_. + WindowSize value; + memcpy(&value, data_, sizeof(value)); + data_ += sizeof(value); + window_diff_ = HostToBigEndian(value) ^ -1; + // Note the initial value of bits_ is larger than kMaxCachedBits as it's + // used to restore the most significant 0 bit that would be present after + // PopulateBits() when we extract the first symbol value. + // As shown in Section 8.2.2 Initialization process for symbol decoder, + // which uses a fixed offset to read the symbol values, the most + // significant bit is always 0: + // The variable numBits is set equal to Min( sz * 8, 15). + // The variable buf is read using the f(numBits) parsing process. + // The variable paddedBuf is set equal to ( buf << (15 - numBits) ). + // The variable SymbolValue is set to ((1 << 15) - 1) ^ paddedBuf. + bits_ = kWindowSize - 15; + return; + } + window_diff_ = 0; + bits_ = -15; + PopulateBits(); +} + +// This is similar to the ReadSymbol() implementation but it is optimized based +// on the following facts: +// * The probability is fixed at half. So some multiplications can be replaced +// with bit operations. +// * Symbol count is fixed at 2. +int DaalaBitReader::ReadBit() { + const uint32_t curr = + ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol; + const auto symbol_value = static_cast(window_diff_ >> bits_); + int bit = 1; + if (symbol_value >= curr) { + values_in_range_ -= curr; + window_diff_ -= static_cast(curr) << bits_; + bit = 0; + } else { + values_in_range_ = curr; + } + NormalizeRange(); + return bit; +} + +int64_t DaalaBitReader::ReadLiteral(int num_bits) { + assert(num_bits <= 32); + assert(num_bits > 0); + uint32_t literal = 0; + int bit = num_bits - 1; + do { + // ARM can combine a shift operation with a constant number of bits with + // some other operations, such as the OR operation. + // Here is an ARM disassembly example: + // orr w1, w0, w1, lsl #1 + // which left shifts register w1 by 1 bit and OR the shift result with + // register w0. + // The next 2 lines are equivalent to: + // literal |= static_cast(ReadBit()) << bit; + literal <<= 1; + literal |= static_cast(ReadBit()); + } while (--bit >= 0); + return literal; +} + +int DaalaBitReader::ReadSymbol(uint16_t* const cdf, int symbol_count) { + const int symbol = ReadSymbolImpl(cdf, symbol_count); + if (allow_update_cdf_) { + UpdateCdf(cdf, symbol_count, symbol); + } + return symbol; +} + +bool DaalaBitReader::ReadSymbol(uint16_t* cdf) { + assert(cdf[1] == 0); + const bool symbol = ReadSymbolImpl(cdf[0]) != 0; + if (allow_update_cdf_) { + const uint16_t count = cdf[2]; + // rate is computed in the spec as: + // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2) + // In this case N is 2 and cdf[N] is |count|. So the equation becomes: + // 4 + (count > 15) + (count > 31) + // Note that the largest value for count is 32 (it is not incremented beyond + // 32). So using that information: + // count >> 4 is 0 for count from 0 to 15. + // count >> 4 is 1 for count from 16 to 31. + // count >> 4 is 2 for count == 32. + // Now, the equation becomes: + // 4 + (count >> 4). + // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced + // with bitwise or. So the final equation is: + // 4 | (count >> 4). + const int rate = 4 | (count >> 4); + if (symbol) { + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + } else { + cdf[0] -= cdf[0] >> rate; + } + cdf[2] += static_cast(count < 32); + } + return symbol; +} + +bool DaalaBitReader::ReadSymbolWithoutCdfUpdate(uint16_t cdf) { + return ReadSymbolImpl(cdf) != 0; +} + +template +int DaalaBitReader::ReadSymbol(uint16_t* const cdf) { + static_assert(symbol_count >= 3 && symbol_count <= 16, ""); + if (symbol_count == 3 || symbol_count == 4) { + return ReadSymbol3Or4(cdf, symbol_count); + } + int symbol; + if (symbol_count == 8) { + symbol = ReadSymbolImpl8(cdf); + } else if (symbol_count <= 13) { + symbol = ReadSymbolImpl(cdf, symbol_count); + } else { + symbol = ReadSymbolImplBinarySearch(cdf, symbol_count); + } + if (allow_update_cdf_) { + if (symbol_count == 5) { + UpdateCdf5(cdf, symbol); + } else if (symbol_count == 7) { + UpdateCdf7(cdf, symbol); + } else if (symbol_count == 8) { + UpdateCdf8(cdf, symbol); + } else if (symbol_count == 9) { + UpdateCdf9(cdf, symbol); + } else if (symbol_count == 11) { + UpdateCdf11(cdf, symbol); + } else if (symbol_count == 13) { + UpdateCdf13(cdf, symbol); + } else if (symbol_count == 16) { + UpdateCdf16(cdf, symbol); + } else { + UpdateCdf(cdf, symbol_count, symbol); + } + } + return symbol; +} + +int DaalaBitReader::ReadSymbolImpl(const uint16_t* const cdf, + int symbol_count) { + assert(cdf[symbol_count - 1] == 0); + --symbol_count; + uint32_t curr = values_in_range_; + int symbol = -1; + uint32_t prev; + const auto symbol_value = static_cast(window_diff_ >> bits_); + uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count; + // Search through the |cdf| array to determine where the scaled cdf value and + // |symbol_value| cross over. + do { + prev = curr; + curr = (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) + + delta; + delta -= kMinimumProbabilityPerSymbol; + } while (symbol_value < curr); + values_in_range_ = prev - curr; + window_diff_ -= static_cast(curr) << bits_; + NormalizeRange(); + return symbol; +} + +int DaalaBitReader::ReadSymbolImplBinarySearch(const uint16_t* const cdf, + int symbol_count) { + assert(cdf[symbol_count - 1] == 0); + assert(symbol_count > 1 && symbol_count <= 16); + --symbol_count; + const auto symbol_value = static_cast(window_diff_ >> bits_); + // Search through the |cdf| array to determine where the scaled cdf value and + // |symbol_value| cross over. Since the CDFs are sorted, we can use binary + // search to do this. Let |symbol| be the index of the first |cdf| array + // entry whose scaled cdf value is less than or equal to |symbol_value|. The + // binary search maintains the invariant: + // low <= symbol <= high + 1 + // and terminates when low == high + 1. + int low = 0; + int high = symbol_count - 1; + // The binary search maintains the invariants that |prev| is the scaled cdf + // value for low - 1 and |curr| is the scaled cdf value for high + 1. (By + // convention, the scaled cdf value for -1 is values_in_range_.) When the + // binary search terminates, |prev| is the scaled cdf value for symbol - 1 + // and |curr| is the scaled cdf value for |symbol|. + uint32_t prev = values_in_range_; + uint32_t curr = 0; + const uint32_t values_in_range_shifted = values_in_range_ >> 8; + do { + const int mid = DivideBy2(low + high); + const uint32_t scaled_cdf = + ScaleCdf(values_in_range_shifted, cdf, mid, symbol_count); + if (symbol_value < scaled_cdf) { + low = mid + 1; + prev = scaled_cdf; + } else { + high = mid - 1; + curr = scaled_cdf; + } + } while (low <= high); + assert(low == high + 1); + // At this point, |low| is the symbol that has been decoded. + values_in_range_ = prev - curr; + window_diff_ -= static_cast(curr) << bits_; + NormalizeRange(); + return low; +} + +int DaalaBitReader::ReadSymbolImpl(uint16_t cdf) { + const auto symbol_value = static_cast(window_diff_ >> bits_); + const uint32_t curr = + (((values_in_range_ >> 8) * (cdf >> kCdfPrecision)) >> 1) + + kMinimumProbabilityPerSymbol; + const int symbol = static_cast(symbol_value < curr); + if (symbol == 1) { + values_in_range_ = curr; + } else { + values_in_range_ -= curr; + window_diff_ -= static_cast(curr) << bits_; + } + NormalizeRange(); + return symbol; +} + +// Equivalent to ReadSymbol(cdf, [3,4]), with the ReadSymbolImpl and UpdateCdf +// calls inlined. +int DaalaBitReader::ReadSymbol3Or4(uint16_t* const cdf, + const int symbol_count) { + assert(cdf[symbol_count - 1] == 0); + uint32_t curr = values_in_range_; + uint32_t prev; + const auto symbol_value = static_cast(window_diff_ >> bits_); + uint32_t delta = kMinimumProbabilityPerSymbol * (symbol_count - 1); + const uint32_t values_in_range_shifted = values_in_range_ >> 8; + + // Search through the |cdf| array to determine where the scaled cdf value and + // |symbol_value| cross over. If allow_update_cdf_ is true, update the |cdf| + // array. + // + // The original code is: + // + // int symbol = -1; + // do { + // prev = curr; + // curr = + // ((values_in_range_shifted * (cdf[++symbol] >> kCdfPrecision)) >> 1) + // + delta; + // delta -= kMinimumProbabilityPerSymbol; + // } while (symbol_value < curr); + // if (allow_update_cdf_) { + // UpdateCdf(cdf, [3,4], symbol); + // } + // + // The do-while loop is unrolled with three or four iterations, and the + // UpdateCdf call is inlined and merged into the iterations. + int symbol = 0; + // Iteration 0. + prev = curr; + curr = + ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta; + if (symbol_value >= curr) { + // symbol == 0. + if (allow_update_cdf_) { + // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/0). + const uint16_t count = cdf[symbol_count]; + cdf[symbol_count] += static_cast(count < 32); + const int rate = (count >> 4) + 4 + static_cast(symbol_count == 4); + if (symbol_count == 4) { +#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON + // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM + // NEON code is slower. Consider using the C version if __arm__ is + // defined. + // 2. The ARM NEON code (compiled for arm64) is slightly slower on + // Samsung Galaxy S8+ (SM-G955FD). + uint16x4_t cdf_vec = vld1_u16(cdf); + const int16x4_t negative_rate = vdup_n_s16(-rate); + const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate); + cdf_vec = vsub_u16(cdf_vec, delta); + vst1_u16(cdf, cdf_vec); +#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 + __m128i cdf_vec = LoadLo8(cdf); + const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate)); + cdf_vec = _mm_sub_epi16(cdf_vec, delta); + StoreLo8(cdf, cdf_vec); +#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 + cdf[0] -= cdf[0] >> rate; + cdf[1] -= cdf[1] >> rate; + cdf[2] -= cdf[2] >> rate; +#endif + } else { // symbol_count == 3. + cdf[0] -= cdf[0] >> rate; + cdf[1] -= cdf[1] >> rate; + } + } + goto found; + } + ++symbol; + delta -= kMinimumProbabilityPerSymbol; + // Iteration 1. + prev = curr; + curr = + ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta; + if (symbol_value >= curr) { + // symbol == 1. + if (allow_update_cdf_) { + // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/1). + const uint16_t count = cdf[symbol_count]; + cdf[symbol_count] += static_cast(count < 32); + const int rate = (count >> 4) + 4 + static_cast(symbol_count == 4); + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] -= cdf[1] >> rate; + if (symbol_count == 4) cdf[2] -= cdf[2] >> rate; + } + goto found; + } + ++symbol; + if (symbol_count == 4) { + delta -= kMinimumProbabilityPerSymbol; + // Iteration 2. + prev = curr; + curr = ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + + delta; + if (symbol_value >= curr) { + // symbol == 2. + if (allow_update_cdf_) { + // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2). + const uint16_t count = cdf[4]; + cdf[4] += static_cast(count < 32); + const int rate = (count >> 4) + 5; + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate; + cdf[2] -= cdf[2] >> rate; + } + goto found; + } + ++symbol; + } + // |delta| is 0 for the last iteration. + // Iteration 2 (symbol_count == 3) or 3 (symbol_count == 4). + prev = curr; + // Since cdf[symbol_count - 1] is 0 and |delta| is 0, |curr| is also 0. + curr = 0; + // symbol == [2,3]. + if (allow_update_cdf_) { + // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/[2,3]). + const uint16_t count = cdf[symbol_count]; + cdf[symbol_count] += static_cast(count < 32); + const int rate = (4 | (count >> 4)) + static_cast(symbol_count == 4); + if (symbol_count == 4) { +#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON + // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON + // code is a tiny bit slower. Consider using the C version if __arm__ is + // defined. + uint16x4_t cdf_vec = vld1_u16(cdf); + const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability); + const int16x4_t diff = + vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec)); + const int16x4_t negative_rate = vdup_n_s16(-rate); + const uint16x4_t delta = + vreinterpret_u16_s16(vshl_s16(diff, negative_rate)); + cdf_vec = vadd_u16(cdf_vec, delta); + vst1_u16(cdf, cdf_vec); + cdf[3] = 0; +#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 + __m128i cdf_vec = LoadLo8(cdf); + const __m128i cdf_max_probability = + _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0); + const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec); + const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate)); + cdf_vec = _mm_add_epi16(cdf_vec, delta); + StoreLo8(cdf, cdf_vec); + cdf[3] = 0; +#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate; + cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate; +#endif + } else { // symbol_count == 3. + cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate; + cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate; + } + } +found: + // End of unrolled do-while loop. + + values_in_range_ = prev - curr; + window_diff_ -= static_cast(curr) << bits_; + NormalizeRange(); + return symbol; +} + +int DaalaBitReader::ReadSymbolImpl8(const uint16_t* const cdf) { + assert(cdf[7] == 0); + uint32_t curr = values_in_range_; + uint32_t prev; + const auto symbol_value = static_cast(window_diff_ >> bits_); + uint32_t delta = kMinimumProbabilityPerSymbol * 7; + // Search through the |cdf| array to determine where the scaled cdf value and + // |symbol_value| cross over. + // + // The original code is: + // + // int symbol = -1; + // do { + // prev = curr; + // curr = + // (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) + // + delta; + // delta -= kMinimumProbabilityPerSymbol; + // } while (symbol_value < curr); + // + // The do-while loop is unrolled with eight iterations. + int symbol = 0; + +#define READ_SYMBOL_ITERATION \ + prev = curr; \ + curr = (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + \ + delta; \ + if (symbol_value >= curr) goto found; \ + ++symbol; \ + delta -= kMinimumProbabilityPerSymbol + + READ_SYMBOL_ITERATION; // Iteration 0. + READ_SYMBOL_ITERATION; // Iteration 1. + READ_SYMBOL_ITERATION; // Iteration 2. + READ_SYMBOL_ITERATION; // Iteration 3. + READ_SYMBOL_ITERATION; // Iteration 4. + READ_SYMBOL_ITERATION; // Iteration 5. + + // The last two iterations can be simplified, so they don't use the + // READ_SYMBOL_ITERATION macro. +#undef READ_SYMBOL_ITERATION + + // Iteration 6. + prev = curr; + curr = + (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta; + if (symbol_value >= curr) goto found; // symbol == 6. + ++symbol; + // |delta| is 0 for the last iteration. + // Iteration 7. + prev = curr; + // Since cdf[7] is 0 and |delta| is 0, |curr| is also 0. + curr = 0; + // symbol == 7. +found: + // End of unrolled do-while loop. + + values_in_range_ = prev - curr; + window_diff_ -= static_cast(curr) << bits_; + NormalizeRange(); + return symbol; +} + +void DaalaBitReader::PopulateBits() { + constexpr int kMaxCachedBits = kWindowSize - 16; +#if defined(__aarch64__) + // Fast path: read eight bytes and add the first six bytes to window_diff_. + // This fast path makes the following assumptions. + // 1. We assume that unaligned load of uint64_t is fast. + // 2. When there are enough bytes in data_, the for loop below reads 6 or 7 + // bytes depending on the value of bits_. This fast path always reads 6 + // bytes, which results in more calls to PopulateBits(). We assume that + // making more calls to a faster PopulateBits() is overall a win. + // NOTE: Although this fast path could also be used on x86_64, it hurts + // performance (measured on Lenovo ThinkStation P920 running Linux). (The + // reason is still unknown.) Therefore this fast path is only used on arm64. + static_assert(kWindowSize == 64, ""); + if (data_ < data_memcpy_end_) { + uint64_t value; + // arm64 supports unaligned loads, so this memcpy call is compiled to a + // single ldr instruction. + memcpy(&value, data_, sizeof(value)); + data_ += kMaxCachedBits >> 3; + value = HostToBigEndian(value) ^ -1; + value >>= kWindowSize - kMaxCachedBits; + window_diff_ = value | (window_diff_ << kMaxCachedBits); + bits_ += kMaxCachedBits; + return; + } +#endif + + const uint8_t* data = data_; + int bits = bits_; + WindowSize window_diff = window_diff_; + + int count = kWindowSize - 9 - (bits + 15); + // The fast path above, if compiled, would cause clang 8.0.7 to vectorize + // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7 + // iterations when WindowSize is 64 bits. So it is not profitable to + // vectorize this loop. Note that clang 8.0.7 does not vectorize this loop if + // the fast path above is not compiled. + +#ifdef __clang__ +#pragma clang loop vectorize(disable) interleave(disable) +#endif + for (; count >= 0 && data < data_end_; count -= 8) { + const uint8_t value = *data++ ^ -1; + window_diff = static_cast(value) | (window_diff << 8); + bits += 8; + } + assert(bits <= kMaxCachedBits); + if (data == data_end_) { + // Shift in some 1s. This is equivalent to providing fake 0 data bits. + window_diff = ((window_diff + 1) << (kMaxCachedBits - bits)) - 1; + bits = kMaxCachedBits; + } + + data_ = data; + bits_ = bits; + window_diff_ = window_diff; +} + +void DaalaBitReader::NormalizeRange() { + const int bits_used = 15 ^ FloorLog2(values_in_range_); + bits_ -= bits_used; + values_in_range_ <<= bits_used; + if (bits_ < 0) PopulateBits(); +} + +// Explicit instantiations. +template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<6>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<9>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<12>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf); +template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf); + +} // namespace libgav1 diff --git a/src/utils/entropy_decoder.h b/src/utils/entropy_decoder.h new file mode 100644 index 0000000..c066b98 --- /dev/null +++ b/src/utils/entropy_decoder.h @@ -0,0 +1,123 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_ +#define LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_ + +#include +#include + +#include "src/utils/bit_reader.h" +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { + +class DaalaBitReader : public BitReader { + public: + // WindowSize must be an unsigned integer type with at least 32 bits. Use the + // largest type with fast arithmetic. size_t should meet these requirements. + using WindowSize = size_t; + + DaalaBitReader(const uint8_t* data, size_t size, bool allow_update_cdf); + ~DaalaBitReader() override = default; + + // Move only. + DaalaBitReader(DaalaBitReader&& rhs) noexcept; + DaalaBitReader& operator=(DaalaBitReader&& rhs) noexcept; + + int ReadBit() final; + int64_t ReadLiteral(int num_bits) override; + // ReadSymbol() calls for which the |symbol_count| is only known at runtime + // will use this variant. + int ReadSymbol(uint16_t* cdf, int symbol_count); + // ReadSymbol() calls for which the |symbol_count| is equal to 2 (boolean + // symbols) will use this variant. + bool ReadSymbol(uint16_t* cdf); + bool ReadSymbolWithoutCdfUpdate(uint16_t cdf); + // Use either linear search or binary search for decoding the symbol depending + // on |symbol_count|. ReadSymbol calls for which the |symbol_count| is known + // at compile time will use this variant. + template + int ReadSymbol(uint16_t* cdf); + + private: + static constexpr int kWindowSize = static_cast(sizeof(WindowSize)) * 8; + static_assert(kWindowSize >= 32, ""); + + // Reads a symbol using the |cdf| table which contains the probabilities of + // each symbol. On a high level, this function does the following: + // 1) Scale the |cdf| values. + // 2) Find the index in the |cdf| array where the scaled CDF value crosses + // the modified |window_diff_| threshold. + // 3) That index is the symbol that has been decoded. + // 4) Update |window_diff_| and |values_in_range_| based on the symbol that + // has been decoded. + inline int ReadSymbolImpl(const uint16_t* cdf, int symbol_count); + // Similar to ReadSymbolImpl but it uses binary search to perform step 2 in + // the comment above. As of now, this function is called when |symbol_count| + // is greater than or equal to 14. + inline int ReadSymbolImplBinarySearch(const uint16_t* cdf, int symbol_count); + // Specialized implementation of ReadSymbolImpl based on the fact that + // symbol_count == 2. + inline int ReadSymbolImpl(uint16_t cdf); + // ReadSymbolN is a specialization of ReadSymbol for symbol_count == N. + LIBGAV1_ALWAYS_INLINE int ReadSymbol3Or4(uint16_t* cdf, int symbol_count); + // ReadSymbolImplN is a specialization of ReadSymbolImpl for + // symbol_count == N. + LIBGAV1_ALWAYS_INLINE int ReadSymbolImpl8(const uint16_t* cdf); + inline void PopulateBits(); + // Normalizes the range so that 32768 <= |values_in_range_| < 65536. Also + // calls PopulateBits() if necessary. + inline void NormalizeRange(); + + const uint8_t* data_; + const uint8_t* const data_end_; + // If |data_| < |data_memcpy_end_|, then we can read sizeof(WindowSize) bytes + // from |data_|. Note with sizeof(WindowSize) == 4 this is only used in the + // constructor, not PopulateBits(). + const uint8_t* const data_memcpy_end_; + const bool allow_update_cdf_; + // Number of cached bits of data in the current value. + int bits_; + // Number of values in the current range. Declared as uint32_t for better + // performance but only the lower 16 bits are used. + uint32_t values_in_range_; + // The difference between the high end of the current range and the coded + // value minus 1. The 16 bits above |bits_| of this variable are used to + // decode the next symbol. It is filled in whenever |bits_| is less than 0. + // Note this implementation differs from the spec as it trades the need to + // shift in 1s in NormalizeRange() with an extra shift in PopulateBits(), + // which occurs less frequently. + WindowSize window_diff_; +}; + +extern template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<6>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<9>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<12>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf); +extern template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf); + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_ENTROPY_DECODER_H_ diff --git a/src/utils/executor.cc b/src/utils/executor.cc new file mode 100644 index 0000000..6934057 --- /dev/null +++ b/src/utils/executor.cc @@ -0,0 +1,21 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/executor.h" + +namespace libgav1 { + +Executor::~Executor() = default; + +} // namespace libgav1 diff --git a/src/utils/executor.h b/src/utils/executor.h new file mode 100644 index 0000000..21abdf8 --- /dev/null +++ b/src/utils/executor.h @@ -0,0 +1,36 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_EXECUTOR_H_ +#define LIBGAV1_SRC_UTILS_EXECUTOR_H_ + +#include + +namespace libgav1 { + +class Executor { + public: + virtual ~Executor(); + + // Schedules the specified "callback" for execution in this executor. + // Depending on the subclass implementation, this may block in some + // situations. + virtual void Schedule(std::function callback) = 0; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_EXECUTOR_H_ diff --git a/src/utils/libgav1_utils.cmake b/src/utils/libgav1_utils.cmake new file mode 100644 index 0000000..8b6ec4b --- /dev/null +++ b/src/utils/libgav1_utils.cmake @@ -0,0 +1,72 @@ +# Copyright 2019 The libgav1 Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_) + return() +endif() # LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_ +set(LIBGAV1_UTILS_LIBGAV1_UTILS_CMAKE_ 1) + +list(APPEND libgav1_utils_sources + "${libgav1_source}/utils/array_2d.h" + "${libgav1_source}/utils/bit_mask_set.h" + "${libgav1_source}/utils/bit_reader.cc" + "${libgav1_source}/utils/bit_reader.h" + "${libgav1_source}/utils/block_parameters_holder.cc" + "${libgav1_source}/utils/block_parameters_holder.h" + "${libgav1_source}/utils/blocking_counter.h" + "${libgav1_source}/utils/common.h" + "${libgav1_source}/utils/compiler_attributes.h" + "${libgav1_source}/utils/constants.cc" + "${libgav1_source}/utils/constants.h" + "${libgav1_source}/utils/cpu.cc" + "${libgav1_source}/utils/cpu.h" + "${libgav1_source}/utils/dynamic_buffer.h" + "${libgav1_source}/utils/entropy_decoder.cc" + "${libgav1_source}/utils/entropy_decoder.h" + "${libgav1_source}/utils/executor.cc" + "${libgav1_source}/utils/executor.h" + "${libgav1_source}/utils/logging.cc" + "${libgav1_source}/utils/logging.h" + "${libgav1_source}/utils/memory.h" + "${libgav1_source}/utils/parameter_tree.cc" + "${libgav1_source}/utils/parameter_tree.h" + "${libgav1_source}/utils/queue.h" + "${libgav1_source}/utils/raw_bit_reader.cc" + "${libgav1_source}/utils/raw_bit_reader.h" + "${libgav1_source}/utils/reference_info.h" + "${libgav1_source}/utils/segmentation.cc" + "${libgav1_source}/utils/segmentation.h" + "${libgav1_source}/utils/segmentation_map.cc" + "${libgav1_source}/utils/segmentation_map.h" + "${libgav1_source}/utils/stack.h" + "${libgav1_source}/utils/threadpool.cc" + "${libgav1_source}/utils/threadpool.h" + "${libgav1_source}/utils/types.h" + "${libgav1_source}/utils/unbounded_queue.h" + "${libgav1_source}/utils/vector.h") + +macro(libgav1_add_utils_targets) + libgav1_add_library(NAME + libgav1_utils + TYPE + OBJECT + SOURCES + ${libgav1_utils_sources} + DEFINES + ${libgav1_defines} + INCLUDES + ${libgav1_include_paths} + ${libgav1_gtest_include_paths}) + +endmacro() diff --git a/src/utils/logging.cc b/src/utils/logging.cc new file mode 100644 index 0000000..9a43c22 --- /dev/null +++ b/src/utils/logging.cc @@ -0,0 +1,65 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/logging.h" + +#include +#include +#include +#include // NOLINT (unapproved c++11 header) + +#if !defined(LIBGAV1_LOG_LEVEL) +#define LIBGAV1_LOG_LEVEL (1 << 30) +#endif + +namespace libgav1 { +namespace internal { +#if LIBGAV1_ENABLE_LOGGING +namespace { + +const char* LogSeverityName(LogSeverity severity) { + switch (severity) { + case LogSeverity::kInfo: + return "INFO"; + case LogSeverity::kError: + return "ERROR"; + case LogSeverity::kWarning: + return "WARNING"; + } + return "UNKNOWN"; +} + +} // namespace + +void Log(LogSeverity severity, const char* file, int line, const char* format, + ...) { + if (LIBGAV1_LOG_LEVEL < static_cast(severity)) return; + std::ostringstream ss; + ss << std::hex << std::this_thread::get_id(); + fprintf(stderr, "%s %s %s:%d] ", LogSeverityName(severity), ss.str().c_str(), + file, line); + + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + fprintf(stderr, "\n"); +} +#else // !LIBGAV1_ENABLE_LOGGING +void Log(LogSeverity /*severity*/, const char* /*file*/, int /*line*/, + const char* /*format*/, ...) {} +#endif // LIBGAV1_ENABLE_LOGGING + +} // namespace internal +} // namespace libgav1 diff --git a/src/utils/logging.h b/src/utils/logging.h new file mode 100644 index 0000000..48928db --- /dev/null +++ b/src/utils/logging.h @@ -0,0 +1,85 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_LOGGING_H_ +#define LIBGAV1_SRC_UTILS_LOGGING_H_ + +#include + +#include "src/utils/compiler_attributes.h" + +#if !defined(LIBGAV1_ENABLE_LOGGING) +#if defined(NDEBUG) || defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) +#define LIBGAV1_ENABLE_LOGGING 0 +#else +#define LIBGAV1_ENABLE_LOGGING 1 +#endif +#endif + +#if LIBGAV1_ENABLE_LOGGING +// LIBGAV1_DLOG(severity, printf-format-string) +// Debug logging that can optionally be enabled in release builds by explicitly +// setting LIBGAV1_ENABLE_LOGGING. +// Severity is given as an all-caps version of enum LogSeverity with the +// leading 'k' removed: LIBGAV1_DLOG(INFO, "..."); +#define LIBGAV1_DLOG(severity, ...) \ + do { \ + constexpr const char* libgav1_logging_internal_basename = \ + ::libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1); \ + ::libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity, \ + libgav1_logging_internal_basename, __LINE__, \ + __VA_ARGS__); \ + } while (0) +#else +#define LIBGAV1_DLOG(severity, ...) \ + do { \ + } while (0) +#endif // LIBGAV1_ENABLE_LOGGING + +#define LIBGAV1_LOGGING_INTERNAL_ERROR ::libgav1::internal::LogSeverity::kError +#define LIBGAV1_LOGGING_INTERNAL_WARNING \ + ::libgav1::internal::LogSeverity::kWarning +#define LIBGAV1_LOGGING_INTERNAL_INFO ::libgav1::internal::LogSeverity::kInfo + +namespace libgav1 { +namespace internal { + +enum class LogSeverity : int { + kError, + kWarning, + kInfo, +}; + +// Helper function to implement LIBGAV1_DLOG +// Logs |format, ...| at |severity| level, reporting it as called from +// |file|:|line|. +void Log(libgav1::internal::LogSeverity severity, const char* file, int line, + const char* format, ...) LIBGAV1_PRINTF_ATTRIBUTE(4, 5); + +// Compile-time function to get the 'base' file_name, that is, the part of +// a file_name after the last '/' or '\' path separator. The search starts at +// the end of the string; the second parameter is the length of the string. +constexpr const char* Basename(const char* file_name, size_t offset) { + return (offset == 0 || file_name[offset - 1] == '/' || + file_name[offset - 1] == '\\') + ? file_name + offset + : Basename(file_name, offset - 1); +} + +} // namespace internal +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_LOGGING_H_ diff --git a/src/utils/memory.h b/src/utils/memory.h new file mode 100644 index 0000000..219a83f --- /dev/null +++ b/src/utils/memory.h @@ -0,0 +1,237 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_MEMORY_H_ +#define LIBGAV1_SRC_UTILS_MEMORY_H_ + +#if defined(__ANDROID__) || defined(_MSC_VER) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +namespace libgav1 { + +enum { +// The byte alignment required for buffers used with SIMD code to be read or +// written with aligned operations. +#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \ + defined(_M_X64) + kMaxAlignment = 32, // extended alignment is safe on x86. +#else + kMaxAlignment = alignof(max_align_t), +#endif +}; + +// AlignedAlloc, AlignedFree +// +// void* AlignedAlloc(size_t alignment, size_t size); +// Allocate aligned memory. +// |alignment| must be a power of 2. +// Unlike posix_memalign(), |alignment| may be smaller than sizeof(void*). +// Unlike aligned_alloc(), |size| does not need to be a multiple of +// |alignment|. +// The returned pointer should be freed by AlignedFree(). +// +// void AlignedFree(void* aligned_memory); +// Free aligned memory. + +#if defined(_MSC_VER) // MSVC + +inline void* AlignedAlloc(size_t alignment, size_t size) { + return _aligned_malloc(size, alignment); +} + +inline void AlignedFree(void* aligned_memory) { _aligned_free(aligned_memory); } + +#else // !defined(_MSC_VER) + +inline void* AlignedAlloc(size_t alignment, size_t size) { +#if defined(__ANDROID__) + // Although posix_memalign() was introduced in Android API level 17, it is + // more convenient to use memalign(). Unlike glibc, Android does not consider + // memalign() an obsolete function. + return memalign(alignment, size); +#else // !defined(__ANDROID__) + void* ptr = nullptr; + // posix_memalign requires that the requested alignment be at least + // sizeof(void*). In this case, fall back on malloc which should return + // memory aligned to at least the size of a pointer. + const size_t required_alignment = sizeof(void*); + if (alignment < required_alignment) return malloc(size); + const int error = posix_memalign(&ptr, alignment, size); + if (error != 0) { + errno = error; + return nullptr; + } + return ptr; +#endif // defined(__ANDROID__) +} + +inline void AlignedFree(void* aligned_memory) { free(aligned_memory); } + +#endif // defined(_MSC_VER) + +inline void Memset(uint8_t* const dst, int value, size_t count) { + memset(dst, value, count); +} + +inline void Memset(uint16_t* const dst, int value, size_t count) { + for (size_t i = 0; i < count; ++i) { + dst[i] = static_cast(value); + } +} + +struct MallocDeleter { + void operator()(void* ptr) const { free(ptr); } +}; + +struct AlignedDeleter { + void operator()(void* ptr) const { AlignedFree(ptr); } +}; + +template +using AlignedUniquePtr = std::unique_ptr; + +// Allocates aligned memory for an array of |count| elements of type T. +template +inline AlignedUniquePtr MakeAlignedUniquePtr(size_t alignment, + size_t count) { + return AlignedUniquePtr( + static_cast(AlignedAlloc(alignment, count * sizeof(T)))); +} + +// A base class with custom new and delete operators. The exception-throwing +// new operators are deleted. The "new (std::nothrow)" form must be used. +// +// The new operators return nullptr if the requested size is greater than +// 0x40000000 bytes (1 GB). TODO(wtc): Make the maximum allocable memory size +// a compile-time configuration macro. +// +// See https://en.cppreference.com/w/cpp/memory/new/operator_new and +// https://en.cppreference.com/w/cpp/memory/new/operator_delete. +// +// NOTE: The allocation and deallocation functions are static member functions +// whether the keyword 'static' is used or not. +struct Allocable { + // Class-specific allocation functions. + static void* operator new(size_t size) = delete; + static void* operator new[](size_t size) = delete; + + // Class-specific non-throwing allocation functions + static void* operator new(size_t size, const std::nothrow_t& tag) noexcept { + if (size > 0x40000000) return nullptr; + return ::operator new(size, tag); + } + static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept { + if (size > 0x40000000) return nullptr; + return ::operator new[](size, tag); + } + + // Class-specific deallocation functions. + static void operator delete(void* ptr) noexcept { ::operator delete(ptr); } + static void operator delete[](void* ptr) noexcept { + ::operator delete[](ptr); + } + + // Only called if new (std::nothrow) is used and the constructor throws an + // exception. + static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept { + ::operator delete(ptr, tag); + } + // Only called if new[] (std::nothrow) is used and the constructor throws an + // exception. + static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept { + ::operator delete[](ptr, tag); + } +}; + +// A variant of Allocable that forces allocations to be aligned to +// kMaxAlignment bytes. This is intended for use with classes that use +// alignas() with this value. C++17 aligned new/delete are used if available, +// otherwise we use AlignedAlloc/Free. +struct MaxAlignedAllocable { + // Class-specific allocation functions. + static void* operator new(size_t size) = delete; + static void* operator new[](size_t size) = delete; + + // Class-specific non-throwing allocation functions + static void* operator new(size_t size, const std::nothrow_t& tag) noexcept { + if (size > 0x40000000) return nullptr; +#ifdef __cpp_aligned_new + return ::operator new(size, std::align_val_t(kMaxAlignment), tag); +#else + static_cast(tag); + return AlignedAlloc(kMaxAlignment, size); +#endif + } + static void* operator new[](size_t size, const std::nothrow_t& tag) noexcept { + if (size > 0x40000000) return nullptr; +#ifdef __cpp_aligned_new + return ::operator new[](size, std::align_val_t(kMaxAlignment), tag); +#else + static_cast(tag); + return AlignedAlloc(kMaxAlignment, size); +#endif + } + + // Class-specific deallocation functions. + static void operator delete(void* ptr) noexcept { +#ifdef __cpp_aligned_new + ::operator delete(ptr, std::align_val_t(kMaxAlignment)); +#else + AlignedFree(ptr); +#endif + } + static void operator delete[](void* ptr) noexcept { +#ifdef __cpp_aligned_new + ::operator delete[](ptr, std::align_val_t(kMaxAlignment)); +#else + AlignedFree(ptr); +#endif + } + + // Only called if new (std::nothrow) is used and the constructor throws an + // exception. + static void operator delete(void* ptr, const std::nothrow_t& tag) noexcept { +#ifdef __cpp_aligned_new + ::operator delete(ptr, std::align_val_t(kMaxAlignment), tag); +#else + static_cast(tag); + AlignedFree(ptr); +#endif + } + // Only called if new[] (std::nothrow) is used and the constructor throws an + // exception. + static void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept { +#ifdef __cpp_aligned_new + ::operator delete[](ptr, std::align_val_t(kMaxAlignment), tag); +#else + static_cast(tag); + AlignedFree(ptr); +#endif + } +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_MEMORY_H_ diff --git a/src/utils/parameter_tree.cc b/src/utils/parameter_tree.cc new file mode 100644 index 0000000..9426ce6 --- /dev/null +++ b/src/utils/parameter_tree.cc @@ -0,0 +1,133 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/parameter_tree.h" + +#include +#include +#include + +#include "src/utils/common.h" +#include "src/utils/constants.h" +#include "src/utils/logging.h" +#include "src/utils/types.h" + +namespace libgav1 { + +// static +std::unique_ptr ParameterTree::Create(int row4x4, int column4x4, + BlockSize block_size, + bool is_leaf) { + std::unique_ptr tree( + new (std::nothrow) ParameterTree(row4x4, column4x4, block_size)); + if (tree != nullptr && is_leaf && !tree->SetPartitionType(kPartitionNone)) { + tree = nullptr; + } + return tree; +} + +bool ParameterTree::SetPartitionType(Partition partition) { + assert(!partition_type_set_); + partition_ = partition; + partition_type_set_ = true; + const int block_width4x4 = kNum4x4BlocksWide[block_size_]; + const int half_block4x4 = block_width4x4 >> 1; + const int quarter_block4x4 = half_block4x4 >> 1; + const BlockSize sub_size = kSubSize[partition][block_size_]; + const BlockSize split_size = kSubSize[kPartitionSplit][block_size_]; + assert(partition == kPartitionNone || sub_size != kBlockInvalid); + switch (partition) { + case kPartitionNone: + parameters_.reset(new (std::nothrow) BlockParameters()); + return parameters_ != nullptr; + case kPartitionHorizontal: + children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true); + children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, + sub_size, true); + return children_[0] != nullptr && children_[1] != nullptr; + case kPartitionVertical: + children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true); + children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, + sub_size, true); + return children_[0] != nullptr && children_[1] != nullptr; + case kPartitionSplit: + children_[0] = + ParameterTree::Create(row4x4_, column4x4_, sub_size, false); + children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, + sub_size, false); + children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, + sub_size, false); + children_[3] = ParameterTree::Create( + row4x4_ + half_block4x4, column4x4_ + half_block4x4, sub_size, false); + return children_[0] != nullptr && children_[1] != nullptr && + children_[2] != nullptr && children_[3] != nullptr; + case kPartitionHorizontalWithTopSplit: + assert(split_size != kBlockInvalid); + children_[0] = + ParameterTree::Create(row4x4_, column4x4_, split_size, true); + children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, + split_size, true); + children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, + sub_size, true); + return children_[0] != nullptr && children_[1] != nullptr && + children_[2] != nullptr; + case kPartitionHorizontalWithBottomSplit: + assert(split_size != kBlockInvalid); + children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true); + children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, + split_size, true); + children_[2] = + ParameterTree::Create(row4x4_ + half_block4x4, + column4x4_ + half_block4x4, split_size, true); + return children_[0] != nullptr && children_[1] != nullptr && + children_[2] != nullptr; + case kPartitionVerticalWithLeftSplit: + assert(split_size != kBlockInvalid); + children_[0] = + ParameterTree::Create(row4x4_, column4x4_, split_size, true); + children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_, + split_size, true); + children_[2] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, + sub_size, true); + return children_[0] != nullptr && children_[1] != nullptr && + children_[2] != nullptr; + case kPartitionVerticalWithRightSplit: + assert(split_size != kBlockInvalid); + children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true); + children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4, + split_size, true); + children_[2] = + ParameterTree::Create(row4x4_ + half_block4x4, + column4x4_ + half_block4x4, split_size, true); + return children_[0] != nullptr && children_[1] != nullptr && + children_[2] != nullptr; + case kPartitionHorizontal4: + for (int i = 0; i < 4; ++i) { + children_[i] = ParameterTree::Create(row4x4_ + i * quarter_block4x4, + column4x4_, sub_size, true); + if (children_[i] == nullptr) return false; + } + return true; + default: + assert(partition == kPartitionVertical4); + for (int i = 0; i < 4; ++i) { + children_[i] = ParameterTree::Create( + row4x4_, column4x4_ + i * quarter_block4x4, sub_size, true); + if (children_[i] == nullptr) return false; + } + return true; + } +} + +} // namespace libgav1 diff --git a/src/utils/parameter_tree.h b/src/utils/parameter_tree.h new file mode 100644 index 0000000..935f3eb --- /dev/null +++ b/src/utils/parameter_tree.h @@ -0,0 +1,113 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_ +#define LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_ + +#include +#include + +#include "src/utils/common.h" +#include "src/utils/compiler_attributes.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" +#include "src/utils/types.h" + +namespace libgav1 { + +class ParameterTree : public Allocable { + public: + // Creates a parameter tree to store the parameters of a block of size + // |block_size| starting at coordinates |row4x4| and |column4x4|. If |is_leaf| + // is set to true, the memory will be allocated for the BlockParameters for + // this node. Otherwise, no memory will be allocated. If |is_leaf| is set to + // false, |block_size| must be a square block, i.e., + // kBlockWidthPixels[block_size] must be equal to + // kBlockHeightPixels[block_size]. + static std::unique_ptr Create(int row4x4, int column4x4, + BlockSize block_size, + bool is_leaf = false); + + // Move only (not Copyable). + ParameterTree(ParameterTree&& other) = default; + ParameterTree& operator=(ParameterTree&& other) = default; + ParameterTree(const ParameterTree&) = delete; + ParameterTree& operator=(const ParameterTree&) = delete; + + // Set the partition type of the current node to |partition|. + // if (partition == kPartitionNone) { + // Memory will be allocated for the BlockParameters for this node. + // } else if (partition != kPartitionSplit) { + // The appropriate child nodes will be populated and memory will be + // allocated for the BlockParameters of the children. + // } else { + // The appropriate child nodes will be populated but they are considered to + // be hanging, i.e., future calls to SetPartitionType() on the child nodes + // will have to set them or their descendants to a terminal type. + // } + // This function must be called only once per node. + LIBGAV1_MUST_USE_RESULT bool SetPartitionType(Partition partition); + + // Basic getters. + int row4x4() const { return row4x4_; } + int column4x4() const { return column4x4_; } + BlockSize block_size() const { return block_size_; } + Partition partition() const { return partition_; } + ParameterTree* children(int index) const { + assert(index < 4); + return children_[index].get(); + } + // Returns the BlockParameters object of the current node if one exists. + // Otherwise returns nullptr. This function will return a valid + // BlockParameters object only for leaf nodes. + BlockParameters* parameters() const { return parameters_.get(); } + + private: + ParameterTree(int row4x4, int column4x4, BlockSize block_size) + : row4x4_(row4x4), column4x4_(column4x4), block_size_(block_size) {} + + Partition partition_ = kPartitionNone; + std::unique_ptr parameters_ = nullptr; + int row4x4_ = -1; + int column4x4_ = -1; + BlockSize block_size_ = kBlockInvalid; + bool partition_type_set_ = false; + + // Child values are defined as follows for various partition types: + // * Horizontal: 0 top partition; 1 bottom partition; 2 nullptr; 3 nullptr; + // * Vertical: 0 left partition; 1 right partition; 2 nullptr; 3 nullptr; + // * Split: 0 top-left partition; 1 top-right partition; 2; bottom-left + // partition; 3 bottom-right partition; + // * HorizontalWithTopSplit: 0 top-left partition; 1 top-right partition; 2 + // bottom partition; 3 nullptr; + // * HorizontalWithBottomSplit: 0 top partition; 1 bottom-left partition; 2 + // bottom-right partition; 3 nullptr; + // * VerticalWithLeftSplit: 0 top-left partition; 1 bottom-left partition; 2 + // right partition; 3 nullptr; + // * VerticalWithRightSplit: 0 left-partition; 1 top-right partition; 2 + // bottom-right partition; 3 nullptr; + // * Horizontal4: 0 top partition; 1 second top partition; 2 third top + // partition; 3 bottom partition; + // * Vertical4: 0 left partition; 1 second left partition; 2 third left + // partition; 3 right partition; + std::unique_ptr children_[4] = {}; + + friend class ParameterTreeTest; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_ diff --git a/src/utils/queue.h b/src/utils/queue.h new file mode 100644 index 0000000..cffb9ca --- /dev/null +++ b/src/utils/queue.h @@ -0,0 +1,105 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_QUEUE_H_ +#define LIBGAV1_SRC_UTILS_QUEUE_H_ + +#include +#include +#include +#include + +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { + +// A FIFO queue of a fixed capacity. +// +// WARNING: No error checking is performed. +template +class Queue { + public: + LIBGAV1_MUST_USE_RESULT bool Init(size_t capacity) { + elements_.reset(new (std::nothrow) T[capacity]); + if (elements_ == nullptr) return false; + capacity_ = capacity; + return true; + } + + // Pushes the element |value| to the end of the queue. It is an error to call + // Push() when the queue is full. + void Push(T&& value) { + assert(size_ < capacity_); + elements_[end_++] = std::move(value); + if (end_ == capacity_) end_ = 0; + ++size_; + } + + // Removes the element at the front of the queue. It is an error to call Pop() + // when the queue is empty. + void Pop() { + assert(size_ != 0); + const T element = std::move(elements_[begin_++]); + static_cast(element); + if (begin_ == capacity_) begin_ = 0; + --size_; + } + + // Returns a reference to the element at the front of the queue. It is an + // error to call Front() when the queue is empty. + T& Front() { + assert(size_ != 0); + return elements_[begin_]; + } + + // Returns a reference to the element at the back of the queue. It is an error + // to call Back() when the queue is empty. + T& Back() { + assert(size_ != 0); + const size_t back = ((end_ == 0) ? capacity_ : end_) - 1; + return elements_[back]; + } + + // Clears the queue. + void Clear() { + while (!Empty()) { + Pop(); + } + } + + // Returns true if the queue is empty. + bool Empty() const { return size_ == 0; } + + // Returns true if the queue is full. + bool Full() const { return size_ >= capacity_; } + + // Returns the number of elements in the queue. + size_t Size() const { return size_; } + + private: + // An array of |capacity| elements. Used as a circular array. + std::unique_ptr elements_; + size_t capacity_ = 0; + // The index of the element to be removed by Pop(). + size_t begin_ = 0; + // The index where the new element is inserted by Push(). + size_t end_ = 0; + size_t size_ = 0; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_QUEUE_H_ diff --git a/src/utils/raw_bit_reader.cc b/src/utils/raw_bit_reader.cc new file mode 100644 index 0000000..15e980d --- /dev/null +++ b/src/utils/raw_bit_reader.cc @@ -0,0 +1,224 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/raw_bit_reader.h" + +#include +#include + +#include "src/utils/common.h" +#include "src/utils/logging.h" + +// Note is only needed when logging is enabled (for the PRI* +// macros). It depends on the definition of LIBGAV1_ENABLE_LOGGING from +// logging.h, thus the non-standard header ordering. +#if LIBGAV1_ENABLE_LOGGING +#include +#endif + +namespace libgav1 { +namespace { + +constexpr int kMaximumLeb128Size = 8; +constexpr uint8_t kLeb128ValueByteMask = 0x7f; +constexpr uint8_t kLeb128TerminationByteMask = 0x80; + +uint8_t Mod8(size_t n) { + // Last 3 bits are the value of mod 8. + return n & 0x07; +} + +size_t DivideBy8(size_t n, bool ceil) { return (n + (ceil ? 7 : 0)) >> 3; } + +} // namespace + +RawBitReader::RawBitReader(const uint8_t* data, size_t size) + : data_(data), bit_offset_(0), size_(size) { + assert(data_ != nullptr || size_ == 0); +} + +int RawBitReader::ReadBitImpl() { + const size_t byte_offset = DivideBy8(bit_offset_, false); + const uint8_t byte = data_[byte_offset]; + const uint8_t shift = 7 - Mod8(bit_offset_); + ++bit_offset_; + return static_cast((byte >> shift) & 0x01); +} + +int RawBitReader::ReadBit() { + if (Finished()) return -1; + return ReadBitImpl(); +} + +int64_t RawBitReader::ReadLiteral(int num_bits) { + assert(num_bits <= 32); + if (!CanReadLiteral(num_bits)) return -1; + assert(num_bits > 0); + uint32_t literal = 0; + int bit = num_bits - 1; + do { + // ARM can combine a shift operation with a constant number of bits with + // some other operations, such as the OR operation. + // Here is an ARM disassembly example: + // orr w1, w0, w1, lsl #1 + // which left shifts register w1 by 1 bit and OR the shift result with + // register w0. + // The next 2 lines are equivalent to: + // literal |= static_cast(ReadBitImpl()) << bit; + literal <<= 1; + literal |= static_cast(ReadBitImpl()); + } while (--bit >= 0); + return literal; +} + +bool RawBitReader::ReadInverseSignedLiteral(int num_bits, int* const value) { + assert(num_bits + 1 < 32); + *value = static_cast(ReadLiteral(num_bits + 1)); + if (*value == -1) return false; + const int sign_bit = 1 << num_bits; + if ((*value & sign_bit) != 0) { + *value -= 2 * sign_bit; + } + return true; +} + +bool RawBitReader::ReadLittleEndian(int num_bytes, size_t* const value) { + // We must be at a byte boundary. + assert(Mod8(bit_offset_) == 0); + assert(num_bytes <= 4); + static_assert(sizeof(size_t) >= 4, ""); + if (value == nullptr) return false; + size_t byte_offset = DivideBy8(bit_offset_, false); + if (Finished() || byte_offset + num_bytes > size_) { + LIBGAV1_DLOG(ERROR, "Not enough bits to read Little Endian value."); + return false; + } + *value = 0; + for (int i = 0; i < num_bytes; ++i) { + const size_t byte = data_[byte_offset]; + *value |= (byte << (i * 8)); + ++byte_offset; + } + bit_offset_ = byte_offset * 8; + return true; +} + +bool RawBitReader::ReadUnsignedLeb128(size_t* const value) { + // We must be at a byte boundary. + assert(Mod8(bit_offset_) == 0); + if (value == nullptr) return false; + uint64_t value64 = 0; + for (int i = 0; i < kMaximumLeb128Size; ++i) { + if (Finished()) { + LIBGAV1_DLOG(ERROR, "Not enough bits to read LEB128 value."); + return false; + } + const size_t byte_offset = DivideBy8(bit_offset_, false); + const uint8_t byte = data_[byte_offset]; + bit_offset_ += 8; + value64 |= static_cast(byte & kLeb128ValueByteMask) << (i * 7); + if ((byte & kLeb128TerminationByteMask) == 0) { + if (value64 != static_cast(value64) || + value64 > std::numeric_limits::max()) { + LIBGAV1_DLOG( + ERROR, "LEB128 value (%" PRIu64 ") exceeded uint32_t maximum (%u).", + value64, std::numeric_limits::max()); + return false; + } + *value = static_cast(value64); + return true; + } + } + LIBGAV1_DLOG( + ERROR, + "Exceeded kMaximumLeb128Size (%d) when trying to read LEB128 value", + kMaximumLeb128Size); + return false; +} + +bool RawBitReader::ReadUvlc(uint32_t* const value) { + if (value == nullptr) return false; + int leading_zeros = 0; + while (true) { + const int bit = ReadBit(); + if (bit == -1) { + LIBGAV1_DLOG(ERROR, "Not enough bits to read uvlc value."); + return false; + } + if (bit == 1) break; + ++leading_zeros; + if (leading_zeros == 32) { + LIBGAV1_DLOG(ERROR, + "Exceeded maximum size (32) when trying to read uvlc value"); + return false; + } + } + int literal; + if (leading_zeros != 0) { + literal = static_cast(ReadLiteral(leading_zeros)); + if (literal == -1) { + LIBGAV1_DLOG(ERROR, "Not enough bits to read uvlc value."); + return false; + } + literal += (1U << leading_zeros) - 1; + } else { + literal = 0; + } + *value = literal; + return true; +} + +bool RawBitReader::AlignToNextByte() { + while ((bit_offset_ & 7) != 0) { + if (ReadBit() != 0) { + return false; + } + } + return true; +} + +bool RawBitReader::VerifyAndSkipTrailingBits(size_t num_bits) { + if (ReadBit() != 1) return false; + for (size_t i = 0; i < num_bits - 1; ++i) { + if (ReadBit() != 0) return false; + } + return true; +} + +bool RawBitReader::SkipBytes(size_t num_bytes) { + // If we are not at a byte boundary, return false. + return ((bit_offset_ & 7) != 0) ? false : SkipBits(num_bytes * 8); +} + +bool RawBitReader::SkipBits(size_t num_bits) { + // If the reader is already finished, return false. + if (Finished()) return false; + // If skipping |num_bits| runs out of buffer, return false. + const size_t bit_offset = bit_offset_ + num_bits - 1; + if (DivideBy8(bit_offset, false) >= size_) return false; + bit_offset_ += num_bits; + return true; +} + +bool RawBitReader::CanReadLiteral(size_t num_bits) const { + if (Finished()) return false; + const size_t bit_offset = bit_offset_ + num_bits - 1; + return DivideBy8(bit_offset, false) < size_; +} + +bool RawBitReader::Finished() const { + return DivideBy8(bit_offset_, false) >= size_; +} + +} // namespace libgav1 diff --git a/src/utils/raw_bit_reader.h b/src/utils/raw_bit_reader.h new file mode 100644 index 0000000..76e7bfa --- /dev/null +++ b/src/utils/raw_bit_reader.h @@ -0,0 +1,78 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_ +#define LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_ + +#include +#include + +#include "src/utils/bit_reader.h" +#include "src/utils/memory.h" + +namespace libgav1 { + +class RawBitReader : public BitReader, public Allocable { + public: + RawBitReader(const uint8_t* data, size_t size); + ~RawBitReader() override = default; + + int ReadBit() override; + int64_t ReadLiteral(int num_bits) override; // f(n) in the spec. + bool ReadInverseSignedLiteral(int num_bits, + int* value); // su(1+num_bits) in the spec. + bool ReadLittleEndian(int num_bytes, + size_t* value); // le(n) in the spec. + bool ReadUnsignedLeb128(size_t* value); // leb128() in the spec. + // Reads a variable length unsigned number and stores it in |*value|. On a + // successful return, |*value| is in the range of 0 to UINT32_MAX − 1, + // inclusive. + bool ReadUvlc(uint32_t* value); // uvlc() in the spec. + bool Finished() const; + size_t bit_offset() const { return bit_offset_; } + // Return the bytes consumed so far (rounded up). + size_t byte_offset() const { return (bit_offset() + 7) >> 3; } + size_t size() const { return size_; } + // Move to the next byte boundary if not already at one. Return false if any + // of the bits being skipped over is non-zero. Return true otherwise. If this + // function returns false, the reader is left in an undefined state and must + // not be used further. section 5.3.5. + bool AlignToNextByte(); + // Make sure that the trailing bits structure is as expected and skip over it. + // section 5.3.4. + bool VerifyAndSkipTrailingBits(size_t num_bits); + // Skip |num_bytes| bytes. This only works if the current position is at a + // byte boundary. The function returns false if the current position is not at + // a byte boundary or if skipping |num_bytes| causes the reader to run out of + // buffer. Returns true otherwise. + bool SkipBytes(size_t num_bytes); + // Skip |num_bits| bits. The function returns false if skipping |num_bits| + // causes the reader to run out of buffer. Returns true otherwise. + bool SkipBits(size_t num_bits); + + private: + // Returns true if it is safe to read a literal of size |num_bits|. + bool CanReadLiteral(size_t num_bits) const; + int ReadBitImpl(); + + const uint8_t* const data_; + size_t bit_offset_; + const size_t size_; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_RAW_BIT_READER_H_ diff --git a/src/utils/reference_info.h b/src/utils/reference_info.h new file mode 100644 index 0000000..a660791 --- /dev/null +++ b/src/utils/reference_info.h @@ -0,0 +1,92 @@ +/* + * Copyright 2020 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_ +#define LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_ + +#include +#include + +#include "src/utils/array_2d.h" +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { + +// This struct collects some members related to reference frames in one place to +// make it easier to pass them as parameters to some dsp functions. +struct ReferenceInfo { + // Initialize |motion_field_reference_frame| so that + // Tile::StoreMotionFieldMvsIntoCurrentFrame() can skip some updates when + // the updates are the same as the initialized value. + // Set to kReferenceFrameIntra instead of kReferenceFrameNone to simplify + // branch conditions in motion field projection. + // The following memory initialization of contiguous memory is very fast. It + // is not recommended to make the initialization multi-threaded, unless the + // memory which needs to be initialized in each thread is still contiguous. + LIBGAV1_MUST_USE_RESULT bool Reset(int rows, int columns) { + return motion_field_reference_frame.Reset(rows, columns, + /*zero_initialize=*/true) && + motion_field_mv.Reset( + rows, columns, +#if LIBGAV1_MSAN + // It is set in Tile::StoreMotionFieldMvsIntoCurrentFrame() only + // for qualified blocks. In MotionFieldProjectionKernel() dsp + // optimizations, it is read no matter it was set or not. + /*zero_initialize=*/true +#else + /*zero_initialize=*/false +#endif + ); + } + + // All members are used by inter frames only. + // For intra frames, they are not initialized. + + std::array order_hint; + + // An example when |relative_distance_from| does not equal + // -|relative_distance_to|: + // |relative_distance_from| = GetRelativeDistance(7, 71, 25) = -64 + // -|relative_distance_to| = -GetRelativeDistance(71, 7, 25) = 64 + // This is why we need both |relative_distance_from| and + // |relative_distance_to|. + // |relative_distance_from|: Relative distances from reference frames to this + // frame. + std::array relative_distance_from; + // |relative_distance_to|: Relative distances to reference frames. + std::array relative_distance_to; + + // Skip motion field projection of specific types of frames if their + // |relative_distance_to| is negative or too large. + std::array skip_references; + // Lookup table to get motion field projection division multiplier of specific + // types of frames. Derived from kProjectionMvDivisionLookup. + std::array projection_divisions; + + // The current frame's |motion_field_reference_frame| and |motion_field_mv_| + // are guaranteed to be allocated only when refresh_frame_flags is not 0. + // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds + // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec. + Array2D motion_field_reference_frame; + // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds + // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec. + Array2D motion_field_mv; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_REFERENCE_INFO_H_ diff --git a/src/utils/segmentation.cc b/src/utils/segmentation.cc new file mode 100644 index 0000000..75fa776 --- /dev/null +++ b/src/utils/segmentation.cc @@ -0,0 +1,31 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/segmentation.h" + +namespace libgav1 { + +const int8_t kSegmentationFeatureBits[kSegmentFeatureMax] = {8, 6, 6, 6, + 6, 3, 0, 0}; +const int kSegmentationFeatureMaxValues[kSegmentFeatureMax] = { + 255, + kMaxLoopFilterValue, + kMaxLoopFilterValue, + kMaxLoopFilterValue, + kMaxLoopFilterValue, + 7, + 0, + 0}; + +} // namespace libgav1 diff --git a/src/utils/segmentation.h b/src/utils/segmentation.h new file mode 100644 index 0000000..67ff74c --- /dev/null +++ b/src/utils/segmentation.h @@ -0,0 +1,32 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_SEGMENTATION_H_ +#define LIBGAV1_SRC_UTILS_SEGMENTATION_H_ + +#include + +#include "src/utils/constants.h" +#include "src/utils/types.h" + +namespace libgav1 { + +extern const int8_t kSegmentationFeatureBits[kSegmentFeatureMax]; +extern const int kSegmentationFeatureMaxValues[kSegmentFeatureMax]; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_SEGMENTATION_H_ diff --git a/src/utils/segmentation_map.cc b/src/utils/segmentation_map.cc new file mode 100644 index 0000000..4284ca2 --- /dev/null +++ b/src/utils/segmentation_map.cc @@ -0,0 +1,49 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/segmentation_map.h" + +#include +#include +#include + +namespace libgav1 { + +bool SegmentationMap::Allocate(int32_t rows4x4, int32_t columns4x4) { + rows4x4_ = rows4x4; + columns4x4_ = columns4x4; + segment_id_buffer_.reset(new (std::nothrow) int8_t[rows4x4_ * columns4x4_]); + if (segment_id_buffer_ == nullptr) return false; + segment_id_.Reset(rows4x4_, columns4x4_, segment_id_buffer_.get()); + return true; +} + +void SegmentationMap::Clear() { + memset(segment_id_buffer_.get(), 0, rows4x4_ * columns4x4_); +} + +void SegmentationMap::CopyFrom(const SegmentationMap& from) { + assert(rows4x4_ == from.rows4x4_ && columns4x4_ == from.columns4x4_); + memcpy(segment_id_buffer_.get(), from.segment_id_buffer_.get(), + rows4x4_ * columns4x4_); +} + +void SegmentationMap::FillBlock(int row4x4, int column4x4, int block_width4x4, + int block_height4x4, int8_t segment_id) { + for (int y = 0; y < block_height4x4; ++y) { + memset(&segment_id_[row4x4 + y][column4x4], segment_id, block_width4x4); + } +} + +} // namespace libgav1 diff --git a/src/utils/segmentation_map.h b/src/utils/segmentation_map.h new file mode 100644 index 0000000..499be24 --- /dev/null +++ b/src/utils/segmentation_map.h @@ -0,0 +1,71 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_ +#define LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_ + +#include +#include + +#include "src/utils/array_2d.h" +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { + +// SegmentationMap stores the segment id associated with each 4x4 block in the +// frame. +class SegmentationMap { + public: + SegmentationMap() = default; + + // Not copyable or movable + SegmentationMap(const SegmentationMap&) = delete; + SegmentationMap& operator=(const SegmentationMap&) = delete; + + // Allocates an internal buffer of the given dimensions to hold the + // segmentation map. The memory in the buffer is not initialized. Returns + // true on success, false on failure (for example, out of memory). + LIBGAV1_MUST_USE_RESULT bool Allocate(int32_t rows4x4, int32_t columns4x4); + + int8_t segment_id(int row4x4, int column4x4) const { + return segment_id_[row4x4][column4x4]; + } + + // Sets every element in the segmentation map to 0. + void Clear(); + + // Copies the entire segmentation map. |from| must be of the same dimensions. + void CopyFrom(const SegmentationMap& from); + + // Sets the region of segmentation map covered by the block to |segment_id|. + // The block is located at |row4x4|, |column4x4| and has dimensions + // |block_width4x4| and |block_height4x4|. + void FillBlock(int row4x4, int column4x4, int block_width4x4, + int block_height4x4, int8_t segment_id); + + private: + int32_t rows4x4_ = 0; + int32_t columns4x4_ = 0; + + // segment_id_ is a rows4x4_ by columns4x4_ 2D array. The underlying data + // buffer is dynamically allocated and owned by segment_id_buffer_. + std::unique_ptr segment_id_buffer_; + Array2DView segment_id_; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_SEGMENTATION_MAP_H_ diff --git a/src/utils/stack.h b/src/utils/stack.h new file mode 100644 index 0000000..39133b9 --- /dev/null +++ b/src/utils/stack.h @@ -0,0 +1,59 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_STACK_H_ +#define LIBGAV1_SRC_UTILS_STACK_H_ + +#include +#include + +namespace libgav1 { + +// A LIFO stack of a fixed capacity. The elements are moved using std::move, so +// the element type T has to be movable. +// +// WARNING: No error checking is performed. +template +class Stack { + public: + // Pushes the element |value| to the top of the stack. It is an error to call + // Push() when the stack is full. + void Push(T value) { + ++top_; + assert(top_ < capacity); + elements_[top_] = std::move(value); + } + + // Returns the element at the top of the stack and removes it from the stack. + // It is an error to call Pop() when the stack is empty. + T Pop() { + assert(top_ >= 0); + return std::move(elements_[top_--]); + } + + // Returns true if the stack is empty. + bool Empty() const { return top_ < 0; } + + private: + static_assert(capacity > 0, ""); + T elements_[capacity]; + // The array index of the top of the stack. The stack is empty if top_ is -1. + int top_ = -1; +}; + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_STACK_H_ diff --git a/src/utils/threadpool.cc b/src/utils/threadpool.cc new file mode 100644 index 0000000..8c8f4fe --- /dev/null +++ b/src/utils/threadpool.cc @@ -0,0 +1,323 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/utils/threadpool.h" + +#if defined(_MSC_VER) +#include +#include +#else // defined(_MSC_VER) +#include +#endif // defined(_MSC_VER) +#if defined(__ANDROID__) || defined(__GLIBC__) +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__ANDROID__) +#include // NOLINT (unapproved c++11 header) +#endif + +// The glibc wrapper for the gettid() system call was added in glibc 2.30. +// Emulate it for older versions of glibc. +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 30) + +#include + +static pid_t gettid() { return static_cast(syscall(SYS_gettid)); } + +#endif +#endif // defined(__GLIBC_PREREQ) + +namespace libgav1 { + +#if defined(__ANDROID__) +namespace { + +using Clock = std::chrono::steady_clock; +using Duration = Clock::duration; +constexpr Duration kBusyWaitDuration = + std::chrono::duration_cast(std::chrono::duration(2e-3)); + +} // namespace +#endif // defined(__ANDROID__) + +// static +std::unique_ptr ThreadPool::Create(int num_threads) { + return Create(/*name_prefix=*/"", num_threads); +} + +// static +std::unique_ptr ThreadPool::Create(const char name_prefix[], + int num_threads) { + if (name_prefix == nullptr || num_threads <= 0) return nullptr; + std::unique_ptr threads(new (std::nothrow) + WorkerThread*[num_threads]); + if (threads == nullptr) return nullptr; + std::unique_ptr pool(new (std::nothrow) ThreadPool( + name_prefix, std::move(threads), num_threads)); + if (pool != nullptr && !pool->StartWorkers()) { + pool = nullptr; + } + return pool; +} + +ThreadPool::ThreadPool(const char name_prefix[], + std::unique_ptr threads, + int num_threads) + : threads_(std::move(threads)), num_threads_(num_threads) { + threads_[0] = nullptr; + assert(name_prefix != nullptr); + const size_t name_prefix_len = + std::min(strlen(name_prefix), sizeof(name_prefix_) - 1); + memcpy(name_prefix_, name_prefix, name_prefix_len); + name_prefix_[name_prefix_len] = '\0'; +} + +ThreadPool::~ThreadPool() { Shutdown(); } + +void ThreadPool::Schedule(std::function closure) { + LockMutex(); + if (!queue_.GrowIfNeeded()) { + // queue_ is full and we can't grow it. Run |closure| directly. + UnlockMutex(); + closure(); + return; + } + queue_.Push(std::move(closure)); + UnlockMutex(); + SignalOne(); +} + +int ThreadPool::num_threads() const { return num_threads_; } + +// A simple implementation that mirrors the non-portable Thread. We may +// choose to expand this in the future as a portable implementation of +// Thread, or replace it at such a time as one is implemented. +class ThreadPool::WorkerThread : public Allocable { + public: + // Creates and starts a thread that runs pool->WorkerFunction(). + explicit WorkerThread(ThreadPool* pool); + + // Not copyable or movable. + WorkerThread(const WorkerThread&) = delete; + WorkerThread& operator=(const WorkerThread&) = delete; + + // REQUIRES: Join() must have been called if Start() was called and + // succeeded. + ~WorkerThread() = default; + + LIBGAV1_MUST_USE_RESULT bool Start(); + + // Joins with the running thread. + void Join(); + + private: +#if defined(_MSC_VER) + static unsigned int __stdcall ThreadBody(void* arg); +#else + static void* ThreadBody(void* arg); +#endif + + void SetupName(); + void Run(); + + ThreadPool* pool_; +#if defined(_MSC_VER) + HANDLE handle_; +#else + pthread_t thread_; +#endif +}; + +ThreadPool::WorkerThread::WorkerThread(ThreadPool* pool) : pool_(pool) {} + +#if defined(_MSC_VER) + +bool ThreadPool::WorkerThread::Start() { + // Since our code calls the C run-time library (CRT), use _beginthreadex + // rather than CreateThread. Microsoft documentation says "If a thread + // created using CreateThread calls the CRT, the CRT may terminate the + // process in low-memory conditions." + uintptr_t handle = _beginthreadex( + /*security=*/nullptr, /*stack_size=*/0, ThreadBody, this, + /*initflag=*/CREATE_SUSPENDED, /*thrdaddr=*/nullptr); + if (handle == 0) return false; + handle_ = reinterpret_cast(handle); + ResumeThread(handle_); + return true; +} + +void ThreadPool::WorkerThread::Join() { + WaitForSingleObject(handle_, INFINITE); + CloseHandle(handle_); +} + +unsigned int ThreadPool::WorkerThread::ThreadBody(void* arg) { + auto* thread = static_cast(arg); + thread->Run(); + return 0; +} + +void ThreadPool::WorkerThread::SetupName() { + // Not currently supported on Windows. +} + +#else // defined(_MSC_VER) + +bool ThreadPool::WorkerThread::Start() { + return pthread_create(&thread_, nullptr, ThreadBody, this) == 0; +} + +void ThreadPool::WorkerThread::Join() { pthread_join(thread_, nullptr); } + +void* ThreadPool::WorkerThread::ThreadBody(void* arg) { + auto* thread = static_cast(arg); + thread->Run(); + return nullptr; +} + +void ThreadPool::WorkerThread::SetupName() { + if (pool_->name_prefix_[0] != '\0') { +#if defined(__APPLE__) + // Apple's version of pthread_setname_np takes one argument and operates on + // the current thread only. Also, pthread_mach_thread_np is Apple-specific. + // The maximum size of the |name| buffer was noted in the Chromium source + // code and was confirmed by experiments. + char name[64]; + mach_port_t id = pthread_mach_thread_np(pthread_self()); + int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_, + static_cast(id)); + assert(rv >= 0); + rv = pthread_setname_np(name); + assert(rv == 0); + static_cast(rv); +#elif defined(__ANDROID__) || defined(__GLIBC__) + // If the |name| buffer is longer than 16 bytes, pthread_setname_np fails + // with error 34 (ERANGE) on Android. + char name[16]; + pid_t id = gettid(); + int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_, + static_cast(id)); + assert(rv >= 0); + rv = pthread_setname_np(pthread_self(), name); + assert(rv == 0); + static_cast(rv); +#endif + } +} + +#endif // defined(_MSC_VER) + +void ThreadPool::WorkerThread::Run() { + SetupName(); + pool_->WorkerFunction(); +} + +bool ThreadPool::StartWorkers() { + if (!queue_.Init()) return false; + for (int i = 0; i < num_threads_; ++i) { + threads_[i] = new (std::nothrow) WorkerThread(this); + if (threads_[i] == nullptr) return false; + if (!threads_[i]->Start()) { + delete threads_[i]; + threads_[i] = nullptr; + return false; + } + } + return true; +} + +void ThreadPool::WorkerFunction() { + LockMutex(); + while (true) { + if (queue_.Empty()) { + if (exit_threads_) { + break; // Queue is empty and exit was requested. + } +#if defined(__ANDROID__) + // On android, if we go to a conditional wait right away, the CPU governor + // kicks in and starts shutting the cores down. So we do a very small busy + // wait to see if we get our next job within that period. This + // significantly improves the performance of common cases of tile parallel + // decoding. If we don't receive a job in the busy wait time, we then go + // to an actual conditional wait as usual. + UnlockMutex(); + bool found_job = false; + const auto wait_start = Clock::now(); + while (Clock::now() - wait_start < kBusyWaitDuration) { + LockMutex(); + if (!queue_.Empty()) { + found_job = true; + break; + } + UnlockMutex(); + } + // If |found_job| is true, we simply continue since we already hold the + // mutex and we know for sure that the |queue_| is not empty. + if (found_job) continue; + // Since |found_job_| was false, the mutex is not being held at this + // point. + LockMutex(); + // Ensure that the queue is still empty. + if (!queue_.Empty()) continue; + if (exit_threads_) { + break; // Queue is empty and exit was requested. + } +#endif // defined(__ANDROID__) + // Queue is still empty, wait for signal or broadcast. + Wait(); + } else { + // Take a job from the queue. + std::function job = std::move(queue_.Front()); + queue_.Pop(); + + UnlockMutex(); + // Note that it is good practice to surround this with a try/catch so + // the thread pool doesn't go to hell if the job throws an exception. + // This is omitted here because Google3 doesn't like exceptions. + std::move(job)(); + job = nullptr; + + LockMutex(); + } + } + UnlockMutex(); +} + +void ThreadPool::Shutdown() { + // Tell worker threads how to exit. + LockMutex(); + exit_threads_ = true; + UnlockMutex(); + SignalAll(); + + // Join all workers. This will block. + for (int i = 0; i < num_threads_; ++i) { + if (threads_[i] == nullptr) break; + threads_[i]->Join(); + delete threads_[i]; + } +} + +} // namespace libgav1 diff --git a/src/utils/threadpool.h b/src/utils/threadpool.h new file mode 100644 index 0000000..fac875e --- /dev/null +++ b/src/utils/threadpool.h @@ -0,0 +1,167 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_THREADPOOL_H_ +#define LIBGAV1_SRC_UTILS_THREADPOOL_H_ + +#include +#include + +#if defined(__APPLE__) +#include +#endif + +#if !defined(LIBGAV1_THREADPOOL_USE_STD_MUTEX) +#if defined(__ANDROID__) || (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) +#define LIBGAV1_THREADPOOL_USE_STD_MUTEX 1 +#else +#define LIBGAV1_THREADPOOL_USE_STD_MUTEX 0 +#endif +#endif + +#if LIBGAV1_THREADPOOL_USE_STD_MUTEX +#include // NOLINT (unapproved c++11 header) +#include // NOLINT (unapproved c++11 header) +#else +// absl::Mutex & absl::CondVar are significantly faster than the pthread +// variants on platforms other than Android. iOS may deadlock on Shutdown() +// using absl, see b/142251739. +#include "absl/base/thread_annotations.h" +#include "absl/synchronization/mutex.h" +#endif + +#include "src/utils/compiler_attributes.h" +#include "src/utils/executor.h" +#include "src/utils/memory.h" +#include "src/utils/unbounded_queue.h" + +namespace libgav1 { + +// An implementation of ThreadPool using POSIX threads (pthreads) or Windows +// threads. +// +// - The pool allocates a fixed number of worker threads on instantiation. +// - The worker threads will pick up work jobs as they arrive. +// - If all workers are busy, work jobs are queued for later execution. +// +// The thread pool is shut down when the pool is destroyed. +// +// Example usage of the thread pool: +// { +// std::unique_ptr pool = ThreadPool::Create(4); +// for (int i = 0; i < 100; ++i) { // Dispatch 100 jobs. +// pool->Schedule([&my_data]() { MyFunction(&my_data); }); +// } +// } // ThreadPool gets destroyed only when all jobs are done. +class ThreadPool : public Executor, public Allocable { + public: + // Creates the thread pool with the specified number of worker threads. + // If num_threads is 1, the closures are run in FIFO order. + static std::unique_ptr Create(int num_threads); + + // Like the above factory method, but also sets the name prefix for threads. + static std::unique_ptr Create(const char name_prefix[], + int num_threads); + + // The destructor will shut down the thread pool and all jobs are executed. + // Note that after shutdown, the thread pool does not accept further jobs. + ~ThreadPool() override; + + // Adds the specified "closure" to the queue for processing. If worker threads + // are available, "closure" will run immediately. Otherwise "closure" is + // queued for later execution. + // + // NOTE: If the internal queue is full and cannot be resized because of an + // out-of-memory error, the current thread runs "closure" before returning + // from Schedule(). For our use cases, this seems better than the + // alternatives: + // 1. Return a failure status. + // 2. Have the current thread wait until the queue is not full. + void Schedule(std::function closure) override; + + int num_threads() const; + + private: + class WorkerThread; + + // Creates the thread pool with the specified number of worker threads. + // If num_threads is 1, the closures are run in FIFO order. + ThreadPool(const char name_prefix[], std::unique_ptr threads, + int num_threads); + + // Starts the worker pool. + LIBGAV1_MUST_USE_RESULT bool StartWorkers(); + + void WorkerFunction(); + + // Shuts down the thread pool, i.e. worker threads finish their work and + // pick up new jobs until the queue is empty. This call will block until + // the shutdown is complete. + // + // Note: If a worker encounters an empty queue after this call, it will exit. + // Other workers might still be running, and if the queue fills up again, the + // thread pool will continue to operate with a decreased number of workers. + // It is up to the caller to prevent adding new jobs. + void Shutdown(); + +#if LIBGAV1_THREADPOOL_USE_STD_MUTEX + + void LockMutex() { queue_mutex_.lock(); } + void UnlockMutex() { queue_mutex_.unlock(); } + + void Wait() { + std::unique_lock queue_lock(queue_mutex_, std::adopt_lock); + condition_.wait(queue_lock); + queue_lock.release(); + } + + void SignalOne() { condition_.notify_one(); } + void SignalAll() { condition_.notify_all(); } + + std::condition_variable condition_; + std::mutex queue_mutex_; + +#else // !LIBGAV1_THREADPOOL_USE_STD_MUTEX + + void LockMutex() ABSL_EXCLUSIVE_LOCK_FUNCTION() { queue_mutex_.Lock(); } + void UnlockMutex() ABSL_UNLOCK_FUNCTION() { queue_mutex_.Unlock(); } + void Wait() { condition_.Wait(&queue_mutex_); } + void SignalOne() { condition_.Signal(); } + void SignalAll() { condition_.SignalAll(); } + + absl::CondVar condition_; + absl::Mutex queue_mutex_; + +#endif // LIBGAV1_THREADPOOL_USE_STD_MUTEX + + UnboundedQueue> queue_ LIBGAV1_GUARDED_BY(queue_mutex_); + // If not all the worker threads are created, the first entry after the + // created worker threads is a null pointer. + const std::unique_ptr threads_; + + bool exit_threads_ LIBGAV1_GUARDED_BY(queue_mutex_) = false; + const int num_threads_ = 0; + // name_prefix_ is a C string, whose length is restricted to 16 characters, + // including the terminating null byte ('\0'). This restriction comes from + // the Linux pthread_setname_np() function. + char name_prefix_[16]; +}; + +} // namespace libgav1 + +#undef LIBGAV1_THREADPOOL_USE_STD_MUTEX + +#endif // LIBGAV1_SRC_UTILS_THREADPOOL_H_ diff --git a/src/utils/types.h b/src/utils/types.h new file mode 100644 index 0000000..374f06b --- /dev/null +++ b/src/utils/types.h @@ -0,0 +1,525 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_TYPES_H_ +#define LIBGAV1_SRC_UTILS_TYPES_H_ + +#include +#include +#include + +#include "src/utils/array_2d.h" +#include "src/utils/constants.h" +#include "src/utils/memory.h" + +namespace libgav1 { + +struct MotionVector : public Allocable { + static constexpr int kRow = 0; + static constexpr int kColumn = 1; + + MotionVector() = default; + MotionVector(const MotionVector& mv) = default; + + MotionVector& operator=(const MotionVector& rhs) { + mv32 = rhs.mv32; + return *this; + } + + bool operator==(const MotionVector& rhs) const { return mv32 == rhs.mv32; } + + union { + // Motion vectors will always fit in int16_t and using int16_t here instead + // of int saves significant memory since some of the frame sized structures + // store motion vectors. + int16_t mv[2]; + // A uint32_t view into the |mv| array. Useful for cases where both the + // motion vectors have to be copied or compared with a single 32 bit + // instruction. + uint32_t mv32; + }; +}; + +union CompoundMotionVector { + CompoundMotionVector() = default; + CompoundMotionVector(const CompoundMotionVector& mv) = default; + + CompoundMotionVector& operator=(const CompoundMotionVector& rhs) { + mv64 = rhs.mv64; + return *this; + } + + bool operator==(const CompoundMotionVector& rhs) const { + return mv64 == rhs.mv64; + } + + MotionVector mv[2]; + // A uint64_t view into the |mv| array. Useful for cases where all the motion + // vectors have to be copied or compared with a single 64 bit instruction. + uint64_t mv64; +}; + +// Stores the motion information used for motion field estimation. +struct TemporalMotionField : public Allocable { + Array2D mv; + Array2D reference_offset; +}; + +// MvContexts contains the contexts used to decode portions of an inter block +// mode info to set the y_mode field in BlockParameters. +// +// The contexts in the struct correspond to the ZeroMvContext, RefMvContext, +// and NewMvContext variables in the spec. +struct MvContexts { + int zero_mv; + int reference_mv; + int new_mv; +}; + +struct PaletteModeInfo { + uint8_t size[kNumPlaneTypes]; + uint16_t color[kMaxPlanes][kMaxPaletteSize]; +}; + +// Stores the parameters used by the prediction process. The members of the +// struct are filled in when parsing the bitstream and used when the prediction +// is computed. The information in this struct is associated with a single +// block. +// While both BlockParameters and PredictionParameters store information +// pertaining to a Block, the only difference is that BlockParameters outlives +// the block itself (for example, some of the variables in BlockParameters are +// used to compute the context for reading elements in the subsequent blocks). +struct PredictionParameters : public Allocable { + // Restore the index in the unsorted mv stack from the least 3 bits of sorted + // |weight_index_stack|. + const MotionVector& reference_mv(int stack_index) const { + return ref_mv_stack[7 - (weight_index_stack[stack_index] & 7)]; + } + const MotionVector& reference_mv(int stack_index, int mv_index) const { + return compound_ref_mv_stack[7 - (weight_index_stack[stack_index] & 7)] + .mv[mv_index]; + } + + void IncreaseWeight(ptrdiff_t index, int weight) { + weight_index_stack[index] += weight << 3; + } + + void SetWeightIndexStackEntry(int index, int weight) { + weight_index_stack[index] = (weight << 3) + 7 - index; + } + + bool use_filter_intra; + FilterIntraPredictor filter_intra_mode; + int angle_delta[kNumPlaneTypes]; + int8_t cfl_alpha_u; + int8_t cfl_alpha_v; + int max_luma_width; + int max_luma_height; + Array2D color_index_map[kNumPlaneTypes]; + bool use_intra_block_copy; + InterIntraMode inter_intra_mode; + bool is_wedge_inter_intra; + int wedge_index; + int wedge_sign; + bool mask_is_inverse; + MotionMode motion_mode; + CompoundPredictionType compound_prediction_type; + union { + // |ref_mv_stack| and |compound_ref_mv_stack| are not sorted after + // construction. reference_mv() must be called to get the correct element. + MotionVector ref_mv_stack[kMaxRefMvStackSize]; + CompoundMotionVector compound_ref_mv_stack[kMaxRefMvStackSize]; + }; + // The least 3 bits of |weight_index_stack| store the index information, and + // the other bits store the weight. The index information is actually 7 - + // index to make the descending order sort stable (preserves the original + // order for elements with the same weight). Sorting an int16_t array is much + // faster than sorting a struct array with weight and index stored separately. + int16_t weight_index_stack[kMaxRefMvStackSize]; + // In the spec, the weights of all the nearest mvs are incremented by a bonus + // weight which is larger than any natural weight, and later the weights of + // the mvs are compared with this bonus weight to determine their contexts. We + // replace this procedure by introducing |nearest_mv_count|, which records the + // count of the nearest mvs. Since all the nearest mvs are in the beginning of + // the mv stack, the index of a mv in the mv stack can be compared with + // |nearest_mv_count| to get that mv's context. + int nearest_mv_count; + int ref_mv_count; + int ref_mv_index; + MotionVector global_mv[2]; + int num_warp_samples; + int warp_estimate_candidates[kMaxLeastSquaresSamples][4]; +}; + +// A lot of BlockParameters objects are created, so the smallest type is used +// for each field. The ranges of some fields are documented to justify why +// their types are large enough. +struct BlockParameters : public Allocable { + BlockSize size; + bool skip; + // True means that this block will use some default settings (that + // correspond to compound prediction) and so most of the mode info is + // skipped. False means that the mode info is not skipped. + bool skip_mode; + bool is_inter; + bool is_explicit_compound_type; // comp_group_idx in the spec. + bool is_compound_type_average; // compound_idx in the spec. + bool is_global_mv_block; + bool use_predicted_segment_id; // only valid with temporal update enabled. + int8_t segment_id; // segment_id is in the range [0, 7]. + PredictionMode y_mode; + PredictionMode uv_mode; + TransformSize transform_size; + TransformSize uv_transform_size; + InterpolationFilter interpolation_filter[2]; + ReferenceFrameType reference_frame[2]; + // The index of this array is as follows: + // 0 - Y plane vertical filtering. + // 1 - Y plane horizontal filtering. + // 2 - U plane (both directions). + // 3 - V plane (both directions). + uint8_t deblock_filter_level[kFrameLfCount]; + CompoundMotionVector mv; + PaletteModeInfo palette_mode_info; + // When |Tile::split_parse_and_decode_| is true, each block gets its own + // instance of |prediction_parameters|. When it is false, all the blocks point + // to |Tile::prediction_parameters_|. This field is valid only as long as the + // block is *being* decoded. The lifetime and usage of this field can be + // better understood by following its flow in tile.cc. + std::unique_ptr prediction_parameters; +}; + +// A five dimensional array used to store the wedge masks. The dimensions are: +// - block_size_index (returned by GetWedgeBlockSizeIndex() in prediction.cc). +// - flip_sign (0 or 1). +// - wedge_index (0 to 15). +// - each of those three dimensions is a 2d array of block_width by +// block_height. +using WedgeMaskArray = + std::array, 16>, 2>, 9>; + +enum GlobalMotionTransformationType : uint8_t { + kGlobalMotionTransformationTypeIdentity, + kGlobalMotionTransformationTypeTranslation, + kGlobalMotionTransformationTypeRotZoom, + kGlobalMotionTransformationTypeAffine, + kNumGlobalMotionTransformationTypes +}; + +// Global motion and warped motion parameters. See the paper for more info: +// S. Parker, Y. Chen, D. Barker, P. de Rivaz, D. Mukherjee, "Global and locally +// adaptive warped motion compensation in video compression", Proc. IEEE +// International Conference on Image Processing (ICIP), pp. 275-279, Sep. 2017. +struct GlobalMotion { + GlobalMotionTransformationType type; + int32_t params[6]; + + // Represent two shearing operations. Computed from |params| by SetupShear(). + // + // The least significant six (= kWarpParamRoundingBits) bits are all zeros. + // (This means alpha, beta, gamma, and delta could be represented by a 10-bit + // signed integer.) The minimum value is INT16_MIN (= -32768) and the maximum + // value is 32704 = 0x7fc0, the largest int16_t value whose least significant + // six bits are all zeros. + // + // Valid warp parameters (as validated by SetupShear()) have smaller ranges. + // Their absolute values are less than 2^14 (= 16384). (This follows from + // the warpValid check at the end of Section 7.11.3.6.) + // + // NOTE: Section 7.11.3.6 of the spec allows a maximum value of 32768, which + // is outside the range of int16_t. When cast to int16_t, 32768 becomes + // -32768. This potential int16_t overflow does not matter because either + // 32768 or -32768 causes SetupShear() to return false, + int16_t alpha; + int16_t beta; + int16_t gamma; + int16_t delta; +}; + +// Loop filter parameters: +// +// If level[0] and level[1] are both equal to 0, the loop filter process is +// not invoked. +// +// |sharpness| and |delta_enabled| are only used by the loop filter process. +// +// The |ref_deltas| and |mode_deltas| arrays are used not only by the loop +// filter process but also by the reference frame update and loading +// processes. The loop filter process uses |ref_deltas| and |mode_deltas| only +// when |delta_enabled| is true. +struct LoopFilter { + // Contains loop filter strength values in the range of [0, 63]. + std::array level; + // Indicates the sharpness level in the range of [0, 7]. + int8_t sharpness; + // Whether the filter level depends on the mode and reference frame used to + // predict a block. + bool delta_enabled; + // Whether additional syntax elements were read that specify which mode and + // reference frame deltas are to be updated. loop_filter_delta_update field in + // Section 5.9.11 of the spec. + bool delta_update; + // Contains the adjustment needed for the filter level based on the chosen + // reference frame, in the range of [-64, 63]. + std::array ref_deltas; + // Contains the adjustment needed for the filter level based on the chosen + // mode, in the range of [-64, 63]. + std::array mode_deltas; +}; + +struct Delta { + bool present; + uint8_t scale; + bool multi; +}; + +struct Cdef { + uint8_t damping; // damping value from the spec + (bitdepth - 8). + uint8_t bits; + // All the strength values are the values from the spec and left shifted by + // (bitdepth - 8). + uint8_t y_primary_strength[kMaxCdefStrengths]; + uint8_t y_secondary_strength[kMaxCdefStrengths]; + uint8_t uv_primary_strength[kMaxCdefStrengths]; + uint8_t uv_secondary_strength[kMaxCdefStrengths]; +}; + +struct TileInfo { + bool uniform_spacing; + int sb_rows; + int sb_columns; + int tile_count; + int tile_columns_log2; + int tile_columns; + int tile_column_start[kMaxTileColumns + 1]; + // This field is not used by libgav1, but is populated for use by some + // hardware decoders. So it must not be removed. + int tile_column_width_in_superblocks[kMaxTileColumns + 1]; + int tile_rows_log2; + int tile_rows; + int tile_row_start[kMaxTileRows + 1]; + // This field is not used by libgav1, but is populated for use by some + // hardware decoders. So it must not be removed. + int tile_row_height_in_superblocks[kMaxTileRows + 1]; + int16_t context_update_id; + uint8_t tile_size_bytes; +}; + +struct LoopRestoration { + LoopRestorationType type[kMaxPlanes]; + int unit_size_log2[kMaxPlanes]; +}; + +// Stores the quantization parameters of Section 5.9.12. +struct QuantizerParameters { + // base_index is in the range [0, 255]. + uint8_t base_index; + int8_t delta_dc[kMaxPlanes]; + // delta_ac[kPlaneY] is always 0. + int8_t delta_ac[kMaxPlanes]; + bool use_matrix; + // The |matrix_level| array is used only when |use_matrix| is true. + // matrix_level[plane] specifies the level in the quantizer matrix that + // should be used for decoding |plane|. The quantizer matrix has 15 levels, + // from 0 to 14. The range of matrix_level[plane] is [0, 15]. If + // matrix_level[plane] is 15, the quantizer matrix is not used. + int8_t matrix_level[kMaxPlanes]; +}; + +// The corresponding segment feature constants in the AV1 spec are named +// SEG_LVL_xxx. +enum SegmentFeature : uint8_t { + kSegmentFeatureQuantizer, + kSegmentFeatureLoopFilterYVertical, + kSegmentFeatureLoopFilterYHorizontal, + kSegmentFeatureLoopFilterU, + kSegmentFeatureLoopFilterV, + kSegmentFeatureReferenceFrame, + kSegmentFeatureSkip, + kSegmentFeatureGlobalMv, + kSegmentFeatureMax +}; + +struct Segmentation { + // 5.11.14. + // Returns true if the feature is enabled in the segment. + bool FeatureActive(int segment_id, SegmentFeature feature) const { + return enabled && segment_id < kMaxSegments && + feature_enabled[segment_id][feature]; + } + + // Returns true if the feature is signed. + static bool FeatureSigned(SegmentFeature feature) { + // Only the first five segment features are signed, so this comparison + // suffices. + return feature <= kSegmentFeatureLoopFilterV; + } + + bool enabled; + bool update_map; + bool update_data; + bool temporal_update; + // True if the segment id will be read before the skip syntax element. False + // if the skip syntax element will be read first. + bool segment_id_pre_skip; + // The highest numbered segment id that has some enabled feature. Used as + // the upper bound for decoding segment ids. + int8_t last_active_segment_id; + + bool feature_enabled[kMaxSegments][kSegmentFeatureMax]; + int16_t feature_data[kMaxSegments][kSegmentFeatureMax]; + bool lossless[kMaxSegments]; + // Cached values of get_qindex(1, segmentId), to be consumed by + // Tile::ReadTransformType(). The values are in the range [0, 255]. + uint8_t qindex[kMaxSegments]; +}; + +// Section 6.8.20. +// Note: In spec, film grain section uses YCbCr to denote variable names, +// such as num_cb_points, num_cr_points. To keep it consistent with other +// parts of code, we use YUV, i.e., num_u_points, num_v_points, etc. +struct FilmGrainParams { + bool apply_grain; + bool update_grain; + bool chroma_scaling_from_luma; + bool overlap_flag; + bool clip_to_restricted_range; + + uint8_t num_y_points; // [0, 14]. + uint8_t num_u_points; // [0, 10]. + uint8_t num_v_points; // [0, 10]. + // Must be [0, 255]. 10/12 bit /= 4 or 16. Must be in increasing order. + uint8_t point_y_value[14]; + uint8_t point_y_scaling[14]; + uint8_t point_u_value[10]; + uint8_t point_u_scaling[10]; + uint8_t point_v_value[10]; + uint8_t point_v_scaling[10]; + + uint8_t chroma_scaling; // [8, 11]. + uint8_t auto_regression_coeff_lag; // [0, 3]. + int8_t auto_regression_coeff_y[24]; // [-128, 127] + int8_t auto_regression_coeff_u[25]; // [-128, 127] + int8_t auto_regression_coeff_v[25]; // [-128, 127] + // Shift value: auto regression coeffs range + // 6: [-2, 2) + // 7: [-1, 1) + // 8: [-0.5, 0.5) + // 9: [-0.25, 0.25) + uint8_t auto_regression_shift; + + uint16_t grain_seed; + int reference_index; + int grain_scale_shift; + // These multipliers are encoded as nonnegative values by adding 128 first. + // The 128 is subtracted during parsing. + int8_t u_multiplier; // [-128, 127] + int8_t u_luma_multiplier; // [-128, 127] + // These offsets are encoded as nonnegative values by adding 256 first. The + // 256 is subtracted during parsing. + int16_t u_offset; // [-256, 255] + int8_t v_multiplier; // [-128, 127] + int8_t v_luma_multiplier; // [-128, 127] + int16_t v_offset; // [-256, 255] +}; + +struct ObuFrameHeader { + uint16_t display_frame_id; + uint16_t current_frame_id; + int64_t frame_offset; + uint16_t expected_frame_id[kNumInterReferenceFrameTypes]; + int32_t width; + int32_t height; + int32_t columns4x4; + int32_t rows4x4; + // The render size (render_width and render_height) is a hint to the + // application about the desired display size. It has no effect on the + // decoding process. + int32_t render_width; + int32_t render_height; + int32_t upscaled_width; + LoopRestoration loop_restoration; + uint32_t buffer_removal_time[kMaxOperatingPoints]; + uint32_t frame_presentation_time; + // Note: global_motion[0] (for kReferenceFrameIntra) is not used. + std::array global_motion; + TileInfo tile_info; + QuantizerParameters quantizer; + Segmentation segmentation; + bool show_existing_frame; + // frame_to_show is in the range [0, 7]. Only used if show_existing_frame is + // true. + int8_t frame_to_show; + FrameType frame_type; + bool show_frame; + bool showable_frame; + bool error_resilient_mode; + bool enable_cdf_update; + bool frame_size_override_flag; + // The order_hint syntax element in the uncompressed header. If + // show_existing_frame is false, the OrderHint variable in the spec is equal + // to this field, and so this field can be used in place of OrderHint when + // show_existing_frame is known to be false, such as during tile decoding. + uint8_t order_hint; + int8_t primary_reference_frame; + bool render_and_frame_size_different; + bool use_superres; + uint8_t superres_scale_denominator; + bool allow_screen_content_tools; + bool allow_intrabc; + bool frame_refs_short_signaling; + // A bitmask that specifies which reference frame slots will be updated with + // the current frame after it is decoded. + uint8_t refresh_frame_flags; + static_assert(sizeof(ObuFrameHeader::refresh_frame_flags) * 8 == + kNumReferenceFrameTypes, + ""); + bool found_reference; + int8_t force_integer_mv; + bool allow_high_precision_mv; + InterpolationFilter interpolation_filter; + bool is_motion_mode_switchable; + bool use_ref_frame_mvs; + bool enable_frame_end_update_cdf; + // True if all segments are losslessly encoded at the coded resolution. + bool coded_lossless; + // True if all segments are losslessly encoded at the upscaled resolution. + bool upscaled_lossless; + TxMode tx_mode; + // True means that the mode info for inter blocks contains the syntax + // element comp_mode that indicates whether to use single or compound + // prediction. False means that all inter blocks will use single prediction. + bool reference_mode_select; + // The frames to use for compound prediction when skip_mode is true. + ReferenceFrameType skip_mode_frame[2]; + bool skip_mode_present; + bool reduced_tx_set; + bool allow_warped_motion; + Delta delta_q; + Delta delta_lf; + // A valid value of reference_frame_index[i] is in the range [0, 7]. -1 + // indicates an invalid value. + int8_t reference_frame_index[kNumInterReferenceFrameTypes]; + // The ref_order_hint[ i ] syntax element in the uncompressed header. + // Specifies the expected output order hint for each reference frame. + uint8_t reference_order_hint[kNumReferenceFrameTypes]; + LoopFilter loop_filter; + Cdef cdef; + FilmGrainParams film_grain_params; +}; + +} // namespace libgav1 +#endif // LIBGAV1_SRC_UTILS_TYPES_H_ diff --git a/src/utils/unbounded_queue.h b/src/utils/unbounded_queue.h new file mode 100644 index 0000000..fa0d303 --- /dev/null +++ b/src/utils/unbounded_queue.h @@ -0,0 +1,245 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_ +#define LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_ + +#include +#include +#include +#include +#include + +#include "src/utils/compiler_attributes.h" +#include "src/utils/memory.h" + +namespace libgav1 { + +// A FIFO queue of an unbounded capacity. +// +// This implementation uses the general approach used in std::deque +// implementations. See, for example, +// https://stackoverflow.com/questions/6292332/what-really-is-a-deque-in-stl +// +// It is much simpler because it just needs to support the queue interface. +// The blocks are chained into a circular list, not managed by a "map". It +// does not shrink the internal buffer. +// +// An alternative implementation approach is a resizable circular array. See, +// for example, ResizingArrayQueue.java in https://algs4.cs.princeton.edu/code/ +// and base::circular_deque in Chromium's base/containers library. +template +class UnboundedQueue { + public: + UnboundedQueue() = default; + + // Move only. + UnboundedQueue(UnboundedQueue&& other) + : first_block_(other.first_block_), + front_(other.front_), + last_block_(other.last_block_), + back_(other.back_) { + other.first_block_ = nullptr; + other.front_ = 0; + other.last_block_ = nullptr; + other.back_ = 0; + } + UnboundedQueue& operator=(UnboundedQueue&& other) { + if (this != &other) { + Destroy(); + first_block_ = other.first_block_; + front_ = other.front_; + last_block_ = other.last_block_; + back_ = other.back_; + other.first_block_ = nullptr; + other.front_ = 0; + other.last_block_ = nullptr; + other.back_ = 0; + } + return *this; + } + + ~UnboundedQueue() { Destroy(); } + + // Allocates two Blocks upfront because most access patterns require at + // least two Blocks. Returns false if the allocation of the Blocks failed. + LIBGAV1_MUST_USE_RESULT bool Init() { + std::unique_ptr new_block0(new (std::nothrow) Block); + std::unique_ptr new_block1(new (std::nothrow) Block); + if (new_block0 == nullptr || new_block1 == nullptr) return false; + first_block_ = last_block_ = new_block0.release(); + new_block1->next = first_block_; + last_block_->next = new_block1.release(); + return true; + } + + // Checks if the queue has room for a new element. If the queue is full, + // tries to grow it. Returns false if the queue is full and the attempt to + // grow it failed. + // + // NOTE: GrowIfNeeded() must be called before each call to Push(). This + // inconvenient design is necessary to guarantee a successful Push() call. + // + // Push(T&& value) is often called with the argument std::move(value). The + // moved-from object |value| won't be usable afterwards, so it would be + // problematic if Push(T&& value) failed and we lost access to the original + // |value| object. + LIBGAV1_MUST_USE_RESULT bool GrowIfNeeded() { + assert(last_block_ != nullptr); + if (back_ == kBlockCapacity) { + if (last_block_->next == first_block_) { + // All Blocks are in use. + std::unique_ptr new_block(new (std::nothrow) Block); + if (new_block == nullptr) return false; + new_block->next = first_block_; + last_block_->next = new_block.release(); + } + last_block_ = last_block_->next; + back_ = 0; + } + return true; + } + + // Pushes the element |value| to the end of the queue. It is an error to call + // Push() when the queue is full. + void Push(const T& value) { + assert(last_block_ != nullptr); + assert(back_ < kBlockCapacity); + T* elements = reinterpret_cast(last_block_->buffer); + new (&elements[back_++]) T(value); + } + + void Push(T&& value) { + assert(last_block_ != nullptr); + assert(back_ < kBlockCapacity); + T* elements = reinterpret_cast(last_block_->buffer); + new (&elements[back_++]) T(std::move(value)); + } + + // Returns the element at the front of the queue. It is an error to call + // Front() when the queue is empty. + T& Front() { + assert(!Empty()); + T* elements = reinterpret_cast(first_block_->buffer); + return elements[front_]; + } + + const T& Front() const { + assert(!Empty()); + T* elements = reinterpret_cast(first_block_->buffer); + return elements[front_]; + } + + // Removes the element at the front of the queue from the queue. It is an + // error to call Pop() when the queue is empty. + void Pop() { + assert(!Empty()); + T* elements = reinterpret_cast(first_block_->buffer); + elements[front_++].~T(); + if (front_ == kBlockCapacity) { + // The first block has become empty. + front_ = 0; + if (first_block_ == last_block_) { + // Only one Block is in use. Simply reset back_. + back_ = 0; + } else { + first_block_ = first_block_->next; + } + } + } + + // Returns true if the queue is empty. + bool Empty() const { return first_block_ == last_block_ && front_ == back_; } + + private: + // kBlockCapacity is the maximum number of elements each Block can hold. + // sizeof(void*) is subtracted from 2048 to account for the |next| pointer in + // the Block struct. + // + // In Linux x86_64, sizeof(std::function) is 32, so each Block can + // hold 63 std::function objects. + // + // NOTE: The corresponding value in in libc++ revision + // 245b5ba3448b9d3f6de5962066557e253a6bc9a4 is: + // template + // struct __deque_block_size { + // static const _DiffType value = + // sizeof(_ValueType) < 256 ? 4096 / sizeof(_ValueType) : 16; + // }; + // + // Note that 4096 / 256 = 16, so apparently this expression is intended to + // ensure the block size is at least 4096 bytes and each block can hold at + // least 16 elements. + static constexpr size_t kBlockCapacity = + (sizeof(T) < 128) ? (2048 - sizeof(void*)) / sizeof(T) : 16; + + struct Block : public Allocable { + alignas(T) char buffer[kBlockCapacity * sizeof(T)]; + Block* next; + }; + + void Destroy() { + if (first_block_ == nullptr) return; // An uninitialized queue. + + // First free the unused blocks, which are located after last_block and + // before first_block_. + Block* block = last_block_->next; + // Cut the circular list open after last_block_. + last_block_->next = nullptr; + while (block != first_block_) { + Block* next = block->next; + delete block; + block = next; + } + + // Then free the used blocks. Destruct the elements in the used blocks. + while (block != nullptr) { + const size_t begin = (block == first_block_) ? front_ : 0; + const size_t end = (block == last_block_) ? back_ : kBlockCapacity; + T* elements = reinterpret_cast(block->buffer); + for (size_t i = begin; i < end; ++i) { + elements[i].~T(); + } + Block* next = block->next; + delete block; + block = next; + } + } + + // Blocks are chained in a circular singly-linked list. If the list of Blocks + // is empty, both first_block_ and last_block_ are null pointers. If the list + // is nonempty, first_block_ points to the first used Block and last_block_ + // points to the last used Block. + // + // Invariant: If Init() is called and succeeds, the queue is always nonempty. + // This allows all methods (except the destructor) to avoid null pointer + // checks for first_block_ and last_block_. + Block* first_block_ = nullptr; + // The index of the element in first_block_ to be removed by Pop(). + size_t front_ = 0; + Block* last_block_ = nullptr; + // The index in last_block_ where the new element is inserted by Push(). + size_t back_ = 0; +}; + +#if !LIBGAV1_CXX17 +template +constexpr size_t UnboundedQueue::kBlockCapacity; +#endif + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_UNBOUNDED_QUEUE_H_ diff --git a/src/utils/vector.h b/src/utils/vector.h new file mode 100644 index 0000000..e211240 --- /dev/null +++ b/src/utils/vector.h @@ -0,0 +1,352 @@ +/* + * Copyright 2019 The libgav1 Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// libgav1::Vector implementation + +#ifndef LIBGAV1_SRC_UTILS_VECTOR_H_ +#define LIBGAV1_SRC_UTILS_VECTOR_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "src/utils/compiler_attributes.h" + +namespace libgav1 { +namespace internal { + +static constexpr size_t kMinVectorAllocation = 16; + +// Returns the smallest power of two greater or equal to 'value'. +inline size_t NextPow2(size_t value) { + if (value == 0) return 0; + --value; + for (size_t i = 1; i < sizeof(size_t) * 8; i *= 2) value |= value >> i; + return value + 1; +} + +// Returns the smallest capacity greater or equal to 'value'. +inline size_t NextCapacity(size_t value) { + if (value == 0) return 0; + if (value <= kMinVectorAllocation) return kMinVectorAllocation; + return NextPow2(value); +} + +//------------------------------------------------------------------------------ +// Data structure equivalent to std::vector but returning false and to its last +// valid state on memory allocation failure. +// std::vector with a custom allocator does not fill this need without +// exceptions. + +template +class VectorBase { + public: + using iterator = T*; + using const_iterator = const T*; + + VectorBase() noexcept = default; + // Move only. + VectorBase(const VectorBase&) = delete; + VectorBase& operator=(const VectorBase&) = delete; + VectorBase(VectorBase&& other) noexcept + : items_(other.items_), + capacity_(other.capacity_), + num_items_(other.num_items_) { + other.items_ = nullptr; + other.capacity_ = 0; + other.num_items_ = 0; + } + VectorBase& operator=(VectorBase&& other) noexcept { + if (this != &other) { + clear(); + free(items_); + items_ = other.items_; + capacity_ = other.capacity_; + num_items_ = other.num_items_; + other.items_ = nullptr; + other.capacity_ = 0; + other.num_items_ = 0; + } + return *this; + } + ~VectorBase() { + clear(); + free(items_); + } + + // Reallocates just enough memory if needed so that 'new_cap' items can fit. + LIBGAV1_MUST_USE_RESULT bool reserve(size_t new_cap) { + if (capacity_ < new_cap) { + T* const new_items = static_cast(malloc(new_cap * sizeof(T))); + if (new_items == nullptr) return false; + if (num_items_ > 0) { + if (std::is_trivial::value) { + // Cast |new_items| and |items_| to void* to avoid the GCC + // -Wclass-memaccess warning and additionally the + // bugprone-undefined-memory-manipulation clang-tidy warning. The + // memcpy is safe because T is a trivial type. + memcpy(static_cast(new_items), + static_cast(items_), num_items_ * sizeof(T)); + } else { + for (size_t i = 0; i < num_items_; ++i) { + new (&new_items[i]) T(std::move(items_[i])); + items_[i].~T(); + } + } + } + free(items_); + items_ = new_items; + capacity_ = new_cap; + } + return true; + } + + // Reallocates less memory so that only the existing items can fit. + bool shrink_to_fit() { + if (capacity_ == num_items_) return true; + if (num_items_ == 0) { + free(items_); + items_ = nullptr; + capacity_ = 0; + return true; + } + const size_t previous_capacity = capacity_; + capacity_ = 0; // Force reserve() to allocate and copy. + if (reserve(num_items_)) return true; + capacity_ = previous_capacity; + return false; + } + + // Constructs a new item by copy constructor. May reallocate if + // 'resize_if_needed'. + LIBGAV1_MUST_USE_RESULT bool push_back(const T& value, + bool resize_if_needed = true) { + if (num_items_ >= capacity_ && + (!resize_if_needed || + !reserve(internal::NextCapacity(num_items_ + 1)))) { + return false; + } + new (&items_[num_items_]) T(value); + ++num_items_; + return true; + } + + // Constructs a new item by copy constructor. reserve() must have been called + // with a sufficient capacity. + // + // WARNING: No error checking is performed. + void push_back_unchecked(const T& value) { + assert(num_items_ < capacity_); + new (&items_[num_items_]) T(value); + ++num_items_; + } + + // Constructs a new item by move constructor. May reallocate if + // 'resize_if_needed'. + LIBGAV1_MUST_USE_RESULT bool push_back(T&& value, + bool resize_if_needed = true) { + if (num_items_ >= capacity_ && + (!resize_if_needed || + !reserve(internal::NextCapacity(num_items_ + 1)))) { + return false; + } + new (&items_[num_items_]) T(std::move(value)); + ++num_items_; + return true; + } + + // Constructs a new item by move constructor. reserve() must have been called + // with a sufficient capacity. + // + // WARNING: No error checking is performed. + void push_back_unchecked(T&& value) { + assert(num_items_ < capacity_); + new (&items_[num_items_]) T(std::move(value)); + ++num_items_; + } + + // Constructs a new item in place by forwarding the arguments args... to the + // constructor. May reallocate. + template + LIBGAV1_MUST_USE_RESULT bool emplace_back(Args&&... args) { + if (num_items_ >= capacity_ && + !reserve(internal::NextCapacity(num_items_ + 1))) { + return false; + } + new (&items_[num_items_]) T(std::forward(args)...); + ++num_items_; + return true; + } + + // Destructs the last item. + void pop_back() { + --num_items_; + items_[num_items_].~T(); + } + + // Destructs the item at 'pos'. + void erase(iterator pos) { erase(pos, pos + 1); } + + // Destructs the items in [first,last). + void erase(iterator first, iterator last) { + for (iterator it = first; it != last; ++it) it->~T(); + if (last != end()) { + if (std::is_trivial::value) { + // Cast |first| and |last| to void* to avoid the GCC + // -Wclass-memaccess warning and additionally the + // bugprone-undefined-memory-manipulation clang-tidy warning. The + // memmove is safe because T is a trivial type. + memmove(static_cast(first), static_cast(last), + (end() - last) * sizeof(T)); + } else { + for (iterator it_src = last, it_dst = first; it_src != end(); + ++it_src, ++it_dst) { + new (it_dst) T(std::move(*it_src)); + it_src->~T(); + } + } + } + num_items_ -= std::distance(first, last); + } + + // Destructs all the items. + void clear() { erase(begin(), end()); } + + // Destroys (including deallocating) all the items. + void reset() { + clear(); + if (!shrink_to_fit()) assert(false); + } + + // Accessors + bool empty() const { return (num_items_ == 0); } + size_t size() const { return num_items_; } + size_t capacity() const { return capacity_; } + + T* data() { return items_; } + T& front() { return items_[0]; } + T& back() { return items_[num_items_ - 1]; } + T& operator[](size_t i) { return items_[i]; } + T& at(size_t i) { return items_[i]; } + const T* data() const { return items_; } + const T& front() const { return items_[0]; } + const T& back() const { return items_[num_items_ - 1]; } + const T& operator[](size_t i) const { return items_[i]; } + const T& at(size_t i) const { return items_[i]; } + + iterator begin() { return &items_[0]; } + const_iterator begin() const { return &items_[0]; } + iterator end() { return &items_[num_items_]; } + const_iterator end() const { return &items_[num_items_]; } + + void swap(VectorBase& b) { + // Although not necessary here, adding "using std::swap;" and then calling + // swap() without namespace qualification is recommended. See Effective + // C++, Item 25. + using std::swap; + swap(items_, b.items_); + swap(capacity_, b.capacity_); + swap(num_items_, b.num_items_); + } + + protected: + T* items_ = nullptr; + size_t capacity_ = 0; + size_t num_items_ = 0; +}; + +} // namespace internal + +//------------------------------------------------------------------------------ + +// Vector class that does *NOT* construct the content on resize(). +// Should be reserved to plain old data. +template +class VectorNoCtor : public internal::VectorBase { + public: + // Creates or destructs items so that 'new_num_items' exist. + // Allocated memory grows every power-of-two items. + LIBGAV1_MUST_USE_RESULT bool resize(size_t new_num_items) { + using super = internal::VectorBase; + if (super::num_items_ < new_num_items) { + if (super::capacity_ < new_num_items) { + if (!super::reserve(internal::NextCapacity(new_num_items))) { + return false; + } + } + super::num_items_ = new_num_items; + } else { + while (super::num_items_ > new_num_items) { + --super::num_items_; + super::items_[super::num_items_].~T(); + } + } + return true; + } +}; + +// This generic vector class will call the constructors. +template +class Vector : public internal::VectorBase { + public: + // Constructs or destructs items so that 'new_num_items' exist. + // Allocated memory grows every power-of-two items. + LIBGAV1_MUST_USE_RESULT bool resize(size_t new_num_items) { + using super = internal::VectorBase; + if (super::num_items_ < new_num_items) { + if (super::capacity_ < new_num_items) { + if (!super::reserve(internal::NextCapacity(new_num_items))) { + return false; + } + } + while (super::num_items_ < new_num_items) { + new (&super::items_[super::num_items_]) T(); + ++super::num_items_; + } + } else { + while (super::num_items_ > new_num_items) { + --super::num_items_; + super::items_[super::num_items_].~T(); + } + } + return true; + } +}; + +//------------------------------------------------------------------------------ + +// Define non-member swap() functions in the namespace in which VectorNoCtor +// and Vector are implemented. See Effective C++, Item 25. + +template +void swap(VectorNoCtor& a, VectorNoCtor& b) { + a.swap(b); +} + +template +void swap(Vector& a, Vector& b) { + a.swap(b); +} + +//------------------------------------------------------------------------------ + +} // namespace libgav1 + +#endif // LIBGAV1_SRC_UTILS_VECTOR_H_ -- cgit v1.2.3