diff options
Diffstat (limited to 'src/threading_strategy.cc')
-rw-r--r-- | src/threading_strategy.cc | 222 |
1 files changed, 222 insertions, 0 deletions
diff --git a/src/threading_strategy.cc b/src/threading_strategy.cc new file mode 100644 index 0000000..cd4d576 --- /dev/null +++ b/src/threading_strategy.cc @@ -0,0 +1,222 @@ +// Copyright 2019 The libgav1 Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "src/threading_strategy.h" + +#include <algorithm> +#include <cassert> +#include <memory> + +#include "src/frame_scratch_buffer.h" +#include "src/utils/constants.h" +#include "src/utils/logging.h" +#include "src/utils/vector.h" + +namespace libgav1 { +namespace { + +#if !defined(LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER) +constexpr int kFrameParallelThresholdMultiplier = 3; +#else +constexpr int kFrameParallelThresholdMultiplier = + LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER; +#endif + +// Computes the number of frame threads to be used based on the following +// heuristic: +// * If |thread_count| == 1, return 0. +// * If |thread_count| <= |tile_count| * 4, return 0. +// * Otherwise, return the largest value of i which satisfies the following +// condition: i + i * tile_columns <= thread_count. This ensures that there +// are at least |tile_columns| worker threads for each frame thread. +// * This function will never return 1 or a value > |thread_count|. +// +// This heuristic is based empirical performance data. The in-frame threading +// model (combination of tile multithreading, superblock row multithreading and +// post filter multithreading) performs better than the frame parallel model +// until we reach the threshold of |thread_count| > |tile_count| * +// kFrameParallelThresholdMultiplier. +// +// It is a function of |tile_count| since tile threading and superblock row +// multithreading will scale only as a factor of |tile_count|. The threshold 4 +// is arrived at based on empirical data. The general idea is that superblock +// row multithreading plateaus at 4 * |tile_count| because in most practical +// cases there aren't more than that many superblock rows and columns available +// to work on in parallel. +int ComputeFrameThreadCount(int thread_count, int tile_count, + int tile_columns) { + assert(thread_count > 0); + if (thread_count == 1) return 0; + return (thread_count <= tile_count * kFrameParallelThresholdMultiplier) + ? 0 + : std::max(2, thread_count / (1 + tile_columns)); +} + +} // namespace + +bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header, + int thread_count) { + assert(thread_count > 0); + frame_parallel_ = false; + + if (thread_count == 1) { + thread_pool_.reset(nullptr); + tile_thread_count_ = 0; + max_tile_index_for_row_threads_ = 0; + return true; + } + + // We do work in the current thread, so it is sufficient to create + // |thread_count|-1 threads in the threadpool. + thread_count = std::min(thread_count, static_cast<int>(kMaxThreads)) - 1; + + if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) { + thread_pool_ = ThreadPool::Create("libgav1", thread_count); + if (thread_pool_ == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.", + thread_count); + tile_thread_count_ = 0; + max_tile_index_for_row_threads_ = 0; + return false; + } + } + + // Prefer tile threads first (but only if there is more than one tile). + const int tile_count = frame_header.tile_info.tile_count; + if (tile_count > 1) { + // We want 1 + tile_thread_count_ <= tile_count because the current thread + // is also used to decode tiles. This is equivalent to + // tile_thread_count_ <= tile_count - 1. + tile_thread_count_ = std::min(thread_count, tile_count - 1); + thread_count -= tile_thread_count_; + if (thread_count == 0) { + max_tile_index_for_row_threads_ = 0; + return true; + } + } else { + tile_thread_count_ = 0; + } + +#if defined(__ANDROID__) + // Assign the remaining threads for each Tile. The heuristic used here is that + // we will assign two threads for each Tile. So for example, if |thread_count| + // is 2, for a stream with 2 tiles the first tile would get both the threads + // and the second tile would have row multi-threading turned off. This + // heuristic is based on the fact that row multi-threading is fast enough only + // when there are at least two threads to do the decoding (since one thread + // always does the parsing). + // + // This heuristic might stop working when SIMD optimizations make the decoding + // much faster and the parsing thread is only as fast as the decoding threads. + // So we will have to revisit this later to make sure that this is still + // optimal. + // + // Note that while this heuristic significantly improves performance on high + // end devices (like the Pixel 3), there are some performance regressions in + // some lower end devices (in some cases) and that needs to be revisited as we + // bring in more optimizations. Overall, the gains because of this heuristic + // seems to be much larger than the regressions. + for (int i = 0; i < tile_count; ++i) { + max_tile_index_for_row_threads_ = i + 1; + thread_count -= 2; + if (thread_count <= 0) break; + } +#else // !defined(__ANDROID__) + // Assign the remaining threads to each Tile. + for (int i = 0; i < tile_count; ++i) { + const int count = thread_count / tile_count + + static_cast<int>(i < thread_count % tile_count); + if (count == 0) { + // Once we see a 0 value, all subsequent values will be 0 since it is + // supposed to be assigned in a round-robin fashion. + break; + } + max_tile_index_for_row_threads_ = i + 1; + } +#endif // defined(__ANDROID__) + return true; +} + +bool ThreadingStrategy::Reset(int thread_count) { + assert(thread_count > 0); + frame_parallel_ = true; + + // In frame parallel mode, we simply access the underlying |thread_pool_| + // directly. So ensure all the other threadpool getter functions return + // nullptr. Also, superblock row multithreading is always disabled in frame + // parallel mode. + tile_thread_count_ = 0; + max_tile_index_for_row_threads_ = 0; + + if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) { + thread_pool_ = ThreadPool::Create("libgav1-fp", thread_count); + if (thread_pool_ == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.", + thread_count); + return false; + } + } + return true; +} + +bool InitializeThreadPoolsForFrameParallel( + int thread_count, int tile_count, int tile_columns, + std::unique_ptr<ThreadPool>* const frame_thread_pool, + FrameScratchBufferPool* const frame_scratch_buffer_pool) { + assert(*frame_thread_pool == nullptr); + thread_count = std::min(thread_count, static_cast<int>(kMaxThreads)); + const int frame_threads = + ComputeFrameThreadCount(thread_count, tile_count, tile_columns); + if (frame_threads == 0) return true; + *frame_thread_pool = ThreadPool::Create(frame_threads); + if (*frame_thread_pool == nullptr) { + LIBGAV1_DLOG(ERROR, "Failed to create frame thread pool with %d threads.", + frame_threads); + return false; + } + int remaining_threads = thread_count - frame_threads; + if (remaining_threads == 0) return true; + int threads_per_frame = remaining_threads / frame_threads; + const int extra_threads = remaining_threads % frame_threads; + Vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers; + if (!frame_scratch_buffers.reserve(frame_threads)) return false; + // Create the tile thread pools. + for (int i = 0; i < frame_threads && remaining_threads > 0; ++i) { + std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer = + frame_scratch_buffer_pool->Get(); + if (frame_scratch_buffer == nullptr) { + return false; + } + // If the number of tile threads cannot be divided equally amongst all the + // frame threads, assign one extra thread to the first |extra_threads| frame + // threads. + const int current_frame_thread_count = + threads_per_frame + static_cast<int>(i < extra_threads); + if (!frame_scratch_buffer->threading_strategy.Reset( + current_frame_thread_count)) { + return false; + } + remaining_threads -= current_frame_thread_count; + frame_scratch_buffers.push_back_unchecked(std::move(frame_scratch_buffer)); + } + // We release the frame scratch buffers in reverse order so that the extra + // threads are allocated to buffers in the top of the stack. + for (int i = static_cast<int>(frame_scratch_buffers.size()) - 1; i >= 0; + --i) { + frame_scratch_buffer_pool->Release(std::move(frame_scratch_buffers[i])); + } + return true; +} + +} // namespace libgav1 |