1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
|
// Copyright 2019 The libgav1 Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "src/threading_strategy.h"
#include <algorithm>
#include <cassert>
#include <memory>
#include "src/frame_scratch_buffer.h"
#include "src/utils/constants.h"
#include "src/utils/logging.h"
#include "src/utils/vector.h"
namespace libgav1 {
namespace {
#if !defined(LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER)
constexpr int kFrameParallelThresholdMultiplier = 3;
#else
constexpr int kFrameParallelThresholdMultiplier =
LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER;
#endif
// Computes the number of frame threads to be used based on the following
// heuristic:
// * If |thread_count| == 1, return 0.
// * If |thread_count| <= |tile_count| * kFrameParallelThresholdMultiplier,
// return 0.
// * Otherwise, return the largest value of i which satisfies the following
// condition: i + i * tile_columns <= thread_count. This ensures that there
// are at least |tile_columns| worker threads for each frame thread.
// * This function will never return 1 or a value > |thread_count|.
//
// This heuristic is based on empirical performance data. The in-frame
// threading model (combination of tile multithreading, superblock row
// multithreading and post filter multithreading) performs better than the
// frame parallel model until we reach the threshold of |thread_count| >
// |tile_count| * kFrameParallelThresholdMultiplier.
//
// It is a function of |tile_count| since tile threading and superblock row
// multithreading will scale only as a factor of |tile_count|. The threshold
// kFrameParallelThresholdMultiplier is arrived at based on empirical data.
// The general idea is that superblock row multithreading plateaus at 4 *
// |tile_count| because in most practical cases there aren't more than that
// many superblock rows and columns available to work on in parallel.
int ComputeFrameThreadCount(int thread_count, int tile_count,
int tile_columns) {
assert(thread_count > 0);
if (thread_count == 1) return 0;
return (thread_count <= tile_count * kFrameParallelThresholdMultiplier)
? 0
: std::max(2, thread_count / (1 + tile_columns));
}
} // namespace
bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
int thread_count) {
assert(thread_count > 0);
frame_parallel_ = false;
if (thread_count == 1) {
thread_pool_.reset(nullptr);
tile_thread_count_ = 0;
max_tile_index_for_row_threads_ = 0;
return true;
}
// We do work in the current thread, so it is sufficient to create
// |thread_count|-1 threads in the threadpool.
thread_count = std::min(thread_count, static_cast<int>(kMaxThreads)) - 1;
if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
thread_pool_ = ThreadPool::Create("libgav1", thread_count);
if (thread_pool_ == nullptr) {
LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
thread_count);
tile_thread_count_ = 0;
max_tile_index_for_row_threads_ = 0;
return false;
}
}
// Prefer tile threads first (but only if there is more than one tile).
const int tile_count = frame_header.tile_info.tile_count;
if (tile_count > 1) {
// We want 1 + tile_thread_count_ <= tile_count because the current thread
// is also used to decode tiles. This is equivalent to
// tile_thread_count_ <= tile_count - 1.
tile_thread_count_ = std::min(thread_count, tile_count - 1);
thread_count -= tile_thread_count_;
if (thread_count == 0) {
max_tile_index_for_row_threads_ = 0;
return true;
}
} else {
tile_thread_count_ = 0;
}
#if defined(__ANDROID__)
// Assign the remaining threads for each Tile. The heuristic used here is that
// we will assign two threads for each Tile. So for example, if |thread_count|
// is 2, for a stream with 2 tiles the first tile would get both the threads
// and the second tile would have row multi-threading turned off. This
// heuristic is based on the fact that row multi-threading is fast enough only
// when there are at least two threads to do the decoding (since one thread
// always does the parsing).
//
// This heuristic might stop working when SIMD optimizations make the decoding
// much faster and the parsing thread is only as fast as the decoding threads.
// So we will have to revisit this later to make sure that this is still
// optimal.
//
// Note that while this heuristic significantly improves performance on high
// end devices (like the Pixel 3), there are some performance regressions in
// some lower end devices (in some cases) and that needs to be revisited as we
// bring in more optimizations. Overall, the gains because of this heuristic
// seems to be much larger than the regressions.
for (int i = 0; i < tile_count; ++i) {
max_tile_index_for_row_threads_ = i + 1;
thread_count -= 2;
if (thread_count <= 0) break;
}
#else // !defined(__ANDROID__)
// Assign the remaining threads to each Tile.
for (int i = 0; i < tile_count; ++i) {
const int count = thread_count / tile_count +
static_cast<int>(i < thread_count % tile_count);
if (count == 0) {
// Once we see a 0 value, all subsequent values will be 0 since it is
// supposed to be assigned in a round-robin fashion.
break;
}
max_tile_index_for_row_threads_ = i + 1;
}
#endif // defined(__ANDROID__)
return true;
}
bool ThreadingStrategy::Reset(int thread_count) {
assert(thread_count > 0);
frame_parallel_ = true;
// In frame parallel mode, we simply access the underlying |thread_pool_|
// directly. So ensure all the other threadpool getter functions return
// nullptr. Also, superblock row multithreading is always disabled in frame
// parallel mode.
tile_thread_count_ = 0;
max_tile_index_for_row_threads_ = 0;
if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
thread_pool_ = ThreadPool::Create("libgav1-fp", thread_count);
if (thread_pool_ == nullptr) {
LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
thread_count);
return false;
}
}
return true;
}
bool InitializeThreadPoolsForFrameParallel(
int thread_count, int tile_count, int tile_columns,
std::unique_ptr<ThreadPool>* const frame_thread_pool,
FrameScratchBufferPool* const frame_scratch_buffer_pool) {
assert(*frame_thread_pool == nullptr);
thread_count = std::min(thread_count, static_cast<int>(kMaxThreads));
const int frame_threads =
ComputeFrameThreadCount(thread_count, tile_count, tile_columns);
if (frame_threads == 0) return true;
*frame_thread_pool = ThreadPool::Create(frame_threads);
if (*frame_thread_pool == nullptr) {
LIBGAV1_DLOG(ERROR, "Failed to create frame thread pool with %d threads.",
frame_threads);
return false;
}
int remaining_threads = thread_count - frame_threads;
if (remaining_threads == 0) return true;
int threads_per_frame = remaining_threads / frame_threads;
const int extra_threads = remaining_threads % frame_threads;
Vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
if (!frame_scratch_buffers.reserve(frame_threads)) return false;
// Create the tile thread pools.
for (int i = 0; i < frame_threads && remaining_threads > 0; ++i) {
std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
frame_scratch_buffer_pool->Get();
if (frame_scratch_buffer == nullptr) {
return false;
}
// If the number of tile threads cannot be divided equally amongst all the
// frame threads, assign one extra thread to the first |extra_threads| frame
// threads.
const int current_frame_thread_count =
threads_per_frame + static_cast<int>(i < extra_threads);
if (!frame_scratch_buffer->threading_strategy.Reset(
current_frame_thread_count)) {
return false;
}
remaining_threads -= current_frame_thread_count;
frame_scratch_buffers.push_back_unchecked(std::move(frame_scratch_buffer));
}
// We release the frame scratch buffers in reverse order so that the extra
// threads are allocated to buffers in the top of the stack.
for (int i = static_cast<int>(frame_scratch_buffers.size()) - 1; i >= 0;
--i) {
frame_scratch_buffer_pool->Release(std::move(frame_scratch_buffers[i]));
}
return true;
}
} // namespace libgav1
|