deadlock caused by race between start stop and wait
deadlock on master is caused by the fact that start
is calling enable
which is setting working_
to thread count:
void enable(uint32_t thread_count) {
std::lock_guard<std::mutex> lock(mutex_);
enabled_ = true;
working_ = thread_count;
}
and stop
is calling disable
which just sets enabled_
to false
void disable() {
{
std::lock_guard<std::mutex> lock(mutex_);
enabled_ = false;
}
// Notify pop so that it can exit.
cv_.notify_all();
}
some threads just exit without calling pop
:
/// @brief run function of each thread
void run() {
while (queue_.enabled()) { // <<< --- exit here without calling pop
WorkItemPtr item = queue_.pop();
if (item) {
try {
(*item)();
} catch (...) {
// catch all exceptions
}
}
}
}
and never reach the code which is decrementing working_
:
Item pop() {
std::unique_lock<std::mutex> lock(mutex_);
--working_; // <<< --- this code is never reached
// Wait for push or disable functions.
if (working_ == 0 && queue_.empty()) {
wait_cv_.notify_all();
}
cv_.wait(lock, [&]() {return (!enabled_ || !queue_.empty());});
if (!enabled_) {
return (Item());
}
so any call to wait
will cause a deadlock because working_
never reaches 0:
void wait() {
std::unique_lock<std::mutex> lock(mutex_);
// Wait for any item or for working threads to finish.
wait_cv_.wait(lock, [&]() {return (working_ == 0 && queue_.empty());}); // <<< --- deadlock here
}
to replicate apply this patch on master (UTs changes only) and they will fail:
diff --git a/src/lib/util/tests/thread_pool_unittest.cc b/src/lib/util/tests/thread_pool_unittest.cc
index 9c636c9e85..1c2e3a3efe 100644
--- a/src/lib/util/tests/thread_pool_unittest.cc
+++ b/src/lib/util/tests/thread_pool_unittest.cc
@@ -533,7 +533,7 @@ TEST_F(ThreadPoolTest, wait) {
ASSERT_EQ(thread_pool.size(), 0);
items_count = 64;
- thread_count = 16;
+ thread_count = 256;
// prepare setup
reset(thread_count);
@@ -556,15 +556,13 @@ TEST_F(ThreadPoolTest, wait) {
// calling start should create the threads and should keep the queued items
EXPECT_NO_THROW(thread_pool.start(thread_count));
- // the thread count should match
- ASSERT_EQ(thread_pool.size(), thread_count);
+ thread_pool.stop();
// wait for all items to be processed
- thread_pool.wait();
+ ASSERT_TRUE(thread_pool.wait(1));
// the item count should be 0
ASSERT_EQ(thread_pool.count(), 0);
- // the thread count should match
- ASSERT_EQ(thread_pool.size(), thread_count);
+
// all items should have been processed
ASSERT_EQ(count(), items_count);
without timeout on wait
they cause a deadlock.