mirror of https://gitee.com/bigwinds/arangodb
Fix race conditions in thread management. (#8032)
This commit is contained in:
parent
dc9722cf0b
commit
ecf4d9d62a
|
@ -99,6 +99,7 @@ Agent::~Agent() {
|
|||
// multiple times, and we do it just in case the Agent object was
|
||||
// created but never really started. Here, we exit with a fatal error
|
||||
// if the threads do not stop in time.
|
||||
shutdown(); // wait for the main Agent thread to terminate
|
||||
}
|
||||
|
||||
/// Wait until threads are terminated:
|
||||
|
@ -117,7 +118,9 @@ void Agent::waitForThreadsStop() {
|
|||
FATAL_ERROR_EXIT();
|
||||
}
|
||||
}
|
||||
shutdown(); // wait for the main Agent thread to terminate
|
||||
// initiate shutdown of main Agent thread, but do not wait for it yet
|
||||
// -> this happens in the destructor
|
||||
beginShutdown();
|
||||
}
|
||||
|
||||
/// State machine
|
||||
|
|
|
@ -34,9 +34,7 @@ Compactor::Compactor(Agent* agent)
|
|||
|
||||
/// Dtor shuts down thread
|
||||
Compactor::~Compactor() {
|
||||
if (!isStopping()) {
|
||||
shutdown();
|
||||
}
|
||||
shutdown();
|
||||
}
|
||||
|
||||
// @brief Run
|
||||
|
|
|
@ -81,9 +81,7 @@ Constituent::Constituent()
|
|||
|
||||
/// Shutdown if not already
|
||||
Constituent::~Constituent() {
|
||||
if (!isStopping()) {
|
||||
shutdown();
|
||||
}
|
||||
shutdown();
|
||||
}
|
||||
|
||||
/// Wait for sync
|
||||
|
|
|
@ -41,9 +41,7 @@ Inception::Inception(Agent* agent) : Thread("Inception"), _agent(agent) {}
|
|||
|
||||
// Shutdown if not already
|
||||
Inception::~Inception() {
|
||||
if (!isStopping()) {
|
||||
shutdown();
|
||||
}
|
||||
shutdown();
|
||||
}
|
||||
|
||||
/// Gossip to others
|
||||
|
|
|
@ -172,9 +172,7 @@ Supervision::Supervision()
|
|||
_upgraded(false) {}
|
||||
|
||||
Supervision::~Supervision() {
|
||||
if (!isStopping()) {
|
||||
shutdown();
|
||||
}
|
||||
shutdown();
|
||||
}
|
||||
|
||||
static std::string const syncPrefix = "/Sync/ServerStates/";
|
||||
|
|
|
@ -206,10 +206,10 @@ HeartbeatThread::HeartbeatThread(AgencyCallbackRegistry* agencyCallbackRegistry,
|
|||
/// @brief destroys a heartbeat thread
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
HeartbeatThread::~HeartbeatThread() {
|
||||
shutdown();
|
||||
if (_maintenanceThread) {
|
||||
_maintenanceThread->stop();
|
||||
}
|
||||
shutdown();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -606,6 +606,7 @@ class IResearchFeature::Async {
|
|||
_next(nullptr),
|
||||
_terminate(nullptr),
|
||||
_wasNotified(false) {}
|
||||
~Thread() { shutdown(); }
|
||||
virtual bool isSystem() override {
|
||||
return true;
|
||||
} // or start(...) will fail
|
||||
|
|
|
@ -48,14 +48,7 @@ struct ApplierThread : public Thread {
|
|||
TRI_ASSERT(_syncer);
|
||||
}
|
||||
|
||||
~ApplierThread() {
|
||||
shutdown();
|
||||
|
||||
{
|
||||
MUTEX_LOCKER(locker, _syncerMutex);
|
||||
_syncer.reset();
|
||||
}
|
||||
}
|
||||
~ApplierThread() {} // shutdown is called by derived implementations!
|
||||
|
||||
void run() override {
|
||||
TRI_ASSERT(_syncer != nullptr);
|
||||
|
@ -109,6 +102,8 @@ struct FullApplierThread final : public ApplierThread {
|
|||
FullApplierThread(ReplicationApplier* applier, std::shared_ptr<InitialSyncer>&& syncer)
|
||||
: ApplierThread(applier, std::move(syncer)) {}
|
||||
|
||||
~FullApplierThread() { shutdown(); }
|
||||
|
||||
Result runApplier() override {
|
||||
TRI_ASSERT(_syncer != nullptr);
|
||||
TRI_ASSERT(_applier != nullptr);
|
||||
|
@ -143,7 +138,8 @@ struct TailingApplierThread final : public ApplierThread {
|
|||
TailingApplierThread(ReplicationApplier* applier, std::shared_ptr<TailingSyncer>&& syncer)
|
||||
: ApplierThread(applier, std::move(syncer)) {}
|
||||
|
||||
public:
|
||||
~TailingApplierThread() { shutdown(); }
|
||||
|
||||
Result runApplier() override {
|
||||
TRI_ASSERT(dynamic_cast<TailingSyncer*>(_syncer.get()) != nullptr);
|
||||
return static_cast<TailingSyncer*>(_syncer.get())->run();
|
||||
|
|
|
@ -50,7 +50,7 @@ class SchedulerThread : virtual public Thread {
|
|||
public:
|
||||
explicit SchedulerThread(Scheduler& scheduler)
|
||||
: Thread("Scheduler"), _scheduler(scheduler) {}
|
||||
~SchedulerThread() { shutdown(); }
|
||||
~SchedulerThread() {} // shutdown is called by derived implementation!
|
||||
|
||||
protected:
|
||||
Scheduler& _scheduler;
|
||||
|
@ -61,6 +61,8 @@ class SchedulerCronThread : public SchedulerThread {
|
|||
explicit SchedulerCronThread(Scheduler& scheduler)
|
||||
: Thread("SchedCron"), SchedulerThread(scheduler) {}
|
||||
|
||||
~SchedulerCronThread() { shutdown(); }
|
||||
|
||||
void run() override { _scheduler.runCronThread(); }
|
||||
};
|
||||
|
||||
|
|
|
@ -55,7 +55,7 @@ class SupervisedSchedulerThread : virtual public Thread {
|
|||
public:
|
||||
explicit SupervisedSchedulerThread(SupervisedScheduler& scheduler)
|
||||
: Thread("Scheduler"), _scheduler(scheduler) {}
|
||||
~SupervisedSchedulerThread() { shutdown(); }
|
||||
~SupervisedSchedulerThread() {} // shutdown is called by derived implementation!
|
||||
|
||||
protected:
|
||||
SupervisedScheduler& _scheduler;
|
||||
|
@ -65,6 +65,7 @@ class SupervisedSchedulerManagerThread final : public SupervisedSchedulerThread
|
|||
public:
|
||||
explicit SupervisedSchedulerManagerThread(SupervisedScheduler& scheduler)
|
||||
: Thread("SchedMan"), SupervisedSchedulerThread(scheduler) {}
|
||||
~SupervisedSchedulerManagerThread() { shutdown(); }
|
||||
void run() override { _scheduler.runSupervisor(); };
|
||||
};
|
||||
|
||||
|
@ -72,6 +73,7 @@ class SupervisedSchedulerWorkerThread final : public SupervisedSchedulerThread {
|
|||
public:
|
||||
explicit SupervisedSchedulerWorkerThread(SupervisedScheduler& scheduler)
|
||||
: Thread("SchedWorker"), SupervisedSchedulerThread(scheduler) {}
|
||||
~SupervisedSchedulerWorkerThread() { shutdown(); }
|
||||
void run() override { _scheduler.runWorker(); };
|
||||
};
|
||||
|
||||
|
|
|
@ -69,7 +69,19 @@ void Thread::startThread(void* arg) {
|
|||
|
||||
LOCAL_THREAD_NAME = ptr->name().c_str();
|
||||
|
||||
auto guard = scopeGuard([&ptr]() { ptr->cleanupMe(); });
|
||||
// make sure we drop our reference when we are finished!
|
||||
auto guard = scopeGuard([ptr]() {
|
||||
LOCAL_THREAD_NAME = nullptr;
|
||||
ptr->releaseRef();
|
||||
});
|
||||
|
||||
ThreadState expected = ThreadState::STARTING;
|
||||
bool res = ptr->_state.compare_exchange_strong(expected, ThreadState::STARTED);
|
||||
if (!res) {
|
||||
TRI_ASSERT(expected == ThreadState::STOPPING);
|
||||
// we are already shutting down -> don't bother calling run!
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
ptr->runMe();
|
||||
|
@ -114,14 +126,14 @@ std::string Thread::stringify(ThreadState state) {
|
|||
switch (state) {
|
||||
case ThreadState::CREATED:
|
||||
return "created";
|
||||
case ThreadState::STARTING:
|
||||
return "starting";
|
||||
case ThreadState::STARTED:
|
||||
return "started";
|
||||
case ThreadState::STOPPING:
|
||||
return "stopping";
|
||||
case ThreadState::STOPPED:
|
||||
return "stopped";
|
||||
case ThreadState::DETACHED:
|
||||
return "detached";
|
||||
}
|
||||
return "unknown";
|
||||
}
|
||||
|
@ -130,6 +142,7 @@ std::string Thread::stringify(ThreadState state) {
|
|||
Thread::Thread(std::string const& name, bool deleteOnExit)
|
||||
: _deleteOnExit(deleteOnExit),
|
||||
_threadStructInitialized(false),
|
||||
_refs(0),
|
||||
_name(name),
|
||||
_thread(),
|
||||
_threadNumber(0),
|
||||
|
@ -140,35 +153,18 @@ Thread::Thread(std::string const& name, bool deleteOnExit)
|
|||
|
||||
/// @brief deletes the thread
|
||||
Thread::~Thread() {
|
||||
TRI_ASSERT(_refs.load() == 0);
|
||||
|
||||
auto state = _state.load();
|
||||
LOG_TOPIC(TRACE, Logger::THREADS)
|
||||
<< "delete(" << _name << "), state: " << stringify(state);
|
||||
|
||||
if (state == ThreadState::STOPPED) {
|
||||
if (_threadStructInitialized) {
|
||||
if (TRI_IsSelfThread(&_thread)) {
|
||||
// we must ignore any errors here, but TRI_DetachThread will log them
|
||||
TRI_DetachThread(&_thread);
|
||||
} else {
|
||||
// we must ignore any errors here, but TRI_JoinThread will log them
|
||||
TRI_JoinThread(&_thread);
|
||||
}
|
||||
}
|
||||
|
||||
_state.store(ThreadState::DETACHED);
|
||||
return;
|
||||
}
|
||||
|
||||
state = _state.load();
|
||||
|
||||
if (state != ThreadState::DETACHED && state != ThreadState::CREATED) {
|
||||
if (state != ThreadState::STOPPED) {
|
||||
LOG_TOPIC(FATAL, arangodb::Logger::FIXME)
|
||||
<< "thread '" << _name << "' is not detached but " << stringify(state)
|
||||
<< "thread '" << _name << "' is not stopped but " << stringify(state)
|
||||
<< ". shutting down hard";
|
||||
FATAL_ERROR_ABORT();
|
||||
}
|
||||
|
||||
LOCAL_THREAD_NAME = nullptr;
|
||||
}
|
||||
|
||||
/// @brief flags the thread as stopping
|
||||
|
@ -179,65 +175,40 @@ void Thread::beginShutdown() {
|
|||
ThreadState state = _state.load();
|
||||
|
||||
while (state == ThreadState::CREATED) {
|
||||
_state.compare_exchange_strong(state, ThreadState::STOPPED);
|
||||
_state.compare_exchange_weak(state, ThreadState::STOPPED);
|
||||
}
|
||||
|
||||
while (state != ThreadState::STOPPING && state != ThreadState::STOPPED &&
|
||||
state != ThreadState::DETACHED) {
|
||||
_state.compare_exchange_strong(state, ThreadState::STOPPING);
|
||||
while (state != ThreadState::STOPPING && state != ThreadState::STOPPED) {
|
||||
_state.compare_exchange_weak(state, ThreadState::STOPPING);
|
||||
}
|
||||
|
||||
LOG_TOPIC(TRACE, Logger::THREADS) << "beginShutdown(" << _name << ") reached state "
|
||||
<< stringify(_state.load());
|
||||
}
|
||||
|
||||
/// @brief derived class MUST call from its destructor
|
||||
/// @brief MUST be called from the destructor of the MOST DERIVED class
|
||||
void Thread::shutdown() {
|
||||
LOG_TOPIC(TRACE, Logger::THREADS) << "shutdown(" << _name << ")";
|
||||
|
||||
ThreadState state = _state.load();
|
||||
|
||||
while (state == ThreadState::CREATED) {
|
||||
bool res = _state.compare_exchange_strong(state, ThreadState::DETACHED);
|
||||
|
||||
if (res) {
|
||||
return;
|
||||
beginShutdown();
|
||||
if (_threadStructInitialized) {
|
||||
if (TRI_IsSelfThread(&_thread)) {
|
||||
// we must ignore any errors here, but TRI_DetachThread will log them
|
||||
TRI_DetachThread(&_thread);
|
||||
} else {
|
||||
// we must ignore any errors here, but TRI_JoinThread will log them
|
||||
TRI_JoinThread(&_thread);
|
||||
}
|
||||
}
|
||||
|
||||
if (_state.load() == ThreadState::STARTED) {
|
||||
beginShutdown();
|
||||
|
||||
if (!isSilent() && _state.load() != ThreadState::STOPPING &&
|
||||
_state.load() != ThreadState::STOPPED) {
|
||||
LOG_TOPIC(WARN, Logger::THREADS) << "forcefully shutting down thread '" << _name
|
||||
<< "' in state " << stringify(_state.load());
|
||||
}
|
||||
}
|
||||
|
||||
size_t n = 10 * 60 * 5; // * 100ms = 1s * 60 * 5
|
||||
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
if (_state.load() == ThreadState::STOPPED) {
|
||||
break;
|
||||
}
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::microseconds(100 * 1000));
|
||||
}
|
||||
|
||||
if (_state.load() != ThreadState::STOPPED) {
|
||||
LOG_TOPIC(FATAL, arangodb::Logger::FIXME)
|
||||
<< "cannot shutdown thread, giving up";
|
||||
FATAL_ERROR_ABORT();
|
||||
}
|
||||
TRI_ASSERT(_refs.load() == 0);
|
||||
TRI_ASSERT(_state.load() == ThreadState::STOPPED);
|
||||
}
|
||||
|
||||
/// @brief checks if the current thread was asked to stop
|
||||
bool Thread::isStopping() const {
|
||||
auto state = _state.load(std::memory_order_relaxed);
|
||||
|
||||
return state == ThreadState::STOPPING || state == ThreadState::STOPPED ||
|
||||
state == ThreadState::DETACHED;
|
||||
return state == ThreadState::STOPPING || state == ThreadState::STOPPED;
|
||||
}
|
||||
|
||||
/// @brief starts the thread
|
||||
|
@ -262,36 +233,40 @@ bool Thread::start(ConditionVariable* finishedCondition) {
|
|||
}
|
||||
|
||||
ThreadState expected = ThreadState::CREATED;
|
||||
bool res = _state.compare_exchange_strong(expected, ThreadState::STARTED);
|
||||
|
||||
if (!res) {
|
||||
if (!_state.compare_exchange_strong(expected, ThreadState::STARTING)) {
|
||||
// This should never happen! If it does, it means we have multiple calls to start().
|
||||
LOG_TOPIC(WARN, Logger::THREADS)
|
||||
<< "thread died before it could start, thread is in state "
|
||||
<< "failed to set thread to state 'starting'; thread is in unexpected state "
|
||||
<< stringify(expected);
|
||||
return false;
|
||||
FATAL_ERROR_ABORT();
|
||||
}
|
||||
|
||||
// we count two references - one for the current thread and one for the thread that
|
||||
// we are trying to start.
|
||||
_refs.fetch_add(2);
|
||||
TRI_ASSERT(_refs.load() == 2);
|
||||
|
||||
TRI_ASSERT(!_threadStructInitialized);
|
||||
memset(&_thread, 0, sizeof(thread_t));
|
||||
TRI_ASSERT(_threadStructInitialized == false);
|
||||
TRI_InitThread(&_thread);
|
||||
|
||||
bool ok = TRI_StartThread(&_thread, _name.c_str(), &startThread, this);
|
||||
|
||||
if (!ok) {
|
||||
// could not start the thread
|
||||
// could not start the thread -> decrement ref for the foreign thread
|
||||
_refs.fetch_sub(1);
|
||||
_state.store(ThreadState::STOPPED);
|
||||
LOG_TOPIC(ERR, Logger::THREADS)
|
||||
<< "could not start thread '" << _name << "': " << TRI_last_error();
|
||||
|
||||
// must cleanup to prevent memleaks
|
||||
cleanupMe();
|
||||
}
|
||||
|
||||
_threadStructInitialized = true;
|
||||
|
||||
releaseRef();
|
||||
|
||||
return ok;
|
||||
}
|
||||
|
||||
void Thread::markAsStopped() {
|
||||
// TODO - marked as stopped before accessing finishedCondition?
|
||||
_state.store(ThreadState::STOPPED);
|
||||
|
||||
if (_finishedCondition != nullptr) {
|
||||
|
@ -323,8 +298,10 @@ void Thread::runMe() {
|
|||
}
|
||||
}
|
||||
|
||||
void Thread::cleanupMe() {
|
||||
if (_deleteOnExit) {
|
||||
void Thread::releaseRef() {
|
||||
auto refs = _refs.fetch_sub(1) - 1;
|
||||
TRI_ASSERT(refs >= 0);
|
||||
if (refs == 0 && _deleteOnExit) {
|
||||
LOCAL_THREAD_NAME = nullptr;
|
||||
delete this;
|
||||
}
|
||||
|
|
|
@ -52,7 +52,7 @@ class Thread {
|
|||
#error OS not supported
|
||||
#endif
|
||||
|
||||
enum class ThreadState { CREATED, STARTED, STOPPING, STOPPED, DETACHED };
|
||||
enum class ThreadState { CREATED, STARTING, STARTED, STOPPING, STOPPED };
|
||||
|
||||
static std::string stringify(ThreadState);
|
||||
|
||||
|
@ -90,17 +90,14 @@ class Thread {
|
|||
virtual bool isSilent() { return false; }
|
||||
|
||||
/// @brief flags the thread as stopping
|
||||
/// Classes that override this function must ensure that they
|
||||
/// always call Thread::beginShutdown()!
|
||||
virtual void beginShutdown();
|
||||
|
||||
bool runningInThisThread() const {
|
||||
return currentThreadNumber() == this->threadNumber();
|
||||
}
|
||||
|
||||
protected:
|
||||
/// @brief called from the destructor
|
||||
void shutdown();
|
||||
|
||||
public:
|
||||
/// @brief name of a thread
|
||||
std::string const& name() const { return _name; }
|
||||
|
||||
|
@ -123,23 +120,36 @@ class Thread {
|
|||
/// @brief starts the thread
|
||||
bool start(basics::ConditionVariable* _finishedCondition = nullptr);
|
||||
|
||||
/// @brief optional notification call when thread gets unplanned exception
|
||||
virtual void crashNotification(std::exception const&) {}
|
||||
|
||||
protected:
|
||||
/// @brief MUST be called from the destructor of the MOST DERIVED class
|
||||
///
|
||||
/// shutdown sets the _state to signal the thread that it should stop
|
||||
/// and waits for the thread to finish. This is necessary to avoid any
|
||||
/// races in the destructor.
|
||||
/// That is also the reason why it has to be called by the MOST DERIVED
|
||||
/// class (potential race on the objects vtable). Usually the call to
|
||||
/// shutdown should be the very first thing in the destructur. Any access
|
||||
/// to members of the thread that happen before the call to shutdown must
|
||||
/// be threadsafe!
|
||||
void shutdown();
|
||||
|
||||
/// @brief the thread program
|
||||
virtual void run() = 0;
|
||||
|
||||
/// @brief optional notification call when thread gets unplanned exception
|
||||
virtual void crashNotification(std::exception const&) {}
|
||||
|
||||
private:
|
||||
/// @brief static started with access to the private variables
|
||||
static void startThread(void* arg);
|
||||
void markAsStopped();
|
||||
void runMe();
|
||||
void cleanupMe();
|
||||
void releaseRef();
|
||||
|
||||
private:
|
||||
bool const _deleteOnExit;
|
||||
bool _threadStructInitialized;
|
||||
std::atomic<int> _refs;
|
||||
|
||||
// name of the thread
|
||||
std::string const _name;
|
||||
|
|
|
@ -40,7 +40,6 @@ LogThread::~LogThread() {
|
|||
Logger::_threaded = false;
|
||||
Logger::_active = false;
|
||||
|
||||
beginShutdown();
|
||||
shutdown();
|
||||
}
|
||||
|
||||
|
|
|
@ -28,6 +28,7 @@ public:
|
|||
_wait.wait(uint64_t(1000000));
|
||||
}
|
||||
}
|
||||
~TestThread() { shutdown(); }
|
||||
|
||||
void run() override {
|
||||
CONDITION_LOCKER(guard, _wait);
|
||||
|
|
Loading…
Reference in New Issue