revert Scheduler changes

2018-11-26 09:54:41 +01:00 · 2018-11-26 09:54:41 +01:00 · 9658300f11
parent 5e828ce137
commit 9658300f11
3 changed files with 56 additions and 130 deletions
--- a/arangod/Scheduler/Scheduler.cpp
+++ b/arangod/Scheduler/Scheduler.cpp
@ -112,7 +112,7 @@ class arangodb::SchedulerThread : public Thread {
    size_t counter = 0;
    bool doDecrement = true;
-    while (!_scheduler->isStopping() || 0 != _scheduler->numQueued()) {
+    while (!_scheduler->isStopping()) {
      try {
        _service->run_one();
      } catch (std::exception const& ex) {
@ -208,47 +208,24 @@ Scheduler::~Scheduler() {
 // do not pass callback by reference, might get deleted before execution
 void Scheduler::post(std::function<void()> const callback) {
  // increment number of queued and guard against exceptions
  //  (this incQueued() manipulates the atomic _counters in a sequentially-consistent
  //   manner.  isStopping() uses same atomic _counters)
  incQueued();
-  // implies if _ioContext still valid (defense against shutdown races)
+  auto guardQueue = scopeGuard([this]() { decQueued(); });
  if (!isStopping()) {
    auto guardQueue = scopeGuard([this]() { decQueued(); });
-    // capture without self, ioContext will not live longer than scheduler
+  // capture without self, ioContext will not live longer than scheduler
-    _ioContext->post([this, callback]() {
+  _ioContext->post([this, callback]() {
-      // start working
+    // start working
      JobGuard jobGuard(this);
      jobGuard.work();
      // reduce number of queued now
      decQueued();
      // it is safe to execute the callback now, even with the queued counter
      // being decreased. this is because JobGuard::work() has increased the
      // working counter, which is also checked on shutdown
      callback();
    });
    // no exception happened, cancel guard
    guardQueue.cancel();
  } else {
    // increase number of working (must precede decQueue() to keep shutdown looping)
    JobGuard jobGuard(this);
    jobGuard.work();
    // reduce number of queued now
    decQueued();
    // this post is coming late in application shutdown,
    //  might be essential ...
    // it is safe to execute the callback now, even with the queued counter
    // being decreased. this is because JobGuard::work() has increased the
    // working counter, which is also checked on shutdown
    callback();
-  } // else
+  });
  // no exception happened, cancel guard
  guardQueue.cancel();
 }
 // do not pass callback by reference, might get deleted before execution
@ -386,67 +363,57 @@ std::string Scheduler::infoStatus() {
 }
 bool Scheduler::canPostDirectly(RequestPriority prio) const noexcept {
-  if (!isStopping()) {
+  auto counters = getCounters();
-    auto counters = getCounters();
+  auto nrWorking = numWorking(counters);
-    auto nrWorking = numWorking(counters);
+  auto nrQueued = numQueued(counters);
    auto nrQueued = numQueued(counters);
-    switch (prio) {
+  switch (prio) {
-      case RequestPriority::HIGH:
+    case RequestPriority::HIGH:
-        return nrWorking + nrQueued < _maxThreads;
+      return nrWorking + nrQueued < _maxThreads;
-        // the "/ 2" is an assumption that HIGH is typically responses to our outbound messages
+    // the "/ 2" is an assumption that HIGH is typically responses to our outbound messages
-        //  where MED & LOW are incoming requests.  Keep half the threads processing our work and half their work.
+    //  where MED & LOW are incoming requests.  Keep half the threads processing our work and half their work.
-      case RequestPriority::MED:
+    case RequestPriority::MED:
-      case RequestPriority::LOW:
+    case RequestPriority::LOW:
-        return nrWorking + nrQueued < _maxThreads / 2;
+      return nrWorking + nrQueued < _maxThreads / 2;
-    }
+  }
-    return false;
+  return false;
  } else {
    // during shutdown, finesse is no longer needed.  post everything.
    return true;
  } // else
 }
 bool Scheduler::pushToFifo(int64_t fifo, std::function<void()> const& callback) {
  LOG_TOPIC(TRACE, Logger::THREADS) << "Push element on fifo: " << fifo;
  TRI_ASSERT(0 <= fifo && fifo < NUMBER_FIFOS);
-  if (!isStopping()) {
+  size_t p = static_cast<size_t>(fifo);
-    size_t p = static_cast<size_t>(fifo);
+  auto job = std::make_unique<FifoJob>(callback);
    auto job = std::make_unique<FifoJob>(callback);
-    try {
+  try {
-      if (0 < _maxFifoSize[p] && (int64_t)_maxFifoSize[p] <= _fifoSize[p]) {
+    if (0 < _maxFifoSize[p] && (int64_t)_maxFifoSize[p] <= _fifoSize[p]) {
        return false;
      }
      if (!_fifos[p]->push(job.get())) {
        return false;
      }
      job.release();
      ++_fifoSize[p];
      // then check, otherwise we might miss to wake up a thread
      auto counters = getCounters();
      auto nrWorking = numRunning(counters);
      auto nrQueued = numQueued(counters);
      if (0 == nrWorking + nrQueued) {
        post([] {
            LOG_TOPIC(DEBUG, Logger::THREADS) << "Wakeup alarm";
            /*wakeup call for scheduler thread*/
          });
      }
    } catch (...) {
      return false;
    }
-  } else {
+
-    // hand this directly to post() so it can route it quickly
+    if (!_fifos[p]->push(job.get())) {
-    post(callback);
+      return false;
-  } // else
+    }
    job.release();
    ++_fifoSize[p];
    // then check, otherwise we might miss to wake up a thread
    auto counters = getCounters();
    auto nrWorking = numRunning(counters);
    auto nrQueued = numQueued(counters);
    if (0 == nrWorking + nrQueued) {
      post([] {
          LOG_TOPIC(DEBUG, Logger::THREADS) << "Wakeup alarm";
          /*wakeup call for scheduler thread*/
      });
    }
  } catch (...) {
    return false;
  }
  return true;
 }
@ -526,38 +493,11 @@ bool Scheduler::start() {
 }
 void Scheduler::beginShutdown() {
-  if (!setStopping()) {
+  if (isStopping()) {
    // somebody else had set the stopping bit already, so we don't care here
    return;
  }
  // Scheduler::post() assumes atomic _counters is manipulated in a
  //  sequentially-consistent manner so that state of _ioContext can be implied
  //  via _counters.
  // push anything within fifo queues onto context queue
  drain();
  int notifyCounter = 0;
  while (true) {
    uint64_t const counters = _counters.load();
    if (numWorking(counters) == 0 && numQueued(counters) == 0) {
      break;
    }
    if (++notifyCounter % 500 == 0) {
      LOG_TOPIC(DEBUG, Logger::THREADS) << "waiting for numWorking: " << numWorking(counters) << ", numQueued: " << numQueued(counters); 
    }
    std::this_thread::yield();
    std::this_thread::sleep_for(std::chrono::microseconds(2000));
  }
  // shutdown worker threads and control mechanisms
  stopRebalancer();
  _threadManager.reset();
  _managerGuard.reset();
@ -565,24 +505,18 @@ void Scheduler::beginShutdown() {
  _serviceGuard.reset();
  _ioContext->stop();
  // set the flag AFTER stopping the threads
  setStopping();
 }
 void Scheduler::shutdown() {
  TRI_ASSERT(isStopping());
  int notifyCounter = 0;
  while (true) {
    uint64_t const counters = _counters.load();
-    if (numRunning(counters) == 0 && numWorking(counters) == 0
+    if (numRunning(counters) == 0 && numWorking(counters) == 0) {
        && numQueued(counters) == 0) {
      break;
    }
    if (++notifyCounter % 50 == 0) {   
      LOG_TOPIC(DEBUG, Logger::THREADS) << "waiting for numRunning: " << numRunning(counters) << ", numWorking: " << numWorking(counters) << ", numQueued: " << numQueued(counters); 
    }
    std::this_thread::yield();
    // we can be quite generous here with waiting...
@ -649,6 +583,8 @@ void Scheduler::stopRebalancer() noexcept {
  }
 }
 //
 // This routine tries to keep only the most likely needed count of threads running:
 //  - asio io_context runs less efficiently if it has too many threads, but
 //  - there is a latency hit to starting a new thread.
--- a/arangod/Scheduler/Scheduler.h
+++ b/arangod/Scheduler/Scheduler.h
@ -105,18 +105,12 @@ class Scheduler {
  bool isRunning() const { return numRunning(_counters) > 0; }
  bool isStopping() const noexcept { return (_counters & (1ULL << 63)) != 0; }
  size_t numQueued() const { return (_counters >> 32) & 0xFFFFULL; }
 private:
  void post(std::function<void()> const callback);
  void drain();
-  /// @brief set the stopping bit
+  inline void setStopping() noexcept { _counters |= (1ULL << 63); }
  /// returns true if the stopping bit was not set before, and
  /// false if it was already set 
  inline bool setStopping() noexcept { 
    return !isStopping(_counters.fetch_or(1ULL << 63));
  }
  inline bool isStopping(uint64_t value) const noexcept {
    return (value & (1ULL << 63)) != 0;
@ -175,7 +169,6 @@ class Scheduler {
  // the highest bytes (AA) are used only to encode a stopping bit. when this
  // bit is set, the scheduler is stopping (or already stopped)
  // warning: _ioContext usage assumes _counters used in sequentially-consistent manner
  std::atomic<uint64_t> _counters;
  inline uint64_t getCounters() const noexcept { return _counters; }
--- a/lib/Basics/LocalTaskQueue.cpp
+++ b/lib/Basics/LocalTaskQueue.cpp
@ -140,9 +140,6 @@ void LocalTaskQueue::enqueueCallback(std::shared_ptr<LocalCallbackTask> task) {
 //////////////////////////////////////////////////////////////////////////////
 void LocalTaskQueue::post(std::function<void()> fn) {
  if (SchedulerFeature::SCHEDULER->isStopping()) {
    THROW_ARANGO_EXCEPTION(TRI_ERROR_SHUTTING_DOWN);
  }
  _poster(fn);
 }