[3.5] Check scheduler queue return value (#9759)

* Add to cmake. * Backport changes from devel. * added CHANGELOG entry, port adjustments from devel
2019-08-20 05:57:51 -04:00 · 2019-08-20 05:57:51 -04:00 · 663212ba19
parent 3dcc293224
commit 663212ba19
18 changed files with 407 additions and 87 deletions
--- a/11
+++ b/11
@ -1,6 +1,17 @@
 v3.5.1 (XXXX-XX-XX)
 -------------------
 * Consistently honor the return value of all attempts to queue tasks in the
  internal scheduler.
  Previously some call sites did not check the return value of internal queueing
  operations, and if the scheduler queue was full, operations that were thought
  to be requeued were silently dropped. Now, there will be reactions on such
  failures. Requeuing an important task with a time offset (Scheduler::queueDelay)
  is now also retried on failure (queue full) up to at most five minutes. If after
  five minutes such a task still cannot be queued, a fatal error will be logged
  and the server process will be aborted.
 * Made index selection much more deterministic in case there are
  multiple competing indexes.
--- a/arangod/Aql/SharedQueryState.cpp
+++ b/arangod/Aql/SharedQueryState.cpp
@ -70,6 +70,5 @@ bool SharedQueryState::executeContinueCallback() const {
  }
  // do NOT use scheduler->post(), can have high latency that
  //  then backs up libcurl callbacks to other objects
-  scheduler->queue(RequestLane::CLIENT_AQL, _continueCallback);
+  return scheduler->queue(RequestLane::CLIENT_AQL, _continueCallback);
  return true;
 }
--- a/arangod/Cache/CacheManagerFeature.cpp
+++ b/arangod/Cache/CacheManagerFeature.cpp
@ -101,8 +101,11 @@ void CacheManagerFeature::start() {
  auto scheduler = SchedulerFeature::SCHEDULER;
  auto postFn = [scheduler](std::function<void()> fn) -> bool {
-    scheduler->queue(RequestLane::INTERNAL_LOW, fn);
+    try {
-    return true;
+      return scheduler->queue(RequestLane::INTERNAL_LOW, fn);
    } catch (...) {
      return false;
    }
  };
  _manager.reset(new Manager(postFn, _cacheSize));
  MANAGER = _manager.get();
--- a/arangod/Pregel/Conductor.cpp
+++ b/arangod/Pregel/Conductor.cpp
@ -20,6 +20,9 @@
 /// @author Simon Grätzer
 ////////////////////////////////////////////////////////////////////////////////
 #include <chrono>
 #include <thread>
 #include "Conductor.h"
 #include "Pregel/Aggregator.h"
@ -30,6 +33,7 @@
 #include "Pregel/Recovery.h"
 #include "Pregel/Utils.h"
 #include "Basics/FunctionUtils.h"
 #include "Basics/MutexLocker.h"
 #include "Basics/StringUtils.h"
 #include "Basics/VelocyPackHelper.h"
@ -305,7 +309,7 @@ VPackBuilder Conductor::finishedWorkerStep(VPackSlice const& data) {
  Scheduler* scheduler = SchedulerFeature::SCHEDULER;
  // don't block the response for workers waiting on this callback
  // this should allow workers to go into the IDLE state
-  scheduler->queue(RequestLane::INTERNAL_LOW, [this] {
+  bool queued = scheduler->queue(RequestLane::INTERNAL_LOW, [this] {
    MUTEX_LOCKER(guard, _callbackMutex);
    if (_state == ExecutionState::RUNNING) {
@ -319,6 +323,11 @@ VPackBuilder Conductor::finishedWorkerStep(VPackSlice const& data) {
          << "No further action taken after receiving all responses";
    }
  });
  if (!queued) {
    LOG_TOPIC("038db", ERR, Logger::PREGEL)
        << "No thread available to queue response, canceling execution";
    cancel();
  }
  return VPackBuilder();
 }
@ -389,7 +398,18 @@ void Conductor::cancel() {
 void Conductor::cancelNoLock() {
  _callbackMutex.assertLockedByCurrentThread();
  _state = ExecutionState::CANCELED;
-  _finalizeWorkers();
+  bool ok;
  int res;
  std::tie(ok, res) = basics::function_utils::retryUntilTimeout<int>(
      [this]() -> std::pair<bool, int> {
        int res = _finalizeWorkers();
        return std::make_pair(res != TRI_ERROR_QUEUE_FULL, res);
      },
      Logger::PREGEL, "cancel worker execution");
  if (!ok) {
    LOG_TOPIC("f8b3c", ERR, Logger::PREGEL)
        << "Failed to cancel worker execution for five minutes, giving up.";
  }
  _workHandle.reset();
 }
@ -412,7 +432,8 @@ void Conductor::startRecovery() {
  TRI_ASSERT(SchedulerFeature::SCHEDULER != nullptr);
  // let's wait for a final state in the cluster
-  _workHandle = SchedulerFeature::SCHEDULER->queueDelay(
+  bool queued = false;
  std::tie(queued, _workHandle) = SchedulerFeature::SCHEDULER->queueDelay(
      RequestLane::CLUSTER_AQL, std::chrono::seconds(2), [this](bool cancelled) {
        if (cancelled || _state != ExecutionState::RECOVERING) {
          return;  // seems like we are canceled
@ -460,6 +481,10 @@ void Conductor::startRecovery() {
          LOG_TOPIC("fefc6", ERR, Logger::PREGEL) << "Compensation failed";
        }
      });
  if (!queued) {
    LOG_TOPIC("92a8d", ERR, Logger::PREGEL)
        << "No thread available to queue recovery, may be in dirty state.";
  }
 }
 // resolves into an ordered list of shards for each collection on each server
@ -691,12 +716,17 @@ void Conductor::finishedWorkerFinalize(VPackSlice data) {
    auto* scheduler = SchedulerFeature::SCHEDULER;
    if (scheduler) {
      uint64_t exe = _executionNumber;
-      scheduler->queue(RequestLane::CLUSTER_AQL, [exe] {
+      bool queued = scheduler->queue(RequestLane::CLUSTER_AQL, [exe] {
        auto pf = PregelFeature::instance();
        if (pf) {
          pf->cleanupConductor(exe);
        }
      });
      if (!queued) {
        LOG_TOPIC("038da", ERR, Logger::PREGEL)
            << "No thread available to queue cleanup, canceling execution";
        cancel();
      }
    }
  }
 }
@ -766,7 +796,7 @@ int Conductor::_sendToAllDBServers(std::string const& path, VPackBuilder const&
      TRI_ASSERT(SchedulerFeature::SCHEDULER != nullptr);
      uint64_t exe = _executionNumber;
      Scheduler* scheduler = SchedulerFeature::SCHEDULER;
-      scheduler->queue(RequestLane::INTERNAL_LOW, [path, message, exe] {
+      bool queued = scheduler->queue(RequestLane::INTERNAL_LOW, [path, message, exe] {
        auto pf = PregelFeature::instance();
        if (!pf) {
          return;
@ -778,6 +808,9 @@ int Conductor::_sendToAllDBServers(std::string const& path, VPackBuilder const&
          PregelFeature::handleWorkerRequest(vocbase, path, message.slice(), response);
        }
      });
      if (!queued) {
        return TRI_ERROR_QUEUE_FULL;
      }
    }
    return TRI_ERROR_NO_ERROR;
  }
--- a/arangod/Pregel/GraphStore.cpp
+++ b/arangod/Pregel/GraphStore.cpp
@ -129,16 +129,25 @@ void GraphStore<V, E>::loadShards(WorkerConfig* config,
        _runningThreads++;
        Scheduler* scheduler = SchedulerFeature::SCHEDULER;
        TRI_ASSERT(scheduler);
-        scheduler->queue(RequestLane::INTERNAL_LOW,
+        bool queued =
-                         [this, vertexShard, edges] {
+            scheduler->queue(RequestLane::INTERNAL_LOW, [this, vertexShard, edges] {
              TRI_DEFER(_runningThreads--);  // exception safe
              try {
                _loadVertices(vertexShard, edges);
              } catch (std::exception const& ex) {
-                             LOG_TOPIC("c87c9", WARN, Logger::PREGEL) << "caught exception while "
+                LOG_TOPIC("c87c9", WARN, Logger::PREGEL)
                    << "caught exception while "
                    << "loading pregel graph: " << ex.what();
              }
            });
        if (!queued) {
          LOG_TOPIC("38da2", WARN, Logger::PREGEL)
              << "No thread available to queue vertex loading";
        }
      } catch (basics::Exception const& ex) {
        LOG_TOPIC("3f283", WARN, Logger::PREGEL)
            << "unhandled exception while "
            << "loading pregel graph: " << ex.what();
      } catch (...) {
        LOG_TOPIC("3f282", WARN, Logger::PREGEL) << "unhandled exception while "
        << "loading pregel graph";
@ -151,7 +160,12 @@ void GraphStore<V, E>::loadShards(WorkerConfig* config,
  }
  Scheduler* scheduler = SchedulerFeature::SCHEDULER;
-  scheduler->queue(RequestLane::INTERNAL_LOW, cb);
+  bool queued = scheduler->queue(RequestLane::INTERNAL_LOW, cb);
  if (!queued) {
    THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_QUEUE_FULL,
                                   "No thread available to queue callback, "
                                   "canceling execution");
  }
 }
 template <typename V, typename E>
@ -563,7 +577,7 @@ void GraphStore<V, E>::storeResults(WorkerConfig* config,
    numT << " threads";
  for (size_t i = 0; i < numT; i++) {
-    SchedulerFeature::SCHEDULER->queue(RequestLane::INTERNAL_LOW, [=]{
+    bool queued = SchedulerFeature::SCHEDULER->queue(RequestLane::INTERNAL_LOW, [=] {
      size_t startI = i * (numSegments / numT);
      size_t endI = (i + 1) * (numSegments / numT);
      TRI_ASSERT(endI <= numSegments);
@ -584,6 +598,11 @@ void GraphStore<V, E>::storeResults(WorkerConfig* config,
        cb();
      }
    });
    if (!queued) {
      THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_QUEUE_FULL,
                                     "No thread available to queue vertex "
                                     "storage, canceling execution");
    }
  }
 }
--- a/arangod/Pregel/PregelFeature.cpp
+++ b/arangod/Pregel/PregelFeature.cpp
@ -308,10 +308,14 @@ void PregelFeature::cleanupWorker(uint64_t executionNumber) {
  // unmapping etc might need a few seconds
  TRI_ASSERT(SchedulerFeature::SCHEDULER != nullptr);
  Scheduler* scheduler = SchedulerFeature::SCHEDULER;
-  scheduler->queue(RequestLane::INTERNAL_LOW, [this, executionNumber, instance] {
+  bool queued = scheduler->queue(RequestLane::INTERNAL_LOW, [this, executionNumber, instance] {
    MUTEX_LOCKER(guard, _mutex);
    _workers.erase(executionNumber);
  });
  if (!queued) {
    THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_QUEUE_FULL,
                                   "No thread available to queue cleanup.");
  }
 }
 void PregelFeature::cleanupAll() {
--- a/arangod/Pregel/Recovery.cpp
+++ b/arangod/Pregel/Recovery.cpp
@ -143,8 +143,13 @@ void RecoveryManager::updatedFailedServers(std::vector<ServerID> const& failed)
      TRI_ASSERT(SchedulerFeature::SCHEDULER != nullptr);
      Scheduler* scheduler = SchedulerFeature::SCHEDULER;
-      scheduler->queue(RequestLane::INTERNAL_LOW,
+      bool queued = scheduler->queue(RequestLane::INTERNAL_LOW, [this, shard] {
-                       [this, shard] { _renewPrimaryServer(shard); });
+        _renewPrimaryServer(shard);
      });
      if (!queued) {
        LOG_TOPIC("038de", ERR, Logger::PREGEL)
            << "No thread available to queue pregel recovery manager request";
      }
    }
  }
 }
--- a/arangod/Pregel/Worker.cpp
+++ b/arangod/Pregel/Worker.cpp
@ -166,9 +166,13 @@ void Worker<V, E, M>::setupWorker() {
    TRI_ASSERT(SchedulerFeature::SCHEDULER != nullptr);
    Scheduler* scheduler = SchedulerFeature::SCHEDULER;
    auto self = shared_from_this();
-    scheduler->queue(RequestLane::INTERNAL_LOW, [self, this, cb] {
+    bool queued = scheduler->queue(RequestLane::INTERNAL_LOW, [self, this, cb] {
      _graphStore->loadShards(&_config, cb);
    });
    if (!queued) {
      THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_QUEUE_FULL,
                                     "No available thread to load shards");
    }
  }
 }
@ -329,7 +333,7 @@ void Worker<V, E, M>::_startProcessing() {
  auto self = shared_from_this();
  for (size_t i = 0; i < numT; i++) {
-    scheduler->queue(RequestLane::INTERNAL_LOW, [self, this, i, numT, numSegments] {
+    bool queued = scheduler->queue(RequestLane::INTERNAL_LOW, [self, this, i, numT, numSegments] {
      if (_state != WorkerState::COMPUTING) {
        LOG_TOPIC("f0e3d", WARN, Logger::PREGEL) << "Execution aborted prematurely.";
        return;
@ -344,6 +348,10 @@ void Worker<V, E, M>::_startProcessing() {
        _finishedProcessing();  // last thread turns the lights out
      }
    });
    if (!queued) {
      THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_QUEUE_FULL,
                                     "No thread available to start processing");
    }
  }
  // TRI_ASSERT(_runningThreads == i);
@ -565,7 +573,8 @@ void Worker<V, E, M>::_continueAsync() {
  // wait for new messages before beginning to process
  int64_t milli = _writeCache->containedMessageCount() < _messageBatchSize ? 50 : 5;
  // start next iteration in $milli mseconds.
-  _workHandle = SchedulerFeature::SCHEDULER->queueDelay(
+  bool queued = false;
  std::tie(queued, _workHandle) = SchedulerFeature::SCHEDULER->queueDelay(
      RequestLane::INTERNAL_LOW, std::chrono::milliseconds(milli), [this](bool cancelled) {
        if (!cancelled) {
          {  // swap these pointers atomically
@ -583,6 +592,10 @@ void Worker<V, E, M>::_continueAsync() {
          _startProcessing();
        }
      });
  if (!queued) {
    THROW_ARANGO_EXCEPTION_MESSAGE(
        TRI_ERROR_QUEUE_FULL, "No thread available to continue execution.");
  }
 }
 template <typename V, typename E, typename M>
@ -703,7 +716,7 @@ void Worker<V, E, M>::compensateStep(VPackSlice const& data) {
  TRI_ASSERT(SchedulerFeature::SCHEDULER != nullptr);
  Scheduler* scheduler = SchedulerFeature::SCHEDULER;
  auto self = shared_from_this();
-  scheduler->queue(RequestLane::INTERNAL_LOW, [self, this] {
+  bool queued = scheduler->queue(RequestLane::INTERNAL_LOW, [self, this] {
    if (_state != WorkerState::RECOVERING) {
      LOG_TOPIC("554e2", WARN, Logger::PREGEL) << "Compensation aborted prematurely.";
      return;
@ -740,6 +753,10 @@ void Worker<V, E, M>::compensateStep(VPackSlice const& data) {
    package.close();
    _callConductor(Utils::finishedRecoveryPath, package);
  });
  if (!queued) {
    THROW_ARANGO_EXCEPTION_MESSAGE(
        TRI_ERROR_QUEUE_FULL, "No thread available to queue compensation.");
  }
 }
 template <typename V, typename E, typename M>
@ -768,10 +785,14 @@ void Worker<V, E, M>::_callConductor(std::string const& path, VPackBuilder const
    TRI_ASSERT(SchedulerFeature::SCHEDULER != nullptr);
    Scheduler* scheduler = SchedulerFeature::SCHEDULER;
    auto self = shared_from_this();
-    scheduler->queue(RequestLane::INTERNAL_LOW, [self, path, message] {
+    bool queued = scheduler->queue(RequestLane::INTERNAL_LOW, [self, path, message] {
      VPackBuilder response;
      PregelFeature::handleConductorRequest(path, message.slice(), response);
    });
    if (!queued) {
      THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_QUEUE_FULL,
                                     "No thread available to call conductor");
    }
  } else {
    std::shared_ptr<ClusterComm> cc = ClusterComm::instance();
    std::string baseUrl = Utils::baseUrl(_config.database(), Utils::conductorPrefix);
--- a/arangod/Replication/InitialSyncer.cpp
+++ b/arangod/Replication/InitialSyncer.cpp
@ -22,6 +22,10 @@
 ////////////////////////////////////////////////////////////////////////////////
 #include "InitialSyncer.h"
 #include "Basics/FunctionUtils.h"
 #include "Basics/application-exit.h"
 #include "Logger/LogMacros.h"
 #include "Scheduler/Scheduler.h"
 #include "Scheduler/SchedulerFeature.h"
@ -56,19 +60,35 @@ void InitialSyncer::startRecurringBatchExtension() {
  }
  std::weak_ptr<Syncer> self(shared_from_this());
-  _batchPingTimer = SchedulerFeature::SCHEDULER->queueDelay(
+  bool queued = false;
-      RequestLane::SERVER_REPLICATION, std::chrono::seconds(secs), [self](bool cancelled) {
+  std::tie(queued, _batchPingTimer) =
      basics::function_utils::retryUntilTimeout<Scheduler::WorkHandle>(
          [secs, self]() -> std::pair<bool, Scheduler::WorkHandle> {
            return SchedulerFeature::SCHEDULER->queueDelay(
                RequestLane::SERVER_REPLICATION, std::chrono::seconds(secs),
                [self](bool cancelled) {
                  if (!cancelled) {
                    auto syncer = self.lock();
                    if (syncer) {
                      auto* s = static_cast<InitialSyncer*>(syncer.get());
                      if (s->_batch.id != 0 && !s->isAborted()) {
-              s->_batch.extend(s->_state.connection, s->_progress, s->_state.syncerId);
+                        s->_batch.extend(s->_state.connection, s->_progress,
                                         s->_state.syncerId);
                        s->startRecurringBatchExtension();
                      }
                    }
                  }
                });
          },
          Logger::REPLICATION, "queue batch extension");
  if (!queued) {
    LOG_TOPIC("f8b3e", ERR, Logger::REPLICATION)
        << "Failed to queue replication batch extension for 5 minutes, exiting.";
    // don't abort, as this is not a critical error 
    // if requeueing has failed here, the replication can still go on, but
    // it _may_ fail later because the batch has expired on the leader.
    // but there are still chances it can continue successfully
  }
 }
 }  // namespace arangodb
--- a/arangod/RestHandler/RestShutdownHandler.cpp
+++ b/arangod/RestHandler/RestShutdownHandler.cpp
@ -91,23 +91,23 @@ RestStatus RestShutdownHandler::execute() {
    clusterFeature->setUnregisterOnShutdown(true);
  }
  try {
    VPackBuilder result;
    result.add(VPackValue("OK"));
    generateResult(rest::ResponseCode::OK, result.slice());
  } catch (...) {
    // Ignore the error
  }
  auto self = shared_from_this();
  Scheduler* scheduler = SchedulerFeature::SCHEDULER;
  // don't block the response for workers waiting on this callback
  // this should allow workers to go into the IDLE state
-  scheduler->queue(RequestLane::CLUSTER_INTERNAL, [self] {
+  bool queued = scheduler->queue(RequestLane::CLUSTER_INTERNAL, [self] {
    // Give the server 2 seconds to send the reply:
    std::this_thread::sleep_for(std::chrono::seconds(2));
    // Go down:
    ApplicationServer::server->beginShutdown();
  });
  if (queued) {
    VPackBuilder result;
    result.add(VPackValue("OK"));
    generateResult(rest::ResponseCode::OK, result.slice());
  } else {
    generateError(rest::ResponseCode::SERVER_ERROR, TRI_ERROR_QUEUE_FULL);
  }
  return RestStatus::DONE;
 }
--- a/arangod/Scheduler/Scheduler.cpp
+++ b/arangod/Scheduler/Scheduler.cpp
@ -143,14 +143,15 @@ void Scheduler::runCronThread() {
  }
 }
-Scheduler::WorkHandle Scheduler::queueDelay(RequestLane lane, clock::duration delay,
+std::pair<bool, Scheduler::WorkHandle> Scheduler::queueDelay(
-                                            std::function<void(bool cancelled)> handler) {
+    RequestLane lane, clock::duration delay, std::function<void(bool cancelled)> handler) {
  TRI_ASSERT(!isStopping());
  if (delay < std::chrono::milliseconds(1)) {
    // execute directly
    bool queued =
        queue(lane, [handler = std::move(handler)]() { handler(false); });
-    return nullptr;
+    return std::make_pair(queued, nullptr);
  }
  auto item = std::make_shared<WorkItem>(std::move(handler), lane, this);
@ -164,8 +165,9 @@ Scheduler::WorkHandle Scheduler::queueDelay(RequestLane lane, clock::duration de
    }
  }
-  return item;
+  return std::make_pair(true, item);
 }
 /*
 void Scheduler::cancelAllTasks() {
  //std::unique_lock<std::mutex> guard(_cronQueueMutex);
--- a/arangod/Scheduler/Scheduler.h
+++ b/arangod/Scheduler/Scheduler.h
@ -25,10 +25,13 @@
 #ifndef ARANGOD_SCHEDULER_SCHEDULER_H
 #define ARANGOD_SCHEDULER_SCHEDULER_H 1
 #include <chrono>
 #include <condition_variable>
 #include <mutex>
 #include <queue>
 #include "Basics/Exceptions.h"
 #include "Basics/system-compiler.h"
 #include "GeneralServer/RequestLane.h"
 namespace arangodb {
@ -37,6 +40,7 @@ namespace velocypack {
 class Builder;
 }
 class LogTopic;
 class SchedulerThread;
 class SchedulerCronThread;
@ -54,12 +58,15 @@ class Scheduler {
  typedef std::shared_ptr<WorkItem> WorkHandle;
  // Enqueues a task - this is implemented on the specific scheduler
-  virtual bool queue(RequestLane lane, std::function<void()>, bool allowDirectHandling = false) = 0;
+  // May throw.
  virtual bool queue(RequestLane lane, std::function<void()>,
                     bool allowDirectHandling = false) ADB_WARN_UNUSED_RESULT = 0;
  // Enqueues a task after delay - this uses the queue functions above.
  // WorkHandle is a shared_ptr to a WorkItem. If all references the WorkItem
-  // are dropped, the task is canceled.
+  // are dropped, the task is canceled. It will return true if queued, false
-  virtual WorkHandle queueDelay(RequestLane lane, clock::duration delay,
+  // otherwise.
  virtual std::pair<bool, WorkHandle> queueDelay(RequestLane lane, clock::duration delay,
                                                 std::function<void(bool canceled)> handler);
  class WorkItem final {
@ -96,9 +103,11 @@ class Scheduler {
        // The following code moves the _handler into the Scheduler.
        // Thus any reference to class to self in the _handler will be released
        // as soon as the scheduler executed the _handler lambda.
-        _scheduler->queue(_lane, [handler = std::move(_handler), arg]() {
+        bool queued = _scheduler->queue(_lane, [handler = std::move(_handler),
-          handler(arg);
+                                                arg]() { handler(arg); });
-        });
+        if (!queued) {
          THROW_ARANGO_EXCEPTION(TRI_ERROR_QUEUE_FULL);
        }
      }
    }
 #ifdef ARANGODB_ENABLE_MAINTAINER_MODE
--- a/arangod/Transaction/ManagerFeature.cpp
+++ b/arangod/Transaction/ManagerFeature.cpp
@ -23,12 +23,39 @@
 #include "ManagerFeature.h"
 #include "ApplicationFeatures/ApplicationServer.h"
 #include "Basics/FunctionUtils.h"
 #include "Basics/MutexLocker.h"
 #include "Basics/application-exit.h"
 #include "Scheduler/SchedulerFeature.h"
 #include "StorageEngine/EngineSelectorFeature.h"
 #include "StorageEngine/StorageEngine.h"
 #include "Transaction/Manager.h"
 namespace {
 void queueGarbageCollection(std::mutex& mutex, arangodb::Scheduler::WorkHandle& workItem,
                            std::function<void(bool)>& gcfunc) {
  bool queued = false;
  {
    std::lock_guard<std::mutex> guard(mutex);
    std::tie(queued, workItem) =
        arangodb::basics::function_utils::retryUntilTimeout<arangodb::Scheduler::WorkHandle>(
            [&gcfunc]() -> std::pair<bool, arangodb::Scheduler::WorkHandle> {
              auto off = std::chrono::seconds(1);
              return arangodb::SchedulerFeature::SCHEDULER->queueDelay(arangodb::RequestLane::INTERNAL_LOW,
                                                                       off, gcfunc);
            },
            arangodb::Logger::TRANSACTIONS,
            "queue transaction garbage collection");
  }
  if (!queued) {
    LOG_TOPIC("f8b3d", FATAL, arangodb::Logger::TRANSACTIONS)
        << "Failed to queue transaction garbage collection, for 5 minutes, "
           "exiting.";
    FATAL_ERROR_EXIT();
  }
 }
 }  // namespace
 using namespace arangodb::application_features;
 using namespace arangodb::basics;
 using namespace arangodb::options;
@ -53,11 +80,8 @@ ManagerFeature::ManagerFeature(application_features::ApplicationServer& server)
    MANAGER->garbageCollect(/*abortAll*/false);
    auto off = std::chrono::seconds(1);
    std::lock_guard<std::mutex> guard(_workItemMutex);
    if (!ApplicationServer::isStopping()) {
-      _workItem = SchedulerFeature::SCHEDULER->queueDelay(RequestLane::INTERNAL_LOW, off, _gcfunc);
+      ::queueGarbageCollection(_workItemMutex, _workItem, _gcfunc);
    }
  };
 }
@ -71,9 +95,7 @@ void ManagerFeature::prepare() {
 void ManagerFeature::start() {
  Scheduler* scheduler = SchedulerFeature::SCHEDULER;
  if (scheduler != nullptr) {  // is nullptr in catch tests
-    auto off = std::chrono::seconds(1);
+    ::queueGarbageCollection(_workItemMutex, _workItem, _gcfunc);
    std::lock_guard<std::mutex> guard(_workItemMutex);
    _workItem = scheduler->queueDelay(RequestLane::INTERNAL_LOW, off, _gcfunc);
  }
 }
--- a/arangod/VocBase/Methods/Tasks.cpp
+++ b/arangod/VocBase/Methods/Tasks.cpp
@ -29,6 +29,7 @@
 #include <velocypack/Builder.h>
 #include <velocypack/velocypack-aliases.h>
 #include "Basics/FunctionUtils.h"
 #include "Basics/StringUtils.h"
 #include "Basics/tri-strings.h"
 #include "Cluster/ServerState.h"
@ -296,20 +297,35 @@ std::function<void(bool cancelled)> Task::callbackFunction() {
    }
    // now do the work:
-    SchedulerFeature::SCHEDULER->queue(RequestLane::INTERNAL_LOW, [self, this, execContext] {
+    bool queued = basics::function_utils::retryUntilTimeout(
        [this, self, execContext]() -> bool {
          return SchedulerFeature::SCHEDULER->queue(RequestLane::INTERNAL_LOW, [self, this, execContext] {
            ExecContextScope scope(_user.empty() ? ExecContext::superuser()
                                                 : execContext.get());
            work(execContext.get());
            if (_periodic.load() && !application_features::ApplicationServer::isStopping()) {
              // requeue the task
-        queue(_interval);
+              bool queued = basics::function_utils::retryUntilTimeout(
                  [this]() -> bool { return queue(_interval); }, Logger::FIXME,
                  "queue task");
              if (!queued) {
                THROW_ARANGO_EXCEPTION_MESSAGE(
                    TRI_ERROR_QUEUE_FULL,
                    "Failed to queue task for 5 minutes, gave up.");
              }
            } else {
              // in case of one-off tasks or in case of a shutdown, simply
              // remove the task from the list
              Task::unregisterTask(_id, true);
            }
          });
        },
        Logger::FIXME, "queue task");
    if (!queued) {
      THROW_ARANGO_EXCEPTION_MESSAGE(
          TRI_ERROR_QUEUE_FULL, "Failed to queue task for 5 minutes, gave up.");
    }
  };
 }
@ -327,13 +343,21 @@ void Task::start() {
  }
  // initially queue the task
-  queue(_offset);
+  bool queued = basics::function_utils::retryUntilTimeout(
      [this]() -> bool { return queue(_offset); }, Logger::FIXME, "queue task");
  if (!queued) {
    THROW_ARANGO_EXCEPTION_MESSAGE(
        TRI_ERROR_QUEUE_FULL, "Failed to queue task for 5 minutes, gave up.");
  }
 }
-void Task::queue(std::chrono::microseconds offset) {
+bool Task::queue(std::chrono::microseconds offset) {
  MUTEX_LOCKER(lock, _taskHandleMutex);
-  _taskHandle = SchedulerFeature::SCHEDULER->queueDelay(RequestLane::INTERNAL_LOW,
+  bool queued = false;
-                                                        offset, callbackFunction());
+  std::tie(queued, _taskHandle) =
      SchedulerFeature::SCHEDULER->queueDelay(RequestLane::INTERNAL_LOW, offset,
                                              callbackFunction());
  return queued;
 }
 void Task::cancel() {
--- a/arangod/VocBase/Methods/Tasks.h
+++ b/arangod/VocBase/Methods/Tasks.h
@ -82,7 +82,7 @@ class Task : public std::enable_shared_from_this<Task> {
 private:
  void toVelocyPack(velocypack::Builder&) const;
  void work(ExecContext const*);
-  void queue(std::chrono::microseconds offset);
+  bool queue(std::chrono::microseconds offset) ADB_WARN_UNUSED_RESULT;
  std::function<void(bool cancelled)> callbackFunction();
  std::string const& name() const { return _name; }
--- a/lib/Basics/FunctionUtils.cpp
+++ b/lib/Basics/FunctionUtils.cpp
@ -0,0 +1,59 @@
 ////////////////////////////////////////////////////////////////////////////////
 /// DISCLAIMER
 ///
 /// Copyright 2014-2019 ArangoDB GmbH, Cologne, Germany
 /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
 ///
 /// Licensed under the Apache License, Version 2.0 (the "License");
 /// you may not use this file except in compliance with the License.
 /// You may obtain a copy of the License at
 ///
 ///     http://www.apache.org/licenses/LICENSE-2.0
 ///
 /// Unless required by applicable law or agreed to in writing, software
 /// distributed under the License is distributed on an "AS IS" BASIS,
 /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 /// See the License for the specific language governing permissions and
 /// limitations under the License.
 ///
 /// Copyright holder is ArangoDB GmbH, Cologne, Germany
 ///
 /// @author Dan Larkin-York
 ////////////////////////////////////////////////////////////////////////////////
 #include "FunctionUtils.h"
 namespace arangodb {
 namespace basics {
 namespace function_utils {
 /**
 * @brief Execute a lambda, retrying periodically until it succeeds or times out
 * @param fn            Lambda to run
 * @param topic         Log topic for any messages
 * @param message       Description for log messages (format below)
 * @param retryInterval Period to wait between attempts
 * @param timeout       Total time to wait before timing out and returning
 *
 * If a given attempt fails, a log message will be made in the following form:
 * "Failed to " + message + ", waiting to retry..."
 */
 bool retryUntilTimeout(std::function<bool()> fn, LogTopic& topic,
                       std::string const& message, std::chrono::nanoseconds retryInterval,
                       std::chrono::nanoseconds timeout) {
  auto start = std::chrono::steady_clock::now();
  bool success = false;
  while ((std::chrono::steady_clock::now() - start) < timeout) {
    success = fn();
    if (success) {
      break;
    }
    LOG_TOPIC("18d0b", INFO, topic) << "Failed to " << message << ", waiting to retry...";
    std::this_thread::sleep_for(retryInterval);
  }
  return success;
 }
 }  // namespace function_utils
 }  // namespace basics
 }  // namespace arangodb
--- a/lib/Basics/FunctionUtils.h
+++ b/lib/Basics/FunctionUtils.h
@ -0,0 +1,88 @@
 ////////////////////////////////////////////////////////////////////////////////
 /// DISCLAIMER
 ///
 /// Copyright 2014-2019 ArangoDB GmbH, Cologne, Germany
 /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
 ///
 /// Licensed under the Apache License, Version 2.0 (the "License");
 /// you may not use this file except in compliance with the License.
 /// You may obtain a copy of the License at
 ///
 ///     http://www.apache.org/licenses/LICENSE-2.0
 ///
 /// Unless required by applicable law or agreed to in writing, software
 /// distributed under the License is distributed on an "AS IS" BASIS,
 /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 /// See the License for the specific language governing permissions and
 /// limitations under the License.
 ///
 /// Copyright holder is ArangoDB GmbH, Cologne, Germany
 ///
 /// @author Dan Larkin-York
 ////////////////////////////////////////////////////////////////////////////////
 #ifndef ARANGODB_BASICS_FUNCTION_UTILS_H
 #define ARANGODB_BASICS_FUNCTION_UTILS_H 1
 #include <algorithm>
 #include <chrono>
 #include <functional>
 #include <thread>
 #include <tuple>
 #include "Logger/LogMacros.h"
 namespace arangodb {
 namespace basics {
 namespace function_utils {
 /**
 * @brief Execute a lambda, retrying periodically until it succeeds or times out
 * @param fn            Lambda to run
 * @param topic         Log topic for any messages
 * @param message       Description for log messages (format below)
 * @param retryInterval Period to wait between attempts
 * @param timeout       Total time to wait before timing out and returning
 *
 * If a given attempt fails, a log message will be made in the following form:
 * "Failed to " + message + ", waiting to retry..."
 */
 template <typename R>
 std::pair<bool, R> retryUntilTimeout(
    std::function<std::pair<bool, R>()> fn, LogTopic& topic, std::string const& message,
    std::chrono::nanoseconds retryInterval = std::chrono::seconds(1),
    std::chrono::nanoseconds timeout = std::chrono::minutes(5)) {
  auto start = std::chrono::steady_clock::now();
  bool success = false;
  R value{};
  while ((std::chrono::steady_clock::now() - start) < timeout) {
    std::tie(success, value) = fn();
    if (success) {
      break;
    }
    LOG_TOPIC("18d0a", INFO, topic) << "Failed to " << message << ", waiting to retry...";
    std::this_thread::sleep_for(retryInterval);
  }
  return std::make_pair(success, value);
 }
 /**
 * @brief Execute a lambda, retrying periodically until it succeeds or times out
 * @param fn            Lambda to run
 * @param topic         Log topic for any messages
 * @param message       Description for log messages (format below)
 * @param retryInterval Period to wait between attempts
 * @param timeout       Total time to wait before timing out and returning
 *
 * If a given attempt fails, a log message will be made in the following form:
 * "Failed to " + message + ", waiting to retry..."
 */
 bool retryUntilTimeout(std::function<bool()> fn, LogTopic& topic, std::string const& message,
                       std::chrono::nanoseconds retryInterval = std::chrono::seconds(1),
                       std::chrono::nanoseconds timeout = std::chrono::minutes(5));
 }  // namespace function_utils
 }  // namespace basics
 }  // namespace arangodb
 #endif
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@ -155,6 +155,7 @@ add_library(${LIB_ARANGO} STATIC
  Basics/DataProtector.cpp
  Basics/Exceptions.cpp
  Basics/FileUtils.cpp
  Basics/FunctionUtils.cpp
  Basics/HybridLogicalClock.cpp
  Basics/LdapUrlParser.cpp
  Basics/LocalTaskQueue.cpp