Fix performRequests?

2016-10-13 10:57:29 +02:00 · 2016-10-13 10:57:29 +02:00 · b993de3c42
parent 3ee28ac333
commit b993de3c42
1 changed files with 68 additions and 45 deletions
--- a/arangod/Cluster/ClusterComm.cpp
+++ b/arangod/Cluster/ClusterComm.cpp
@ -995,15 +995,22 @@ size_t ClusterComm::performRequests(std::vector<ClusterCommRequest>& requests,
  std::unordered_map<OperationID, size_t> opIDtoIndex;
  try {
-    while (now <= endTime) {
+    while (true) {
      now = TRI_microtime();
      if (now > endTime) {
        break;
      }
      if (nrDone >= requests.size()) {
        // All good, report
        return nrGood;
      }
      double actionNeeded = endTime;
      // First send away what is due:
      for (size_t i = 0; i < requests.size(); i++) {
-        if (!requests[i].done && now >= dueTime[i]) {
+        if (!requests[i].done) {
          if (now >= dueTime[i]) {
            if (requests[i].headerFields.get() == nullptr) {
              requests[i].headerFields = std::make_unique<
                  std::unordered_map<std::string, std::string>>();
@ -1024,12 +1031,27 @@ size_t ClusterComm::performRequests(std::vector<ClusterCommRequest>& requests,
            opIDtoIndex.insert(std::make_pair(opId, i));
            // It is possible that an error occurs right away, we will notice
            // below after wait(), though, and retry in due course.
          } else if (dueTime[i] < actionNeeded) {
            actionNeeded = dueTime[i];
          }
        }
      }
      auto res = wait("", coordinatorTransactionID, 0, "", actionNeeded - now);
      // wait could have taken some time, so we need to update now now
      now = TRI_microtime();  
-      auto res =
+      // note that this is needed further below from here, so it is *not*
-          wait("", coordinatorTransactionID, 0, "", endTime - now);
+      // good enough to only update at the top of the loop!
      if (res.status == CL_COMM_DROPPED) {
        // this indicates that no request for this coordinatorTransactionID
        // is in flight, this is possible, since we might have scheduled
        // a retry later than now and simply wait till then
        if (now < actionNeeded) {
          usleep((actionNeeded - now) * 1000000);
        }
        continue;
      }
      auto it = opIDtoIndex.find(res.operationID);
      if (it == opIDtoIndex.end()) {
@ -1056,8 +1078,13 @@ size_t ClusterComm::performRequests(std::vector<ClusterCommRequest>& requests,
        requests[index].result = res;
        // In this case we will retry:
-          dueTime[index] =
+        double tryAgainAfter = now - startTime;
-              (std::min)(10.0, (std::max)(0.2, 2 * (now - startTime))) + now;
+        if (tryAgainAfter < 0.2) {
          tryAgainAfter = 0.2;
        } else if (tryAgainAfter > 10.0) {
          tryAgainAfter = 10.0;
        }
        dueTime[index] = tryAgainAfter + now;
        if (dueTime[index] >= endTime) {
          requests[index].done = true;
          nrDone++;
@ -1073,10 +1100,6 @@ size_t ClusterComm::performRequests(std::vector<ClusterCommRequest>& requests,
            << "got no answer from " << requests[index].destination << ":"
            << requests[index].path << " with error " << res.status;
      }
        if (nrDone >= requests.size()) {
          // We are done, all results are in!
          return nrGood;
        }
    }
  } catch (...) {
    LOG_TOPIC(ERR, Logger::CLUSTER) << "ClusterComm::performRequests: "