1
0
Fork 0

Bug fix 3.2/supervision no longer fails to remove server from failed when back to good (#4211)

* let's not miss failedserver removal
* remove resetting of FailedServers in test code
* Only call abortRequestsToFailedServers at most every 3 seconds.
This commit is contained in:
Kaveh Vahedipour 2018-01-03 23:18:11 +01:00 committed by Max Neunhöffer
parent 4b798028c9
commit 40b10a7fe2
3 changed files with 14 additions and 10 deletions

View File

@ -274,13 +274,12 @@ void handleOnStatusDBServer(
// New condition GOOD: // New condition GOOD:
if (transisted.status == Supervision::HEALTH_STATUS_GOOD) { if (transisted.status == Supervision::HEALTH_STATUS_GOOD) {
if (snapshot.has(failedServerPath)) { if (snapshot.has(failedServerPath)) {
Builder del; envelope = std::make_shared<VPackBuilder>();
{ VPackArrayBuilder c(&del); { VPackArrayBuilder a(envelope.get());
{ VPackObjectBuilder cc(&del); { VPackObjectBuilder operations (envelope.get());
del.add(VPackValue(failedServerPath)); envelope->add(VPackValue(failedServerPath));
{ VPackObjectBuilder ccc(&del); { VPackObjectBuilder ccc(envelope.get());
del.add("op", VPackValue("delete")); }}} envelope->add("op", VPackValue("delete")); }}}
singleWriteTransaction(agent, del);
} }
} else if ( // New state: FAILED persisted: GOOD (-> BAD) } else if ( // New state: FAILED persisted: GOOD (-> BAD)
persisted.status == Supervision::HEALTH_STATUS_GOOD && persisted.status == Supervision::HEALTH_STATUS_GOOD &&
@ -464,7 +463,7 @@ std::vector<check_t> Supervision::check(std::string const& type) {
pReport->add(i.key.copyString(), i.value); pReport->add(i.key.copyString(), i.value);
} }
}} // Operation }} // Operation
if (envelope != nullptr) { // Preconditions(Job) if (envelope != nullptr && envelope->slice().length()>1) { // Preconditions(Job)
TRI_ASSERT( TRI_ASSERT(
envelope->slice().isArray() && envelope->slice()[1].isObject()); envelope->slice().isArray() && envelope->slice()[1].isObject());
pReport->add(envelope->slice()[1]); pReport->add(envelope->slice()[1]);

View File

@ -1257,9 +1257,14 @@ void ClusterCommThread::abortRequestsToFailedServers() {
void ClusterCommThread::run() { void ClusterCommThread::run() {
LOG_TOPIC(DEBUG, Logger::CLUSTER) << "starting ClusterComm thread"; LOG_TOPIC(DEBUG, Logger::CLUSTER) << "starting ClusterComm thread";
std::chrono::steady_clock clock;
auto lastAbortCheck = clock.now();
while (!isStopping()) { while (!isStopping()) {
try { try {
abortRequestsToFailedServers(); if (clock.now() - lastAbortCheck > std::chrono::duration<double>(3.0)) {
abortRequestsToFailedServers();
lastAbortCheck = clock.now();
}
_cc->communicator()->work_once(); _cc->communicator()->work_once();
_cc->communicator()->wait(); _cc->communicator()->wait();
LOG_TOPIC(TRACE, Logger::CLUSTER) << "done waiting in ClusterCommThread"; LOG_TOPIC(TRACE, Logger::CLUSTER) << "done waiting in ClusterCommThread";

View File

@ -344,7 +344,7 @@ function SynchronousReplicationSuite () {
tearDown : function () { tearDown : function () {
db._drop(cn); db._drop(cn);
global.ArangoAgency.set('Target/FailedServers', {}); //global.ArangoAgency.set('Target/FailedServers', {});
}, },
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////