1
0
Fork 0

Bug fix 3.2/supervision no longer fails to remove server from failed when back to good (#4211)

* let's not miss failedserver removal
* remove resetting of FailedServers in test code
* Only call abortRequestsToFailedServers at most every 3 seconds.
This commit is contained in:
Kaveh Vahedipour 2018-01-03 23:18:11 +01:00 committed by Max Neunhöffer
parent 4b798028c9
commit 40b10a7fe2
3 changed files with 14 additions and 10 deletions

View File

@ -274,13 +274,12 @@ void handleOnStatusDBServer(
// New condition GOOD:
if (transisted.status == Supervision::HEALTH_STATUS_GOOD) {
if (snapshot.has(failedServerPath)) {
Builder del;
{ VPackArrayBuilder c(&del);
{ VPackObjectBuilder cc(&del);
del.add(VPackValue(failedServerPath));
{ VPackObjectBuilder ccc(&del);
del.add("op", VPackValue("delete")); }}}
singleWriteTransaction(agent, del);
envelope = std::make_shared<VPackBuilder>();
{ VPackArrayBuilder a(envelope.get());
{ VPackObjectBuilder operations (envelope.get());
envelope->add(VPackValue(failedServerPath));
{ VPackObjectBuilder ccc(envelope.get());
envelope->add("op", VPackValue("delete")); }}}
}
} else if ( // New state: FAILED persisted: GOOD (-> BAD)
persisted.status == Supervision::HEALTH_STATUS_GOOD &&
@ -464,7 +463,7 @@ std::vector<check_t> Supervision::check(std::string const& type) {
pReport->add(i.key.copyString(), i.value);
}
}} // Operation
if (envelope != nullptr) { // Preconditions(Job)
if (envelope != nullptr && envelope->slice().length()>1) { // Preconditions(Job)
TRI_ASSERT(
envelope->slice().isArray() && envelope->slice()[1].isObject());
pReport->add(envelope->slice()[1]);

View File

@ -1257,9 +1257,14 @@ void ClusterCommThread::abortRequestsToFailedServers() {
void ClusterCommThread::run() {
LOG_TOPIC(DEBUG, Logger::CLUSTER) << "starting ClusterComm thread";
std::chrono::steady_clock clock;
auto lastAbortCheck = clock.now();
while (!isStopping()) {
try {
abortRequestsToFailedServers();
if (clock.now() - lastAbortCheck > std::chrono::duration<double>(3.0)) {
abortRequestsToFailedServers();
lastAbortCheck = clock.now();
}
_cc->communicator()->work_once();
_cc->communicator()->wait();
LOG_TOPIC(TRACE, Logger::CLUSTER) << "done waiting in ClusterCommThread";

View File

@ -344,7 +344,7 @@ function SynchronousReplicationSuite () {
tearDown : function () {
db._drop(cn);
global.ArangoAgency.set('Target/FailedServers', {});
//global.ArangoAgency.set('Target/FailedServers', {});
},
////////////////////////////////////////////////////////////////////////////////