mirror of https://gitee.com/bigwinds/arangodb
Bug fix 3.2/supervision no longer fails to remove server from failed when back to good (#4211)
* let's not miss failedserver removal * remove resetting of FailedServers in test code * Only call abortRequestsToFailedServers at most every 3 seconds.
This commit is contained in:
parent
4b798028c9
commit
40b10a7fe2
|
@ -274,13 +274,12 @@ void handleOnStatusDBServer(
|
|||
// New condition GOOD:
|
||||
if (transisted.status == Supervision::HEALTH_STATUS_GOOD) {
|
||||
if (snapshot.has(failedServerPath)) {
|
||||
Builder del;
|
||||
{ VPackArrayBuilder c(&del);
|
||||
{ VPackObjectBuilder cc(&del);
|
||||
del.add(VPackValue(failedServerPath));
|
||||
{ VPackObjectBuilder ccc(&del);
|
||||
del.add("op", VPackValue("delete")); }}}
|
||||
singleWriteTransaction(agent, del);
|
||||
envelope = std::make_shared<VPackBuilder>();
|
||||
{ VPackArrayBuilder a(envelope.get());
|
||||
{ VPackObjectBuilder operations (envelope.get());
|
||||
envelope->add(VPackValue(failedServerPath));
|
||||
{ VPackObjectBuilder ccc(envelope.get());
|
||||
envelope->add("op", VPackValue("delete")); }}}
|
||||
}
|
||||
} else if ( // New state: FAILED persisted: GOOD (-> BAD)
|
||||
persisted.status == Supervision::HEALTH_STATUS_GOOD &&
|
||||
|
@ -464,7 +463,7 @@ std::vector<check_t> Supervision::check(std::string const& type) {
|
|||
pReport->add(i.key.copyString(), i.value);
|
||||
}
|
||||
}} // Operation
|
||||
if (envelope != nullptr) { // Preconditions(Job)
|
||||
if (envelope != nullptr && envelope->slice().length()>1) { // Preconditions(Job)
|
||||
TRI_ASSERT(
|
||||
envelope->slice().isArray() && envelope->slice()[1].isObject());
|
||||
pReport->add(envelope->slice()[1]);
|
||||
|
|
|
@ -1257,9 +1257,14 @@ void ClusterCommThread::abortRequestsToFailedServers() {
|
|||
void ClusterCommThread::run() {
|
||||
LOG_TOPIC(DEBUG, Logger::CLUSTER) << "starting ClusterComm thread";
|
||||
|
||||
std::chrono::steady_clock clock;
|
||||
auto lastAbortCheck = clock.now();
|
||||
while (!isStopping()) {
|
||||
try {
|
||||
abortRequestsToFailedServers();
|
||||
if (clock.now() - lastAbortCheck > std::chrono::duration<double>(3.0)) {
|
||||
abortRequestsToFailedServers();
|
||||
lastAbortCheck = clock.now();
|
||||
}
|
||||
_cc->communicator()->work_once();
|
||||
_cc->communicator()->wait();
|
||||
LOG_TOPIC(TRACE, Logger::CLUSTER) << "done waiting in ClusterCommThread";
|
||||
|
|
|
@ -344,7 +344,7 @@ function SynchronousReplicationSuite () {
|
|||
|
||||
tearDown : function () {
|
||||
db._drop(cn);
|
||||
global.ArangoAgency.set('Target/FailedServers', {});
|
||||
//global.ArangoAgency.set('Target/FailedServers', {});
|
||||
},
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
Loading…
Reference in New Issue