mirror of https://gitee.com/bigwinds/arangodb
Bug fix 3.2/supervision no longer fails to remove server from failed when back to good (#4211)
* let's not miss failedserver removal * remove resetting of FailedServers in test code * Only call abortRequestsToFailedServers at most every 3 seconds.
This commit is contained in:
parent
4b798028c9
commit
40b10a7fe2
|
@ -274,13 +274,12 @@ void handleOnStatusDBServer(
|
||||||
// New condition GOOD:
|
// New condition GOOD:
|
||||||
if (transisted.status == Supervision::HEALTH_STATUS_GOOD) {
|
if (transisted.status == Supervision::HEALTH_STATUS_GOOD) {
|
||||||
if (snapshot.has(failedServerPath)) {
|
if (snapshot.has(failedServerPath)) {
|
||||||
Builder del;
|
envelope = std::make_shared<VPackBuilder>();
|
||||||
{ VPackArrayBuilder c(&del);
|
{ VPackArrayBuilder a(envelope.get());
|
||||||
{ VPackObjectBuilder cc(&del);
|
{ VPackObjectBuilder operations (envelope.get());
|
||||||
del.add(VPackValue(failedServerPath));
|
envelope->add(VPackValue(failedServerPath));
|
||||||
{ VPackObjectBuilder ccc(&del);
|
{ VPackObjectBuilder ccc(envelope.get());
|
||||||
del.add("op", VPackValue("delete")); }}}
|
envelope->add("op", VPackValue("delete")); }}}
|
||||||
singleWriteTransaction(agent, del);
|
|
||||||
}
|
}
|
||||||
} else if ( // New state: FAILED persisted: GOOD (-> BAD)
|
} else if ( // New state: FAILED persisted: GOOD (-> BAD)
|
||||||
persisted.status == Supervision::HEALTH_STATUS_GOOD &&
|
persisted.status == Supervision::HEALTH_STATUS_GOOD &&
|
||||||
|
@ -464,7 +463,7 @@ std::vector<check_t> Supervision::check(std::string const& type) {
|
||||||
pReport->add(i.key.copyString(), i.value);
|
pReport->add(i.key.copyString(), i.value);
|
||||||
}
|
}
|
||||||
}} // Operation
|
}} // Operation
|
||||||
if (envelope != nullptr) { // Preconditions(Job)
|
if (envelope != nullptr && envelope->slice().length()>1) { // Preconditions(Job)
|
||||||
TRI_ASSERT(
|
TRI_ASSERT(
|
||||||
envelope->slice().isArray() && envelope->slice()[1].isObject());
|
envelope->slice().isArray() && envelope->slice()[1].isObject());
|
||||||
pReport->add(envelope->slice()[1]);
|
pReport->add(envelope->slice()[1]);
|
||||||
|
|
|
@ -1257,9 +1257,14 @@ void ClusterCommThread::abortRequestsToFailedServers() {
|
||||||
void ClusterCommThread::run() {
|
void ClusterCommThread::run() {
|
||||||
LOG_TOPIC(DEBUG, Logger::CLUSTER) << "starting ClusterComm thread";
|
LOG_TOPIC(DEBUG, Logger::CLUSTER) << "starting ClusterComm thread";
|
||||||
|
|
||||||
|
std::chrono::steady_clock clock;
|
||||||
|
auto lastAbortCheck = clock.now();
|
||||||
while (!isStopping()) {
|
while (!isStopping()) {
|
||||||
try {
|
try {
|
||||||
abortRequestsToFailedServers();
|
if (clock.now() - lastAbortCheck > std::chrono::duration<double>(3.0)) {
|
||||||
|
abortRequestsToFailedServers();
|
||||||
|
lastAbortCheck = clock.now();
|
||||||
|
}
|
||||||
_cc->communicator()->work_once();
|
_cc->communicator()->work_once();
|
||||||
_cc->communicator()->wait();
|
_cc->communicator()->wait();
|
||||||
LOG_TOPIC(TRACE, Logger::CLUSTER) << "done waiting in ClusterCommThread";
|
LOG_TOPIC(TRACE, Logger::CLUSTER) << "done waiting in ClusterCommThread";
|
||||||
|
|
|
@ -344,7 +344,7 @@ function SynchronousReplicationSuite () {
|
||||||
|
|
||||||
tearDown : function () {
|
tearDown : function () {
|
||||||
db._drop(cn);
|
db._drop(cn);
|
||||||
global.ArangoAgency.set('Target/FailedServers', {});
|
//global.ArangoAgency.set('Target/FailedServers', {});
|
||||||
},
|
},
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
Loading…
Reference in New Issue