mirror of https://gitee.com/bigwinds/arangodb
bugfix: establish unique function name & implementation for communication retry status (#6150)
* initial checkin of isRetryOK(). Includes fixes to known code that has previously hung shutdowns by performing infinite retries. * slight help on getting out of a loop faster during shutdown. not essential.
This commit is contained in:
parent
a5ef080a8a
commit
86ea784372
|
@ -1359,7 +1359,8 @@ AgencyCommResult AgencyComm::sendWithFailover(
|
||||||
auto serverFeature =
|
auto serverFeature =
|
||||||
application_features::ApplicationServer::getFeature<ServerFeature>(
|
application_features::ApplicationServer::getFeature<ServerFeature>(
|
||||||
"Server");
|
"Server");
|
||||||
if (serverFeature->isStopping()) {
|
if (serverFeature->isStopping()
|
||||||
|
|| !application_features::ApplicationServer::isRetryOK()) {
|
||||||
LOG_TOPIC(INFO, Logger::AGENCYCOMM)
|
LOG_TOPIC(INFO, Logger::AGENCYCOMM)
|
||||||
<< "Unsuccessful AgencyComm: Timeout because of shutdown "
|
<< "Unsuccessful AgencyComm: Timeout because of shutdown "
|
||||||
<< "errorCode: " << result.errorCode()
|
<< "errorCode: " << result.errorCode()
|
||||||
|
|
|
@ -30,6 +30,7 @@
|
||||||
#include <velocypack/Parser.h>
|
#include <velocypack/Parser.h>
|
||||||
#include <velocypack/velocypack-aliases.h>
|
#include <velocypack/velocypack-aliases.h>
|
||||||
|
|
||||||
|
#include "ApplicationFeatures/ApplicationServer.h"
|
||||||
#include "Basics/ConditionLocker.h"
|
#include "Basics/ConditionLocker.h"
|
||||||
#include "Basics/MutexLocker.h"
|
#include "Basics/MutexLocker.h"
|
||||||
#include "Logger/Logger.h"
|
#include "Logger/Logger.h"
|
||||||
|
@ -123,7 +124,8 @@ bool AgencyCallback::execute(std::shared_ptr<VPackBuilder> newData) {
|
||||||
void AgencyCallback::executeByCallbackOrTimeout(double maxTimeout) {
|
void AgencyCallback::executeByCallbackOrTimeout(double maxTimeout) {
|
||||||
// One needs to acquire the mutex of the condition variable
|
// One needs to acquire the mutex of the condition variable
|
||||||
// before entering this function!
|
// before entering this function!
|
||||||
if (!_cv.wait(static_cast<uint64_t>(maxTimeout * 1000000.0))) {
|
if (!_cv.wait(static_cast<uint64_t>(maxTimeout * 1000000.0))
|
||||||
|
&& application_features::ApplicationServer::isRetryOK()) {
|
||||||
LOG_TOPIC(DEBUG, Logger::CLUSTER)
|
LOG_TOPIC(DEBUG, Logger::CLUSTER)
|
||||||
<< "Waiting done and nothing happended. Refetching to be sure";
|
<< "Waiting done and nothing happended. Refetching to be sure";
|
||||||
// mop: watches have not triggered during our sleep...recheck to be sure
|
// mop: watches have not triggered during our sleep...recheck to be sure
|
||||||
|
|
|
@ -1362,6 +1362,9 @@ int ClusterInfo::createDatabaseCoordinator(std::string const& name,
|
||||||
|
|
||||||
agencyCallback->executeByCallbackOrTimeout(getReloadServerListTimeout() /
|
agencyCallback->executeByCallbackOrTimeout(getReloadServerListTimeout() /
|
||||||
interval);
|
interval);
|
||||||
|
if (!application_features::ApplicationServer::isRetryOK()) {
|
||||||
|
return setErrormsg(TRI_ERROR_CLUSTER_TIMEOUT, errorMsg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1450,6 +1453,9 @@ int ClusterInfo::dropDatabaseCoordinator(std::string const& name,
|
||||||
}
|
}
|
||||||
|
|
||||||
agencyCallback->executeByCallbackOrTimeout(interval);
|
agencyCallback->executeByCallbackOrTimeout(interval);
|
||||||
|
if (!application_features::ApplicationServer::isRetryOK()) {
|
||||||
|
return setErrormsg(TRI_ERROR_CLUSTER_TIMEOUT, errorMsg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1753,6 +1759,9 @@ int ClusterInfo::createCollectionCoordinator(std::string const& databaseName,
|
||||||
}
|
}
|
||||||
|
|
||||||
agencyCallback->executeByCallbackOrTimeout(interval);
|
agencyCallback->executeByCallbackOrTimeout(interval);
|
||||||
|
if (!application_features::ApplicationServer::isRetryOK()) {
|
||||||
|
return setErrormsg(TRI_ERROR_CLUSTER_TIMEOUT, errorMsg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1901,6 +1910,9 @@ int ClusterInfo::dropCollectionCoordinator(
|
||||||
}
|
}
|
||||||
|
|
||||||
agencyCallback->executeByCallbackOrTimeout(interval);
|
agencyCallback->executeByCallbackOrTimeout(interval);
|
||||||
|
if (!application_features::ApplicationServer::isRetryOK()) {
|
||||||
|
return setErrormsg(TRI_ERROR_CLUSTER_TIMEOUT, errorMsg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2291,13 +2303,13 @@ int ClusterInfo::ensureIndexCoordinator(
|
||||||
try {
|
try {
|
||||||
auto start = std::chrono::steady_clock::now();
|
auto start = std::chrono::steady_clock::now();
|
||||||
// Keep trying for 2 minutes, if it's preconditions, which are stopping us
|
// Keep trying for 2 minutes, if it's preconditions, which are stopping us
|
||||||
while (true) {
|
while (true) {
|
||||||
resultBuilder.clear();
|
resultBuilder.clear();
|
||||||
errorCode = ensureIndexCoordinatorWithoutRollback(
|
errorCode = ensureIndexCoordinatorWithoutRollback(
|
||||||
databaseName, collectionID, idString, slice, create, compare,
|
databaseName, collectionID, idString, slice, create, compare,
|
||||||
resultBuilder, errorMsg, timeout);
|
resultBuilder, errorMsg, timeout);
|
||||||
|
|
||||||
if (errorCode == (int)arangodb::rest::ResponseCode::PRECONDITION_FAILED) {
|
if (errorCode == (int)arangodb::rest::ResponseCode::PRECONDITION_FAILED) {
|
||||||
if (std::chrono::duration_cast<std::chrono::seconds>(
|
if (std::chrono::duration_cast<std::chrono::seconds>(
|
||||||
std::chrono::steady_clock::now()-start).count() < 120) {
|
std::chrono::steady_clock::now()-start).count() < 120) {
|
||||||
std::chrono::duration<size_t, std::milli>
|
std::chrono::duration<size_t, std::milli>
|
||||||
|
@ -2675,6 +2687,9 @@ int ClusterInfo::ensureIndexCoordinatorWithoutRollback(
|
||||||
}
|
}
|
||||||
|
|
||||||
agencyCallback->executeByCallbackOrTimeout(interval);
|
agencyCallback->executeByCallbackOrTimeout(interval);
|
||||||
|
if (!application_features::ApplicationServer::isRetryOK()) {
|
||||||
|
return setErrormsg(TRI_ERROR_CLUSTER_TIMEOUT, errorMsg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2913,6 +2928,9 @@ int ClusterInfo::dropIndexCoordinator(std::string const& databaseName,
|
||||||
}
|
}
|
||||||
|
|
||||||
agencyCallback->executeByCallbackOrTimeout(interval);
|
agencyCallback->executeByCallbackOrTimeout(interval);
|
||||||
|
if (!application_features::ApplicationServer::isRetryOK()) {
|
||||||
|
return setErrormsg(TRI_ERROR_CLUSTER_TIMEOUT, errorMsg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -264,7 +264,7 @@ bool FollowerInfo::remove(ServerID const& sid) {
|
||||||
break; //
|
break; //
|
||||||
} else {
|
} else {
|
||||||
LOG_TOPIC(WARN, Logger::CLUSTER)
|
LOG_TOPIC(WARN, Logger::CLUSTER)
|
||||||
<< "FollowerInfo::remove, could not cas key " << path
|
<< "FollowerInfo::remove, could not cas key " << path
|
||||||
<< ". status code: " << res2._statusCode << ", incriminating body: " << res2.bodyRef();
|
<< ". status code: " << res2._statusCode << ", incriminating body: " << res2.bodyRef();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -273,7 +273,8 @@ bool FollowerInfo::remove(ServerID const& sid) {
|
||||||
<< path << " in agency.";
|
<< path << " in agency.";
|
||||||
}
|
}
|
||||||
std::this_thread::sleep_for(std::chrono::microseconds(500000));
|
std::this_thread::sleep_for(std::chrono::microseconds(500000));
|
||||||
} while (TRI_microtime() < startTime + 30);
|
} while (TRI_microtime() < startTime + 30
|
||||||
|
&& application_features::ApplicationServer::isRetryOK());
|
||||||
if (!success) {
|
if (!success) {
|
||||||
_followers = _oldFollowers;
|
_followers = _oldFollowers;
|
||||||
LOG_TOPIC(ERR, Logger::CLUSTER)
|
LOG_TOPIC(ERR, Logger::CLUSTER)
|
||||||
|
|
|
@ -136,6 +136,16 @@ class ApplicationServer {
|
||||||
return server != nullptr && server->_stopping.load();
|
return server != nullptr && server->_stopping.load();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Today this static function is a duplicate of isStopping(). The
|
||||||
|
// function name 'isStopping()' is defined in other classes and
|
||||||
|
// can cause scope confusion. It also causes confusion as to when
|
||||||
|
// the application versus an individual feature or thread has begun
|
||||||
|
// stopping. This function is intended to be used within communication
|
||||||
|
// retry loops where infinite retries have previously blocked clean "stopping".
|
||||||
|
static bool isRetryOK() {
|
||||||
|
return !isStopping();
|
||||||
|
}
|
||||||
|
|
||||||
static bool isPrepared() {
|
static bool isPrepared() {
|
||||||
if (server != nullptr) {
|
if (server != nullptr) {
|
||||||
ServerState tmp = server->_state.load(std::memory_order_relaxed);
|
ServerState tmp = server->_state.load(std::memory_order_relaxed);
|
||||||
|
@ -250,7 +260,7 @@ class ApplicationServer {
|
||||||
void registerFailCallback(std::function<void(std::string const&)> const& callback) {
|
void registerFailCallback(std::function<void(std::string const&)> const& callback) {
|
||||||
fail = callback;
|
fail = callback;
|
||||||
}
|
}
|
||||||
|
|
||||||
// setup and validate all feature dependencies, determine feature order
|
// setup and validate all feature dependencies, determine feature order
|
||||||
void setupDependencies(bool failOnMissing);
|
void setupDependencies(bool failOnMissing);
|
||||||
|
|
||||||
|
@ -320,7 +330,7 @@ class ApplicationServer {
|
||||||
|
|
||||||
// features order for prepare/start
|
// features order for prepare/start
|
||||||
std::vector<ApplicationFeature*> _orderedFeatures;
|
std::vector<ApplicationFeature*> _orderedFeatures;
|
||||||
|
|
||||||
// will be signalled when the application server is asked to shut down
|
// will be signalled when the application server is asked to shut down
|
||||||
basics::ConditionVariable _shutdownCondition;
|
basics::ConditionVariable _shutdownCondition;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue