mirror of https://gitee.com/bigwinds/arangodb
make replication timeouts configurable via startup options (#10473)
* make replication timeouts configurable via startup options The following options are available (for active failover and master-slave replication): --replication.connect-timeout --replication.request-timeout Values can be specified in seconds. If these options are used, they will be used for replication requests, overriding any hard-coded defaults or explicitly configured timeouts. Additionally, this change increases the default request timeout for replication from 10 minutes to 20 minutes. * do *not* change default value for timeouts * make tests work again * Update CHANGELOG
This commit is contained in:
parent
23fbe0af54
commit
81287811fe
|
@ -1,6 +1,12 @@
|
|||
v3.5.3 (XXXX-XX-XX)
|
||||
-------------------
|
||||
|
||||
* Make the timeouts for replication requests (for active failover and master-slave
|
||||
replication configurable via startup options:
|
||||
|
||||
--replication.connect-timeout
|
||||
--replication.request-timeout
|
||||
|
||||
* Fixed internal issue #4647: dead Coordinators are not removed for agency.
|
||||
|
||||
* Fixed UPSERT matching.
|
||||
|
|
|
@ -772,7 +772,13 @@ void HeartbeatThread::runSingleServer() {
|
|||
config._idleMinWaitTime = 250 * 1000; // 250ms
|
||||
config._idleMaxWaitTime = 3 * 1000 * 1000; // 3s
|
||||
TRI_ASSERT(!config._skipCreateDrop);
|
||||
config._includeFoxxQueues = true; // sync _queues and _jobs
|
||||
config._includeFoxxQueues = true; // sync _queues and _jobs
|
||||
|
||||
auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
|
||||
if (feature != nullptr) {
|
||||
config._connectTimeout = feature->checkConnectTimeout(config._connectTimeout);
|
||||
config._requestTimeout = feature->checkRequestTimeout(config._requestTimeout);
|
||||
}
|
||||
|
||||
applier->forget(); // forget about any existing configuration
|
||||
applier->reconfigure(config);
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include "Basics/Exceptions.h"
|
||||
#include "Cluster/ClusterFeature.h"
|
||||
#include "GeneralServer/AuthenticationFeature.h"
|
||||
#include "Replication/ReplicationFeature.h"
|
||||
|
||||
#include <velocypack/Builder.h>
|
||||
#include <velocypack/Iterator.h>
|
||||
|
@ -63,7 +64,13 @@ ReplicationApplierConfiguration::ReplicationApplierConfiguration()
|
|||
_requireFromPresent(true),
|
||||
_incremental(false),
|
||||
_verbose(false),
|
||||
_restrictType(RestrictType::None) {}
|
||||
_restrictType(RestrictType::None) {
|
||||
auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
|
||||
if (feature != nullptr) {
|
||||
_requestTimeout = feature->requestTimeout();
|
||||
_connectTimeout = feature->connectTimeout();
|
||||
}
|
||||
}
|
||||
|
||||
/// @brief reset the configuration to defaults
|
||||
void ReplicationApplierConfiguration::reset() {
|
||||
|
@ -99,6 +106,12 @@ void ReplicationApplierConfiguration::reset() {
|
|||
#ifdef ARANGODB_ENABLE_MAINTAINER_MODE
|
||||
_force32mode = false;
|
||||
#endif
|
||||
|
||||
auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
|
||||
if (feature != nullptr) {
|
||||
_requestTimeout = feature->requestTimeout();
|
||||
_connectTimeout = feature->connectTimeout();
|
||||
}
|
||||
}
|
||||
|
||||
/// @brief get a VelocyPack representation
|
||||
|
@ -219,12 +232,18 @@ ReplicationApplierConfiguration ReplicationApplierConfiguration::fromVelocyPack(
|
|||
|
||||
value = slice.get("requestTimeout");
|
||||
if (value.isNumber()) {
|
||||
configuration._requestTimeout = value.getNumber<double>();
|
||||
auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
|
||||
if (feature != nullptr) {
|
||||
configuration._requestTimeout = feature->checkRequestTimeout(value.getNumber<double>());
|
||||
}
|
||||
}
|
||||
|
||||
value = slice.get("connectTimeout");
|
||||
if (value.isNumber()) {
|
||||
configuration._connectTimeout = value.getNumber<double>();
|
||||
auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
|
||||
if (feature != nullptr) {
|
||||
configuration._connectTimeout = feature->checkConnectTimeout(value.getNumber<double>());
|
||||
}
|
||||
}
|
||||
|
||||
value = slice.get("maxConnectRetries");
|
||||
|
|
|
@ -43,6 +43,10 @@ ReplicationFeature* ReplicationFeature::INSTANCE = nullptr;
|
|||
|
||||
ReplicationFeature::ReplicationFeature(ApplicationServer& server)
|
||||
: ApplicationFeature(server, "Replication"),
|
||||
_connectTimeout(10.0),
|
||||
_requestTimeout(600.0),
|
||||
_forceConnectTimeout(false),
|
||||
_forceRequestTimeout(false),
|
||||
_replicationApplierAutoStart(true),
|
||||
_enableActiveFailover(false),
|
||||
_parallelTailingInvocations(0),
|
||||
|
@ -74,11 +78,21 @@ void ReplicationFeature::collectOptions(std::shared_ptr<ProgramOptions> options)
|
|||
options->addOption("--replication.active-failover",
|
||||
"Enable active-failover during asynchronous replication",
|
||||
new BooleanParameter(&_enableActiveFailover));
|
||||
|
||||
options->addOption("--replication.max-parallel-tailing-invocations",
|
||||
"Maximum number of concurrently allowed WAL tailing invocations (0 = unlimited)",
|
||||
new UInt64Parameter(&_maxParallelTailingInvocations),
|
||||
arangodb::options::makeFlags(arangodb::options::Flags::Hidden))
|
||||
.setIntroducedIn(30500);
|
||||
|
||||
options->addOption("--replication.connect-timeout",
|
||||
"Default timeout value for replication connection attempts (in seconds)",
|
||||
new DoubleParameter(&_connectTimeout))
|
||||
.setIntroducedIn(30409).setIntroducedIn(30504);
|
||||
options->addOption("--replication.request-timeout",
|
||||
"Default timeout value for replication requests (in seconds)",
|
||||
new DoubleParameter(&_requestTimeout))
|
||||
.setIntroducedIn(30409).setIntroducedIn(30504);
|
||||
}
|
||||
|
||||
void ReplicationFeature::validateOptions(std::shared_ptr<options::ProgramOptions> options) {
|
||||
|
@ -89,6 +103,20 @@ void ReplicationFeature::validateOptions(std::shared_ptr<options::ProgramOptions
|
|||
"configured";
|
||||
FATAL_ERROR_EXIT();
|
||||
}
|
||||
|
||||
if (_connectTimeout < 1.0) {
|
||||
_connectTimeout = 1.0;
|
||||
}
|
||||
if (options->processingResult().touched("--replication.connect-timeout")) {
|
||||
_forceConnectTimeout = true;
|
||||
}
|
||||
|
||||
if (_requestTimeout < 3.0) {
|
||||
_requestTimeout = 3.0;
|
||||
}
|
||||
if (options->processingResult().touched("--replication.request-timeout")) {
|
||||
_forceRequestTimeout = true;
|
||||
}
|
||||
}
|
||||
|
||||
void ReplicationFeature::prepare() {
|
||||
|
@ -165,6 +193,20 @@ void ReplicationFeature::trackTailingStart() {
|
|||
void ReplicationFeature::trackTailingEnd() noexcept {
|
||||
--_parallelTailingInvocations;
|
||||
}
|
||||
|
||||
double ReplicationFeature::checkConnectTimeout(double value) const {
|
||||
if (_forceConnectTimeout) {
|
||||
return _connectTimeout;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
double ReplicationFeature::checkRequestTimeout(double value) const {
|
||||
if (_forceRequestTimeout) {
|
||||
return _requestTimeout;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
// start the replication applier for a single database
|
||||
void ReplicationFeature::startApplier(TRI_vocbase_t* vocbase) {
|
||||
|
|
|
@ -60,6 +60,24 @@ class ReplicationFeature final : public application_features::ApplicationFeature
|
|||
/// @brief stop the replication applier for a single database
|
||||
void stopApplier(TRI_vocbase_t* vocbase);
|
||||
|
||||
/// @brief returns the connect timeout for replication requests
|
||||
double connectTimeout() const { return _connectTimeout; }
|
||||
|
||||
/// @brief returns the request timeout for replication requests
|
||||
double requestTimeout() const { return _requestTimeout; }
|
||||
|
||||
/// @brief returns the connect timeout for replication requests
|
||||
/// this will return the provided value if the user has not adjusted the
|
||||
/// timeout via configuration. otherwise it will return the configured
|
||||
/// timeout value
|
||||
double checkConnectTimeout(double value) const;
|
||||
|
||||
/// @brief returns the request timeout for replication requests
|
||||
/// this will return the provided value if the user has not adjusted the
|
||||
/// timeout via configuration. otherwise it will return the configured
|
||||
/// timeout value
|
||||
double checkRequestTimeout(double value) const;
|
||||
|
||||
/// @brief automatic failover of replication using the agency
|
||||
bool isActiveFailoverEnabled() const { return _enableActiveFailover; }
|
||||
|
||||
|
@ -81,6 +99,20 @@ class ReplicationFeature final : public application_features::ApplicationFeature
|
|||
static ReplicationFeature* INSTANCE;
|
||||
|
||||
private:
|
||||
/// @brief connection timeout for replication requests
|
||||
double _connectTimeout;
|
||||
|
||||
/// @brief request timeout for replication requests
|
||||
double _requestTimeout;
|
||||
|
||||
/// @brief whether or not the user-defined connect timeout is forced to be used
|
||||
/// this is true only if the user set the connect timeout at startup
|
||||
bool _forceConnectTimeout;
|
||||
|
||||
/// @brief whether or not the user-defined request timeout is forced to be used
|
||||
/// this is true only if the user set the request timeout at startup
|
||||
bool _forceRequestTimeout;
|
||||
|
||||
bool _replicationApplierAutoStart;
|
||||
|
||||
/// Enable the active failover
|
||||
|
|
Loading…
Reference in New Issue