1
0
Fork 0

make replication timeouts configurable via startup options (#10473)

* make replication timeouts configurable via startup options

The following options are available (for active failover
and master-slave replication):

    --replication.connect-timeout
    --replication.request-timeout

Values can be specified in seconds. If these options are used, they will
be used for replication requests, overriding any hard-coded defaults or
explicitly configured timeouts.

Additionally, this change increases the default request timeout
for replication from 10 minutes to 20 minutes.

* do *not* change default value for timeouts

* make tests work again

* Update CHANGELOG
This commit is contained in:
Jan 2019-11-19 18:24:33 +01:00 committed by KVS85
parent 23fbe0af54
commit 81287811fe
5 changed files with 109 additions and 4 deletions

View File

@ -1,6 +1,12 @@
v3.5.3 (XXXX-XX-XX)
-------------------
* Make the timeouts for replication requests (for active failover and master-slave
replication configurable via startup options:
--replication.connect-timeout
--replication.request-timeout
* Fixed internal issue #4647: dead Coordinators are not removed for agency.
* Fixed UPSERT matching.

View File

@ -772,7 +772,13 @@ void HeartbeatThread::runSingleServer() {
config._idleMinWaitTime = 250 * 1000; // 250ms
config._idleMaxWaitTime = 3 * 1000 * 1000; // 3s
TRI_ASSERT(!config._skipCreateDrop);
config._includeFoxxQueues = true; // sync _queues and _jobs
config._includeFoxxQueues = true; // sync _queues and _jobs
auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
if (feature != nullptr) {
config._connectTimeout = feature->checkConnectTimeout(config._connectTimeout);
config._requestTimeout = feature->checkRequestTimeout(config._requestTimeout);
}
applier->forget(); // forget about any existing configuration
applier->reconfigure(config);

View File

@ -26,6 +26,7 @@
#include "Basics/Exceptions.h"
#include "Cluster/ClusterFeature.h"
#include "GeneralServer/AuthenticationFeature.h"
#include "Replication/ReplicationFeature.h"
#include <velocypack/Builder.h>
#include <velocypack/Iterator.h>
@ -63,7 +64,13 @@ ReplicationApplierConfiguration::ReplicationApplierConfiguration()
_requireFromPresent(true),
_incremental(false),
_verbose(false),
_restrictType(RestrictType::None) {}
_restrictType(RestrictType::None) {
auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
if (feature != nullptr) {
_requestTimeout = feature->requestTimeout();
_connectTimeout = feature->connectTimeout();
}
}
/// @brief reset the configuration to defaults
void ReplicationApplierConfiguration::reset() {
@ -99,6 +106,12 @@ void ReplicationApplierConfiguration::reset() {
#ifdef ARANGODB_ENABLE_MAINTAINER_MODE
_force32mode = false;
#endif
auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
if (feature != nullptr) {
_requestTimeout = feature->requestTimeout();
_connectTimeout = feature->connectTimeout();
}
}
/// @brief get a VelocyPack representation
@ -219,12 +232,18 @@ ReplicationApplierConfiguration ReplicationApplierConfiguration::fromVelocyPack(
value = slice.get("requestTimeout");
if (value.isNumber()) {
configuration._requestTimeout = value.getNumber<double>();
auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
if (feature != nullptr) {
configuration._requestTimeout = feature->checkRequestTimeout(value.getNumber<double>());
}
}
value = slice.get("connectTimeout");
if (value.isNumber()) {
configuration._connectTimeout = value.getNumber<double>();
auto* feature = application_features::ApplicationServer::lookupFeature<ReplicationFeature>("Replication");
if (feature != nullptr) {
configuration._connectTimeout = feature->checkConnectTimeout(value.getNumber<double>());
}
}
value = slice.get("maxConnectRetries");

View File

@ -43,6 +43,10 @@ ReplicationFeature* ReplicationFeature::INSTANCE = nullptr;
ReplicationFeature::ReplicationFeature(ApplicationServer& server)
: ApplicationFeature(server, "Replication"),
_connectTimeout(10.0),
_requestTimeout(600.0),
_forceConnectTimeout(false),
_forceRequestTimeout(false),
_replicationApplierAutoStart(true),
_enableActiveFailover(false),
_parallelTailingInvocations(0),
@ -74,11 +78,21 @@ void ReplicationFeature::collectOptions(std::shared_ptr<ProgramOptions> options)
options->addOption("--replication.active-failover",
"Enable active-failover during asynchronous replication",
new BooleanParameter(&_enableActiveFailover));
options->addOption("--replication.max-parallel-tailing-invocations",
"Maximum number of concurrently allowed WAL tailing invocations (0 = unlimited)",
new UInt64Parameter(&_maxParallelTailingInvocations),
arangodb::options::makeFlags(arangodb::options::Flags::Hidden))
.setIntroducedIn(30500);
options->addOption("--replication.connect-timeout",
"Default timeout value for replication connection attempts (in seconds)",
new DoubleParameter(&_connectTimeout))
.setIntroducedIn(30409).setIntroducedIn(30504);
options->addOption("--replication.request-timeout",
"Default timeout value for replication requests (in seconds)",
new DoubleParameter(&_requestTimeout))
.setIntroducedIn(30409).setIntroducedIn(30504);
}
void ReplicationFeature::validateOptions(std::shared_ptr<options::ProgramOptions> options) {
@ -89,6 +103,20 @@ void ReplicationFeature::validateOptions(std::shared_ptr<options::ProgramOptions
"configured";
FATAL_ERROR_EXIT();
}
if (_connectTimeout < 1.0) {
_connectTimeout = 1.0;
}
if (options->processingResult().touched("--replication.connect-timeout")) {
_forceConnectTimeout = true;
}
if (_requestTimeout < 3.0) {
_requestTimeout = 3.0;
}
if (options->processingResult().touched("--replication.request-timeout")) {
_forceRequestTimeout = true;
}
}
void ReplicationFeature::prepare() {
@ -165,6 +193,20 @@ void ReplicationFeature::trackTailingStart() {
void ReplicationFeature::trackTailingEnd() noexcept {
--_parallelTailingInvocations;
}
double ReplicationFeature::checkConnectTimeout(double value) const {
if (_forceConnectTimeout) {
return _connectTimeout;
}
return value;
}
double ReplicationFeature::checkRequestTimeout(double value) const {
if (_forceRequestTimeout) {
return _requestTimeout;
}
return value;
}
// start the replication applier for a single database
void ReplicationFeature::startApplier(TRI_vocbase_t* vocbase) {

View File

@ -60,6 +60,24 @@ class ReplicationFeature final : public application_features::ApplicationFeature
/// @brief stop the replication applier for a single database
void stopApplier(TRI_vocbase_t* vocbase);
/// @brief returns the connect timeout for replication requests
double connectTimeout() const { return _connectTimeout; }
/// @brief returns the request timeout for replication requests
double requestTimeout() const { return _requestTimeout; }
/// @brief returns the connect timeout for replication requests
/// this will return the provided value if the user has not adjusted the
/// timeout via configuration. otherwise it will return the configured
/// timeout value
double checkConnectTimeout(double value) const;
/// @brief returns the request timeout for replication requests
/// this will return the provided value if the user has not adjusted the
/// timeout via configuration. otherwise it will return the configured
/// timeout value
double checkRequestTimeout(double value) const;
/// @brief automatic failover of replication using the agency
bool isActiveFailoverEnabled() const { return _enableActiveFailover; }
@ -81,6 +99,20 @@ class ReplicationFeature final : public application_features::ApplicationFeature
static ReplicationFeature* INSTANCE;
private:
/// @brief connection timeout for replication requests
double _connectTimeout;
/// @brief request timeout for replication requests
double _requestTimeout;
/// @brief whether or not the user-defined connect timeout is forced to be used
/// this is true only if the user set the connect timeout at startup
bool _forceConnectTimeout;
/// @brief whether or not the user-defined request timeout is forced to be used
/// this is true only if the user set the request timeout at startup
bool _forceRequestTimeout;
bool _replicationApplierAutoStart;
/// Enable the active failover