From abca9b01c67d3297eb76976023c83bae8eeec62f Mon Sep 17 00:00:00 2001 From: Jan Date: Tue, 19 Nov 2019 18:24:33 +0100 Subject: [PATCH] make replication timeouts configurable via startup options (#10473) * make replication timeouts configurable via startup options The following options are available (for active failover and master-slave replication): --replication.connect-timeout --replication.request-timeout Values can be specified in seconds. If these options are used, they will be used for replication requests, overriding any hard-coded defaults or explicitly configured timeouts. Additionally, this change increases the default request timeout for replication from 10 minutes to 20 minutes. * do *not* change default value for timeouts * make tests work again * Update CHANGELOG --- CHANGELOG | 6 +++ arangod/Cluster/HeartbeatThread.cpp | 8 +++- .../ReplicationApplierConfiguration.cpp | 25 +++++++++-- arangod/Replication/ReplicationFeature.cpp | 42 +++++++++++++++++++ arangod/Replication/ReplicationFeature.h | 32 ++++++++++++++ 5 files changed, 109 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index b02817ecd5..896a79b031 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,12 @@ v3.5.3 (XXXX-XX-XX) ------------------- +* Make the timeouts for replication requests (for active failover and master-slave + replication configurable via startup options: + + --replication.connect-timeout + --replication.request-timeout + * Fixed internal issue #4647: dead Coordinators are not removed for agency. * Fixed UPSERT matching. diff --git a/arangod/Cluster/HeartbeatThread.cpp b/arangod/Cluster/HeartbeatThread.cpp index 85e7c47edc..9dacf1968d 100644 --- a/arangod/Cluster/HeartbeatThread.cpp +++ b/arangod/Cluster/HeartbeatThread.cpp @@ -772,7 +772,13 @@ void HeartbeatThread::runSingleServer() { config._idleMinWaitTime = 250 * 1000; // 250ms config._idleMaxWaitTime = 3 * 1000 * 1000; // 3s TRI_ASSERT(!config._skipCreateDrop); - config._includeFoxxQueues = true; // sync _queues and _jobs + config._includeFoxxQueues = true; // sync _queues and _jobs + + auto* feature = application_features::ApplicationServer::lookupFeature("Replication"); + if (feature != nullptr) { + config._connectTimeout = feature->checkConnectTimeout(config._connectTimeout); + config._requestTimeout = feature->checkRequestTimeout(config._requestTimeout); + } applier->forget(); // forget about any existing configuration applier->reconfigure(config); diff --git a/arangod/Replication/ReplicationApplierConfiguration.cpp b/arangod/Replication/ReplicationApplierConfiguration.cpp index 6339b6cfc5..c71fe48512 100644 --- a/arangod/Replication/ReplicationApplierConfiguration.cpp +++ b/arangod/Replication/ReplicationApplierConfiguration.cpp @@ -26,6 +26,7 @@ #include "Basics/Exceptions.h" #include "Cluster/ClusterFeature.h" #include "GeneralServer/AuthenticationFeature.h" +#include "Replication/ReplicationFeature.h" #include #include @@ -63,7 +64,13 @@ ReplicationApplierConfiguration::ReplicationApplierConfiguration() _requireFromPresent(true), _incremental(false), _verbose(false), - _restrictType(RestrictType::None) {} + _restrictType(RestrictType::None) { + auto* feature = application_features::ApplicationServer::lookupFeature("Replication"); + if (feature != nullptr) { + _requestTimeout = feature->requestTimeout(); + _connectTimeout = feature->connectTimeout(); + } +} /// @brief reset the configuration to defaults void ReplicationApplierConfiguration::reset() { @@ -99,6 +106,12 @@ void ReplicationApplierConfiguration::reset() { #ifdef ARANGODB_ENABLE_MAINTAINER_MODE _force32mode = false; #endif + + auto* feature = application_features::ApplicationServer::lookupFeature("Replication"); + if (feature != nullptr) { + _requestTimeout = feature->requestTimeout(); + _connectTimeout = feature->connectTimeout(); + } } /// @brief get a VelocyPack representation @@ -219,12 +232,18 @@ ReplicationApplierConfiguration ReplicationApplierConfiguration::fromVelocyPack( value = slice.get("requestTimeout"); if (value.isNumber()) { - configuration._requestTimeout = value.getNumber(); + auto* feature = application_features::ApplicationServer::lookupFeature("Replication"); + if (feature != nullptr) { + configuration._requestTimeout = feature->checkRequestTimeout(value.getNumber()); + } } value = slice.get("connectTimeout"); if (value.isNumber()) { - configuration._connectTimeout = value.getNumber(); + auto* feature = application_features::ApplicationServer::lookupFeature("Replication"); + if (feature != nullptr) { + configuration._connectTimeout = feature->checkConnectTimeout(value.getNumber()); + } } value = slice.get("maxConnectRetries"); diff --git a/arangod/Replication/ReplicationFeature.cpp b/arangod/Replication/ReplicationFeature.cpp index b619692f60..ac5e9db9c7 100644 --- a/arangod/Replication/ReplicationFeature.cpp +++ b/arangod/Replication/ReplicationFeature.cpp @@ -43,6 +43,10 @@ ReplicationFeature* ReplicationFeature::INSTANCE = nullptr; ReplicationFeature::ReplicationFeature(ApplicationServer& server) : ApplicationFeature(server, "Replication"), + _connectTimeout(10.0), + _requestTimeout(600.0), + _forceConnectTimeout(false), + _forceRequestTimeout(false), _replicationApplierAutoStart(true), _enableActiveFailover(false), _parallelTailingInvocations(0), @@ -74,11 +78,21 @@ void ReplicationFeature::collectOptions(std::shared_ptr options) options->addOption("--replication.active-failover", "Enable active-failover during asynchronous replication", new BooleanParameter(&_enableActiveFailover)); + options->addOption("--replication.max-parallel-tailing-invocations", "Maximum number of concurrently allowed WAL tailing invocations (0 = unlimited)", new UInt64Parameter(&_maxParallelTailingInvocations), arangodb::options::makeFlags(arangodb::options::Flags::Hidden)) .setIntroducedIn(30500); + + options->addOption("--replication.connect-timeout", + "Default timeout value for replication connection attempts (in seconds)", + new DoubleParameter(&_connectTimeout)) + .setIntroducedIn(30409).setIntroducedIn(30504); + options->addOption("--replication.request-timeout", + "Default timeout value for replication requests (in seconds)", + new DoubleParameter(&_requestTimeout)) + .setIntroducedIn(30409).setIntroducedIn(30504); } void ReplicationFeature::validateOptions(std::shared_ptr options) { @@ -89,6 +103,20 @@ void ReplicationFeature::validateOptions(std::shared_ptrprocessingResult().touched("--replication.connect-timeout")) { + _forceConnectTimeout = true; + } + + if (_requestTimeout < 3.0) { + _requestTimeout = 3.0; + } + if (options->processingResult().touched("--replication.request-timeout")) { + _forceRequestTimeout = true; + } } void ReplicationFeature::prepare() { @@ -165,6 +193,20 @@ void ReplicationFeature::trackTailingStart() { void ReplicationFeature::trackTailingEnd() noexcept { --_parallelTailingInvocations; } + +double ReplicationFeature::checkConnectTimeout(double value) const { + if (_forceConnectTimeout) { + return _connectTimeout; + } + return value; +} + +double ReplicationFeature::checkRequestTimeout(double value) const { + if (_forceRequestTimeout) { + return _requestTimeout; + } + return value; +} // start the replication applier for a single database void ReplicationFeature::startApplier(TRI_vocbase_t* vocbase) { diff --git a/arangod/Replication/ReplicationFeature.h b/arangod/Replication/ReplicationFeature.h index c96088a46f..708560dc58 100644 --- a/arangod/Replication/ReplicationFeature.h +++ b/arangod/Replication/ReplicationFeature.h @@ -60,6 +60,24 @@ class ReplicationFeature final : public application_features::ApplicationFeature /// @brief stop the replication applier for a single database void stopApplier(TRI_vocbase_t* vocbase); + /// @brief returns the connect timeout for replication requests + double connectTimeout() const { return _connectTimeout; } + + /// @brief returns the request timeout for replication requests + double requestTimeout() const { return _requestTimeout; } + + /// @brief returns the connect timeout for replication requests + /// this will return the provided value if the user has not adjusted the + /// timeout via configuration. otherwise it will return the configured + /// timeout value + double checkConnectTimeout(double value) const; + + /// @brief returns the request timeout for replication requests + /// this will return the provided value if the user has not adjusted the + /// timeout via configuration. otherwise it will return the configured + /// timeout value + double checkRequestTimeout(double value) const; + /// @brief automatic failover of replication using the agency bool isActiveFailoverEnabled() const { return _enableActiveFailover; } @@ -81,6 +99,20 @@ class ReplicationFeature final : public application_features::ApplicationFeature static ReplicationFeature* INSTANCE; private: + /// @brief connection timeout for replication requests + double _connectTimeout; + + /// @brief request timeout for replication requests + double _requestTimeout; + + /// @brief whether or not the user-defined connect timeout is forced to be used + /// this is true only if the user set the connect timeout at startup + bool _forceConnectTimeout; + + /// @brief whether or not the user-defined request timeout is forced to be used + /// this is true only if the user set the request timeout at startup + bool _forceRequestTimeout; + bool _replicationApplierAutoStart; /// Enable the active failover