From 3f0b4ad4ee39f17420441d811b8b3ba287e124c6 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Fri, 9 Jun 2017 11:58:45 +0200 Subject: [PATCH 1/3] investigating ssl handshake bug --- arangod/Scheduler/Socket.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arangod/Scheduler/Socket.h b/arangod/Scheduler/Socket.h index 0d6e18a4ea..9a38310c63 100644 --- a/arangod/Scheduler/Socket.h +++ b/arangod/Scheduler/Socket.h @@ -76,7 +76,7 @@ bool doSslHandshake(T& socket) { // check if we have spent more than x seconds handshaking and then abort TRI_ASSERT(start != 0.0); - if (TRI_microtime() - start >= 3) { + if (TRI_microtime() - start >= 10000) { ec.assign(boost::asio::error::connection_reset, boost::system::generic_category()); LOG_TOPIC(DEBUG, Logger::COMMUNICATION) << "forcefully shutting down connection after wait time"; From dd89653798163d8bda9313045ab1d6e46a717d5b Mon Sep 17 00:00:00 2001 From: Andreas Streichardt Date: Fri, 9 Jun 2017 12:20:20 +0200 Subject: [PATCH 2/3] typo in error message --- arangod/Agency/FailedServer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arangod/Agency/FailedServer.cpp b/arangod/Agency/FailedServer.cpp index 5a1539b159..e70da47c3a 100644 --- a/arangod/Agency/FailedServer.cpp +++ b/arangod/Agency/FailedServer.cpp @@ -310,7 +310,7 @@ JOB_STATUS FailedServer::status() { if (deleteTodos) { LOG_TOPIC(INFO, Logger::SUPERVISION) << "Server " << _server << " is healthy again. Will try to delete" - "any jobs which have not yet started!"; + " any jobs which have not yet started!"; deleteTodos->close(); deleteTodos->close(); // Transact to agency From 29921d32a82663f4004e032a3125cb454783d639 Mon Sep 17 00:00:00 2001 From: Max Neunhoeffer Date: Fri, 9 Jun 2017 13:06:02 +0200 Subject: [PATCH 3/3] Fix behaviour of synchronous replication in dropFollower case. If we are already in shutdown, we do not drop a follower. If we cannot drop a follower (no contact to agency), we error out. --- arangod/Cluster/FollowerInfo.cpp | 17 +++++++--- arangod/Cluster/FollowerInfo.h | 2 +- arangod/Transaction/Methods.cpp | 53 ++++++++++++++++++++++++-------- js/common/bootstrap/errors.js | 1 + lib/Basics/errors.dat | 2 +- lib/Basics/voc-errors.cpp | 1 + lib/Basics/voc-errors.h | 14 +++++++++ 7 files changed, 71 insertions(+), 19 deletions(-) diff --git a/arangod/Cluster/FollowerInfo.cpp b/arangod/Cluster/FollowerInfo.cpp index 38639b3653..2f6b8d855b 100644 --- a/arangod/Cluster/FollowerInfo.cpp +++ b/arangod/Cluster/FollowerInfo.cpp @@ -24,6 +24,7 @@ #include "FollowerInfo.h" +#include "ApplicationFeatures/ApplicationServer.h" #include "Cluster/ServerState.h" #include "VocBase/LogicalCollection.h" @@ -183,7 +184,12 @@ void FollowerInfo::add(ServerID const& sid) { /// since been dropped (see `dropFollowerInfo` below). //////////////////////////////////////////////////////////////////////////////// -void FollowerInfo::remove(ServerID const& sid) { +bool FollowerInfo::remove(ServerID const& sid) { + if (application_features::ApplicationServer::isStopping()) { + // If we are already shutting down, we cannot be trusted any more with + // such an important decision like dropping a follower. + return false; + } MUTEX_LOCKER(locker, _mutex); // First check if there is anything to do: @@ -195,7 +201,7 @@ void FollowerInfo::remove(ServerID const& sid) { } } if (!found) { - return; // nothing to do + return true; // nothing to do } auto v = std::make_shared>(); @@ -207,10 +213,11 @@ void FollowerInfo::remove(ServerID const& sid) { } } } + auto _oldFollowers = _followers; _followers = v; // will cast to std::vector const #ifdef DEBUG_SYNC_REPLICATION if (!AgencyCommManager::MANAGER) { - return; + return true; } #endif // Now tell the agency, path is @@ -267,13 +274,15 @@ void FollowerInfo::remove(ServerID const& sid) { usleep(500000); } while (TRI_microtime() < startTime + 30); if (!success) { + _followers = _oldFollowers; LOG_TOPIC(ERR, Logger::CLUSTER) << "FollowerInfo::remove, timeout in agency operation for key " << path; } + return success; } ////////////////////////////////////////////////////////////////////////////// -/// @brief clear follower list, no changes in agency necesary +/// @brief clear follower list, no changes in agency necessary ////////////////////////////////////////////////////////////////////////////// void FollowerInfo::clear() { diff --git a/arangod/Cluster/FollowerInfo.h b/arangod/Cluster/FollowerInfo.h index b97b896712..9a7919fe55 100644 --- a/arangod/Cluster/FollowerInfo.h +++ b/arangod/Cluster/FollowerInfo.h @@ -67,7 +67,7 @@ class FollowerInfo { /// way. ////////////////////////////////////////////////////////////////////////////// - void remove(ServerID const& s); + bool remove(ServerID const& s); ////////////////////////////////////////////////////////////////////////////// /// @brief clear follower list, no changes in agency necesary diff --git a/arangod/Transaction/Methods.cpp b/arangod/Transaction/Methods.cpp index 720b941a26..0226e99bb3 100644 --- a/arangod/Transaction/Methods.cpp +++ b/arangod/Transaction/Methods.cpp @@ -1534,10 +1534,16 @@ OperationResult transaction::Methods::insertLocal( } if (!replicationWorked) { auto const& followerInfo = collection->followers(); - followerInfo->remove((*followers)[i]); - LOG_TOPIC(ERR, Logger::REPLICATION) - << "insertLocal: dropping follower " << (*followers)[i] - << " for shard " << collectionName; + if (followerInfo->remove((*followers)[i])) { + LOG_TOPIC(WARN, Logger::REPLICATION) + << "insertLocal: dropping follower " << (*followers)[i] + << " for shard " << collectionName; + } else { + LOG_TOPIC(ERR, Logger::REPLICATION) + << "insertLocal: could not drop follower " + << (*followers)[i] << " for shard " << collectionName; + THROW_ARANGO_EXCEPTION(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER); + } } } } @@ -1850,7 +1856,16 @@ OperationResult transaction::Methods::modifyLocal( } if (!replicationWorked) { auto const& followerInfo = collection->followers(); - followerInfo->remove((*followers)[i]); + if (followerInfo->remove((*followers)[i])) { + LOG_TOPIC(WARN, Logger::REPLICATION) + << "modifyLocal: dropping follower " << (*followers)[i] + << " for shard " << collectionName; + } else { + LOG_TOPIC(ERR, Logger::REPLICATION) + << "modifyLocal: could not drop follower " + << (*followers)[i] << " for shard " << collectionName; + THROW_ARANGO_EXCEPTION(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER); + } LOG_TOPIC(ERR, Logger::REPLICATION) << "modifyLocal: dropping follower " << (*followers)[i] << " for shard " << collectionName; @@ -2094,10 +2109,16 @@ OperationResult transaction::Methods::removeLocal( } if (!replicationWorked) { auto const& followerInfo = collection->followers(); - followerInfo->remove((*followers)[i]); - LOG_TOPIC(ERR, Logger::REPLICATION) - << "removeLocal: dropping follower " << (*followers)[i] - << " for shard " << collectionName; + if (followerInfo->remove((*followers)[i])) { + LOG_TOPIC(WARN, Logger::REPLICATION) + << "removeLocal: dropping follower " << (*followers)[i] + << " for shard " << collectionName; + } else { + LOG_TOPIC(ERR, Logger::REPLICATION) + << "removeLocal: could not drop follower " + << (*followers)[i] << " for shard " << collectionName; + THROW_ARANGO_EXCEPTION(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER); + } } } } @@ -2270,10 +2291,16 @@ OperationResult transaction::Methods::truncateLocal( requests[i].result.answer_code == rest::ResponseCode::OK); if (!replicationWorked) { auto const& followerInfo = collection->followers(); - followerInfo->remove((*followers)[i]); - LOG_TOPIC(ERR, Logger::REPLICATION) - << "truncateLocal: dropping follower " << (*followers)[i] - << " for shard " << collectionName; + if (followerInfo->remove((*followers)[i])) { + LOG_TOPIC(WARN, Logger::REPLICATION) + << "truncateLocal: dropping follower " << (*followers)[i] + << " for shard " << collectionName; + } else { + LOG_TOPIC(ERR, Logger::REPLICATION) + << "truncateLocal: could not drop follower " + << (*followers)[i] << " for shard " << collectionName; + THROW_ARANGO_EXCEPTION(TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER); + } } } } diff --git a/js/common/bootstrap/errors.js b/js/common/bootstrap/errors.js index 7e6b0db5a0..ccacb57711 100644 --- a/js/common/bootstrap/errors.js +++ b/js/common/bootstrap/errors.js @@ -166,6 +166,7 @@ "ERROR_CLUSTER_MUST_NOT_DROP_COLL_OTHER_DISTRIBUTESHARDSLIKE" : { "code" : 1485, "message" : "must not drop collection while another has a distributeShardsLike attribute pointing to it" }, "ERROR_CLUSTER_UNKNOWN_DISTRIBUTESHARDSLIKE" : { "code" : 1486, "message" : "must not have a distributeShardsLike attribute pointing to an unknown collection" }, "ERROR_CLUSTER_INSUFFICIENT_DBSERVERS" : { "code" : 1487, "message" : "the number of current dbservers is lower than the requested replicationFactor" }, + "ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER" : { "code" : 1488, "message" : "a follower could not be dropped in agency" }, "ERROR_QUERY_KILLED" : { "code" : 1500, "message" : "query killed" }, "ERROR_QUERY_PARSE" : { "code" : 1501, "message" : "%s" }, "ERROR_QUERY_EMPTY" : { "code" : 1502, "message" : "query is empty" }, diff --git a/lib/Basics/errors.dat b/lib/Basics/errors.dat index abc95eee8d..66d9882678 100755 --- a/lib/Basics/errors.dat +++ b/lib/Basics/errors.dat @@ -202,7 +202,7 @@ ERROR_CLUSTER_CHAIN_OF_DISTRIBUTESHARDSLIKE,1484,"chain of distributeShardsLike ERROR_CLUSTER_MUST_NOT_DROP_COLL_OTHER_DISTRIBUTESHARDSLIKE,1485,"must not drop collection while another has a distributeShardsLike attribute pointing to it","Will be raised if one tries to drop a collection to which another collection points with its distributeShardsLike attribute." ERROR_CLUSTER_UNKNOWN_DISTRIBUTESHARDSLIKE,1486,"must not have a distributeShardsLike attribute pointing to an unknown collection","Will be raised if one tries to create a collection which points to an unknown collection in its distributeShardsLike attribute." ERROR_CLUSTER_INSUFFICIENT_DBSERVERS,1487,"the number of current dbservers is lower than the requested replicationFactor","Will be raised if one tries to create a collection with a replicationFactor greater than the available number of DBServers." - +ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER,1488,"a follower could not be dropped in agency","Will be raised if a follower that ought to be dropped could not be dropped in the agency (under Current)." ################################################################################ ## ArangoDB query errors diff --git a/lib/Basics/voc-errors.cpp b/lib/Basics/voc-errors.cpp index 5c6335a164..7a6e13d697 100644 --- a/lib/Basics/voc-errors.cpp +++ b/lib/Basics/voc-errors.cpp @@ -162,6 +162,7 @@ void TRI_InitializeErrorMessages () { REG_ERROR(ERROR_CLUSTER_MUST_NOT_DROP_COLL_OTHER_DISTRIBUTESHARDSLIKE, "must not drop collection while another has a distributeShardsLike attribute pointing to it"); REG_ERROR(ERROR_CLUSTER_UNKNOWN_DISTRIBUTESHARDSLIKE, "must not have a distributeShardsLike attribute pointing to an unknown collection"); REG_ERROR(ERROR_CLUSTER_INSUFFICIENT_DBSERVERS, "the number of current dbservers is lower than the requested replicationFactor"); + REG_ERROR(ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER, "a follower could not be dropped in agency"); REG_ERROR(ERROR_QUERY_KILLED, "query killed"); REG_ERROR(ERROR_QUERY_PARSE, "%s"); REG_ERROR(ERROR_QUERY_EMPTY, "query is empty"); diff --git a/lib/Basics/voc-errors.h b/lib/Basics/voc-errors.h index 886b949017..a90ebc8239 100644 --- a/lib/Basics/voc-errors.h +++ b/lib/Basics/voc-errors.h @@ -400,6 +400,9 @@ /// - 1487: @LIT{the number of current dbservers is lower than the requested replicationFactor} /// Will be raised if one tries to create a collection with a /// replicationFactor greater than the available number of DBServers. +/// - 1488: @LIT{a follower could not be dropped in agency} +/// Will be raised if a follower that ought to be dropped could not be +/// dropped in the agency (under Current). /// - 1500: @LIT{query killed} /// Will be raised when a running query is killed by an explicit admin /// command. @@ -2372,6 +2375,17 @@ void TRI_InitializeErrorMessages (); #define TRI_ERROR_CLUSTER_INSUFFICIENT_DBSERVERS (1487) +//////////////////////////////////////////////////////////////////////////////// +/// @brief 1488: ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER +/// +/// a follower could not be dropped in agency +/// +/// Will be raised if a follower that ought to be dropped could not be dropped +/// in the agency (under Current). +//////////////////////////////////////////////////////////////////////////////// + +#define TRI_ERROR_CLUSTER_COULD_NOT_DROP_FOLLOWER (1488) + //////////////////////////////////////////////////////////////////////////////// /// @brief 1500: ERROR_QUERY_KILLED ///