diff --git a/CHANGELOG b/CHANGELOG index 02de9f00da..17fd1126b0 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,16 @@ v3.5.1 (XXXX-XX-XX) ------------------- +* Check for duplicate server endpoints registered in the agency in sub-keys of + `/Current/ServersRegistered`. + + Duplicate endpoints will be registered if more than one arangod instance is + started with the same value for startup option `--cluster.my-address`. This can + happen unintentionally due to typos in the configuration, copy&paste remainders etc. + + In case a duplicate endpoint is detected on startup, a warning will be written + to the log, indicating which other server has already "acquired" the same endpoint. + * Make graph operations in general-graph transaction-aware. * Fixed adding an orphan collections as the first collection in a SmartGraph. diff --git a/arangod/Cluster/ServerState.cpp b/arangod/Cluster/ServerState.cpp index 467a318eb7..16fe4ae2d1 100644 --- a/arangod/Cluster/ServerState.cpp +++ b/arangod/Cluster/ServerState.cpp @@ -25,6 +25,8 @@ #include #include +#include +#include #include #include @@ -45,9 +47,21 @@ #include "StorageEngine/EngineSelectorFeature.h" #include "VocBase/ticks.h" +#include +#include + using namespace arangodb; using namespace arangodb::basics; +namespace { +// whenever the format of the generated UUIDs changes, please make sure to +// adjust this regex too! +std::regex const uuidRegex("^(SNGL|CRDN|PRMR|AGNT)-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$"); +} + +static constexpr char const* currentServersRegisteredPref = + "/Current/ServersRegistered/"; + //////////////////////////////////////////////////////////////////////////////// /// @brief single instance of ServerState - will live as long as the server is /// running @@ -67,7 +81,6 @@ ServerState::ServerState() _host(), _state(STATE_UNDEFINED), _initialized(false), - _foxxmaster(), _foxxmasterQueueupdate(false) { setRole(ROLE_UNDEFINED); } @@ -158,6 +171,8 @@ std::string ServerState::roleToString(ServerState::RoleEnum role) { } std::string ServerState::roleToShortString(ServerState::RoleEnum role) { + // whenever anything here changes, please make sure to + // adjust ::uuidRegex too! switch (role) { case ROLE_UNDEFINED: return "NONE"; @@ -414,7 +429,55 @@ bool ServerState::integrateIntoCluster(ServerState::RoleEnum role, << roleToString(role) << " and our id is " << id; // now overwrite the entry in /Current/ServersRegistered/ - return registerAtAgencyPhase2(comm, hadPersistedId); + bool registered = registerAtAgencyPhase2(comm, hadPersistedId); + if (!registered) { + return false; + } + + // now check the configuration of the different servers for duplicate endpoints + AgencyCommResult result = comm.getValues(currentServersRegisteredPref); + + if (result.successful()) { + auto slicePath = AgencyCommManager::slicePath(currentServersRegisteredPref); + auto valueSlice = result.slice()[0].get(slicePath); + + if (valueSlice.isObject()) { + // map from server UUID to endpoint + std::unordered_map endpoints; + for (auto const& it : VPackObjectIterator(valueSlice)) { + std::string const serverId = it.key.copyString(); + + if (!isUuid(serverId)) { + continue; + } + if (!it.value.isObject()) { + continue; + } + VPackSlice endpointSlice = it.value.get("endpoint"); + if (!endpointSlice.isString()) { + continue; + } + auto it2 = endpoints.emplace(endpointSlice.copyString(), serverId); + if (!it2.second && it2.first->first != serverId) { + // duplicate entry! + LOG_TOPIC("9a134", WARN, Logger::CLUSTER) + << "found duplicate server entry for endpoint '" + << endpointSlice.copyString() << "', already used by other server " << it2.first->second + << ". it looks like this is a (mis)configuration issue"; + // anyway, continue with startup + } + } + } + } + + return true; +} + +/// @brief whether or not "value" is a server UUID +bool ServerState::isUuid(std::string const& value) const { + // whenever the format of the generated UUIDs changes, please make sure to + // adjust ::uuidRegex too! + return std::regex_match(value, ::uuidRegex); } ////////////////////////////////////////////////////////////////////////////// @@ -451,7 +514,7 @@ void mkdir(std::string const& path) { } } -std::string ServerState::getUuidFilename() { +std::string ServerState::getUuidFilename() const { auto dbpath = application_features::ApplicationServer::getFeature( "DatabasePath"); TRI_ASSERT(dbpath != nullptr); @@ -481,6 +544,8 @@ bool ServerState::writePersistedId(std::string const& id) { } std::string ServerState::generatePersistedId(RoleEnum const& role) { + // whenever the format of the generated UUID changes, please make sure to + // adjust ::uuidRegex too! std::string id = roleToShortString(role) + "-" + to_string(boost::uuids::random_generator()()); writePersistedId(id); @@ -507,9 +572,6 @@ std::string ServerState::getPersistedId() { FATAL_ERROR_EXIT(); } -static constexpr char const* currentServersRegisteredPref = - "/Current/ServersRegistered/"; - /// @brief check equality of engines with other registered servers bool ServerState::checkEngineEquality(AgencyComm& comm) { std::string engineName = EngineSelectorFeature::engineName(); @@ -540,7 +602,7 @@ bool ServerState::checkEngineEquality(AgencyComm& comm) { /// @brief create an id for a specified role ////////////////////////////////////////////////////////////////////////////// -bool ServerState::registerAtAgencyPhase1(AgencyComm& comm, const ServerState::RoleEnum& role) { +bool ServerState::registerAtAgencyPhase1(AgencyComm& comm, ServerState::RoleEnum const& role) { std::string const agencyListKey = roleToAgencyListKey(role); std::string const latestIdKey = "Latest" + roleToAgencyKey(role) + "Id"; @@ -674,13 +736,20 @@ bool ServerState::registerAtAgencyPhase1(AgencyComm& comm, const ServerState::Ro bool ServerState::registerAtAgencyPhase2(AgencyComm& comm, bool const hadPersistedId) { TRI_ASSERT(!_id.empty() && !_myEndpoint.empty()); + + std::string const serverRegistrationPath = currentServersRegisteredPref + _id; + std::string const rebootIdPath = "/Current/ServersKnown/" + _id + "/rebootId"; + + // If we generated a new UUID, this *must not* exist in the Agency, so we + // should fail to register. + std::vector pre; + if (!hadPersistedId) { + pre.emplace_back(AgencyPrecondition(rebootIdPath, AgencyPrecondition::Type::EMPTY, true)); + } while (!application_features::ApplicationServer::isStopping()) { - std::string serverRegistrationPath = currentServersRegisteredPref + _id; - std::string rebootIdPath = "/Current/ServersKnown/" + _id + "/rebootId"; - VPackBuilder builder; - try { + { VPackObjectBuilder b(&builder); builder.add("endpoint", VPackValue(_myEndpoint)); builder.add("advertisedEndpoint", VPackValue(_advertisedEndpoint)); @@ -688,18 +757,8 @@ bool ServerState::registerAtAgencyPhase2(AgencyComm& comm, bool const hadPersist builder.add("version", VPackValue(rest::Version::getNumericServerVersion())); builder.add("versionString", VPackValue(rest::Version::getServerVersion())); builder.add("engine", VPackValue(EngineSelectorFeature::engineName())); - } catch (...) { - LOG_TOPIC("de625", FATAL, arangodb::Logger::CLUSTER) << "out of memory"; - FATAL_ERROR_EXIT(); } - - // If we generated a new UUID, this *must not* exist in the Agency, so we - // should fail to register. - std::vector pre; - if (!hadPersistedId) { - pre.emplace_back(AgencyPrecondition(rebootIdPath, AgencyPrecondition::Type::EMPTY, true)); - } - + AgencyWriteTransaction trx( {AgencyOperation(serverRegistrationPath, AgencyValueOperationType::SET, builder.slice()), @@ -770,7 +829,7 @@ void ServerState::setId(std::string const& id) { /// @brief get the short server id //////////////////////////////////////////////////////////////////////////////// -uint32_t ServerState::getShortId() { +uint32_t ServerState::getShortId() const { return _shortId.load(std::memory_order_relaxed); } @@ -791,7 +850,7 @@ uint64_t ServerState::getRebootId() const { return _rebootId; } -void ServerState::setRebootId(uint64_t const rebootId) { +void ServerState::setRebootId(uint64_t rebootId) { TRI_ASSERT(rebootId > 0); _rebootId = rebootId; } diff --git a/arangod/Cluster/ServerState.h b/arangod/Cluster/ServerState.h index 9a7563c8cf..e88db6032c 100644 --- a/arangod/Cluster/ServerState.h +++ b/arangod/Cluster/ServerState.h @@ -203,14 +203,14 @@ class ServerState { void setId(std::string const&); /// @brief get the short id - uint32_t getShortId(); + uint32_t getShortId() const; /// @brief set the server short id void setShortId(uint32_t); uint64_t getRebootId() const; - void setRebootId(uint64_t const rebootId); + void setRebootId(uint64_t rebootId); /// @brief get the server endpoint std::string getEndpoint(); @@ -256,8 +256,8 @@ class ServerState { /// @brief sets server mode and propagates new mode to agency Result propagateClusterReadOnly(bool); - /// file where the server persists it's UUID - std::string getUuidFilename(); + /// file where the server persists its UUID + std::string getUuidFilename() const; private: /// @brief atomically fetches the server role @@ -278,13 +278,16 @@ class ServerState { ResultT readRebootIdFromAgency(AgencyComm& comm); /// @brief register at agency, might already be done - bool registerAtAgencyPhase1(AgencyComm&, const RoleEnum&); + bool registerAtAgencyPhase1(AgencyComm&, RoleEnum const&); /// @brief write the Current/ServersRegistered entry - bool registerAtAgencyPhase2(AgencyComm&, bool const hadPersistedId); + bool registerAtAgencyPhase2(AgencyComm&, bool hadPersistedId); void setFoxxmasterSinceNow(); + /// @brief whether or not "value" is a server UUID + bool isUuid(std::string const& value) const; + private: /// @brief server role std::atomic _role; @@ -333,11 +336,11 @@ class ServerState { /// @brief whether or not the cluster was initialized bool _initialized; + + bool _foxxmasterQueueupdate; std::string _foxxmaster; - bool _foxxmasterQueueupdate; - // @brief point in time since which this server is the Foxxmaster TRI_voc_tick_t _foxxmasterSince; };