1
0
Fork 0
arangodb/arangod/Cluster/TakeoverShardLeadership.cpp

257 lines
9.6 KiB
C++

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2014-2018 ArangoDB GmbH, Cologne, Germany
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Kaveh Vahedipour
/// @author Matthew Von-Maszewski
/// @author Max Neunhoeffer
////////////////////////////////////////////////////////////////////////////////
#include "TakeoverShardLeadership.h"
#include "MaintenanceFeature.h"
#include "ApplicationFeatures/ApplicationServer.h"
#include "Basics/VelocyPackHelper.h"
#include "Cluster/ClusterComm.h"
#include "Cluster/ClusterFeature.h"
#include "Cluster/ClusterInfo.h"
#include "Cluster/FollowerInfo.h"
#include "Logger/LogMacros.h"
#include "Logger/Logger.h"
#include "Logger/LoggerStream.h"
#include "Transaction/ClusterUtils.h"
#include "Transaction/Methods.h"
#include "Transaction/StandaloneContext.h"
#include "Utils/DatabaseGuard.h"
#include "Utils/SingleCollectionTransaction.h"
#include "VocBase/LogicalCollection.h"
#include "VocBase/Methods/Collections.h"
#include "VocBase/Methods/Databases.h"
#include <velocypack/Compare.h>
#include <velocypack/Iterator.h>
#include <velocypack/Slice.h>
#include <velocypack/velocypack-aliases.h>
using namespace arangodb;
using namespace arangodb::application_features;
using namespace arangodb::maintenance;
using namespace arangodb::methods;
TakeoverShardLeadership::TakeoverShardLeadership(MaintenanceFeature& feature,
ActionDescription const& desc)
: ActionBase(feature, desc) {
std::stringstream error;
_labels.emplace(FAST_TRACK);
if (!desc.has(DATABASE)) {
error << "database must be specified";
}
TRI_ASSERT(desc.has(DATABASE));
if (!desc.has(COLLECTION)) {
error << "collection must be specified. ";
}
TRI_ASSERT(desc.has(COLLECTION));
if (!desc.has(SHARD)) {
error << "shard must be specified";
}
TRI_ASSERT(desc.has(SHARD));
if (!desc.has(THE_LEADER)) {
error << "leader must be specified. ";
}
TRI_ASSERT(desc.has(THE_LEADER));
if (!desc.has(LOCAL_LEADER)) {
error << "local leader must be specified. ";
}
TRI_ASSERT(desc.has(LOCAL_LEADER));
TRI_ASSERT(desc.has(OLD_CURRENT_COUNTER));
if (!error.str().empty()) {
LOG_TOPIC("2aa85", ERR, Logger::MAINTENANCE) << "TakeoverLeadership: " << error.str();
_result.reset(TRI_ERROR_INTERNAL, error.str());
setState(FAILED);
}
}
TakeoverShardLeadership::~TakeoverShardLeadership() {}
static void sendLeaderChangeRequests(std::vector<ServerID> const& currentServers,
std::shared_ptr<std::vector<ServerID>>& realInsyncFollowers,
std::string const& databaseName, ShardID const& shardID,
std::string const& oldLeader) {
auto cc = ClusterComm::instance();
if (cc == nullptr) {
// nullptr happens only during controlled shutdown
return;
}
std::string const& sid = ServerState::instance()->getId();
VPackBuilder bodyBuilder;
{
VPackObjectBuilder ob(&bodyBuilder);
bodyBuilder.add("leaderId", VPackValue(sid));
bodyBuilder.add("oldLeaderId", VPackValue(oldLeader));
bodyBuilder.add("shard", VPackValue(shardID));
}
std::string const url = "/_db/" + databaseName + "/_api/replication/set-the-leader";
std::vector<ClusterCommRequest> requests;
auto body = std::make_shared<std::string>(bodyBuilder.toJson());
for (auto const& srv : currentServers) {
if (srv == sid) {
continue; // ignore ourself
}
LOG_TOPIC("42516", DEBUG, Logger::MAINTENANCE)
<< "Sending " << bodyBuilder.toJson() << " to " << srv;
requests.emplace_back("server:" + srv, RequestType::PUT, url, body);
}
cc->performRequests(requests, 3.0, Logger::COMMUNICATION, false);
// This code intentionally ignores all errors
realInsyncFollowers = std::make_shared<std::vector<ServerID>>();
for (auto const& req : requests) {
ClusterCommResult const& result = req.result;
if (result.status == CL_COMM_RECEIVED && result.errorCode == TRI_ERROR_NO_ERROR) {
if (result.result && result.result->getHttpReturnCode() == 200) {
realInsyncFollowers->push_back(result.serverID);
}
}
}
}
static void handleLeadership(LogicalCollection& collection,
std::string const& localLeader,
std::string const& plannedLeader,
std::string const& databaseName,
uint64_t oldCounter, MaintenanceFeature& feature) {
auto& followers = collection.followers();
if (plannedLeader.empty()) { // Planned to lead
if (!localLeader.empty()) { // We were not leader, assume leadership
// This will block the thread until we fetched a new current version
// in maintenance main thread.
feature.waitForLargerCurrentCounter(oldCounter);
auto currentInfo = ClusterInfo::instance()->getCollectionCurrent(
databaseName, std::to_string(collection.planId()));
if (currentInfo == nullptr) {
// Collection has been dropped we cannot continue here.
return;
}
TRI_ASSERT(currentInfo != nullptr);
std::vector<ServerID> currentServers = currentInfo->servers(collection.name());
std::shared_ptr<std::vector<ServerID>> realInsyncFollowers;
if (currentServers.size() > 0) {
std::string& oldLeader = currentServers.at(0);
// Check if the old leader has resigned and stopped all write
// (if so, we can assume that all servers are still in sync)
if (oldLeader.at(0) == '_') {
// remove the underscore from the list as it is useless anyway
oldLeader = oldLeader.substr(1);
// Update all follower and tell them that we are the leader now
sendLeaderChangeRequests(currentServers, realInsyncFollowers, databaseName, collection.name(), oldLeader);
}
}
std::vector<ServerID> failoverCandidates = currentInfo->failoverCandidates(collection.name());
followers->takeOverLeadership(failoverCandidates, realInsyncFollowers);
transaction::cluster::abortFollowerTransactionsOnShard(collection.id());
}
} else { // Planned to follow
if (localLeader.empty()) {
// Note that the following does not delete the follower list
// and that this is crucial, because in the planned leader
// resign case, updateCurrentForCollections will report the
// resignation together with the old in-sync list to the
// agency. If this list would be empty, then the supervision
// would be very angry with us!
followers->setTheLeader(plannedLeader);
transaction::cluster::abortLeaderTransactionsOnShard(collection.id());
}
// Note that if we have been a follower to some leader
// we do not immediately adjust the leader here, even if
// the planned leader differs from what we have set locally.
// The setting must only be adjusted once we have
// synchronized with the new leader and negotiated
// a leader/follower relationship!
}
}
bool TakeoverShardLeadership::first() {
std::string const& database = _description.get(DATABASE);
std::string const& collection = _description.get(COLLECTION);
std::string const& shard = _description.get(SHARD);
std::string const& plannedLeader = _description.get(THE_LEADER);
std::string const& localLeader = _description.get(LOCAL_LEADER);
std::string const& oldCounterString = _description.get(OLD_CURRENT_COUNTER);
uint64_t oldCounter = basics::StringUtils::uint64(oldCounterString);
try {
DatabaseGuard guard(database);
auto& vocbase = guard.database();
Result found = methods::Collections::lookup(
vocbase, shard, [&](std::shared_ptr<LogicalCollection> const& coll) -> void {
TRI_ASSERT(coll);
LOG_TOPIC("5632a", DEBUG, Logger::MAINTENANCE)
<< "trying to become leader of shard '" << database << "/" << shard;
// We adjust local leadership, note that the planned
// resignation case is not handled here, since then
// ourselves does not appear in shards[shard] but only
// "_" + ourselves.
handleLeadership(*coll, localLeader, plannedLeader,
vocbase.name(), oldCounter, feature());
});
if (found.fail()) {
std::stringstream error;
error << "TakeoverShardLeadership: failed to lookup local collection " << shard << "in database " + database;
LOG_TOPIC("65342", ERR, Logger::MAINTENANCE) << error.str();
_result = actionError(TRI_ERROR_ARANGO_DATA_SOURCE_NOT_FOUND, error.str());
}
} catch (std::exception const& e) {
std::stringstream error;
error << "action " << _description << " failed with exception " << e.what();
LOG_TOPIC("79443", WARN, Logger::MAINTENANCE)
<< "TakeoverShardLeadership: " << error.str();
_result.reset(TRI_ERROR_INTERNAL, error.str());
}
if (_result.fail()) {
_feature.storeShardError(database, collection, shard,
_description.get(SERVER_ID), _result);
}
notify();
return false;
}