1
0
Fork 0

[3.5] Added precondition to ensure that server is still as seen before. (#10477)

* Added precondition to ensure that server is still as seen before.

* Removed merge conflicts.
This commit is contained in:
Lars Maier 2019-11-20 13:39:31 +01:00 committed by KVS85
parent 43677785dd
commit 9a33122c5d
2 changed files with 29 additions and 16 deletions

View File

@ -459,7 +459,7 @@ std::vector<check_t> Supervision::check(std::string const& type) {
// Do actual monitoring // Do actual monitoring
for (auto const& machine : machinesPlanned) { for (auto const& machine : machinesPlanned) {
std::string lastHeartbeatStatus, lastHeartbeatAcked, lastHeartbeatTime, std::string lastHeartbeatStatus, lastHeartbeatAcked, lastHeartbeatTime,
lastStatus, serverID(machine.first), shortName; lastStatus, serverID(machine.first), shortName;
// short name arrives asynchronous to machine registering, make sure // short name arrives asynchronous to machine registering, make sure
// it has arrived before trying to use it // it has arrived before trying to use it
@ -1153,7 +1153,7 @@ void Supervision::cleanupFinishedAndFailedJobs() {
constexpr size_t maximalFinishedJobs = 500; constexpr size_t maximalFinishedJobs = 500;
constexpr size_t maximalFailedJobs = 1000; constexpr size_t maximalFailedJobs = 1000;
auto cleanup = [&](std::string prefix, size_t limit) { auto cleanup = [&](std::string const& prefix, size_t limit) {
auto const& jobs = _snapshot.hasAsChildren(prefix).first; auto const& jobs = _snapshot.hasAsChildren(prefix).first;
if (jobs.size() <= 2 * limit) { if (jobs.size() <= 2 * limit) {
return; return;
@ -1170,8 +1170,8 @@ void Supervision::cleanupFinishedAndFailedJobs() {
} }
} }
std::sort(v.begin(), v.end(), [](keyDate const& a, keyDate const& b) -> bool { std::sort(v.begin(), v.end(), [](keyDate const& a, keyDate const& b) -> bool {
return a.second < b.second; return a.second < b.second;
}); });
size_t toBeDeleted = v.size() - limit; // known to be positive size_t toBeDeleted = v.size() - limit; // known to be positive
LOG_TOPIC("98451", INFO, Logger::AGENCY) << "Deleting " << toBeDeleted << " old jobs" LOG_TOPIC("98451", INFO, Logger::AGENCY) << "Deleting " << toBeDeleted << " old jobs"
" in " << prefix; " in " << prefix;
@ -1281,14 +1281,16 @@ void Supervision::workJobs() {
} }
} }
bool Supervision::verifyCoordinatorRebootID(std::string const& coordinatorID,
bool Supervision::verifyCoordinatorRebootID(std::string coordinatorID, uint64_t wantedRebootID) { uint64_t wantedRebootID, bool& coordinatorFound) {
// check if the coordinator exists in health // check if the coordinator exists in health
std::string const& health = serverHealth(coordinatorID); std::string const& health = serverHealth(coordinatorID);
LOG_TOPIC("44432", DEBUG, Logger::SUPERVISION) LOG_TOPIC("44432", DEBUG, Logger::SUPERVISION)
<< "verifyCoordinatorRebootID: coordinatorID=" << "verifyCoordinatorRebootID: coordinatorID="
<< coordinatorID << " health=" << health; << coordinatorID << " health=" << health;
// if the server is not found, health is an empty string // if the server is not found, health is an empty string
coordinatorFound = health.empty();
if (health != "GOOD" && health != "BAD") { if (health != "GOOD" && health != "BAD") {
return false; return false;
} }
@ -1301,7 +1303,9 @@ bool Supervision::verifyCoordinatorRebootID(std::string coordinatorID, uint64_t
return rebootID.second && rebootID.first == wantedRebootID; return rebootID.second && rebootID.first == wantedRebootID;
} }
void Supervision::deleteBrokenDatabase(std::string const& database, std::string const& coordinatorID, uint64_t rebootID) { void Supervision::deleteBrokenDatabase(std::string const& database,
std::string const& coordinatorID,
uint64_t rebootID, bool coordinatorFound) {
auto envelope = std::make_shared<Builder>(); auto envelope = std::make_shared<Builder>();
{ {
VPackArrayBuilder trxs(envelope.get()); VPackArrayBuilder trxs(envelope.get());
@ -1330,10 +1334,15 @@ void Supervision::deleteBrokenDatabase(std::string const& database, std::string
} }
{ {
// precondition that this database is still in Plan and is building // precondition that this database is still in Plan and is building
VPackObjectBuilder precondition(envelope.get()); VPackObjectBuilder preconditions(envelope.get());
envelope->add(_agencyPrefix + planDBPrefix + database + "/" + StaticStrings::DatabaseIsBuilding, VPackValue(true)); envelope->add(_agencyPrefix + planDBPrefix + database + "/" + StaticStrings::DatabaseIsBuilding, VPackValue(true));
envelope->add(_agencyPrefix + planDBPrefix + database + "/" + StaticStrings::DatabaseCoordinatorRebootId, VPackValue(rebootID)); envelope->add(_agencyPrefix + planDBPrefix + database + "/" + StaticStrings::DatabaseCoordinatorRebootId, VPackValue(rebootID));
envelope->add(_agencyPrefix + planDBPrefix + database + "/" + StaticStrings::DatabaseCoordinator, VPackValue(coordinatorID)); envelope->add(_agencyPrefix + planDBPrefix + database + "/" + StaticStrings::DatabaseCoordinator, VPackValue(coordinatorID));
{
VPackObjectBuilder precondition(envelope.get(), _agencyPrefix + healthPrefix + "/" + coordinatorID);
envelope->add("oldEmpty", VPackValue(!coordinatorFound));
}
} }
} }
} }
@ -1372,9 +1381,11 @@ void Supervision::checkBrokenCreatedDatabases() {
std::pair<std::string, bool> coordinatorID = db->hasAsString(StaticStrings::DatabaseCoordinator); std::pair<std::string, bool> coordinatorID = db->hasAsString(StaticStrings::DatabaseCoordinator);
bool keepDatabase = true; bool keepDatabase = true;
bool coordinatorFound = false;
if (rebootID.second && coordinatorID.second) { if (rebootID.second && coordinatorID.second) {
keepDatabase = verifyCoordinatorRebootID(coordinatorID.first, rebootID.first); keepDatabase = verifyCoordinatorRebootID(coordinatorID.first,
rebootID.first, coordinatorFound);
// incomplete data, should not happen // incomplete data, should not happen
} else { } else {
// v---- Please note this awesome log-id // v---- Please note this awesome log-id
@ -1387,7 +1398,7 @@ void Supervision::checkBrokenCreatedDatabases() {
LOG_TOPIC("fe522", INFO, Logger::SUPERVISION) LOG_TOPIC("fe522", INFO, Logger::SUPERVISION)
<< "checkBrokenCreatedDatabases: removing skeleton database with name " << dbpair.first; << "checkBrokenCreatedDatabases: removing skeleton database with name " << dbpair.first;
// delete this database and all of its collections // delete this database and all of its collections
deleteBrokenDatabase(dbpair.first, coordinatorID.first, rebootID.first); deleteBrokenDatabase(dbpair.first, coordinatorID.first, rebootID.first, coordinatorFound);
} }
} }
} }
@ -1533,11 +1544,11 @@ void Supervision::enforceReplication() {
auto const& col = *(col_.second); auto const& col = *(col_.second);
size_t replicationFactor; size_t replicationFactor;
auto replFact = col.hasAsUInt("replicationFactor"); auto replFact = col.hasAsUInt(StaticStrings::ReplicationFactor);
if (replFact.second) { if (replFact.second) {
replicationFactor = replFact.first; replicationFactor = replFact.first;
} else { } else {
auto replFact2 = col.hasAsString("replicationFactor"); auto replFact2 = col.hasAsString(StaticStrings::ReplicationFactor);
if (replFact2.second && replFact2.first == "satellite") { if (replFact2.second && replFact2.first == "satellite") {
// satellites => distribute to every server // satellites => distribute to every server
auto available = Job::availableServers(_snapshot); auto available = Job::availableServers(_snapshot);
@ -1549,7 +1560,7 @@ void Supervision::enforceReplication() {
} }
} }
bool clone = col.has("distributeShardsLike"); bool clone = col.has(StaticStrings::DistributeShardsLike);
if (!clone) { if (!clone) {
for (auto const& shard_ : col.hasAsChildren("shards").first) { // Pl shards for (auto const& shard_ : col.hasAsChildren("shards").first) { // Pl shards

View File

@ -90,7 +90,7 @@ class Supervision : public arangodb::CriticalThread {
}; };
/// @brief Construct sanity checking /// @brief Construct sanity checking
Supervision(); explicit Supervision();
/// @brief Default dtor /// @brief Default dtor
~Supervision(); ~Supervision();
@ -188,8 +188,10 @@ class Supervision : public arangodb::CriticalThread {
bool handleJobs(); bool handleJobs();
void handleShutdown(); void handleShutdown();
bool verifyCoordinatorRebootID(std::string coordinatorID, uint64_t wantedRebootID); bool verifyCoordinatorRebootID(std::string const& coordinatorID,
void deleteBrokenDatabase(std::string const& database, std::string const& coordinatorID, uint64_t rebootID); uint64_t wantedRebootID, bool& coordinatorFound);
void deleteBrokenDatabase(std::string const& database, std::string const& coordinatorID,
uint64_t rebootID, bool coordinatorFound);
/// @brief Migrate chains of distributeShardsLike to depth 1 /// @brief Migrate chains of distributeShardsLike to depth 1
void fixPrototypeChain(VPackBuilder&); void fixPrototypeChain(VPackBuilder&);