mirror of https://gitee.com/bigwinds/arangodb
[3.5] Added precondition to ensure that server is still as seen before. (#10477)
* Added precondition to ensure that server is still as seen before. * Removed merge conflicts.
This commit is contained in:
parent
43677785dd
commit
9a33122c5d
|
@ -459,7 +459,7 @@ std::vector<check_t> Supervision::check(std::string const& type) {
|
||||||
// Do actual monitoring
|
// Do actual monitoring
|
||||||
for (auto const& machine : machinesPlanned) {
|
for (auto const& machine : machinesPlanned) {
|
||||||
std::string lastHeartbeatStatus, lastHeartbeatAcked, lastHeartbeatTime,
|
std::string lastHeartbeatStatus, lastHeartbeatAcked, lastHeartbeatTime,
|
||||||
lastStatus, serverID(machine.first), shortName;
|
lastStatus, serverID(machine.first), shortName;
|
||||||
|
|
||||||
// short name arrives asynchronous to machine registering, make sure
|
// short name arrives asynchronous to machine registering, make sure
|
||||||
// it has arrived before trying to use it
|
// it has arrived before trying to use it
|
||||||
|
@ -1153,7 +1153,7 @@ void Supervision::cleanupFinishedAndFailedJobs() {
|
||||||
constexpr size_t maximalFinishedJobs = 500;
|
constexpr size_t maximalFinishedJobs = 500;
|
||||||
constexpr size_t maximalFailedJobs = 1000;
|
constexpr size_t maximalFailedJobs = 1000;
|
||||||
|
|
||||||
auto cleanup = [&](std::string prefix, size_t limit) {
|
auto cleanup = [&](std::string const& prefix, size_t limit) {
|
||||||
auto const& jobs = _snapshot.hasAsChildren(prefix).first;
|
auto const& jobs = _snapshot.hasAsChildren(prefix).first;
|
||||||
if (jobs.size() <= 2 * limit) {
|
if (jobs.size() <= 2 * limit) {
|
||||||
return;
|
return;
|
||||||
|
@ -1170,8 +1170,8 @@ void Supervision::cleanupFinishedAndFailedJobs() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::sort(v.begin(), v.end(), [](keyDate const& a, keyDate const& b) -> bool {
|
std::sort(v.begin(), v.end(), [](keyDate const& a, keyDate const& b) -> bool {
|
||||||
return a.second < b.second;
|
return a.second < b.second;
|
||||||
});
|
});
|
||||||
size_t toBeDeleted = v.size() - limit; // known to be positive
|
size_t toBeDeleted = v.size() - limit; // known to be positive
|
||||||
LOG_TOPIC("98451", INFO, Logger::AGENCY) << "Deleting " << toBeDeleted << " old jobs"
|
LOG_TOPIC("98451", INFO, Logger::AGENCY) << "Deleting " << toBeDeleted << " old jobs"
|
||||||
" in " << prefix;
|
" in " << prefix;
|
||||||
|
@ -1281,14 +1281,16 @@ void Supervision::workJobs() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Supervision::verifyCoordinatorRebootID(std::string const& coordinatorID,
|
||||||
bool Supervision::verifyCoordinatorRebootID(std::string coordinatorID, uint64_t wantedRebootID) {
|
uint64_t wantedRebootID, bool& coordinatorFound) {
|
||||||
// check if the coordinator exists in health
|
// check if the coordinator exists in health
|
||||||
std::string const& health = serverHealth(coordinatorID);
|
std::string const& health = serverHealth(coordinatorID);
|
||||||
LOG_TOPIC("44432", DEBUG, Logger::SUPERVISION)
|
LOG_TOPIC("44432", DEBUG, Logger::SUPERVISION)
|
||||||
<< "verifyCoordinatorRebootID: coordinatorID="
|
<< "verifyCoordinatorRebootID: coordinatorID="
|
||||||
<< coordinatorID << " health=" << health;
|
<< coordinatorID << " health=" << health;
|
||||||
|
|
||||||
// if the server is not found, health is an empty string
|
// if the server is not found, health is an empty string
|
||||||
|
coordinatorFound = health.empty();
|
||||||
if (health != "GOOD" && health != "BAD") {
|
if (health != "GOOD" && health != "BAD") {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1301,7 +1303,9 @@ bool Supervision::verifyCoordinatorRebootID(std::string coordinatorID, uint64_t
|
||||||
return rebootID.second && rebootID.first == wantedRebootID;
|
return rebootID.second && rebootID.first == wantedRebootID;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Supervision::deleteBrokenDatabase(std::string const& database, std::string const& coordinatorID, uint64_t rebootID) {
|
void Supervision::deleteBrokenDatabase(std::string const& database,
|
||||||
|
std::string const& coordinatorID,
|
||||||
|
uint64_t rebootID, bool coordinatorFound) {
|
||||||
auto envelope = std::make_shared<Builder>();
|
auto envelope = std::make_shared<Builder>();
|
||||||
{
|
{
|
||||||
VPackArrayBuilder trxs(envelope.get());
|
VPackArrayBuilder trxs(envelope.get());
|
||||||
|
@ -1330,10 +1334,15 @@ void Supervision::deleteBrokenDatabase(std::string const& database, std::string
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// precondition that this database is still in Plan and is building
|
// precondition that this database is still in Plan and is building
|
||||||
VPackObjectBuilder precondition(envelope.get());
|
VPackObjectBuilder preconditions(envelope.get());
|
||||||
envelope->add(_agencyPrefix + planDBPrefix + database + "/" + StaticStrings::DatabaseIsBuilding, VPackValue(true));
|
envelope->add(_agencyPrefix + planDBPrefix + database + "/" + StaticStrings::DatabaseIsBuilding, VPackValue(true));
|
||||||
envelope->add(_agencyPrefix + planDBPrefix + database + "/" + StaticStrings::DatabaseCoordinatorRebootId, VPackValue(rebootID));
|
envelope->add(_agencyPrefix + planDBPrefix + database + "/" + StaticStrings::DatabaseCoordinatorRebootId, VPackValue(rebootID));
|
||||||
envelope->add(_agencyPrefix + planDBPrefix + database + "/" + StaticStrings::DatabaseCoordinator, VPackValue(coordinatorID));
|
envelope->add(_agencyPrefix + planDBPrefix + database + "/" + StaticStrings::DatabaseCoordinator, VPackValue(coordinatorID));
|
||||||
|
|
||||||
|
{
|
||||||
|
VPackObjectBuilder precondition(envelope.get(), _agencyPrefix + healthPrefix + "/" + coordinatorID);
|
||||||
|
envelope->add("oldEmpty", VPackValue(!coordinatorFound));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1372,9 +1381,11 @@ void Supervision::checkBrokenCreatedDatabases() {
|
||||||
std::pair<std::string, bool> coordinatorID = db->hasAsString(StaticStrings::DatabaseCoordinator);
|
std::pair<std::string, bool> coordinatorID = db->hasAsString(StaticStrings::DatabaseCoordinator);
|
||||||
|
|
||||||
bool keepDatabase = true;
|
bool keepDatabase = true;
|
||||||
|
bool coordinatorFound = false;
|
||||||
|
|
||||||
if (rebootID.second && coordinatorID.second) {
|
if (rebootID.second && coordinatorID.second) {
|
||||||
keepDatabase = verifyCoordinatorRebootID(coordinatorID.first, rebootID.first);
|
keepDatabase = verifyCoordinatorRebootID(coordinatorID.first,
|
||||||
|
rebootID.first, coordinatorFound);
|
||||||
// incomplete data, should not happen
|
// incomplete data, should not happen
|
||||||
} else {
|
} else {
|
||||||
// v---- Please note this awesome log-id
|
// v---- Please note this awesome log-id
|
||||||
|
@ -1387,7 +1398,7 @@ void Supervision::checkBrokenCreatedDatabases() {
|
||||||
LOG_TOPIC("fe522", INFO, Logger::SUPERVISION)
|
LOG_TOPIC("fe522", INFO, Logger::SUPERVISION)
|
||||||
<< "checkBrokenCreatedDatabases: removing skeleton database with name " << dbpair.first;
|
<< "checkBrokenCreatedDatabases: removing skeleton database with name " << dbpair.first;
|
||||||
// delete this database and all of its collections
|
// delete this database and all of its collections
|
||||||
deleteBrokenDatabase(dbpair.first, coordinatorID.first, rebootID.first);
|
deleteBrokenDatabase(dbpair.first, coordinatorID.first, rebootID.first, coordinatorFound);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1533,11 +1544,11 @@ void Supervision::enforceReplication() {
|
||||||
auto const& col = *(col_.second);
|
auto const& col = *(col_.second);
|
||||||
|
|
||||||
size_t replicationFactor;
|
size_t replicationFactor;
|
||||||
auto replFact = col.hasAsUInt("replicationFactor");
|
auto replFact = col.hasAsUInt(StaticStrings::ReplicationFactor);
|
||||||
if (replFact.second) {
|
if (replFact.second) {
|
||||||
replicationFactor = replFact.first;
|
replicationFactor = replFact.first;
|
||||||
} else {
|
} else {
|
||||||
auto replFact2 = col.hasAsString("replicationFactor");
|
auto replFact2 = col.hasAsString(StaticStrings::ReplicationFactor);
|
||||||
if (replFact2.second && replFact2.first == "satellite") {
|
if (replFact2.second && replFact2.first == "satellite") {
|
||||||
// satellites => distribute to every server
|
// satellites => distribute to every server
|
||||||
auto available = Job::availableServers(_snapshot);
|
auto available = Job::availableServers(_snapshot);
|
||||||
|
@ -1549,7 +1560,7 @@ void Supervision::enforceReplication() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clone = col.has("distributeShardsLike");
|
bool clone = col.has(StaticStrings::DistributeShardsLike);
|
||||||
|
|
||||||
if (!clone) {
|
if (!clone) {
|
||||||
for (auto const& shard_ : col.hasAsChildren("shards").first) { // Pl shards
|
for (auto const& shard_ : col.hasAsChildren("shards").first) { // Pl shards
|
||||||
|
|
|
@ -90,7 +90,7 @@ class Supervision : public arangodb::CriticalThread {
|
||||||
};
|
};
|
||||||
|
|
||||||
/// @brief Construct sanity checking
|
/// @brief Construct sanity checking
|
||||||
Supervision();
|
explicit Supervision();
|
||||||
|
|
||||||
/// @brief Default dtor
|
/// @brief Default dtor
|
||||||
~Supervision();
|
~Supervision();
|
||||||
|
@ -188,8 +188,10 @@ class Supervision : public arangodb::CriticalThread {
|
||||||
|
|
||||||
bool handleJobs();
|
bool handleJobs();
|
||||||
void handleShutdown();
|
void handleShutdown();
|
||||||
bool verifyCoordinatorRebootID(std::string coordinatorID, uint64_t wantedRebootID);
|
bool verifyCoordinatorRebootID(std::string const& coordinatorID,
|
||||||
void deleteBrokenDatabase(std::string const& database, std::string const& coordinatorID, uint64_t rebootID);
|
uint64_t wantedRebootID, bool& coordinatorFound);
|
||||||
|
void deleteBrokenDatabase(std::string const& database, std::string const& coordinatorID,
|
||||||
|
uint64_t rebootID, bool coordinatorFound);
|
||||||
|
|
||||||
/// @brief Migrate chains of distributeShardsLike to depth 1
|
/// @brief Migrate chains of distributeShardsLike to depth 1
|
||||||
void fixPrototypeChain(VPackBuilder&);
|
void fixPrototypeChain(VPackBuilder&);
|
||||||
|
|
Loading…
Reference in New Issue