1
0
Fork 0

leader fail seems good

This commit is contained in:
Kaveh Vahedipour 2016-05-31 15:21:42 +02:00
parent 1538bb7cb0
commit bad7a6a35a
2 changed files with 106 additions and 90 deletions

View File

@ -34,16 +34,30 @@ using namespace arangodb;
using namespace arangodb::consensus;
std::string printTimestamp(Supervision::TimePoint const& t) {
std::string timepointToString(Supervision::TimePoint const& t) {
time_t tt = std::chrono::system_clock::to_time_t(t);
struct tm tb;
size_t const len (21);
char buffer[len];
TRI_gmtime(tt, &tb);
TRI_localtime(tt, &tb);
::strftime(buffer, sizeof(buffer), "%Y-%m-%dT%H:%M:%SZ", &tb);
return std::string(buffer, len-1);
}
Supervision::TimePoint stringToTimepoint(std::string const& str) {
std::tm tt;
tt.tm_year = std::stoi(str.substr(0,4)) - 1900;
tt.tm_mon = std::stoi(str.substr(5,2)) - 1;
tt.tm_mday = std::stoi(str.substr(8,2));
tt.tm_hour = std::stoi(str.substr(11,2));
tt.tm_min = std::stoi(str.substr(14,2));
tt.tm_sec = std::stoi(str.substr(17,2));
tt.tm_isdst = -1;
auto time_c = ::mktime(&tt);
return std::chrono::system_clock::from_time_t(time_c);
}
inline arangodb::consensus::write_ret_t transact (
Agent* _agent, Builder const& transaction) {
@ -58,7 +72,7 @@ inline arangodb::consensus::write_ret_t transact (
LOG_TOPIC(ERR, Logger::AGENCY) << e.what();
}
LOG(INFO) << envelope->toJson();
LOG_TOPIC(DEBUG, Logger::AGENCY) << envelope->toJson();
return _agent->write(envelope);
}
@ -120,7 +134,7 @@ bool Job::finish(std::string const& type, bool success = true) const {
finished.add(_agencyPrefix + (success ? finishedPrefix : failedPrefix)
+ _jobId, VPackValue(VPackValueType::Object));
finished.add("timeFinished",
VPackValue(printTimestamp(std::chrono::system_clock::now())));
VPackValue(timepointToString(std::chrono::system_clock::now())));
for (auto const& obj : VPackObjectIterator(pending.slice()[0])) {
finished.add(obj.key.copyString(), obj.value);
}
@ -133,7 +147,7 @@ bool Job::finish(std::string const& type, bool success = true) const {
finished.close();
// --- Remove block
finished.add(_agencyPrefix + "/Supervision/" + type + _jobId,
finished.add(_agencyPrefix + "/Supervision/" + type,
VPackValue(VPackValueType::Object));
finished.add("op", VPackValue("delete"));
finished.close();
@ -198,7 +212,7 @@ struct FailedLeader : public Job {
todo.add("isLeader", VPackValue(true));
todo.add("jobId", VPackValue(_jobId));
todo.add("timeCreated",
VPackValue(printTimestamp(std::chrono::system_clock::now())));
VPackValue(timepointToString(std::chrono::system_clock::now())));
todo.close(); todo.close(); todo.close();
write_ret_t res = transact(_agent, todo);
@ -245,7 +259,7 @@ struct FailedLeader : public Job {
pending.add(_agencyPrefix + pendingPrefix + _jobId,
VPackValue(VPackValueType::Object));
pending.add("timeStarted",
VPackValue(printTimestamp(std::chrono::system_clock::now())));
VPackValue(timepointToString(std::chrono::system_clock::now())));
for (auto const& obj : VPackObjectIterator(todo.slice()[0])) {
pending.add(obj.key.copyString(), obj.value);
}
@ -336,7 +350,7 @@ struct FailedLeader : public Job {
if (planned.slice()[0] == current.slice()[0]) {
if (finish("Shards/" + _shard + "/")) {
if (finish("Shards/" + shard)) {
return FINISHED;
}
@ -406,7 +420,7 @@ struct FailedServer : public Job {
pending.add(_agencyPrefix + pendingPrefix + _jobId,
VPackValue(VPackValueType::Object));
pending.add("timeStarted",
VPackValue(printTimestamp(std::chrono::system_clock::now())));
VPackValue(timepointToString(std::chrono::system_clock::now())));
for (auto const& obj : VPackObjectIterator(todo.slice()[0])) {
pending.add(obj.key.copyString(), obj.value);
}
@ -498,7 +512,7 @@ struct FailedServer : public Job {
todo.add("jobId", VPackValue(_jobId));
todo.add("creator", VPackValue(_creator));
todo.add("timeCreated",
VPackValue(printTimestamp(std::chrono::system_clock::now())));
VPackValue(timepointToString(std::chrono::system_clock::now())));
todo.close(); todo.close(); todo.close();
write_ret_t res = transact(_agent, todo);
@ -537,7 +551,7 @@ struct FailedServer : public Job {
}
if (!found) {
if (finish("DBServers/" + _failed + "/")) {
if (finish("DBServers/" + _failed)) {
return FINISHED;
}
}
@ -569,6 +583,7 @@ struct CleanOutServer : public Job {
std::string const& creator, std::string const& prefix,
std::string const& server) :
Job(snapshot, agent, jobId, creator, prefix), _server(server) {
}
virtual ~CleanOutServer () {}
@ -578,33 +593,8 @@ struct CleanOutServer : public Job {
}
virtual bool create () const {
LOG_TOPIC(INFO, Logger::AGENCY) << "Todo: Clean out server " + _server;
std::string path = _agencyPrefix + toDoPrefix + _jobId;
Builder todo;
todo.openArray();
todo.openObject();
todo.add(path, VPackValue(VPackValueType::Object));
todo.add("type", VPackValue("cleanOutServer"));
todo.add("server", VPackValue(_server));
todo.add("jobId", VPackValue(_jobId));
todo.add("creator", VPackValue(_creator));
todo.add("timeCreated",
VPackValue(printTimestamp(std::chrono::system_clock::now())));
todo.close(); todo.close(); todo.close();
write_ret_t res = transact(_agent, todo);
if (res.accepted && res.indices.size()==1 && res.indices[0]) {
return true;
}
LOG_TOPIC(INFO, Logger::AGENCY) << "Failed to insert job " + _jobId;
return false;
};
}
virtual bool start() const {
@ -624,7 +614,7 @@ struct CleanOutServer : public Job {
pending.add(_agencyPrefix + pendingPrefix + _jobId,
VPackValue(VPackValueType::Object));
pending.add("timeStarted",
VPackValue(printTimestamp(std::chrono::system_clock::now())));
VPackValue(timepointToString(std::chrono::system_clock::now())));
for (auto const& obj : VPackObjectIterator(todo.slice()[0])) {
pending.add(obj.key.copyString(), obj.value);
}
@ -731,7 +721,7 @@ void Supervision::wakeUp() {
}
static std::string const syncPrefix = "/Sync/ServerStates/";
static std::string const supervisionPrefix = "/Supervision/Health/";
static std::string const healthPrefix = "/Supervision/Health/";
static std::string const planDBServersPrefix = "/Plan/DBServers";
std::vector<check_t> Supervision::checkDBServers() {
@ -740,63 +730,65 @@ std::vector<check_t> Supervision::checkDBServers() {
_snapshot(planDBServersPrefix).children();
for (auto const& machine : machinesPlanned) {
ServerID const& serverID = machine.first;
auto it = _vitalSigns.find(serverID);
std::string lastHeartbeatTime =
_snapshot(syncPrefix + serverID + "/time").toJson();
std::string lastHeartbeatStatus =
_snapshot(syncPrefix + serverID + "/status").toJson();
if (it != _vitalSigns.end()) { // Existing server
bool good = false;
std::string lastHeartbeatTime, lastHeartbeatStatus, lastHeartbeatAcked,
lastStatus, heartbeatTime, heartbeatStatus, serverID;
query_t report = std::make_shared<Builder>();
report->openArray();
report->openArray();
report->openObject();
report->add(_agencyPrefix + supervisionPrefix + serverID,
VPackValue(VPackValueType::Object));
report->add("LastHearbeatReceived",
VPackValue(printTimestamp(it->second->myTimestamp)));
report->add("LastHearbeatSent", VPackValue(it->second->serverTimestamp));
report->add("LastHearbeatStatus", VPackValue(lastHeartbeatStatus));
if (it->second->serverTimestamp == lastHeartbeatTime) {
report->add("Status", VPackValue("DOWN"));
std::chrono::seconds t{0};
t = std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::system_clock::now() - it->second->myTimestamp);
if (t.count() > _gracePeriod) { // Failure
if (it->second->maintenance() == "0") {
it->second->maintenance(std::to_string(_jobId++));
}
FailedServer fsj(_snapshot, _agent, it->second->maintenance(),
serverID = machine.first;
heartbeatTime = _snapshot(syncPrefix + serverID + "/time").toJson();
heartbeatStatus = _snapshot(syncPrefix + serverID + "/status").toJson();
try { // Existing
lastHeartbeatTime =
_snapshot(healthPrefix + serverID + "/LastHeartbeatSent").toJson();
lastHeartbeatStatus =
_snapshot(healthPrefix + serverID + "/LastHeartbeatStatus").toJson();
lastHeartbeatAcked =
_snapshot(healthPrefix + serverID + "/LastHeartbeatAcked").toJson();
lastStatus = _snapshot(healthPrefix + serverID + "/Status").toJson();
if (lastHeartbeatTime != heartbeatTime) { // Update
good = true;
}
} catch (...) { // New server
good = true;
}
query_t report = std::make_shared<Builder>();
report->openArray();
report->openArray();
report->openObject();
report->add(_agencyPrefix + healthPrefix + serverID,
VPackValue(VPackValueType::Object));
report->add("LastHeartbeatSent", VPackValue(heartbeatTime));
report->add("LastHeartbeatStatus", VPackValue(heartbeatStatus));
if (good) {
report->add("LastHeartbeatAcked",
VPackValue(
timepointToString(std::chrono::system_clock::now())));
report->add("Status", VPackValue("UP"));
} else {
std::chrono::seconds t{0};
t = std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::system_clock::now()-stringToTimepoint(lastHeartbeatAcked));
if (t.count() > _gracePeriod) { // Failure
if (lastStatus == "DOWN") {
report->add("Status", VPackValue("FAILED"));
FailedServer fsj(_snapshot, _agent, std::to_string(_jobId++),
"supervision", _agencyPrefix, serverID);
}
} else {
report->add("Status", VPackValue("UP"));
it->second->update(lastHeartbeatStatus, lastHeartbeatTime);
report->add("Status", VPackValue("DOWN"));
}
report->close();
report->close();
report->close();
report->close();
_agent->write(report);
} else { // New server
_vitalSigns[serverID] =
std::make_shared<VitalSign>(lastHeartbeatStatus, lastHeartbeatTime);
}
}
auto itr = _vitalSigns.begin();
while (itr != _vitalSigns.end()) {
if (machinesPlanned.find(itr->first) == machinesPlanned.end()) {
itr = _vitalSigns.erase(itr);
} else {
++itr;
}
report->close();
report->close();
report->close();
report->close();
_agent->write(report);
}
return ret;
@ -851,13 +843,35 @@ void Supervision::run() {
// Do supervision
doChecks(timedout);
workJobs();
}
}
void Supervision::workJobs() {
for (auto const& todoEnt : _snapshot(toDoPrefix).children()) {
Node const& todo = *todoEnt.second;
if (todo("type").toJson() == "failedServer") {
FailedServer fs (
_snapshot, _agent, todo("jobId").toJson(), todo("creator").toJson(),
_agencyPrefix, todo("server").toJson());
}
}
for (auto const& todoEnt : _snapshot(pendingPrefix).children()) {
Node const& todo = *todoEnt.second;
if (todo("type").toJson() == "failedServer") {
FailedServer fs (
_snapshot, _agent, todo("jobId").toJson(), todo("creator").toJson(),
_agencyPrefix, todo("server").toJson());
}
}
}
// Start thread
bool Supervision::start() {
Thread::start();

View File

@ -152,6 +152,8 @@ class Supervision : public arangodb::Thread {
std::vector<check_t> checkDBServers();
std::vector<check_t> checkShards();
void workJobs();
/// @brief Get unique ids from agency
bool getUniqueIds();
@ -172,7 +174,7 @@ class Supervision : public arangodb::Thread {
///@brief last vital signs as reported through heartbeats to agency
///
std::map<ServerID, std::shared_ptr<VitalSign>> _vitalSigns;
// std::map<ServerID, std::shared_ptr<VitalSign>> _vitalSigns;
long _frequency;
long _gracePeriod;