1
0
Fork 0

Feature/supervsion maintenance mode (#5108)

* Supervision goes to Maintenance mode, when /arango/Supervision/Maintenance exists
* coordinator route stands
* stop updates in transient, when supervision off
This commit is contained in:
Kaveh Vahedipour 2018-04-20 13:23:22 +02:00 committed by Max Neunhöffer
parent 15bb69224f
commit 3d043b35a3
6 changed files with 222 additions and 20 deletions

View File

@ -482,8 +482,8 @@ std::vector<check_t> Supervision::check(std::string const& type) {
// Take necessary actions if any
std::shared_ptr<VPackBuilder> envelope;
if (changed) {
handleOnStatus(_agent, _snapshot, persist, transist, serverID, _jobId,
envelope);
handleOnStatus(
_agent, _snapshot, persist, transist, serverID, _jobId, envelope);
}
persist = transist; // Now copy Status, SyncStatus from transient to persited
@ -577,6 +577,40 @@ bool Supervision::doChecks() {
return true;
}
void Supervision::reportStatus(std::string const& status) {
bool persist = false;
query_t report;
{ // Do I have to report to agency under
_lock.assertLockedByCurrentThread();
if (_snapshot.hasAsString("/Supervision/State/Mode").first != status) {
// This includes the case that the mode is not set, since status
// is never empty
persist = true;
}
}
report = std::make_shared<VPackBuilder>();
{ VPackArrayBuilder trx(report.get());
{ VPackObjectBuilder br(report.get());
report->add(VPackValue("/Supervision/State"));
{ VPackObjectBuilder bbr(report.get());
report->add("Mode", VPackValue(status));
report->add("Timestamp",
VPackValue(timepointToString(std::chrono::system_clock::now())));}}}
// Importatnt! No reporting in transient for Maintenance mode.
if (status != "Maintenance") {
transient(_agent, *report);
}
if (persist) {
write_ret_t res = singleWriteTransaction(_agent, *report);
}
}
void Supervision::run() {
// First wait until somebody has initialized the ArangoDB data, before
// that running the supervision does not make sense and will indeed
@ -634,8 +668,9 @@ void Supervision::run() {
// Only modifiy this condition with extreme care:
// Supervision needs to wait until the agent has finished leadership
// preparation or else the local agency snapshot might be behind its
// last state.
if (_agent->leading() && _agent->getPrepareLeadership() == 0) {
// last state.
if (
_agent->leading() && _agent->getPrepareLeadership() == 0) {
if (_jobId == 0 || _jobId == _jobIdMax) {
getUniqueIds(); // cannot fail but only hang
@ -643,23 +678,33 @@ void Supervision::run() {
updateSnapshot();
if (!_upgraded) {
upgradeAgency();
}
if (!_snapshot.has("Supervision/Maintenance")) {
if (_agent->leaderFor() > 10) {
try {
doChecks();
} catch (std::exception const& e) {
LOG_TOPIC(ERR, Logger::SUPERVISION)
<< e.what() << " " << __FILE__ << " " << __LINE__;
} catch (...) {
LOG_TOPIC(ERR, Logger::SUPERVISION) <<
"Supervision::doChecks() generated an uncaught exception.";
reportStatus("Normal");
if (!_upgraded) {
upgradeAgency();
}
}
handleJobs();
if (_agent->leaderFor() > 10) {
try {
doChecks();
} catch (std::exception const& e) {
LOG_TOPIC(ERR, Logger::SUPERVISION)
<< e.what() << " " << __FILE__ << " " << __LINE__;
} catch (...) {
LOG_TOPIC(ERR, Logger::SUPERVISION) <<
"Supervision::doChecks() generated an uncaught exception.";
}
}
handleJobs();
} else {
reportStatus("Maintenance");
}
}
}
_cv.wait(static_cast<uint64_t>(1000000 * _frequency));

View File

@ -164,6 +164,12 @@ class Supervision : public arangodb::Thread {
void shrinkCluster();
/**
* @brief Report status of supervision in agency
* @param status Status, which will show in Supervision/State
*/
void reportStatus(std::string const& status);
bool isShuttingDown();
bool handleJobs();

View File

@ -333,6 +333,9 @@ static void JS_WriteAgency(v8::FunctionCallbackInfo<v8::Value> const& args) {
static void JS_TransactAgency(v8::FunctionCallbackInfo<v8::Value> const& args) {
JS_APIAgency("transact", args);
}
static void JS_TransientAgency(v8::FunctionCallbackInfo<v8::Value> const& args) {
JS_APIAgency("transient", args);
}
////////////////////////////////////////////////////////////////////////////////
@ -1994,6 +1997,7 @@ void TRI_InitV8Cluster(v8::Isolate* isolate, v8::Handle<v8::Context> context) {
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "read"), JS_ReadAgency);
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "write"), JS_WriteAgency);
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "transact"), JS_TransactAgency);
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "transient"), JS_TransientAgency);
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "cas"), JS_CasAgency);
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "createDirectory"),

View File

@ -32,6 +32,8 @@
var actions = require('@arangodb/actions');
var cluster = require('@arangodb/cluster');
var wait = require("internal").wait;
// var internal = require('internal');
var _ = require('lodash');
@ -134,6 +136,88 @@ actions.defineHttp({
// / @brief was docuBlock JSF_cluster_node_version_GET
// //////////////////////////////////////////////////////////////////////////////
actions.defineHttp({
url: '_admin/cluster/maintenance',
allowUseDatabase: true,
prefix: false,
callback: function (req, res) {
if (req.requestType !== actions.PUT) {
actions.resultError(req, res, actions.HTTP_FORBIDDEN, 0,
'only GET and PUT requests are allowed');
return;
}
var body = JSON.parse(req.requestBody);
if (body === undefined) {
res.responseCode = actions.HTTP_BAD;
res.body = JSON.stringify({
'error': true,
'errorMessage': 'empty body'
});
return;
}
let operations = {};
if (body === "on") {
operations['/arango/Supervision/Maintenance'] =
{"op":"set","new":true,"ttl":3600};
} else if (body === "off") {
operations['/arango/Supervision/Maintenance'] = {"op":"delete"};
} else {
res.responseCode = actions.HTTP_BAD;
res.body = JSON.stringify({
'error': true,
'errorMessage': 'state string must be "on" or "off"'
});
return;
}
let preconditions = {};
try {
global.ArangoAgency.write([[operations, preconditions]]);
} catch (e) {
throw e;
}
// Wait 2 min for supervision to go to maintenance mode
var waitUntil = new Date().getTime() + 120.0*1000;
while (true) {
var mode = global.ArangoAgency.read([["/arango/Supervision/State/Mode"]])[0].
arango.Supervision.State.Mode;
if (body === "on" && mode === "Maintenance") {
res.body = JSON.stringify({
error: false,
warning: 'Cluster supervision deactivated. It will be reactivated automatically in 60 minutes unless this call is repeated until then.'});
break;
} else if (body === "off" && mode === "Normal") {
res.body = JSON.stringify({
error: false,
warning: 'Cluster supervision reactivated.'});
break;
}
wait(0.1);
if (new Date().getTime() > waitUntil) {
res.responseCode = actions.HTTP_GATEWAY_TIMEOUT;
res.body = JSON.stringify({
'error': true,
'errorMessage':
'timed out while waiting for supervision to go into maintenance mode'
});
return;
}
}
return ;
}});
// //////////////////////////////////////////////////////////////////////////////
// / @brief was docuBlock JSF_cluster_node_version_GET
// //////////////////////////////////////////////////////////////////////////////
actions.defineHttp({
url: '_admin/clusterNodeVersion',
prefix: false,

View File

@ -2130,3 +2130,4 @@ exports.HTTP_SERVER_ERROR = 500;
exports.HTTP_NOT_IMPLEMENTED = 501;
exports.HTTP_BAD_GATEWAY = 502;
exports.HTTP_SERVICE_UNAVAILABLE = 503;
exports.HTTP_GATEWAY_TIMEOUT = 504;

View File

@ -1,5 +1,5 @@
/*jshint globalstrict:false, strict:false */
/*global assertTrue, assertEqual */
/*global assertTrue, assertEqual, ArangoAgency */
////////////////////////////////////////////////////////////////////////////////
/// @brief test moving shards in the cluster
@ -131,7 +131,6 @@ function MovingShardsSuite () {
}
////////////////////////////////////////////////////////////////////////////////
/// @brief test whether or not a server is clean
////////////////////////////////////////////////////////////////////////////////
@ -299,6 +298,34 @@ function MovingShardsSuite () {
}
}
////////////////////////////////////////////////////////////////////////////////
/// @brief Set supervision mode
////////////////////////////////////////////////////////////////////////////////
function maintenanceMode(mode) {
console.log("Switching supervision maintenance " + mode);
var coordEndpoint =
global.ArangoClusterInfo.getServerEndpoint("Coordinator0001");
var request = require("@arangodb/request");
var endpointToURL = require("@arangodb/cluster").endpointToURL;
var url = endpointToURL(coordEndpoint);
var req;
try {
req = request({ method: "PUT",
url: url + "/_admin/cluster/maintenance",
body: JSON.stringify(mode) });
} catch (err) {
console.error(
"Exception for PUT /_admin/cluster/maintenance:", err.stack);
return false;
}
console.log("Supervision maintenance is " + mode);
return true;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief create some collections
////////////////////////////////////////////////////////////////////////////////
@ -609,6 +636,41 @@ function MovingShardsSuite () {
assertTrue(waitForSupervision());
},
////////////////////////////////////////////////////////////////////////////////
/// @brief pausing supervision for a couple of seconds
////////////////////////////////////////////////////////////////////////////////
testMaintenanceMode : function() {
createSomeCollections(1, 1, 3);
assertTrue(waitForSynchronousReplication("_system"));
var servers = findCollectionServers("_system", c[1].name());
var fromServer = servers[0];
var toServer = findServerNotOnList(servers);
var cinfo = global.ArangoClusterInfo.getCollectionInfo(
"_system", c[1].name());
var shard = Object.keys(cinfo.shards)[0];
assertTrue(maintenanceMode("on"));
assertTrue(moveShard("_system", c[1]._id, shard, fromServer, toServer));
var first = global.ArangoAgency.transient([["/arango/Supervision/State"]])[0].
arango.Supervision.State, state;
var waitUntil = new Date().getTime() + 30.0*1000;
while(true) {
state = global.ArangoAgency.transient([["/arango/Supervision/State"]])[0].
arango.Supervision.State;
assertEqual(state.Timestamp, first.Timestamp);
wait(5.0);
if (new Date().getTime() > waitUntil) {
break;
}
}
assertTrue(maintenanceMode("off"));
state = global.ArangoAgency.transient([["/arango/Supervision/State"]])[0].
arango.Supervision.State;
assertTrue(state.Timestamp !== first.Timestamp);
assertTrue(testServerEmpty(fromServer, false, 1, 1));
assertTrue(waitForSupervision());
},
////////////////////////////////////////////////////////////////////////////////
/// @brief just to allow a trailing comma at the end of the last test
////////////////////////////////////////////////////////////////////////////////