mirror of https://gitee.com/bigwinds/arangodb
Feature/supervsion maintenance mode (#5108)
* Supervision goes to Maintenance mode, when /arango/Supervision/Maintenance exists * coordinator route stands * stop updates in transient, when supervision off
This commit is contained in:
parent
15bb69224f
commit
3d043b35a3
|
@ -482,8 +482,8 @@ std::vector<check_t> Supervision::check(std::string const& type) {
|
|||
// Take necessary actions if any
|
||||
std::shared_ptr<VPackBuilder> envelope;
|
||||
if (changed) {
|
||||
handleOnStatus(_agent, _snapshot, persist, transist, serverID, _jobId,
|
||||
envelope);
|
||||
handleOnStatus(
|
||||
_agent, _snapshot, persist, transist, serverID, _jobId, envelope);
|
||||
}
|
||||
|
||||
persist = transist; // Now copy Status, SyncStatus from transient to persited
|
||||
|
@ -577,6 +577,40 @@ bool Supervision::doChecks() {
|
|||
return true;
|
||||
}
|
||||
|
||||
void Supervision::reportStatus(std::string const& status) {
|
||||
|
||||
bool persist = false;
|
||||
query_t report;
|
||||
|
||||
{ // Do I have to report to agency under
|
||||
_lock.assertLockedByCurrentThread();
|
||||
if (_snapshot.hasAsString("/Supervision/State/Mode").first != status) {
|
||||
// This includes the case that the mode is not set, since status
|
||||
// is never empty
|
||||
persist = true;
|
||||
}
|
||||
}
|
||||
|
||||
report = std::make_shared<VPackBuilder>();
|
||||
{ VPackArrayBuilder trx(report.get());
|
||||
{ VPackObjectBuilder br(report.get());
|
||||
report->add(VPackValue("/Supervision/State"));
|
||||
{ VPackObjectBuilder bbr(report.get());
|
||||
report->add("Mode", VPackValue(status));
|
||||
report->add("Timestamp",
|
||||
VPackValue(timepointToString(std::chrono::system_clock::now())));}}}
|
||||
|
||||
// Importatnt! No reporting in transient for Maintenance mode.
|
||||
if (status != "Maintenance") {
|
||||
transient(_agent, *report);
|
||||
}
|
||||
|
||||
if (persist) {
|
||||
write_ret_t res = singleWriteTransaction(_agent, *report);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void Supervision::run() {
|
||||
// First wait until somebody has initialized the ArangoDB data, before
|
||||
// that running the supervision does not make sense and will indeed
|
||||
|
@ -634,8 +668,9 @@ void Supervision::run() {
|
|||
// Only modifiy this condition with extreme care:
|
||||
// Supervision needs to wait until the agent has finished leadership
|
||||
// preparation or else the local agency snapshot might be behind its
|
||||
// last state.
|
||||
if (_agent->leading() && _agent->getPrepareLeadership() == 0) {
|
||||
// last state.
|
||||
if (
|
||||
_agent->leading() && _agent->getPrepareLeadership() == 0) {
|
||||
|
||||
if (_jobId == 0 || _jobId == _jobIdMax) {
|
||||
getUniqueIds(); // cannot fail but only hang
|
||||
|
@ -643,23 +678,33 @@ void Supervision::run() {
|
|||
|
||||
updateSnapshot();
|
||||
|
||||
if (!_upgraded) {
|
||||
upgradeAgency();
|
||||
}
|
||||
if (!_snapshot.has("Supervision/Maintenance")) {
|
||||
|
||||
if (_agent->leaderFor() > 10) {
|
||||
try {
|
||||
doChecks();
|
||||
} catch (std::exception const& e) {
|
||||
LOG_TOPIC(ERR, Logger::SUPERVISION)
|
||||
<< e.what() << " " << __FILE__ << " " << __LINE__;
|
||||
} catch (...) {
|
||||
LOG_TOPIC(ERR, Logger::SUPERVISION) <<
|
||||
"Supervision::doChecks() generated an uncaught exception.";
|
||||
reportStatus("Normal");
|
||||
|
||||
if (!_upgraded) {
|
||||
upgradeAgency();
|
||||
}
|
||||
}
|
||||
|
||||
handleJobs();
|
||||
if (_agent->leaderFor() > 10) {
|
||||
try {
|
||||
doChecks();
|
||||
} catch (std::exception const& e) {
|
||||
LOG_TOPIC(ERR, Logger::SUPERVISION)
|
||||
<< e.what() << " " << __FILE__ << " " << __LINE__;
|
||||
} catch (...) {
|
||||
LOG_TOPIC(ERR, Logger::SUPERVISION) <<
|
||||
"Supervision::doChecks() generated an uncaught exception.";
|
||||
}
|
||||
}
|
||||
|
||||
handleJobs();
|
||||
|
||||
} else {
|
||||
|
||||
reportStatus("Maintenance");
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
_cv.wait(static_cast<uint64_t>(1000000 * _frequency));
|
||||
|
|
|
@ -164,6 +164,12 @@ class Supervision : public arangodb::Thread {
|
|||
|
||||
void shrinkCluster();
|
||||
|
||||
/**
|
||||
* @brief Report status of supervision in agency
|
||||
* @param status Status, which will show in Supervision/State
|
||||
*/
|
||||
void reportStatus(std::string const& status);
|
||||
|
||||
bool isShuttingDown();
|
||||
|
||||
bool handleJobs();
|
||||
|
|
|
@ -333,6 +333,9 @@ static void JS_WriteAgency(v8::FunctionCallbackInfo<v8::Value> const& args) {
|
|||
static void JS_TransactAgency(v8::FunctionCallbackInfo<v8::Value> const& args) {
|
||||
JS_APIAgency("transact", args);
|
||||
}
|
||||
static void JS_TransientAgency(v8::FunctionCallbackInfo<v8::Value> const& args) {
|
||||
JS_APIAgency("transient", args);
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -1994,6 +1997,7 @@ void TRI_InitV8Cluster(v8::Isolate* isolate, v8::Handle<v8::Context> context) {
|
|||
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "read"), JS_ReadAgency);
|
||||
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "write"), JS_WriteAgency);
|
||||
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "transact"), JS_TransactAgency);
|
||||
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "transient"), JS_TransientAgency);
|
||||
|
||||
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "cas"), JS_CasAgency);
|
||||
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "createDirectory"),
|
||||
|
|
|
@ -32,6 +32,8 @@
|
|||
|
||||
var actions = require('@arangodb/actions');
|
||||
var cluster = require('@arangodb/cluster');
|
||||
var wait = require("internal").wait;
|
||||
|
||||
// var internal = require('internal');
|
||||
var _ = require('lodash');
|
||||
|
||||
|
@ -134,6 +136,88 @@ actions.defineHttp({
|
|||
// / @brief was docuBlock JSF_cluster_node_version_GET
|
||||
// //////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
actions.defineHttp({
|
||||
url: '_admin/cluster/maintenance',
|
||||
allowUseDatabase: true,
|
||||
prefix: false,
|
||||
|
||||
callback: function (req, res) {
|
||||
if (req.requestType !== actions.PUT) {
|
||||
actions.resultError(req, res, actions.HTTP_FORBIDDEN, 0,
|
||||
'only GET and PUT requests are allowed');
|
||||
return;
|
||||
}
|
||||
|
||||
var body = JSON.parse(req.requestBody);
|
||||
if (body === undefined) {
|
||||
res.responseCode = actions.HTTP_BAD;
|
||||
res.body = JSON.stringify({
|
||||
'error': true,
|
||||
'errorMessage': 'empty body'
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
let operations = {};
|
||||
if (body === "on") {
|
||||
operations['/arango/Supervision/Maintenance'] =
|
||||
{"op":"set","new":true,"ttl":3600};
|
||||
} else if (body === "off") {
|
||||
operations['/arango/Supervision/Maintenance'] = {"op":"delete"};
|
||||
} else {
|
||||
res.responseCode = actions.HTTP_BAD;
|
||||
res.body = JSON.stringify({
|
||||
'error': true,
|
||||
'errorMessage': 'state string must be "on" or "off"'
|
||||
});
|
||||
return;
|
||||
}
|
||||
let preconditions = {};
|
||||
try {
|
||||
global.ArangoAgency.write([[operations, preconditions]]);
|
||||
} catch (e) {
|
||||
throw e;
|
||||
}
|
||||
|
||||
// Wait 2 min for supervision to go to maintenance mode
|
||||
var waitUntil = new Date().getTime() + 120.0*1000;
|
||||
while (true) {
|
||||
var mode = global.ArangoAgency.read([["/arango/Supervision/State/Mode"]])[0].
|
||||
arango.Supervision.State.Mode;
|
||||
|
||||
if (body === "on" && mode === "Maintenance") {
|
||||
res.body = JSON.stringify({
|
||||
error: false,
|
||||
warning: 'Cluster supervision deactivated. It will be reactivated automatically in 60 minutes unless this call is repeated until then.'});
|
||||
break;
|
||||
} else if (body === "off" && mode === "Normal") {
|
||||
res.body = JSON.stringify({
|
||||
error: false,
|
||||
warning: 'Cluster supervision reactivated.'});
|
||||
break;
|
||||
}
|
||||
|
||||
wait(0.1);
|
||||
|
||||
if (new Date().getTime() > waitUntil) {
|
||||
res.responseCode = actions.HTTP_GATEWAY_TIMEOUT;
|
||||
res.body = JSON.stringify({
|
||||
'error': true,
|
||||
'errorMessage':
|
||||
'timed out while waiting for supervision to go into maintenance mode'
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ;
|
||||
|
||||
}});
|
||||
// //////////////////////////////////////////////////////////////////////////////
|
||||
// / @brief was docuBlock JSF_cluster_node_version_GET
|
||||
// //////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
actions.defineHttp({
|
||||
url: '_admin/clusterNodeVersion',
|
||||
prefix: false,
|
||||
|
|
|
@ -2130,3 +2130,4 @@ exports.HTTP_SERVER_ERROR = 500;
|
|||
exports.HTTP_NOT_IMPLEMENTED = 501;
|
||||
exports.HTTP_BAD_GATEWAY = 502;
|
||||
exports.HTTP_SERVICE_UNAVAILABLE = 503;
|
||||
exports.HTTP_GATEWAY_TIMEOUT = 504;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*jshint globalstrict:false, strict:false */
|
||||
/*global assertTrue, assertEqual */
|
||||
/*global assertTrue, assertEqual, ArangoAgency */
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief test moving shards in the cluster
|
||||
|
@ -131,7 +131,6 @@ function MovingShardsSuite () {
|
|||
}
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief test whether or not a server is clean
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -299,6 +298,34 @@ function MovingShardsSuite () {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Set supervision mode
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
function maintenanceMode(mode) {
|
||||
console.log("Switching supervision maintenance " + mode);
|
||||
var coordEndpoint =
|
||||
global.ArangoClusterInfo.getServerEndpoint("Coordinator0001");
|
||||
var request = require("@arangodb/request");
|
||||
var endpointToURL = require("@arangodb/cluster").endpointToURL;
|
||||
var url = endpointToURL(coordEndpoint);
|
||||
var req;
|
||||
try {
|
||||
req = request({ method: "PUT",
|
||||
url: url + "/_admin/cluster/maintenance",
|
||||
body: JSON.stringify(mode) });
|
||||
} catch (err) {
|
||||
console.error(
|
||||
"Exception for PUT /_admin/cluster/maintenance:", err.stack);
|
||||
return false;
|
||||
}
|
||||
console.log("Supervision maintenance is " + mode);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief create some collections
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -609,6 +636,41 @@ function MovingShardsSuite () {
|
|||
assertTrue(waitForSupervision());
|
||||
},
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief pausing supervision for a couple of seconds
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
testMaintenanceMode : function() {
|
||||
createSomeCollections(1, 1, 3);
|
||||
assertTrue(waitForSynchronousReplication("_system"));
|
||||
var servers = findCollectionServers("_system", c[1].name());
|
||||
var fromServer = servers[0];
|
||||
var toServer = findServerNotOnList(servers);
|
||||
var cinfo = global.ArangoClusterInfo.getCollectionInfo(
|
||||
"_system", c[1].name());
|
||||
var shard = Object.keys(cinfo.shards)[0];
|
||||
assertTrue(maintenanceMode("on"));
|
||||
assertTrue(moveShard("_system", c[1]._id, shard, fromServer, toServer));
|
||||
var first = global.ArangoAgency.transient([["/arango/Supervision/State"]])[0].
|
||||
arango.Supervision.State, state;
|
||||
var waitUntil = new Date().getTime() + 30.0*1000;
|
||||
while(true) {
|
||||
state = global.ArangoAgency.transient([["/arango/Supervision/State"]])[0].
|
||||
arango.Supervision.State;
|
||||
assertEqual(state.Timestamp, first.Timestamp);
|
||||
wait(5.0);
|
||||
if (new Date().getTime() > waitUntil) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue(maintenanceMode("off"));
|
||||
state = global.ArangoAgency.transient([["/arango/Supervision/State"]])[0].
|
||||
arango.Supervision.State;
|
||||
assertTrue(state.Timestamp !== first.Timestamp);
|
||||
assertTrue(testServerEmpty(fromServer, false, 1, 1));
|
||||
assertTrue(waitForSupervision());
|
||||
},
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief just to allow a trailing comma at the end of the last test
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
Loading…
Reference in New Issue