1
0
Fork 0

stop supervision on demand (#5109)

* stop supervision on demand
* adding tests
* Correct an error message.
This commit is contained in:
Kaveh Vahedipour 2018-04-20 11:58:47 +02:00 committed by Max Neunhöffer
parent b843dd6aec
commit 507418d9a4
7 changed files with 231 additions and 23 deletions

View File

@ -8,7 +8,7 @@ v3.3.8 (XXXX-XX-XX)
* fix lookups by `_id` in smart graph edge collections * fix lookups by `_id` in smart graph edge collections
* improve startup resilience in case there are datafile errors (MMFiles) * improve startup resilience in case there are datafile errors (MMFiles)
also allow repairing broken VERSION files automatically on startup by also allow repairing broken VERSION files automatically on startup by
specifying the option `--database.ignore-datafile-errors true` specifying the option `--database.ignore-datafile-errors true`
@ -17,16 +17,18 @@ v3.3.8 (XXXX-XX-XX)
* fixed internal issue #2148: Number of documents found by filter is misleading in web UI * fixed internal issue #2148: Number of documents found by filter is misleading in web UI
* added startup option `--database.required-directory-state` * added startup option `--database.required-directory-state`
using this option it is possible to require the database directory to be using this option it is possible to require the database directory to be
in a specific state on startup. the options for this value are: in a specific state on startup. the options for this value are:
- non-existing: database directory must not exist - non-existing: database directory must not exist
- existing: database directory must exist - existing: database directory must exist
- empty: database directory must exist but be empty - empty: database directory must exist but be empty
- populated: database directory must exist and contain specific files already - populated: database directory must exist and contain specific files already
- any: any state allowed - any: any state allowed
* supervision can be put into maintenance mode
v3.3.7 (2018-04-11) v3.3.7 (2018-04-11)
------------------- -------------------

View File

@ -481,8 +481,8 @@ std::vector<check_t> Supervision::check(std::string const& type) {
// Take necessary actions if any // Take necessary actions if any
std::shared_ptr<VPackBuilder> envelope; std::shared_ptr<VPackBuilder> envelope;
if (changed) { if (changed) {
handleOnStatus(_agent, _snapshot, persist, transist, serverID, _jobId, handleOnStatus(
envelope); _agent, _snapshot, persist, transist, serverID, _jobId, envelope);
} }
persist = transist; // Now copy Status, SyncStatus from transient to persited persist = transist; // Now copy Status, SyncStatus from transient to persited
@ -576,6 +576,43 @@ bool Supervision::doChecks() {
return true; return true;
} }
void Supervision::reportStatus(std::string const& status) {
bool persist = false;
query_t report;
{ // Do I have to report to agency under
_lock.assertLockedByCurrentThread();
if (_snapshot.has("/Supervision/State/Mode") &&
_snapshot("/Supervision/State/Mode").isString()) {
if (_snapshot("/Supervision/State/Mode").getString() != status) {
persist = true;
}
} else {
persist = true;
}
}
report = std::make_shared<VPackBuilder>();
{ VPackArrayBuilder trx(report.get());
{ VPackObjectBuilder br(report.get());
report->add(VPackValue("/Supervision/State"));
{ VPackObjectBuilder bbr(report.get());
report->add("Mode", VPackValue(status));
report->add("Timestamp",
VPackValue(timepointToString(std::chrono::system_clock::now())));}}}
// Importatnt! No reporting in transient for Maintenance mode.
if (status != "Maintenance") {
transient(_agent, *report);
}
if (persist) {
write_ret_t res = singleWriteTransaction(_agent, *report);
}
}
void Supervision::run() { void Supervision::run() {
// First wait until somebody has initialized the ArangoDB data, before // First wait until somebody has initialized the ArangoDB data, before
// that running the supervision does not make sense and will indeed // that running the supervision does not make sense and will indeed
@ -633,8 +670,9 @@ void Supervision::run() {
// Only modifiy this condition with extreme care: // Only modifiy this condition with extreme care:
// Supervision needs to wait until the agent has finished leadership // Supervision needs to wait until the agent has finished leadership
// preparation or else the local agency snapshot might be behind its // preparation or else the local agency snapshot might be behind its
// last state. // last state.
if (_agent->leading() && _agent->getPrepareLeadership() == 0) { if (
_agent->leading() && _agent->getPrepareLeadership() == 0) {
if (_jobId == 0 || _jobId == _jobIdMax) { if (_jobId == 0 || _jobId == _jobIdMax) {
getUniqueIds(); // cannot fail but only hang getUniqueIds(); // cannot fail but only hang
@ -642,23 +680,33 @@ void Supervision::run() {
updateSnapshot(); updateSnapshot();
if (!_upgraded) { if (!_snapshot.has("Supervision/Maintenance")) {
upgradeAgency();
}
if (_agent->leaderFor() > 10) { reportStatus("Normal");
try {
doChecks(); if (!_upgraded) {
} catch (std::exception const& e) { upgradeAgency();
LOG_TOPIC(ERR, Logger::SUPERVISION)
<< e.what() << " " << __FILE__ << " " << __LINE__;
} catch (...) {
LOG_TOPIC(ERR, Logger::SUPERVISION) <<
"Supervision::doChecks() generated an uncaught exception.";
} }
}
handleJobs(); if (_agent->leaderFor() > 10) {
try {
doChecks();
} catch (std::exception const& e) {
LOG_TOPIC(ERR, Logger::SUPERVISION)
<< e.what() << " " << __FILE__ << " " << __LINE__;
} catch (...) {
LOG_TOPIC(ERR, Logger::SUPERVISION) <<
"Supervision::doChecks() generated an uncaught exception.";
}
}
handleJobs();
} else {
reportStatus("Maintenance");
}
} }
} }
_cv.wait(static_cast<uint64_t>(1000000 * _frequency)); _cv.wait(static_cast<uint64_t>(1000000 * _frequency));

View File

@ -164,6 +164,12 @@ class Supervision : public arangodb::Thread {
void shrinkCluster(); void shrinkCluster();
/**
* @brief Report status of supervision in agency
* @param status Status, which will show in Supervision/State
*/
void reportStatus(std::string const& status);
bool isShuttingDown(); bool isShuttingDown();
bool handleJobs(); bool handleJobs();

View File

@ -333,6 +333,9 @@ static void JS_WriteAgency(v8::FunctionCallbackInfo<v8::Value> const& args) {
static void JS_TransactAgency(v8::FunctionCallbackInfo<v8::Value> const& args) { static void JS_TransactAgency(v8::FunctionCallbackInfo<v8::Value> const& args) {
JS_APIAgency("transact", args); JS_APIAgency("transact", args);
} }
static void JS_TransientAgency(v8::FunctionCallbackInfo<v8::Value> const& args) {
JS_APIAgency("transient", args);
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -1994,6 +1997,7 @@ void TRI_InitV8Cluster(v8::Isolate* isolate, v8::Handle<v8::Context> context) {
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "read"), JS_ReadAgency); TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "read"), JS_ReadAgency);
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "write"), JS_WriteAgency); TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "write"), JS_WriteAgency);
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "transact"), JS_TransactAgency); TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "transact"), JS_TransactAgency);
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "transient"), JS_TransientAgency);
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "cas"), JS_CasAgency); TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "cas"), JS_CasAgency);
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "createDirectory"), TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "createDirectory"),

View File

@ -33,6 +33,8 @@
var actions = require('@arangodb/actions'); var actions = require('@arangodb/actions');
var cluster = require('@arangodb/cluster'); var cluster = require('@arangodb/cluster');
var wait = require("internal").wait;
// var internal = require('internal'); // var internal = require('internal');
var _ = require('lodash'); var _ = require('lodash');
@ -220,6 +222,89 @@ actions.defineHttp({
// / @brief was docuBlock JSF_cluster_node_version_GET // / @brief was docuBlock JSF_cluster_node_version_GET
// ////////////////////////////////////////////////////////////////////////////// // //////////////////////////////////////////////////////////////////////////////
actions.defineHttp({
url: '_admin/cluster/maintenance',
allowUseDatabase: true,
prefix: false,
callback: function (req, res) {
if (req.requestType !== actions.PUT) {
actions.resultError(req, res, actions.HTTP_FORBIDDEN, 0,
'only GET and PUT requests are allowed');
return;
}
var body = JSON.parse(req.requestBody);
if (body === undefined) {
res.responseCode = actions.HTTP_BAD;
res.body = JSON.stringify({
'error': true,
'errorMessage': 'empty body'
});
return;
}
let operations = {};
if (body === "on") {
operations['/arango/Supervision/Maintenance'] =
{"op":"set","new":true,"ttl":3600};
} else if (body === "off") {
operations['/arango/Supervision/Maintenance'] = {"op":"delete"};
} else {
res.responseCode = actions.HTTP_BAD;
res.body = JSON.stringify({
'error': true,
'errorMessage': 'state string must be "on" or "off"'
});
return;
}
let preconditions = {};
try {
global.ArangoAgency.write([[operations, preconditions]]);
} catch (e) {
throw e;
}
// Wait 2 min for supervision to go to maintenance mode
var waitUntil = new Date().getTime() + 120.0*1000;
while (true) {
var mode = global.ArangoAgency.read([["/arango/Supervision/State/Mode"]])[0].
arango.Supervision.State.Mode;
if (body === "on" && mode === "Maintenance") {
res.body = JSON.stringify({
error: false,
warning: 'Cluster supervision deactivated. It will be reactivated automatically in 60 minutes unless this call is repeated until then.'});
break;
} else if (body === "off" && mode === "Normal") {
res.body = JSON.stringify({
error: false,
warning: 'Cluster supervision reactivated.'});
break;
}
wait(0.1);
if (new Date().getTime() > waitUntil) {
res.responseCode = actions.HTTP_GATEWAY_TIMEOUT;
res.body = JSON.stringify({
'error': true,
'errorMessage':
'timed out while waiting for supervision to go into maintenance mode'
});
return;
}
}
return ;
}});
// //////////////////////////////////////////////////////////////////////////////
// / @brief was docuBlock JSF_cluster_node_version_GET
// //////////////////////////////////////////////////////////////////////////////
actions.defineHttp({ actions.defineHttp({
url: '_admin/clusterNodeVersion', url: '_admin/clusterNodeVersion',
prefix: false, prefix: false,

View File

@ -2130,3 +2130,4 @@ exports.HTTP_SERVER_ERROR = 500;
exports.HTTP_NOT_IMPLEMENTED = 501; exports.HTTP_NOT_IMPLEMENTED = 501;
exports.HTTP_BAD_GATEWAY = 502; exports.HTTP_BAD_GATEWAY = 502;
exports.HTTP_SERVICE_UNAVAILABLE = 503; exports.HTTP_SERVICE_UNAVAILABLE = 503;
exports.HTTP_GATEWAY_TIMEOUT = 504;

View File

@ -1,5 +1,5 @@
/*jshint globalstrict:false, strict:false */ /*jshint globalstrict:false, strict:false */
/*global assertTrue, assertEqual */ /*global assertTrue, assertEqual, ArangoAgency */
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// @brief test moving shards in the cluster /// @brief test moving shards in the cluster
@ -131,7 +131,6 @@ function MovingShardsSuite () {
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// @brief test whether or not a server is clean /// @brief test whether or not a server is clean
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -299,6 +298,34 @@ function MovingShardsSuite () {
} }
} }
////////////////////////////////////////////////////////////////////////////////
/// @brief Set supervision mode
////////////////////////////////////////////////////////////////////////////////
function maintenanceMode(mode) {
console.log("Switching supervision maintenance " + mode);
var coordEndpoint =
global.ArangoClusterInfo.getServerEndpoint("Coordinator0001");
var request = require("@arangodb/request");
var endpointToURL = require("@arangodb/cluster").endpointToURL;
var url = endpointToURL(coordEndpoint);
var req;
try {
req = request({ method: "PUT",
url: url + "/_admin/cluster/maintenance",
body: JSON.stringify(mode) });
} catch (err) {
console.error(
"Exception for PUT /_admin/cluster/maintenance:", err.stack);
return false;
}
console.log("Supervision maintenance is " + mode);
return true;
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// @brief create some collections /// @brief create some collections
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -609,6 +636,41 @@ function MovingShardsSuite () {
assertTrue(waitForSupervision()); assertTrue(waitForSupervision());
}, },
////////////////////////////////////////////////////////////////////////////////
/// @brief pausing supervision for a couple of seconds
////////////////////////////////////////////////////////////////////////////////
testMaintenanceMode : function() {
createSomeCollections(1, 1, 3);
assertTrue(waitForSynchronousReplication("_system"));
var servers = findCollectionServers("_system", c[1].name());
var fromServer = servers[0];
var toServer = findServerNotOnList(servers);
var cinfo = global.ArangoClusterInfo.getCollectionInfo(
"_system", c[1].name());
var shard = Object.keys(cinfo.shards)[0];
assertTrue(maintenanceMode("on"));
assertTrue(moveShard("_system", c[1]._id, shard, fromServer, toServer));
var first = global.ArangoAgency.transient([["/arango/Supervision/State"]])[0].
arango.Supervision.State, state;
var waitUntil = new Date().getTime() + 30.0*1000;
while(true) {
state = global.ArangoAgency.transient([["/arango/Supervision/State"]])[0].
arango.Supervision.State;
assertEqual(state.Timestamp, first.Timestamp);
wait(5.0);
if (new Date().getTime() > waitUntil) {
break;
}
}
assertTrue(maintenanceMode("off"));
state = global.ArangoAgency.transient([["/arango/Supervision/State"]])[0].
arango.Supervision.State;
assertTrue(state.Timestamp !== first.Timestamp);
assertTrue(testServerEmpty(fromServer, false, 1, 1));
assertTrue(waitForSupervision());
},
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
/// @brief just to allow a trailing comma at the end of the last test /// @brief just to allow a trailing comma at the end of the last test
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////