mirror of https://gitee.com/bigwinds/arangodb
stop supervision on demand (#5109)
* stop supervision on demand * adding tests * Correct an error message.
This commit is contained in:
parent
b843dd6aec
commit
507418d9a4
|
@ -8,7 +8,7 @@ v3.3.8 (XXXX-XX-XX)
|
||||||
* fix lookups by `_id` in smart graph edge collections
|
* fix lookups by `_id` in smart graph edge collections
|
||||||
|
|
||||||
* improve startup resilience in case there are datafile errors (MMFiles)
|
* improve startup resilience in case there are datafile errors (MMFiles)
|
||||||
|
|
||||||
also allow repairing broken VERSION files automatically on startup by
|
also allow repairing broken VERSION files automatically on startup by
|
||||||
specifying the option `--database.ignore-datafile-errors true`
|
specifying the option `--database.ignore-datafile-errors true`
|
||||||
|
|
||||||
|
@ -17,16 +17,18 @@ v3.3.8 (XXXX-XX-XX)
|
||||||
* fixed internal issue #2148: Number of documents found by filter is misleading in web UI
|
* fixed internal issue #2148: Number of documents found by filter is misleading in web UI
|
||||||
|
|
||||||
* added startup option `--database.required-directory-state`
|
* added startup option `--database.required-directory-state`
|
||||||
|
|
||||||
using this option it is possible to require the database directory to be
|
using this option it is possible to require the database directory to be
|
||||||
in a specific state on startup. the options for this value are:
|
in a specific state on startup. the options for this value are:
|
||||||
|
|
||||||
- non-existing: database directory must not exist
|
- non-existing: database directory must not exist
|
||||||
- existing: database directory must exist
|
- existing: database directory must exist
|
||||||
- empty: database directory must exist but be empty
|
- empty: database directory must exist but be empty
|
||||||
- populated: database directory must exist and contain specific files already
|
- populated: database directory must exist and contain specific files already
|
||||||
- any: any state allowed
|
- any: any state allowed
|
||||||
|
|
||||||
|
* supervision can be put into maintenance mode
|
||||||
|
|
||||||
|
|
||||||
v3.3.7 (2018-04-11)
|
v3.3.7 (2018-04-11)
|
||||||
-------------------
|
-------------------
|
||||||
|
|
|
@ -481,8 +481,8 @@ std::vector<check_t> Supervision::check(std::string const& type) {
|
||||||
// Take necessary actions if any
|
// Take necessary actions if any
|
||||||
std::shared_ptr<VPackBuilder> envelope;
|
std::shared_ptr<VPackBuilder> envelope;
|
||||||
if (changed) {
|
if (changed) {
|
||||||
handleOnStatus(_agent, _snapshot, persist, transist, serverID, _jobId,
|
handleOnStatus(
|
||||||
envelope);
|
_agent, _snapshot, persist, transist, serverID, _jobId, envelope);
|
||||||
}
|
}
|
||||||
|
|
||||||
persist = transist; // Now copy Status, SyncStatus from transient to persited
|
persist = transist; // Now copy Status, SyncStatus from transient to persited
|
||||||
|
@ -576,6 +576,43 @@ bool Supervision::doChecks() {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Supervision::reportStatus(std::string const& status) {
|
||||||
|
|
||||||
|
bool persist = false;
|
||||||
|
query_t report;
|
||||||
|
|
||||||
|
{ // Do I have to report to agency under
|
||||||
|
_lock.assertLockedByCurrentThread();
|
||||||
|
if (_snapshot.has("/Supervision/State/Mode") &&
|
||||||
|
_snapshot("/Supervision/State/Mode").isString()) {
|
||||||
|
if (_snapshot("/Supervision/State/Mode").getString() != status) {
|
||||||
|
persist = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
persist = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
report = std::make_shared<VPackBuilder>();
|
||||||
|
{ VPackArrayBuilder trx(report.get());
|
||||||
|
{ VPackObjectBuilder br(report.get());
|
||||||
|
report->add(VPackValue("/Supervision/State"));
|
||||||
|
{ VPackObjectBuilder bbr(report.get());
|
||||||
|
report->add("Mode", VPackValue(status));
|
||||||
|
report->add("Timestamp",
|
||||||
|
VPackValue(timepointToString(std::chrono::system_clock::now())));}}}
|
||||||
|
|
||||||
|
// Importatnt! No reporting in transient for Maintenance mode.
|
||||||
|
if (status != "Maintenance") {
|
||||||
|
transient(_agent, *report);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (persist) {
|
||||||
|
write_ret_t res = singleWriteTransaction(_agent, *report);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
void Supervision::run() {
|
void Supervision::run() {
|
||||||
// First wait until somebody has initialized the ArangoDB data, before
|
// First wait until somebody has initialized the ArangoDB data, before
|
||||||
// that running the supervision does not make sense and will indeed
|
// that running the supervision does not make sense and will indeed
|
||||||
|
@ -633,8 +670,9 @@ void Supervision::run() {
|
||||||
// Only modifiy this condition with extreme care:
|
// Only modifiy this condition with extreme care:
|
||||||
// Supervision needs to wait until the agent has finished leadership
|
// Supervision needs to wait until the agent has finished leadership
|
||||||
// preparation or else the local agency snapshot might be behind its
|
// preparation or else the local agency snapshot might be behind its
|
||||||
// last state.
|
// last state.
|
||||||
if (_agent->leading() && _agent->getPrepareLeadership() == 0) {
|
if (
|
||||||
|
_agent->leading() && _agent->getPrepareLeadership() == 0) {
|
||||||
|
|
||||||
if (_jobId == 0 || _jobId == _jobIdMax) {
|
if (_jobId == 0 || _jobId == _jobIdMax) {
|
||||||
getUniqueIds(); // cannot fail but only hang
|
getUniqueIds(); // cannot fail but only hang
|
||||||
|
@ -642,23 +680,33 @@ void Supervision::run() {
|
||||||
|
|
||||||
updateSnapshot();
|
updateSnapshot();
|
||||||
|
|
||||||
if (!_upgraded) {
|
if (!_snapshot.has("Supervision/Maintenance")) {
|
||||||
upgradeAgency();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (_agent->leaderFor() > 10) {
|
reportStatus("Normal");
|
||||||
try {
|
|
||||||
doChecks();
|
if (!_upgraded) {
|
||||||
} catch (std::exception const& e) {
|
upgradeAgency();
|
||||||
LOG_TOPIC(ERR, Logger::SUPERVISION)
|
|
||||||
<< e.what() << " " << __FILE__ << " " << __LINE__;
|
|
||||||
} catch (...) {
|
|
||||||
LOG_TOPIC(ERR, Logger::SUPERVISION) <<
|
|
||||||
"Supervision::doChecks() generated an uncaught exception.";
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
handleJobs();
|
if (_agent->leaderFor() > 10) {
|
||||||
|
try {
|
||||||
|
doChecks();
|
||||||
|
} catch (std::exception const& e) {
|
||||||
|
LOG_TOPIC(ERR, Logger::SUPERVISION)
|
||||||
|
<< e.what() << " " << __FILE__ << " " << __LINE__;
|
||||||
|
} catch (...) {
|
||||||
|
LOG_TOPIC(ERR, Logger::SUPERVISION) <<
|
||||||
|
"Supervision::doChecks() generated an uncaught exception.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
handleJobs();
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
reportStatus("Maintenance");
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_cv.wait(static_cast<uint64_t>(1000000 * _frequency));
|
_cv.wait(static_cast<uint64_t>(1000000 * _frequency));
|
||||||
|
|
|
@ -164,6 +164,12 @@ class Supervision : public arangodb::Thread {
|
||||||
|
|
||||||
void shrinkCluster();
|
void shrinkCluster();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Report status of supervision in agency
|
||||||
|
* @param status Status, which will show in Supervision/State
|
||||||
|
*/
|
||||||
|
void reportStatus(std::string const& status);
|
||||||
|
|
||||||
bool isShuttingDown();
|
bool isShuttingDown();
|
||||||
|
|
||||||
bool handleJobs();
|
bool handleJobs();
|
||||||
|
|
|
@ -333,6 +333,9 @@ static void JS_WriteAgency(v8::FunctionCallbackInfo<v8::Value> const& args) {
|
||||||
static void JS_TransactAgency(v8::FunctionCallbackInfo<v8::Value> const& args) {
|
static void JS_TransactAgency(v8::FunctionCallbackInfo<v8::Value> const& args) {
|
||||||
JS_APIAgency("transact", args);
|
JS_APIAgency("transact", args);
|
||||||
}
|
}
|
||||||
|
static void JS_TransientAgency(v8::FunctionCallbackInfo<v8::Value> const& args) {
|
||||||
|
JS_APIAgency("transient", args);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -1994,6 +1997,7 @@ void TRI_InitV8Cluster(v8::Isolate* isolate, v8::Handle<v8::Context> context) {
|
||||||
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "read"), JS_ReadAgency);
|
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "read"), JS_ReadAgency);
|
||||||
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "write"), JS_WriteAgency);
|
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "write"), JS_WriteAgency);
|
||||||
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "transact"), JS_TransactAgency);
|
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "transact"), JS_TransactAgency);
|
||||||
|
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "transient"), JS_TransientAgency);
|
||||||
|
|
||||||
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "cas"), JS_CasAgency);
|
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "cas"), JS_CasAgency);
|
||||||
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "createDirectory"),
|
TRI_AddMethodVocbase(isolate, rt, TRI_V8_ASCII_STRING(isolate, "createDirectory"),
|
||||||
|
|
|
@ -33,6 +33,8 @@
|
||||||
|
|
||||||
var actions = require('@arangodb/actions');
|
var actions = require('@arangodb/actions');
|
||||||
var cluster = require('@arangodb/cluster');
|
var cluster = require('@arangodb/cluster');
|
||||||
|
var wait = require("internal").wait;
|
||||||
|
|
||||||
// var internal = require('internal');
|
// var internal = require('internal');
|
||||||
var _ = require('lodash');
|
var _ = require('lodash');
|
||||||
|
|
||||||
|
@ -220,6 +222,89 @@ actions.defineHttp({
|
||||||
// / @brief was docuBlock JSF_cluster_node_version_GET
|
// / @brief was docuBlock JSF_cluster_node_version_GET
|
||||||
// //////////////////////////////////////////////////////////////////////////////
|
// //////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
actions.defineHttp({
|
||||||
|
url: '_admin/cluster/maintenance',
|
||||||
|
allowUseDatabase: true,
|
||||||
|
prefix: false,
|
||||||
|
|
||||||
|
callback: function (req, res) {
|
||||||
|
if (req.requestType !== actions.PUT) {
|
||||||
|
actions.resultError(req, res, actions.HTTP_FORBIDDEN, 0,
|
||||||
|
'only GET and PUT requests are allowed');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var body = JSON.parse(req.requestBody);
|
||||||
|
if (body === undefined) {
|
||||||
|
res.responseCode = actions.HTTP_BAD;
|
||||||
|
res.body = JSON.stringify({
|
||||||
|
'error': true,
|
||||||
|
'errorMessage': 'empty body'
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let operations = {};
|
||||||
|
if (body === "on") {
|
||||||
|
operations['/arango/Supervision/Maintenance'] =
|
||||||
|
{"op":"set","new":true,"ttl":3600};
|
||||||
|
} else if (body === "off") {
|
||||||
|
operations['/arango/Supervision/Maintenance'] = {"op":"delete"};
|
||||||
|
} else {
|
||||||
|
res.responseCode = actions.HTTP_BAD;
|
||||||
|
res.body = JSON.stringify({
|
||||||
|
'error': true,
|
||||||
|
'errorMessage': 'state string must be "on" or "off"'
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let preconditions = {};
|
||||||
|
try {
|
||||||
|
global.ArangoAgency.write([[operations, preconditions]]);
|
||||||
|
} catch (e) {
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait 2 min for supervision to go to maintenance mode
|
||||||
|
var waitUntil = new Date().getTime() + 120.0*1000;
|
||||||
|
while (true) {
|
||||||
|
var mode = global.ArangoAgency.read([["/arango/Supervision/State/Mode"]])[0].
|
||||||
|
arango.Supervision.State.Mode;
|
||||||
|
|
||||||
|
if (body === "on" && mode === "Maintenance") {
|
||||||
|
res.body = JSON.stringify({
|
||||||
|
error: false,
|
||||||
|
warning: 'Cluster supervision deactivated. It will be reactivated automatically in 60 minutes unless this call is repeated until then.'});
|
||||||
|
break;
|
||||||
|
} else if (body === "off" && mode === "Normal") {
|
||||||
|
res.body = JSON.stringify({
|
||||||
|
error: false,
|
||||||
|
warning: 'Cluster supervision reactivated.'});
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
wait(0.1);
|
||||||
|
|
||||||
|
if (new Date().getTime() > waitUntil) {
|
||||||
|
res.responseCode = actions.HTTP_GATEWAY_TIMEOUT;
|
||||||
|
res.body = JSON.stringify({
|
||||||
|
'error': true,
|
||||||
|
'errorMessage':
|
||||||
|
'timed out while waiting for supervision to go into maintenance mode'
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return ;
|
||||||
|
|
||||||
|
}});
|
||||||
|
|
||||||
|
// //////////////////////////////////////////////////////////////////////////////
|
||||||
|
// / @brief was docuBlock JSF_cluster_node_version_GET
|
||||||
|
// //////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
actions.defineHttp({
|
actions.defineHttp({
|
||||||
url: '_admin/clusterNodeVersion',
|
url: '_admin/clusterNodeVersion',
|
||||||
prefix: false,
|
prefix: false,
|
||||||
|
|
|
@ -2130,3 +2130,4 @@ exports.HTTP_SERVER_ERROR = 500;
|
||||||
exports.HTTP_NOT_IMPLEMENTED = 501;
|
exports.HTTP_NOT_IMPLEMENTED = 501;
|
||||||
exports.HTTP_BAD_GATEWAY = 502;
|
exports.HTTP_BAD_GATEWAY = 502;
|
||||||
exports.HTTP_SERVICE_UNAVAILABLE = 503;
|
exports.HTTP_SERVICE_UNAVAILABLE = 503;
|
||||||
|
exports.HTTP_GATEWAY_TIMEOUT = 504;
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*jshint globalstrict:false, strict:false */
|
/*jshint globalstrict:false, strict:false */
|
||||||
/*global assertTrue, assertEqual */
|
/*global assertTrue, assertEqual, ArangoAgency */
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
/// @brief test moving shards in the cluster
|
/// @brief test moving shards in the cluster
|
||||||
|
@ -131,7 +131,6 @@ function MovingShardsSuite () {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
/// @brief test whether or not a server is clean
|
/// @brief test whether or not a server is clean
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -299,6 +298,34 @@ function MovingShardsSuite () {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
/// @brief Set supervision mode
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
function maintenanceMode(mode) {
|
||||||
|
console.log("Switching supervision maintenance " + mode);
|
||||||
|
var coordEndpoint =
|
||||||
|
global.ArangoClusterInfo.getServerEndpoint("Coordinator0001");
|
||||||
|
var request = require("@arangodb/request");
|
||||||
|
var endpointToURL = require("@arangodb/cluster").endpointToURL;
|
||||||
|
var url = endpointToURL(coordEndpoint);
|
||||||
|
var req;
|
||||||
|
try {
|
||||||
|
req = request({ method: "PUT",
|
||||||
|
url: url + "/_admin/cluster/maintenance",
|
||||||
|
body: JSON.stringify(mode) });
|
||||||
|
} catch (err) {
|
||||||
|
console.error(
|
||||||
|
"Exception for PUT /_admin/cluster/maintenance:", err.stack);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
console.log("Supervision maintenance is " + mode);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
/// @brief create some collections
|
/// @brief create some collections
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -609,6 +636,41 @@ function MovingShardsSuite () {
|
||||||
assertTrue(waitForSupervision());
|
assertTrue(waitForSupervision());
|
||||||
},
|
},
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
/// @brief pausing supervision for a couple of seconds
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
testMaintenanceMode : function() {
|
||||||
|
createSomeCollections(1, 1, 3);
|
||||||
|
assertTrue(waitForSynchronousReplication("_system"));
|
||||||
|
var servers = findCollectionServers("_system", c[1].name());
|
||||||
|
var fromServer = servers[0];
|
||||||
|
var toServer = findServerNotOnList(servers);
|
||||||
|
var cinfo = global.ArangoClusterInfo.getCollectionInfo(
|
||||||
|
"_system", c[1].name());
|
||||||
|
var shard = Object.keys(cinfo.shards)[0];
|
||||||
|
assertTrue(maintenanceMode("on"));
|
||||||
|
assertTrue(moveShard("_system", c[1]._id, shard, fromServer, toServer));
|
||||||
|
var first = global.ArangoAgency.transient([["/arango/Supervision/State"]])[0].
|
||||||
|
arango.Supervision.State, state;
|
||||||
|
var waitUntil = new Date().getTime() + 30.0*1000;
|
||||||
|
while(true) {
|
||||||
|
state = global.ArangoAgency.transient([["/arango/Supervision/State"]])[0].
|
||||||
|
arango.Supervision.State;
|
||||||
|
assertEqual(state.Timestamp, first.Timestamp);
|
||||||
|
wait(5.0);
|
||||||
|
if (new Date().getTime() > waitUntil) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue(maintenanceMode("off"));
|
||||||
|
state = global.ArangoAgency.transient([["/arango/Supervision/State"]])[0].
|
||||||
|
arango.Supervision.State;
|
||||||
|
assertTrue(state.Timestamp !== first.Timestamp);
|
||||||
|
assertTrue(testServerEmpty(fromServer, false, 1, 1));
|
||||||
|
assertTrue(waitForSupervision());
|
||||||
|
},
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
/// @brief just to allow a trailing comma at the end of the last test
|
/// @brief just to allow a trailing comma at the end of the last test
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
Loading…
Reference in New Issue