1
0
Fork 0

Feature/engine version added to agent configuration (#7481) (#7524)

* agents' is obtained from leader's configuration
* corrections in Supervision for advertised endpoints
* change log
* Updated Documentation for cluster/health.
* Unified naming convention.
* Fixed missing update of volatile fields.
* Set version in right order.
* Removed debug output.
* Fixed jslint - missing ;
This commit is contained in:
Lars Maier 2018-11-29 14:25:40 +01:00 committed by Max Neunhöffer
parent 17a95710bd
commit 52cff7ad55
6 changed files with 111 additions and 57 deletions

View File

@ -146,6 +146,9 @@ v3.4.0-rc.5 (XXXX-XX-XX)
* fix move leader shard: wait until all but the old leader are in sync.
This fixes some unstable tests.
* cluster health features more elaborate agent records
* agency's supervision edited for advertised endpoints
v3.4.0-rc.4 (2018-11-04)
------------------------

View File

@ -12,14 +12,23 @@ Queries the health of the cluster for monitoring purposes. The response is a JSO
- `Endpoint`: A string representing the network endpoint of the server.
- `Role`: The role the server plays. Possible values are `"AGENT"`, `"COORDINATOR"`, and `"DBSERVER"`.
- `CanBeDeleted`: Boolean representing whether the node can safely be removed from the cluster.
Additionally, if the node is a Coordinator or DBServer, it will also have the following attributes:
- `AdvertisedEndpoint`: A string representing the advertised endpoint (e.g. external IP address or load balancer, optional)
- `Status`: A string indicating the health of the node as assessed by the supervision (agency). This should be considered primary source of truth for node health. If the node is responding normally to requests, it is `"GOOD"`. If it has missed one heartbeat, it is `"BAD"`. If it has been declared failed by the supervision, which occurs after missing heartbeats for about 15 seconds, it will be marked `"FAILED"`.
- `SyncStatus`: The last sync status reported by the node. This value is primarily used to determine the value of `Status`. Possible values include `"UNKNOWN"`, `"UNDEFINED"`, `"STARTUP"`, `"STOPPING"`, `"STOPPED"`, `"SERVING"`, `"SHUTDOWN"`.
- `ShortName`: A string representing the shortname of the server, e.g. `"DBServer1"`.
- `Timestamp`: ISO 8601 timestamp specifying the last heartbeat received.
- `Host`: An optional string, specifying the host machine if known.
- `Version`: Version String of ArangoDB used by that node.
- `Engine`: Storage Engine used by that node.
- `Status`: A string indicating the health of the node as assessed by the supervision (agency). This should be considered primary source of truth for coordinator and dbservers node health. If the node is responding normally to requests, it is `"GOOD"`. If it has missed one heartbeat, it is `"BAD"`. If it has been declared failed by the supervision, which occurs after missing heartbeats for about 15 seconds, it will be marked `"FAILED"`.
Additionally it will also have the following attributes for
- Coordinators and DBServer:
- `SyncStatus`: The last sync status reported by the node. This value is primarily used to determine the value of `Status`. Possible values include `"UNKNOWN"`, `"UNDEFINED"`, `"STARTUP"`, `"STOPPING"`, `"STOPPED"`, `"SERVING"`, `"SHUTDOWN"`.
- `ShortName`: A string representing the shortname of the server, e.g. `"Coordinator0001"`.
- `Timestamp`: ISO 8601 timestamp specifying the last heartbeat received.
- `Host`: An optional string, specifying the host machine if known.
- Only Coordinators:
- `AdvertisedEndpoint`: A string representing the advertised endpoint, if set. (e.g. external IP address or load balancer, optional)
- Agents:
- "Leader": ID of the agent this node regards as leader.
- "Leading": Whether this agent is the leader (true) or not (false).
- "LastAckedTime": Time since last `acked` in seconds.
@RESTRETURNCODES

View File

@ -33,6 +33,7 @@
#include "Logger/Logger.h"
#include "Rest/HttpRequest.h"
#include "Rest/Version.h"
#include "StorageEngine/EngineSelectorFeature.h"
using namespace arangodb;
@ -547,6 +548,8 @@ RestStatus RestAgencyHandler::handleConfig() {
body.add("commitIndex", Value(last));
_agent->lastAckedAgo(body);
body.add("configuration", _agent->config().toBuilder()->slice());
body.add("engine", VPackValue(EngineSelectorFeature::engineName()));
body.add("version", VPackValue(ARANGODB_VERSION));
}
generateResult(rest::ResponseCode::OK, body.slice());

View File

@ -61,18 +61,14 @@ struct HealthRecord {
HealthRecord(
std::string const& sn, std::string const& ep, std::string const& ho,
std::string const& en, std::string const& sv) :
shortName(sn), endpoint(ep), hostId(ho), serverVersion(sv),
engine(en), version(0) {}
std::string const& en, std::string const& sv, std::string const& ae) :
shortName(sn), endpoint(ep), advertisedEndpoint(ae), hostId(ho),
serverVersion(sv), engine(en), version(0) {}
explicit HealthRecord(Node const& node) {
*this = node;
}
HealthRecord(HealthRecord const& other) {
*this = other;
}
HealthRecord& operator=(Node const& node) {
version = 0;
if (shortName.empty()) {
@ -98,12 +94,16 @@ struct HealthRecord {
if (node.has("AdvertisedEndpoint")) {
version = 3;
advertisedEndpoint = node.hasAsString("AdvertisedEndpoint").first;
if (node.has("Engine") && node.has("Version")) {
version = 4;
engine = node.hasAsString("Engine").first;
serverVersion = node.hasAsString("Version").first;
}
} else {
advertisedEndpoint.clear();
}
if (node.has("Engine") && node.has("Version")) {
version = 4;
engine = node.hasAsString("Engine").first;
serverVersion = node.hasAsString("Version").first;
} else {
engine.clear();
serverVersion.clear();
}
} else if (node.has("LastHeartbeatStatus")) {
version = 1;
@ -119,29 +119,18 @@ struct HealthRecord {
return *this;
}
HealthRecord& operator=(HealthRecord const& other) {
shortName = other.shortName;
syncStatus = other.syncStatus;
status = other.status;
advertisedEndpoint = other.advertisedEndpoint;
endpoint = other.endpoint;
hostId = other.hostId;
engine = other.engine;
serverVersion = other.serverVersion;
version = other.version;
return *this;
}
void toVelocyPack(VPackBuilder& obj) const {
TRI_ASSERT(obj.isOpenObject());
obj.add("ShortName", VPackValue(shortName));
obj.add("Endpoint", VPackValue(endpoint));
obj.add("AdvertisedEndpoint", VPackValue(advertisedEndpoint));
obj.add("Host", VPackValue(hostId));
obj.add("SyncStatus", VPackValue(syncStatus));
obj.add("Status", VPackValue(status));
obj.add("Version", VPackValue(serverVersion));
obj.add("Engine", VPackValue(engine));
if (!advertisedEndpoint.empty()) {
obj.add("AdvertisedEndpoint", VPackValue(advertisedEndpoint));
}
if (syncTime.empty()) {
obj.add("Timestamp",
VPackValue(timepointToString(std::chrono::system_clock::now())));
@ -152,7 +141,13 @@ struct HealthRecord {
}
bool statusDiff(HealthRecord const& other) {
return (status != other.status || syncStatus != other.syncStatus);
return status != other.status ||
syncStatus != other.syncStatus ||
advertisedEndpoint != other.advertisedEndpoint ||
serverVersion != other.serverVersion ||
engine != other.engine ||
hostId != other.hostId ||
endpoint != other.endpoint;
}
friend std::ostream& operator<<(std::ostream& o, HealthRecord const& hr) {
@ -481,18 +476,18 @@ std::vector<check_t> Supervision::check(std::string const& type) {
if (serversRegistered.has(enPath)) {
engine = serversRegistered.hasAsString(enPath).first;
}
// "/arango/Current/<serverId>/externalEndpoint"
/*std::string externalEndpoint;
std::string extEndPath = serverID + "/externalEndpoint";
//"/arango/Current/<serverId>/externalEndpoint"
std::string externalEndpoint;
std::string extEndPath = serverID + "/advertisedEndpoint";
if (serversRegistered.has(extEndPath)) {
externalEndpoint = serversRegistered.hasAsString(extEndPath).first;
}*/
}
// Health records from persistence, from transience and a new one
HealthRecord transist(shortName, endpoint, hostId, engine, serverVersion);
HealthRecord persist(shortName, endpoint, hostId, engine, serverVersion);
HealthRecord transist(
shortName, endpoint, hostId, engine, serverVersion, externalEndpoint);
HealthRecord persist(
shortName, endpoint, hostId, engine, serverVersion, externalEndpoint);
// Get last health entries from transient and persistent key value stores
if (_transient.has(healthPrefix + serverID)) {
@ -519,6 +514,13 @@ std::vector<check_t> Supervision::check(std::string const& type) {
transist.syncTime = syncTime;
transist.syncStatus = syncStatus;
// update volatile values that may change
transist.advertisedEndpoint = externalEndpoint;
transist.serverVersion = serverVersion;
transist.engine = engine;
transist.hostId = hostId;
transist.endpoint = endpoint;
// Calculate elapsed since lastAcked
auto elapsed = std::chrono::duration<double>(
std::chrono::system_clock::now() - lastAckedTime);

View File

@ -429,18 +429,13 @@ static void JS_Agency(v8::FunctionCallbackInfo<v8::Value> const& args) {
}
VPackBuilder builder;
{ VPackArrayBuilder a(&builder);
{ VPackArrayBuilder b(&builder);
builder.add(VPackValue("/.agency"));
}
}
AgencyComm comm;
AgencyCommResult result =
comm.sendWithFailover(
arangodb::rest::RequestType::POST,
arangodb::rest::RequestType::GET,
AgencyCommManager::CONNECTION_OPTIONS._requestTimeout,
std::string("/_api/agency/read"), builder.slice());
std::string("/_api/agency/config"), builder.slice());
if (!result.successful()) {
THROW_AGENCY_EXCEPTION(result);

View File

@ -188,7 +188,7 @@ actions.defineHttp({
while (true) {
var mode = global.ArangoAgency.read([["/arango/Supervision/State/Mode"]])[0].
arango.Supervision.State.Mode;
if (body === "on" && mode === "Maintenance") {
res.body = JSON.stringify({
error: false,
@ -202,7 +202,7 @@ actions.defineHttp({
}
wait(0.1);
if (new Date().getTime() > waitUntil) {
res.responseCode = actions.HTTP_GATEWAY_TIMEOUT;
res.body = JSON.stringify({
@ -212,10 +212,10 @@ actions.defineHttp({
});
return;
}
}
return ;
return ;
}});
// //////////////////////////////////////////////////////////////////////////////
@ -540,8 +540,50 @@ actions.defineHttp({
return Health;
}, Health);
Object.entries(agency[0]['.agency'].pool).forEach(([key, value]) => {
Health[key] = {Endpoint: value, Role: 'Agent', CanBeDeleted: false};
Object.entries(agency.configuration.pool).forEach(([key, value]) => {
if (Health.hasOwnProperty(key)) {
Health[key].Endpoint = value;
Health[key].Role = 'Agent';
Health[key].CanBeDeleted = false;
} else {
Health[key] = {Endpoint: value, Role: 'Agent', CanBeDeleted: false};
}
var options = { timeout: 5 };
var op = ArangoClusterComm.asyncRequest(
'GET', value, req.database, '/_api/agency/config', '', {}, options);
var r = ArangoClusterComm.wait(op);
if (r.status === 'RECEIVED') {
var record = JSON.parse(r.body);
Health[key].Version = record.version;
Health[key].Engine = record.engine;
Health[key].Leader = record.leaderId;
if (record.hasOwnProperty("lastAcked")) {
Health[key].Leading = true;
Object.entries(record.lastAcked).forEach(([k,v]) => {
if (Health.hasOwnProperty(k)) {
Health[k].LastAckedTime = v.lastAckedTime;
} else {
Health[k] = {LastAckedTime: v.lastAckedTime};
}
});
}
Health[key].Status = "GOOD";
} else {
Health[key].Status = "BAD";
if (r.status === 'TIMEOUT') {
Health[key].Error = "TIMEOUT";
} else {
try {
Health[key].Error = JSON.parse(r.body);
} catch (err) {
Health[key].Error = "UNKNOWN";
}
}
}
});
actions.resultOk(req, res, actions.HTTP_OK, {Health, ClusterId: clusterId});