1
0
Fork 0

Work on cluster shutdown:

- when running in valgrind be more gracefull
  - fix incidents when somebody else already picked exit the result
  - fix write access to the results array - run.pids is an object.
  - fix copying arangod in case of crash in cluster case
  - fix cluster shutdown structure analysis
This commit is contained in:
Willi Goesgens 2015-05-07 13:48:58 +02:00
parent 537b115eac
commit a571f12634
2 changed files with 69 additions and 28 deletions

View File

@ -419,7 +419,7 @@ launchActions.startServers = function (dispatchers, cmd, isRelaunch) {
for (i = 0;i < endpoints.length;i++) {
var timeout = 50;
if (cmd.valgrind !== '') {
timeout *= 1000;
timeout *= 10000;
}
if (! waitForServerUp(endpoints[i], timeout)) {
error = true;
@ -542,26 +542,48 @@ shutdownActions.startServers = function (dispatchers, cmd, run) {
// we cannot do much with the result...
}
console.info("Waiting 8 seconds for servers to shutdown gracefully...");
wait(8);
var shutdownWait = 8;
if (cmd.valgrind !== '') {
shutdownWait *= 10000;
}
console.info("Waiting " + shutdownWait + " seconds for servers to shutdown gracefully...");
var j = 0;
var runpids = run.pids.length;
while ((j < shutdownWait) && (runpids > 0)) {
wait(1);
j++;
for (i = 0; i < run.pids.length; i++) {
for (i = 0;i < run.pids.length;i++) {
var s = statusExternal(run.pids[i]);
if (s.status !== "TERMINATED") {
if (s.hasOwnProperty('signal')) {
error = true;
console.error("shuting down %s %s done - with problems: " + s,
run.roles[i],
run.endpointNames[i],
JSON.stringify(run.pids[i]));
if (serverStates[JSON.stringify(run.pids[i].pid)] === undefined) {
var s = statusExternal(run.pids[i]);
if ((s.status === "NOT-FOUND") ||
(s.status === "TERMINATED") ||
s.hasOwnProperty('signal')) {
runpids -=1;
serverStates[JSON.stringify(run.pids[i])] = s;
error = true;
}
else if (j > shutdownWait) {
if (s.status !== "TERMINATED") {
if (s.hasOwnProperty('signal')) {
error = true;
console.error("shuting down %s %s done - with problems: " + s,
run.roles[i],
run.endpointNames[i],
JSON.stringify(run.pids[i]));
}
else {
console.info("Shutting down %s the hard way...",
JSON.stringify(run.pids[i]));
s.killedState = killExternal(run.pids[i]);
console.info("done.");
runpids -=1;
}
serverStates[JSON.stringify(run.pids[i])] = s;
}
}
}
else {
console.info("Shutting down %s the hard way...",
JSON.stringify(run.pids[i]));
s.killedState = killExternal(run.pids[i]);
console.info("done.");
}
serverStates[run.pids[i]] = s;
}
}
return {"error": error, "isStartServers": true, "serverStates" : serverStates};

View File

@ -462,7 +462,6 @@ function checkInstanceAlive(instanceInfo, options) {
var ret = res.status === "RUNNING";
if (! ret) {
print("ArangoD with PID " + instanceInfo.pid.pid + " gone:");
instanceInfo.exitStatus = res;
print(instanceInfo);
if (res.hasOwnProperty('signal') &&
((res.signal === 11) ||
@ -485,9 +484,10 @@ function checkInstanceAlive(instanceInfo, options) {
statusExternal(instanceInfo.monitor, true);
}
else {
copy("bin/arangod", instanceInfo.tmpDataDir);
copy("bin/arangod", storeArangodPath);
}
}
instanceInfo.exitStatus = res;
}
if (!ret) {
serverCrashed = true;
@ -506,11 +506,21 @@ function checkInstanceAlive(instanceInfo, options) {
storeArangodPath = "/var/tmp/arangod_" + checkpid.pid;
print("Core dump written; copying arangod to " +
storeArangodPath + " for later analysis.");
instanceInfo.exitStatus = ress;
ress.gdbHint = "Run debugger with 'gdb " +
storeArangodPath +
" /var/tmp/core*" + checkpid.pid + "*'";
copy("bin/arangod", storeArangodPath);
if (require("internal").platform.substr(0,3) === 'win') {
copy("bin\\arangod.exe", instanceInfo.tmpDataDir + "\\arangod.exe");
copy("bin\\arangod.pdb", instanceInfo.tmpDataDir + "\\arangod.pdb");
// Windows: wait for procdump to do its job...
statusExternal(instanceInfo.monitor, true);
}
else {
copy("bin/arangod", storeArangodPath);
}
instanceInfo.exitStatus = ress;
ClusterFit = false;
}
}
@ -539,15 +549,24 @@ function shutdownInstance (instanceInfo, options) {
instanceInfo.kickstarter.cleanup();
}
if (rc.error) {
for (var i in rc.serverStates) {
if (rc.serverStates.hasOwnProperty(i)){
if (rc.serverStates[i].hasOwnProperty('signal')) {
print("Server shut down with : " + yaml.safeDump(rc.serverStates[i]) + " marking run as crashy.");
serverCrashed = true;
for (var i = 0; i < rc.results.length; i++ ) {
if (rc.results[i].hasOwnProperty('isStartServers') &&
(rc.results[i].isStartServers === true)) {
for (var serverState in rc.results[i].serverStates) {
if (rc.results[i].serverStates.hasOwnProperty(serverState)){
if ((rc.results[i].serverStates[serverState].status === "NOT-FOUND") ||
(rc.results[i].serverStates[serverState].hasOwnProperty('signal'))) {
print("Server " + serverState + " shut down with:\n" +
yaml.safeDump(rc.results[i].serverStates[serverState]) +
" marking run as crashy.");
serverCrashed = true;
}
}
}
}
}
}
}
else {
if (typeof(instanceInfo.exitStatus) === 'undefined') {