1
0
Fork 0

Work on cluster shutdown:

- when running in valgrind be more gracefull
  - fix incidents when somebody else already picked exit the result
  - fix write access to the results array - run.pids is an object.
  - fix copying arangod in case of crash in cluster case
  - fix cluster shutdown structure analysis
This commit is contained in:
Willi Goesgens 2015-05-07 13:48:58 +02:00
parent 537b115eac
commit a571f12634
2 changed files with 69 additions and 28 deletions

View File

@ -419,7 +419,7 @@ launchActions.startServers = function (dispatchers, cmd, isRelaunch) {
for (i = 0;i < endpoints.length;i++) { for (i = 0;i < endpoints.length;i++) {
var timeout = 50; var timeout = 50;
if (cmd.valgrind !== '') { if (cmd.valgrind !== '') {
timeout *= 1000; timeout *= 10000;
} }
if (! waitForServerUp(endpoints[i], timeout)) { if (! waitForServerUp(endpoints[i], timeout)) {
error = true; error = true;
@ -542,26 +542,48 @@ shutdownActions.startServers = function (dispatchers, cmd, run) {
// we cannot do much with the result... // we cannot do much with the result...
} }
console.info("Waiting 8 seconds for servers to shutdown gracefully..."); var shutdownWait = 8;
wait(8); if (cmd.valgrind !== '') {
shutdownWait *= 10000;
}
console.info("Waiting " + shutdownWait + " seconds for servers to shutdown gracefully...");
var j = 0;
var runpids = run.pids.length;
while ((j < shutdownWait) && (runpids > 0)) {
wait(1);
j++;
for (i = 0; i < run.pids.length; i++) {
for (i = 0;i < run.pids.length;i++) { if (serverStates[JSON.stringify(run.pids[i].pid)] === undefined) {
var s = statusExternal(run.pids[i]); var s = statusExternal(run.pids[i]);
if (s.status !== "TERMINATED") {
if (s.hasOwnProperty('signal')) { if ((s.status === "NOT-FOUND") ||
error = true; (s.status === "TERMINATED") ||
console.error("shuting down %s %s done - with problems: " + s, s.hasOwnProperty('signal')) {
run.roles[i], runpids -=1;
run.endpointNames[i], serverStates[JSON.stringify(run.pids[i])] = s;
JSON.stringify(run.pids[i])); error = true;
}
else if (j > shutdownWait) {
if (s.status !== "TERMINATED") {
if (s.hasOwnProperty('signal')) {
error = true;
console.error("shuting down %s %s done - with problems: " + s,
run.roles[i],
run.endpointNames[i],
JSON.stringify(run.pids[i]));
}
else {
console.info("Shutting down %s the hard way...",
JSON.stringify(run.pids[i]));
s.killedState = killExternal(run.pids[i]);
console.info("done.");
runpids -=1;
}
serverStates[JSON.stringify(run.pids[i])] = s;
}
}
} }
else {
console.info("Shutting down %s the hard way...",
JSON.stringify(run.pids[i]));
s.killedState = killExternal(run.pids[i]);
console.info("done.");
}
serverStates[run.pids[i]] = s;
} }
} }
return {"error": error, "isStartServers": true, "serverStates" : serverStates}; return {"error": error, "isStartServers": true, "serverStates" : serverStates};

View File

@ -462,7 +462,6 @@ function checkInstanceAlive(instanceInfo, options) {
var ret = res.status === "RUNNING"; var ret = res.status === "RUNNING";
if (! ret) { if (! ret) {
print("ArangoD with PID " + instanceInfo.pid.pid + " gone:"); print("ArangoD with PID " + instanceInfo.pid.pid + " gone:");
instanceInfo.exitStatus = res;
print(instanceInfo); print(instanceInfo);
if (res.hasOwnProperty('signal') && if (res.hasOwnProperty('signal') &&
((res.signal === 11) || ((res.signal === 11) ||
@ -485,9 +484,10 @@ function checkInstanceAlive(instanceInfo, options) {
statusExternal(instanceInfo.monitor, true); statusExternal(instanceInfo.monitor, true);
} }
else { else {
copy("bin/arangod", instanceInfo.tmpDataDir); copy("bin/arangod", storeArangodPath);
} }
} }
instanceInfo.exitStatus = res;
} }
if (!ret) { if (!ret) {
serverCrashed = true; serverCrashed = true;
@ -506,11 +506,21 @@ function checkInstanceAlive(instanceInfo, options) {
storeArangodPath = "/var/tmp/arangod_" + checkpid.pid; storeArangodPath = "/var/tmp/arangod_" + checkpid.pid;
print("Core dump written; copying arangod to " + print("Core dump written; copying arangod to " +
storeArangodPath + " for later analysis."); storeArangodPath + " for later analysis.");
instanceInfo.exitStatus = ress;
ress.gdbHint = "Run debugger with 'gdb " + ress.gdbHint = "Run debugger with 'gdb " +
storeArangodPath + storeArangodPath +
" /var/tmp/core*" + checkpid.pid + "*'"; " /var/tmp/core*" + checkpid.pid + "*'";
copy("bin/arangod", storeArangodPath);
if (require("internal").platform.substr(0,3) === 'win') {
copy("bin\\arangod.exe", instanceInfo.tmpDataDir + "\\arangod.exe");
copy("bin\\arangod.pdb", instanceInfo.tmpDataDir + "\\arangod.pdb");
// Windows: wait for procdump to do its job...
statusExternal(instanceInfo.monitor, true);
}
else {
copy("bin/arangod", storeArangodPath);
}
instanceInfo.exitStatus = ress;
ClusterFit = false; ClusterFit = false;
} }
} }
@ -539,15 +549,24 @@ function shutdownInstance (instanceInfo, options) {
instanceInfo.kickstarter.cleanup(); instanceInfo.kickstarter.cleanup();
} }
if (rc.error) { if (rc.error) {
for (var i in rc.serverStates) { for (var i = 0; i < rc.results.length; i++ ) {
if (rc.serverStates.hasOwnProperty(i)){ if (rc.results[i].hasOwnProperty('isStartServers') &&
if (rc.serverStates[i].hasOwnProperty('signal')) { (rc.results[i].isStartServers === true)) {
print("Server shut down with : " + yaml.safeDump(rc.serverStates[i]) + " marking run as crashy."); for (var serverState in rc.results[i].serverStates) {
serverCrashed = true; if (rc.results[i].serverStates.hasOwnProperty(serverState)){
if ((rc.results[i].serverStates[serverState].status === "NOT-FOUND") ||
(rc.results[i].serverStates[serverState].hasOwnProperty('signal'))) {
print("Server " + serverState + " shut down with:\n" +
yaml.safeDump(rc.results[i].serverStates[serverState]) +
" marking run as crashy.");
serverCrashed = true;
}
}
} }
} }
} }
} }
} }
else { else {
if (typeof(instanceInfo.exitStatus) === 'undefined') { if (typeof(instanceInfo.exitStatus) === 'undefined') {