From a571f1263445bca28f4ae60dd25bc1da44de03de Mon Sep 17 00:00:00 2001 From: Willi Goesgens Date: Thu, 7 May 2015 13:48:58 +0200 Subject: [PATCH] Work on cluster shutdown: - when running in valgrind be more gracefull - fix incidents when somebody else already picked exit the result - fix write access to the results array - run.pids is an object. - fix copying arangod in case of crash in cluster case - fix cluster shutdown structure analysis --- .../org/arangodb/cluster/kickstarter.js | 60 +++++++++++++------ js/server/modules/org/arangodb/testing.js | 37 +++++++++--- 2 files changed, 69 insertions(+), 28 deletions(-) diff --git a/js/server/modules/org/arangodb/cluster/kickstarter.js b/js/server/modules/org/arangodb/cluster/kickstarter.js index 8f268ea30b..da7e03e996 100644 --- a/js/server/modules/org/arangodb/cluster/kickstarter.js +++ b/js/server/modules/org/arangodb/cluster/kickstarter.js @@ -419,7 +419,7 @@ launchActions.startServers = function (dispatchers, cmd, isRelaunch) { for (i = 0;i < endpoints.length;i++) { var timeout = 50; if (cmd.valgrind !== '') { - timeout *= 1000; + timeout *= 10000; } if (! waitForServerUp(endpoints[i], timeout)) { error = true; @@ -542,26 +542,48 @@ shutdownActions.startServers = function (dispatchers, cmd, run) { // we cannot do much with the result... } - console.info("Waiting 8 seconds for servers to shutdown gracefully..."); - wait(8); + var shutdownWait = 8; + if (cmd.valgrind !== '') { + shutdownWait *= 10000; + } + console.info("Waiting " + shutdownWait + " seconds for servers to shutdown gracefully..."); + var j = 0; + var runpids = run.pids.length; + while ((j < shutdownWait) && (runpids > 0)) { + wait(1); + j++; + for (i = 0; i < run.pids.length; i++) { - for (i = 0;i < run.pids.length;i++) { - var s = statusExternal(run.pids[i]); - if (s.status !== "TERMINATED") { - if (s.hasOwnProperty('signal')) { - error = true; - console.error("shuting down %s %s done - with problems: " + s, - run.roles[i], - run.endpointNames[i], - JSON.stringify(run.pids[i])); + if (serverStates[JSON.stringify(run.pids[i].pid)] === undefined) { + var s = statusExternal(run.pids[i]); + + if ((s.status === "NOT-FOUND") || + (s.status === "TERMINATED") || + s.hasOwnProperty('signal')) { + runpids -=1; + serverStates[JSON.stringify(run.pids[i])] = s; + error = true; + } + else if (j > shutdownWait) { + if (s.status !== "TERMINATED") { + if (s.hasOwnProperty('signal')) { + error = true; + console.error("shuting down %s %s done - with problems: " + s, + run.roles[i], + run.endpointNames[i], + JSON.stringify(run.pids[i])); + } + else { + console.info("Shutting down %s the hard way...", + JSON.stringify(run.pids[i])); + s.killedState = killExternal(run.pids[i]); + console.info("done."); + runpids -=1; + } + serverStates[JSON.stringify(run.pids[i])] = s; + } + } } - else { - console.info("Shutting down %s the hard way...", - JSON.stringify(run.pids[i])); - s.killedState = killExternal(run.pids[i]); - console.info("done."); - } - serverStates[run.pids[i]] = s; } } return {"error": error, "isStartServers": true, "serverStates" : serverStates}; diff --git a/js/server/modules/org/arangodb/testing.js b/js/server/modules/org/arangodb/testing.js index 31ad870514..bacad050c2 100644 --- a/js/server/modules/org/arangodb/testing.js +++ b/js/server/modules/org/arangodb/testing.js @@ -462,7 +462,6 @@ function checkInstanceAlive(instanceInfo, options) { var ret = res.status === "RUNNING"; if (! ret) { print("ArangoD with PID " + instanceInfo.pid.pid + " gone:"); - instanceInfo.exitStatus = res; print(instanceInfo); if (res.hasOwnProperty('signal') && ((res.signal === 11) || @@ -485,9 +484,10 @@ function checkInstanceAlive(instanceInfo, options) { statusExternal(instanceInfo.monitor, true); } else { - copy("bin/arangod", instanceInfo.tmpDataDir); + copy("bin/arangod", storeArangodPath); } } + instanceInfo.exitStatus = res; } if (!ret) { serverCrashed = true; @@ -506,11 +506,21 @@ function checkInstanceAlive(instanceInfo, options) { storeArangodPath = "/var/tmp/arangod_" + checkpid.pid; print("Core dump written; copying arangod to " + storeArangodPath + " for later analysis."); - instanceInfo.exitStatus = ress; ress.gdbHint = "Run debugger with 'gdb " + storeArangodPath + " /var/tmp/core*" + checkpid.pid + "*'"; - copy("bin/arangod", storeArangodPath); + + if (require("internal").platform.substr(0,3) === 'win') { + copy("bin\\arangod.exe", instanceInfo.tmpDataDir + "\\arangod.exe"); + copy("bin\\arangod.pdb", instanceInfo.tmpDataDir + "\\arangod.pdb"); + // Windows: wait for procdump to do its job... + statusExternal(instanceInfo.monitor, true); + } + else { + copy("bin/arangod", storeArangodPath); + } + + instanceInfo.exitStatus = ress; ClusterFit = false; } } @@ -539,15 +549,24 @@ function shutdownInstance (instanceInfo, options) { instanceInfo.kickstarter.cleanup(); } if (rc.error) { - for (var i in rc.serverStates) { - if (rc.serverStates.hasOwnProperty(i)){ - if (rc.serverStates[i].hasOwnProperty('signal')) { - print("Server shut down with : " + yaml.safeDump(rc.serverStates[i]) + " marking run as crashy."); - serverCrashed = true; + for (var i = 0; i < rc.results.length; i++ ) { + if (rc.results[i].hasOwnProperty('isStartServers') && + (rc.results[i].isStartServers === true)) { + for (var serverState in rc.results[i].serverStates) { + if (rc.results[i].serverStates.hasOwnProperty(serverState)){ + if ((rc.results[i].serverStates[serverState].status === "NOT-FOUND") || + (rc.results[i].serverStates[serverState].hasOwnProperty('signal'))) { + print("Server " + serverState + " shut down with:\n" + + yaml.safeDump(rc.results[i].serverStates[serverState]) + + " marking run as crashy."); + serverCrashed = true; + } + } } } } } + } else { if (typeof(instanceInfo.exitStatus) === 'undefined') {