From 7c0575b27d34a548e360346942fd7c6212cad60b Mon Sep 17 00:00:00 2001 From: Max Neunhoeffer Date: Fri, 3 Feb 2017 10:33:25 +0100 Subject: [PATCH 1/2] Fix timeout in cluster operation and wal flush bugs. --- js/server/modules/@arangodb/cluster.js | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/js/server/modules/@arangodb/cluster.js b/js/server/modules/@arangodb/cluster.js index d0b79ac34f..899f68583f 100644 --- a/js/server/modules/@arangodb/cluster.js +++ b/js/server/modules/@arangodb/cluster.js @@ -120,14 +120,12 @@ function startReadLockOnLeader (endpoint, database, collName, timeout) { const id = r.id; var body = { 'id': id, 'collection': collName, 'ttl': timeout }; - r = request({ url: url + '/_api/replication/holdReadLockCollection', - body: JSON.stringify(body), - method: 'POST', headers: {'x-arango-async': true} }); - if (r.status !== 202) { - console.error('startReadLockOnLeader: Could not start read lock for shard', - collName, r); - return false; - } + request({ url: url + '/_api/replication/holdReadLockCollection', + body: JSON.stringify(body), + method: 'POST', headers: {'x-arango-async': true} }); + // Intentionally do not look at the outcome, even in case of an error + // we must make sure that the read lock on the leader is not active! + // This is done automatically below. var count = 0; while (++count < 20) { // wait for some time until read lock established: @@ -170,7 +168,10 @@ function startReadLockOnLeader (endpoint, database, collName, timeout) { // ///////////////////////////////////////////////////////////////////////////// function cancelReadLockOnLeader (endpoint, database, lockJobId) { - var url = endpointToURL(endpoint) + '/_db/' + database + + // Note that we always use the _system database here because the actual + // database might be gone already on the leader and we need to cancel + // the read lock under all circumstances. + var url = endpointToURL(endpoint) + '/_db/_system' + '/_api/replication/holdReadLockCollection'; var r; var body = {'id': lockJobId}; @@ -181,7 +182,8 @@ function cancelReadLockOnLeader (endpoint, database, lockJobId) { return false; } if (r.status !== 200) { - console.error('cancelReadLockOnLeader: error', r); + console.error('cancelReadLockOnLeader: error', lockJobId, r.status, + r.message, r.body, r.json); return false; } console.debug('cancelReadLockOnLeader: success'); From 72fa45cc9487b9bc8c8555daaa8ea41d0b3d666b Mon Sep 17 00:00:00 2001 From: Max Neunhoeffer Date: Fri, 3 Feb 2017 10:47:13 +0100 Subject: [PATCH 2/2] Port better reporting of longish Schmutz behaviour to devel. --- arangod/Cluster/DBServerAgencySync.cpp | 7 +++++ js/server/modules/@arangodb/cluster.js | 37 ++++++++++++++++++++------ 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/arangod/Cluster/DBServerAgencySync.cpp b/arangod/Cluster/DBServerAgencySync.cpp index d1f741caad..2769b52eff 100644 --- a/arangod/Cluster/DBServerAgencySync.cpp +++ b/arangod/Cluster/DBServerAgencySync.cpp @@ -54,6 +54,8 @@ void DBServerAgencySync::work() { DBServerAgencySyncResult DBServerAgencySync::execute() { // default to system database + double startTime = TRI_microtime(); + LOG_TOPIC(DEBUG, Logger::HEARTBEAT) << "DBServerAgencySync::execute starting"; DatabaseFeature* database = ApplicationServer::getFeature("Database"); @@ -80,6 +82,11 @@ DBServerAgencySyncResult DBServerAgencySync::execute() { return result; } + double now = TRI_microtime(); + if (now - startTime > 5.0) { + LOG(INFO) << "DBServerAgencySync::execute took more than 5s to get free V8 context, starting handle-plan-change now"; + } + TRI_DEFER(V8DealerFeature::DEALER->exitContext(context)); auto isolate = context->_isolate; diff --git a/js/server/modules/@arangodb/cluster.js b/js/server/modules/@arangodb/cluster.js index 899f68583f..15eee2fb78 100644 --- a/js/server/modules/@arangodb/cluster.js +++ b/js/server/modules/@arangodb/cluster.js @@ -455,6 +455,7 @@ function synchronizeOneShard (database, shard, planId, leader) { // synchronize this shard from the leader // this function will throw if anything goes wrong + var startTime = new Date(); var isStopping = require('internal').isStopping; var ourselves = global.ArangoServerState.id(); @@ -487,8 +488,9 @@ function synchronizeOneShard (database, shard, planId, leader) { planned[0] !== leader) { // Things have changed again, simply terminate: terminateAndStartOther(); - console.debug('synchronizeOneShard: cancelled, %s/%s, %s/%s', - database, shard, database, planId); + let endTime = new Date(); + console.debug('synchronizeOneShard: cancelled, %s/%s, %s/%s, started %s, ended %s', + database, shard, database, planId, startTime.toString(), endTime.toString()); return; } var current = []; @@ -502,8 +504,9 @@ function synchronizeOneShard (database, shard, planId, leader) { } // We are already there, this is rather strange, but never mind: terminateAndStartOther(); - console.debug('synchronizeOneShard: already done, %s/%s, %s/%s', - database, shard, database, planId); + let endTime = new Date(); + console.debug('synchronizeOneShard: already done, %s/%s, %s/%s, started %s, ended %s', + database, shard, database, planId, startTime.toString(), endTime.toString()); return; } console.debug('synchronizeOneShard: waiting for leader, %s/%s, %s/%s', @@ -524,9 +527,16 @@ function synchronizeOneShard (database, shard, planId, leader) { if (isStopping()) { throw 'server is shutting down'; } + let startTime = new Date(); sy = rep.syncCollection(shard, { endpoint: ep, incremental: true, keepBarrier: true, useCollectionId: false }); + let endTime = new Date(); + let longSync = false; + if (endTime - startTime > 5000) { + console.error('synchronizeOneShard: long call to syncCollection for shard', shard, JSON.stringify(sy), "start time: ", startTime.toString(), "end time: ", endTime.toString()); + longSync = true; + } if (sy.error) { console.error('synchronizeOneShard: could not initially synchronize', 'shard ', shard, sy); @@ -534,7 +544,15 @@ function synchronizeOneShard (database, shard, planId, leader) { } else { if (sy.collections.length === 0 || sy.collections[0].name !== shard) { + if (longSync) { + console.error('synchronizeOneShard: long sync, before cancelBarrier', + new Date().toString()); + } cancelBarrier(ep, database, sy.barrierId); + if (longSync) { + console.error('synchronizeOneShard: long sync, after cancelBarrier', + new Date().toString()); + } throw 'Shard ' + shard + ' seems to be gone from leader!'; } else { // Now start a read transaction to stop writes: @@ -594,14 +612,17 @@ function synchronizeOneShard (database, shard, planId, leader) { } else if (err2 && err2.errorNum === 1402 && err2.errorMessage.match(/HTTP 404/)) { logLevel = 'debug'; } - console[logLevel]("synchronization of local shard '%s/%s' for central '%s/%s' failed: %s", - database, shard, database, planId, JSON.stringify(err2)); + let endTime = new Date(); + console[logLevel]("synchronization of local shard '%s/%s' for central '%s/%s' failed: %s, started: %s, ended: %s", + database, shard, database, planId, JSON.stringify(err2), + startTime.toString(), endTime.toString()); } } // Tell others that we are done: terminateAndStartOther(); - console.debug('synchronizeOneShard: done, %s/%s, %s/%s', - database, shard, database, planId); + let endTime = new Date(); + console.debug('synchronizeOneShard: done, %s/%s, %s/%s, started: %s, ended: %s', + database, shard, database, planId, startTime.toString(), endTime.toString()); } // /////////////////////////////////////////////////////////////////////////////