From bc30fbe1ef3046888871af82fb27418ec03c8baa Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Mon, 23 Jan 2017 11:14:25 +0100 Subject: [PATCH 1/4] handling agency comm failures --- arangod/Agency/AgencyComm.cpp | 38 ++++++++++++++++------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/arangod/Agency/AgencyComm.cpp b/arangod/Agency/AgencyComm.cpp index 51ef5def2c..7773d4c60f 100644 --- a/arangod/Agency/AgencyComm.cpp +++ b/arangod/Agency/AgencyComm.cpp @@ -1365,26 +1365,7 @@ AgencyCommResult AgencyComm::sendWithFailover( // break on a watch timeout (drop connection) if (result._statusCode == 0) { - AgencyCommManager::MANAGER->failed(std::move(connection), endpoint); - endpoint.clear(); - connection = AgencyCommManager::MANAGER->acquire(endpoint); - continue; - } - - // sometimes the agency will return a 307 (temporary redirect) - // in this case we have to pick it up and use the new location returned - if (result._statusCode == - (int)arangodb::rest::ResponseCode::TEMPORARY_REDIRECT) { - endpoint = AgencyCommManager::MANAGER->redirect( - std::move(connection), endpoint, result._location, url); - connection = AgencyCommManager::MANAGER->acquire(endpoint); - waitInterval = std::chrono::duration(.25); - continue; - } - // Precondition failed. - - if (result._statusCode == 412 && !clientId.empty()) { VPackBuilder b; { VPackArrayBuilder ab(&b); @@ -1392,7 +1373,7 @@ AgencyCommResult AgencyComm::sendWithFailover( } LOG_TOPIC(INFO, Logger::AGENCYCOMM) << - "Got precondition failed! Inquiring about clientId " << clientId << ": "; + "Failed agency comm! Inquiring about clientId " << clientId << ": "; AgencyCommResult inq = send( connection.get(), method, conTimeout, "/_api/agency/inquire", @@ -1439,9 +1420,24 @@ AgencyCommResult AgencyComm::sendWithFailover( "with error. Keep trying ..."; return result; } - + + AgencyCommManager::MANAGER->failed(std::move(connection), endpoint); + endpoint.clear(); + connection = AgencyCommManager::MANAGER->acquire(endpoint); + continue; } + // sometimes the agency will return a 307 (temporary redirect) + // in this case we have to pick it up and use the new location returned + if (result._statusCode == + (int)arangodb::rest::ResponseCode::TEMPORARY_REDIRECT) { + endpoint = AgencyCommManager::MANAGER->redirect( + std::move(connection), endpoint, result._location, url); + connection = AgencyCommManager::MANAGER->acquire(endpoint); + waitInterval = std::chrono::duration(.25); + continue; + } + // do not retry on client errors if (result._statusCode >= 400 && result._statusCode <= 499) { AgencyCommManager::MANAGER->release(std::move(connection), endpoint); From 8d70132c9d8633fefc4c9b96d7a6bef26940a096 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Mon, 23 Jan 2017 11:16:23 +0100 Subject: [PATCH 2/4] handling agency comm failures --- arangod/Agency/AgencyComm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arangod/Agency/AgencyComm.cpp b/arangod/Agency/AgencyComm.cpp index 7773d4c60f..93cfa54e0a 100644 --- a/arangod/Agency/AgencyComm.cpp +++ b/arangod/Agency/AgencyComm.cpp @@ -625,7 +625,7 @@ std::string AgencyCommManager::redirect( _endpoints.end()); LOG_TOPIC(WARN, Logger::AGENCYCOMM) - << "got an agency redirect from '" << endpoint + << "Got an agency redirect from '" << endpoint << "' to '" << specification << "'"; _endpoints.push_front(specification); From 981e71953d728ae49cca019f8206a32d289e5459 Mon Sep 17 00:00:00 2001 From: Kaveh Vahedipour Date: Mon, 23 Jan 2017 11:26:03 +0100 Subject: [PATCH 3/4] handling agency comm failures --- .../tests/resilience/resilience-synchronous-repl-cluster.js | 1 - 1 file changed, 1 deletion(-) diff --git a/js/server/tests/resilience/resilience-synchronous-repl-cluster.js b/js/server/tests/resilience/resilience-synchronous-repl-cluster.js index 940724f702..2733a16d95 100644 --- a/js/server/tests/resilience/resilience-synchronous-repl-cluster.js +++ b/js/server/tests/resilience/resilience-synchronous-repl-cluster.js @@ -825,7 +825,6 @@ function SynchronousReplicationSuite () { wait(5); } healLeader(); - assertTrue(waitForSynchronousReplication("_system")); }, //////////////////////////////////////////////////////////////////////////////// From 4beec0a94c4e6531b3c487f0ebf261e33ac106d3 Mon Sep 17 00:00:00 2001 From: Michael Hackstein Date: Mon, 23 Jan 2017 11:42:06 +0100 Subject: [PATCH 4/4] Reverted numberOfShards from Test. The test-code is not deterministic. --- js/server/tests/aql/aql-optimizer-rule-use-index-for-sort.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/js/server/tests/aql/aql-optimizer-rule-use-index-for-sort.js b/js/server/tests/aql/aql-optimizer-rule-use-index-for-sort.js index 305b6fe468..e37eccdd98 100644 --- a/js/server/tests/aql/aql-optimizer-rule-use-index-for-sort.js +++ b/js/server/tests/aql/aql-optimizer-rule-use-index-for-sort.js @@ -112,7 +112,7 @@ function optimizerRuleTestSuite() { var loopto = 10; internal.db._drop(colName); - skiplist = internal.db._create(colName, {numberOfShards: 4}); + skiplist = internal.db._create(colName, {numberOfShards: 1}); var i, j; for (j = 1; j <= loopto; ++j) { for (i = 1; i <= loopto; ++i) { @@ -127,7 +127,7 @@ function optimizerRuleTestSuite() { skiplist.ensureIndex({ type: "hash", fields: [ "c" ], unique: false }); internal.db._drop(colNameOther); - skiplist2 = internal.db._create(colNameOther, {numberOfShards: 4}); + skiplist2 = internal.db._create(colNameOther, {numberOfShards: 1}); for (j = 1; j <= loopto; ++j) { for (i = 1; i <= loopto; ++i) { skiplist2.save({ "f" : i, "g": j , "h": j, "i": i, "j": i, "joinme" : "aoeu " + j});