1
0
Fork 0

Cancel connections to known failed servers (#10546)

* test

* rename shit

* rename shit

* Cancel connections to known failed servers

* do not uncomment code

* Retry on 503 responses

* Make test grey again
This commit is contained in:
Simon 2019-11-28 00:34:43 +08:00 committed by KVS85
parent 119aa7bfd8
commit 2ae30242c0
7 changed files with 117 additions and 83 deletions

View File

@ -53,7 +53,8 @@ StatusCode constexpr StatusNotAcceptable = 406;
StatusCode constexpr StatusConflict = 409;
StatusCode constexpr StatusPreconditionFailed = 412;
StatusCode constexpr StatusInternalError = 500;
StatusCode constexpr StatusUnavailable = 505;
StatusCode constexpr StatusUnavailable = 503;
StatusCode constexpr StatusVersionNotSupported = 505;
// -----------------------------------------------------------------------------
// --SECTION-- enum class ErrorCondition

View File

@ -34,21 +34,18 @@ void resolveConnect(detail::ConnectionConfiguration const& config,
asio_ns::ip::tcp::resolver& resolver,
SocketT& socket,
F&& done) {
auto cb = [&socket, done = std::forward<F>(done)]
(asio_ns::error_code const& ec,
asio_ns::ip::tcp::resolver::iterator it) {
if (ec) { // error
auto cb = [&socket, done(std::forward<F>(done))](auto ec, auto it) mutable {
if (ec) { // error in address resolver
done(ec);
return;
}
// A successful resolve operation is guaranteed to pass a
// non-empty range to the handler.
auto cb = [done](asio_ns::error_code const& ec,
asio_ns::ip::tcp::resolver::iterator const&) {
done(ec);
};
asio_ns::async_connect(socket, it, std::move(cb));
asio_ns::async_connect(socket, it,
[done(std::move(done))](auto ec, auto it) mutable {
std::forward<F>(done)(ec);
});
};
// windows does not like async_resolve
@ -114,7 +111,9 @@ struct Socket<fuerte::SocketType::Ssl> {
template<typename F>
void connect(detail::ConnectionConfiguration const& config, F&& done) {
auto cb = [this, &config, done = std::forward<F>(done)](asio_ns::error_code const& ec) {
bool verify = config._verifyHost;
resolveConnect(config, resolver, socket.next_layer(),
[=, done(std::forward<F>(done))](auto const& ec) mutable {
if (ec) {
done(ec);
return;
@ -122,7 +121,7 @@ struct Socket<fuerte::SocketType::Ssl> {
// Perform SSL handshake and verify the remote host's certificate.
socket.next_layer().set_option(asio_ns::ip::tcp::no_delay(true));
if (config._verifyHost) {
if (verify) {
socket.set_verify_mode(asio_ns::ssl::verify_peer);
socket.set_verify_callback(asio_ns::ssl::rfc2818_verification(config._host));
} else {
@ -130,9 +129,7 @@ struct Socket<fuerte::SocketType::Ssl> {
}
socket.async_handshake(asio_ns::ssl::stream_base::client, std::move(done));
};
resolveConnect(config, resolver, socket.next_layer(), std::move(cb));
});
}
void shutdown() {

View File

@ -151,25 +151,18 @@ void ConnectionPool::pruneConnections() {
}
/// @brief cancel connections to this endpoint
void ConnectionPool::cancelConnections(std::string const& endpoint) {
fuerte::ConnectionBuilder builder;
builder.endpoint(endpoint);
builder.protocolType(_config.protocol); // always overwrite protocol
std::string normalized = builder.normalizedEndpoint();
size_t ConnectionPool::cancelConnections(std::string const& endpoint) {
WRITE_LOCKER(guard, _lock);
auto const& it = _connections.find(normalized);
auto const& it = _connections.find(endpoint);
if (it != _connections.end()) {
// {
// ConnectionList& list = *(it->second);
// std::lock_guard<std::mutex> guard(list.mutex);
// for (auto& c : list.connections) {
// c->shutdown();
// }
// }
size_t n = it->second->list.size();
for (auto& c : it->second->list) {
c.fuerte->cancel();
}
_connections.erase(it);
return n;
}
return 0;
}
/// @brief return the number of open connections
@ -231,6 +224,7 @@ ConnectionPtr ConnectionPool::selectConnection(std::string const& endpoint,
fuerte::ConnectionBuilder builder;
builder.endpoint(endpoint); // picks the socket type
builder.verifyHost(_config.verifyHosts);
builder.protocolType(_config.protocol); // always overwrite protocol
TRI_ASSERT(builder.socketType() != SocketType::Undefined);

View File

@ -89,7 +89,7 @@ class ConnectionPool final {
void pruneConnections();
/// @brief cancel connections to this endpoint
void cancelConnections(std::string const& endpoint);
size_t cancelConnections(std::string const& endpoint);
/// @brief return the number of open connections
size_t numOpenConnections() const;

View File

@ -107,6 +107,10 @@ FutureRes sendRequest(ConnectionPool* pool, DestinationId dest, RestVerb type,
return futures::makeFuture(Response{std::move(dest), Error::Canceled, nullptr});
}
LOG_TOPIC("2713a", DEBUG, Logger::COMMUNICATION)
<< "request to '" << dest
<< "' '" << fuerte::to_string(type) << " " << path << "'";
arangodb::network::EndpointSpec spec;
int res = resolveDestination(*pool->config().clusterInfo, dest, spec);
if (res != TRI_ERROR_NO_ERROR) { // FIXME return an error ?!
@ -236,20 +240,7 @@ class RequestsState final : public std::enable_shared_from_this<RequestsState> {
switch (err) {
case fuerte::Error::NoError: {
TRI_ASSERT(res);
if (res->statusCode() == fuerte::StatusOK || res->statusCode() == fuerte::StatusCreated ||
res->statusCode() == fuerte::StatusAccepted ||
res->statusCode() == fuerte::StatusNoContent) {
callResponse(Error::NoError, std::move(res));
break;
} else if (res->statusCode() == fuerte::StatusNotFound && _options.retryNotFound &&
TRI_ERROR_ARANGO_DATA_SOURCE_NOT_FOUND ==
network::errorCodeFromBody(res->slice())) {
LOG_TOPIC("5a8e9", DEBUG, Logger::COMMUNICATION)
<< "retrying request";
} else { // a "proper error" which has to be returned to the client
LOG_TOPIC("5a8d9", DEBUG, Logger::COMMUNICATION)
<< "canceling request";
callResponse(err, std::move(res));
if (checkResponse(err, req, res)) {
break;
}
[[fallthrough]];
@ -257,7 +248,8 @@ class RequestsState final : public std::enable_shared_from_this<RequestsState> {
case fuerte::Error::CouldNotConnect:
case fuerte::Error::ConnectionClosed:
case fuerte::Error::Timeout: {
case fuerte::Error::Timeout:
case fuerte::Error::Canceled: {
// Note that this case includes the refusal of a leader to accept
// the operation, in which case we have to flush ClusterInfo:
@ -283,8 +275,40 @@ class RequestsState final : public std::enable_shared_from_this<RequestsState> {
}
}
bool checkResponse(fuerte::Error err,
std::unique_ptr<fuerte::Request>& req,
std::unique_ptr<fuerte::Response>& res) {
switch (res->statusCode()) {
case fuerte::StatusOK:
case fuerte::StatusCreated:
case fuerte::StatusAccepted:
case fuerte::StatusNoContent:
callResponse(Error::NoError, std::move(res));
return true; // done
case fuerte::StatusUnavailable:
return false; // goto retry
case fuerte::StatusNotFound:
if (_options.retryNotFound &&
TRI_ERROR_ARANGO_DATA_SOURCE_NOT_FOUND == network::errorCodeFromBody(res->slice())) {
return false; // goto retry
}
[[fallthrough]];
default: // a "proper error" which has to be returned to the client
callResponse(err, std::move(res));
return true; // done
}
}
/// @broef schedule calling the response promise
void callResponse(Error err, std::unique_ptr<fuerte::Response> res) {
LOG_TOPIC_IF("2713d", DEBUG, Logger::COMMUNICATION, err != fuerte::Error::NoError)
<< "error on request to '" << _destination
<< "' '" << fuerte::to_string(_type) << " " << _path
<< "' '" << fuerte::to_string(err) << "'";
Scheduler* sch = SchedulerFeature::SCHEDULER;
if (_options.skipScheduler || sch == nullptr) {
_promise.setValue(Response{std::move(_destination), err, std::move(res)});
@ -303,6 +327,11 @@ class RequestsState final : public std::enable_shared_from_this<RequestsState> {
}
void retryLater(std::chrono::steady_clock::duration tryAgainAfter) {
LOG_TOPIC("2713e", DEBUG, Logger::COMMUNICATION)
<< "retry request to '" << _destination
<< "' '" << fuerte::to_string(_type) << " " << _path << "'";
auto* sch = SchedulerFeature::SCHEDULER;
if (ADB_UNLIKELY(sch == nullptr)) {
_promise.setValue(Response{std::move(_destination), fuerte::Error::Canceled, nullptr});
@ -338,6 +367,10 @@ FutureRes sendRequestRetry(ConnectionPool* pool, DestinationId destination,
return futures::makeFuture(Response{destination, Error::Canceled, nullptr});
}
LOG_TOPIC("2713b", DEBUG, Logger::COMMUNICATION)
<< "request to '" << destination
<< "' '" << fuerte::to_string(type) << " " << path << "'";
// auto req = prepareRequest(type, path, std::move(payload), timeout, headers);
auto rs = std::make_shared<RequestsState>(pool, std::move(destination),
type, std::move(path),

View File

@ -97,27 +97,6 @@ void NetworkFeature::collectOptions(std::shared_ptr<options::ProgramOptions> opt
options->addOption("--network.verify-hosts", "verify hosts when using TLS",
new BooleanParameter(&_verifyHosts))
.setIntroducedIn(30600);
_gcfunc = [this](bool canceled) {
if (canceled) {
return;
}
_pool->pruneConnections();
if (server().hasFeature<ClusterFeature>()) {
auto& ci = server().getFeature<ClusterFeature>().clusterInfo();
auto failed = ci.getFailedServers();
for (ServerID const& f : failed) {
_pool->cancelConnections(f);
}
}
if (!server().isStopping() && !canceled) {
std::chrono::seconds off(12);
::queueGarbageCollection(_workItemMutex, _workItem, _gcfunc, off);
}
};
}
void NetworkFeature::validateOptions(std::shared_ptr<options::ProgramOptions>) {
@ -131,17 +110,45 @@ void NetworkFeature::validateOptions(std::shared_ptr<options::ProgramOptions>) {
}
void NetworkFeature::prepare() {
ClusterInfo* ci = nullptr;
if (server().hasFeature<ClusterFeature>() && server().isEnabled<ClusterFeature>()) {
ci = &server().getFeature<ClusterFeature>().clusterInfo();
}
network::ConnectionPool::Config config;
config.numIOThreads = static_cast<unsigned>(_numIOThreads);
config.maxOpenConnections = _maxOpenConnections;
config.idleConnectionMilli = _idleTtlMilli;
config.verifyHosts = _verifyHosts;
if (server().hasFeature<ClusterFeature>() && server().isEnabled<ClusterFeature>()) {
config.clusterInfo = &server().getFeature<ClusterFeature>().clusterInfo();
}
config.clusterInfo = ci;
_pool = std::make_unique<network::ConnectionPool>(config);
_poolPtr.store(_pool.get(), std::memory_order_release);
_gcfunc = [this, ci](bool canceled) {
if (canceled) {
return;
}
_pool->pruneConnections();
if (ci != nullptr) {
auto failed = ci->getFailedServers();
for (ServerID const& srvId : failed) {
std::string endpoint = ci->getServerEndpoint(srvId);
size_t n = _pool->cancelConnections(endpoint);
LOG_TOPIC_IF("15d94", INFO, Logger::COMMUNICATION, n > 0)
<< "canceling " << n << " connections to failed server '"
<< srvId << "' on endpoint '" << endpoint << "'";
}
}
if (!server().isStopping() && !canceled) {
std::chrono::seconds off(12);
::queueGarbageCollection(_workItemMutex, _workItem, _gcfunc, off);
}
};
}
void NetworkFeature::start() {

View File

@ -454,12 +454,12 @@ function TaskSuite () {
assertEqual(1, t.length);
var tries = 0;
while (tries++ < 10) {
while (tries++ < 15) {
if (db[cn].count() === 1) {
return; // alright
}
internal.wait(1);
internal.wait(2);
}
fail();
@ -490,12 +490,12 @@ function TaskSuite () {
assertEqual("_system", task.database);
var tries = 0;
while (tries++ < 10) {
while (tries++ < 15) {
if (db[cn].count() === 1) {
return; // alright
}
internal.wait(1);
internal.wait(2);
}
fail();
@ -525,13 +525,15 @@ function TaskSuite () {
assertEqual(5, task.offset);
assertEqual("_system", task.database);
internal.wait(5);
var tries = 0;
while (tries++ < 20) {
while (tries++ < 15) {
if (db[cn].count() === 1) {
return; // alright
}
internal.wait(1);
internal.wait(2);
}
// task hasn't been executed
@ -553,13 +555,13 @@ function TaskSuite () {
var task = tasks.register({
name: "UnitTests1",
command: command,
offset: 10,
offset: 15,
params: 23
});
assertEqual("UnitTests1", task.name);
assertEqual("timed", task.type);
assertEqual(10, task.offset);
assertEqual(15, task.offset);
assertEqual("_system", task.database);
tasks.unregister(task);
@ -599,13 +601,13 @@ function TaskSuite () {
assertEqual("_system", task.database);
var tries = 0;
while (tries++ < 20) {
while (tries++ < 15) {
if (db[cn].count() > 0) {
assertTrue(db[cn].byExample({ value: 17 }).toArray().length > 0);
return; // alright
}
internal.wait(1);
internal.wait(2);
}
fail();
@ -639,13 +641,13 @@ function TaskSuite () {
assertEqual("_system", task.database);
var tries = 0;
while (tries++ < 20) {
while (tries++ < 15) {
if (db[cn].count() > 0) {
assertTrue(db[cn].byExample({ value: 42 }).toArray().length > 0);
return; // alright
}
internal.wait(1);
internal.wait(2);
}
fail();