mirror of https://gitee.com/bigwinds/arangodb
Cancel connections to known failed servers (#10546)
* test * rename shit * rename shit * Cancel connections to known failed servers * do not uncomment code * Retry on 503 responses * Make test grey again
This commit is contained in:
parent
119aa7bfd8
commit
2ae30242c0
|
@ -53,7 +53,8 @@ StatusCode constexpr StatusNotAcceptable = 406;
|
|||
StatusCode constexpr StatusConflict = 409;
|
||||
StatusCode constexpr StatusPreconditionFailed = 412;
|
||||
StatusCode constexpr StatusInternalError = 500;
|
||||
StatusCode constexpr StatusUnavailable = 505;
|
||||
StatusCode constexpr StatusUnavailable = 503;
|
||||
StatusCode constexpr StatusVersionNotSupported = 505;
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// --SECTION-- enum class ErrorCondition
|
||||
|
|
|
@ -34,21 +34,18 @@ void resolveConnect(detail::ConnectionConfiguration const& config,
|
|||
asio_ns::ip::tcp::resolver& resolver,
|
||||
SocketT& socket,
|
||||
F&& done) {
|
||||
auto cb = [&socket, done = std::forward<F>(done)]
|
||||
(asio_ns::error_code const& ec,
|
||||
asio_ns::ip::tcp::resolver::iterator it) {
|
||||
if (ec) { // error
|
||||
auto cb = [&socket, done(std::forward<F>(done))](auto ec, auto it) mutable {
|
||||
if (ec) { // error in address resolver
|
||||
done(ec);
|
||||
return;
|
||||
}
|
||||
|
||||
// A successful resolve operation is guaranteed to pass a
|
||||
// non-empty range to the handler.
|
||||
auto cb = [done](asio_ns::error_code const& ec,
|
||||
asio_ns::ip::tcp::resolver::iterator const&) {
|
||||
done(ec);
|
||||
};
|
||||
asio_ns::async_connect(socket, it, std::move(cb));
|
||||
asio_ns::async_connect(socket, it,
|
||||
[done(std::move(done))](auto ec, auto it) mutable {
|
||||
std::forward<F>(done)(ec);
|
||||
});
|
||||
};
|
||||
|
||||
// windows does not like async_resolve
|
||||
|
@ -114,7 +111,9 @@ struct Socket<fuerte::SocketType::Ssl> {
|
|||
|
||||
template<typename F>
|
||||
void connect(detail::ConnectionConfiguration const& config, F&& done) {
|
||||
auto cb = [this, &config, done = std::forward<F>(done)](asio_ns::error_code const& ec) {
|
||||
bool verify = config._verifyHost;
|
||||
resolveConnect(config, resolver, socket.next_layer(),
|
||||
[=, done(std::forward<F>(done))](auto const& ec) mutable {
|
||||
if (ec) {
|
||||
done(ec);
|
||||
return;
|
||||
|
@ -122,7 +121,7 @@ struct Socket<fuerte::SocketType::Ssl> {
|
|||
|
||||
// Perform SSL handshake and verify the remote host's certificate.
|
||||
socket.next_layer().set_option(asio_ns::ip::tcp::no_delay(true));
|
||||
if (config._verifyHost) {
|
||||
if (verify) {
|
||||
socket.set_verify_mode(asio_ns::ssl::verify_peer);
|
||||
socket.set_verify_callback(asio_ns::ssl::rfc2818_verification(config._host));
|
||||
} else {
|
||||
|
@ -130,9 +129,7 @@ struct Socket<fuerte::SocketType::Ssl> {
|
|||
}
|
||||
|
||||
socket.async_handshake(asio_ns::ssl::stream_base::client, std::move(done));
|
||||
};
|
||||
|
||||
resolveConnect(config, resolver, socket.next_layer(), std::move(cb));
|
||||
});
|
||||
}
|
||||
|
||||
void shutdown() {
|
||||
|
|
|
@ -151,25 +151,18 @@ void ConnectionPool::pruneConnections() {
|
|||
}
|
||||
|
||||
/// @brief cancel connections to this endpoint
|
||||
void ConnectionPool::cancelConnections(std::string const& endpoint) {
|
||||
fuerte::ConnectionBuilder builder;
|
||||
builder.endpoint(endpoint);
|
||||
builder.protocolType(_config.protocol); // always overwrite protocol
|
||||
|
||||
std::string normalized = builder.normalizedEndpoint();
|
||||
|
||||
size_t ConnectionPool::cancelConnections(std::string const& endpoint) {
|
||||
WRITE_LOCKER(guard, _lock);
|
||||
auto const& it = _connections.find(normalized);
|
||||
auto const& it = _connections.find(endpoint);
|
||||
if (it != _connections.end()) {
|
||||
// {
|
||||
// ConnectionList& list = *(it->second);
|
||||
// std::lock_guard<std::mutex> guard(list.mutex);
|
||||
// for (auto& c : list.connections) {
|
||||
// c->shutdown();
|
||||
// }
|
||||
// }
|
||||
size_t n = it->second->list.size();
|
||||
for (auto& c : it->second->list) {
|
||||
c.fuerte->cancel();
|
||||
}
|
||||
_connections.erase(it);
|
||||
return n;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// @brief return the number of open connections
|
||||
|
@ -231,6 +224,7 @@ ConnectionPtr ConnectionPool::selectConnection(std::string const& endpoint,
|
|||
|
||||
fuerte::ConnectionBuilder builder;
|
||||
builder.endpoint(endpoint); // picks the socket type
|
||||
builder.verifyHost(_config.verifyHosts);
|
||||
builder.protocolType(_config.protocol); // always overwrite protocol
|
||||
TRI_ASSERT(builder.socketType() != SocketType::Undefined);
|
||||
|
||||
|
|
|
@ -89,7 +89,7 @@ class ConnectionPool final {
|
|||
void pruneConnections();
|
||||
|
||||
/// @brief cancel connections to this endpoint
|
||||
void cancelConnections(std::string const& endpoint);
|
||||
size_t cancelConnections(std::string const& endpoint);
|
||||
|
||||
/// @brief return the number of open connections
|
||||
size_t numOpenConnections() const;
|
||||
|
|
|
@ -107,6 +107,10 @@ FutureRes sendRequest(ConnectionPool* pool, DestinationId dest, RestVerb type,
|
|||
return futures::makeFuture(Response{std::move(dest), Error::Canceled, nullptr});
|
||||
}
|
||||
|
||||
LOG_TOPIC("2713a", DEBUG, Logger::COMMUNICATION)
|
||||
<< "request to '" << dest
|
||||
<< "' '" << fuerte::to_string(type) << " " << path << "'";
|
||||
|
||||
arangodb::network::EndpointSpec spec;
|
||||
int res = resolveDestination(*pool->config().clusterInfo, dest, spec);
|
||||
if (res != TRI_ERROR_NO_ERROR) { // FIXME return an error ?!
|
||||
|
@ -236,20 +240,7 @@ class RequestsState final : public std::enable_shared_from_this<RequestsState> {
|
|||
switch (err) {
|
||||
case fuerte::Error::NoError: {
|
||||
TRI_ASSERT(res);
|
||||
if (res->statusCode() == fuerte::StatusOK || res->statusCode() == fuerte::StatusCreated ||
|
||||
res->statusCode() == fuerte::StatusAccepted ||
|
||||
res->statusCode() == fuerte::StatusNoContent) {
|
||||
callResponse(Error::NoError, std::move(res));
|
||||
break;
|
||||
} else if (res->statusCode() == fuerte::StatusNotFound && _options.retryNotFound &&
|
||||
TRI_ERROR_ARANGO_DATA_SOURCE_NOT_FOUND ==
|
||||
network::errorCodeFromBody(res->slice())) {
|
||||
LOG_TOPIC("5a8e9", DEBUG, Logger::COMMUNICATION)
|
||||
<< "retrying request";
|
||||
} else { // a "proper error" which has to be returned to the client
|
||||
LOG_TOPIC("5a8d9", DEBUG, Logger::COMMUNICATION)
|
||||
<< "canceling request";
|
||||
callResponse(err, std::move(res));
|
||||
if (checkResponse(err, req, res)) {
|
||||
break;
|
||||
}
|
||||
[[fallthrough]];
|
||||
|
@ -257,7 +248,8 @@ class RequestsState final : public std::enable_shared_from_this<RequestsState> {
|
|||
|
||||
case fuerte::Error::CouldNotConnect:
|
||||
case fuerte::Error::ConnectionClosed:
|
||||
case fuerte::Error::Timeout: {
|
||||
case fuerte::Error::Timeout:
|
||||
case fuerte::Error::Canceled: {
|
||||
// Note that this case includes the refusal of a leader to accept
|
||||
// the operation, in which case we have to flush ClusterInfo:
|
||||
|
||||
|
@ -283,8 +275,40 @@ class RequestsState final : public std::enable_shared_from_this<RequestsState> {
|
|||
}
|
||||
}
|
||||
|
||||
bool checkResponse(fuerte::Error err,
|
||||
std::unique_ptr<fuerte::Request>& req,
|
||||
std::unique_ptr<fuerte::Response>& res) {
|
||||
switch (res->statusCode()) {
|
||||
case fuerte::StatusOK:
|
||||
case fuerte::StatusCreated:
|
||||
case fuerte::StatusAccepted:
|
||||
case fuerte::StatusNoContent:
|
||||
callResponse(Error::NoError, std::move(res));
|
||||
return true; // done
|
||||
|
||||
case fuerte::StatusUnavailable:
|
||||
return false; // goto retry
|
||||
|
||||
case fuerte::StatusNotFound:
|
||||
if (_options.retryNotFound &&
|
||||
TRI_ERROR_ARANGO_DATA_SOURCE_NOT_FOUND == network::errorCodeFromBody(res->slice())) {
|
||||
return false; // goto retry
|
||||
}
|
||||
[[fallthrough]];
|
||||
default: // a "proper error" which has to be returned to the client
|
||||
callResponse(err, std::move(res));
|
||||
return true; // done
|
||||
}
|
||||
}
|
||||
|
||||
/// @broef schedule calling the response promise
|
||||
void callResponse(Error err, std::unique_ptr<fuerte::Response> res) {
|
||||
|
||||
LOG_TOPIC_IF("2713d", DEBUG, Logger::COMMUNICATION, err != fuerte::Error::NoError)
|
||||
<< "error on request to '" << _destination
|
||||
<< "' '" << fuerte::to_string(_type) << " " << _path
|
||||
<< "' '" << fuerte::to_string(err) << "'";
|
||||
|
||||
Scheduler* sch = SchedulerFeature::SCHEDULER;
|
||||
if (_options.skipScheduler || sch == nullptr) {
|
||||
_promise.setValue(Response{std::move(_destination), err, std::move(res)});
|
||||
|
@ -303,6 +327,11 @@ class RequestsState final : public std::enable_shared_from_this<RequestsState> {
|
|||
}
|
||||
|
||||
void retryLater(std::chrono::steady_clock::duration tryAgainAfter) {
|
||||
|
||||
LOG_TOPIC("2713e", DEBUG, Logger::COMMUNICATION)
|
||||
<< "retry request to '" << _destination
|
||||
<< "' '" << fuerte::to_string(_type) << " " << _path << "'";
|
||||
|
||||
auto* sch = SchedulerFeature::SCHEDULER;
|
||||
if (ADB_UNLIKELY(sch == nullptr)) {
|
||||
_promise.setValue(Response{std::move(_destination), fuerte::Error::Canceled, nullptr});
|
||||
|
@ -338,6 +367,10 @@ FutureRes sendRequestRetry(ConnectionPool* pool, DestinationId destination,
|
|||
return futures::makeFuture(Response{destination, Error::Canceled, nullptr});
|
||||
}
|
||||
|
||||
LOG_TOPIC("2713b", DEBUG, Logger::COMMUNICATION)
|
||||
<< "request to '" << destination
|
||||
<< "' '" << fuerte::to_string(type) << " " << path << "'";
|
||||
|
||||
// auto req = prepareRequest(type, path, std::move(payload), timeout, headers);
|
||||
auto rs = std::make_shared<RequestsState>(pool, std::move(destination),
|
||||
type, std::move(path),
|
||||
|
|
|
@ -97,27 +97,6 @@ void NetworkFeature::collectOptions(std::shared_ptr<options::ProgramOptions> opt
|
|||
options->addOption("--network.verify-hosts", "verify hosts when using TLS",
|
||||
new BooleanParameter(&_verifyHosts))
|
||||
.setIntroducedIn(30600);
|
||||
|
||||
_gcfunc = [this](bool canceled) {
|
||||
if (canceled) {
|
||||
return;
|
||||
}
|
||||
|
||||
_pool->pruneConnections();
|
||||
|
||||
if (server().hasFeature<ClusterFeature>()) {
|
||||
auto& ci = server().getFeature<ClusterFeature>().clusterInfo();
|
||||
auto failed = ci.getFailedServers();
|
||||
for (ServerID const& f : failed) {
|
||||
_pool->cancelConnections(f);
|
||||
}
|
||||
}
|
||||
|
||||
if (!server().isStopping() && !canceled) {
|
||||
std::chrono::seconds off(12);
|
||||
::queueGarbageCollection(_workItemMutex, _workItem, _gcfunc, off);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
void NetworkFeature::validateOptions(std::shared_ptr<options::ProgramOptions>) {
|
||||
|
@ -131,17 +110,45 @@ void NetworkFeature::validateOptions(std::shared_ptr<options::ProgramOptions>) {
|
|||
}
|
||||
|
||||
void NetworkFeature::prepare() {
|
||||
|
||||
ClusterInfo* ci = nullptr;
|
||||
if (server().hasFeature<ClusterFeature>() && server().isEnabled<ClusterFeature>()) {
|
||||
ci = &server().getFeature<ClusterFeature>().clusterInfo();
|
||||
}
|
||||
|
||||
network::ConnectionPool::Config config;
|
||||
config.numIOThreads = static_cast<unsigned>(_numIOThreads);
|
||||
config.maxOpenConnections = _maxOpenConnections;
|
||||
config.idleConnectionMilli = _idleTtlMilli;
|
||||
config.verifyHosts = _verifyHosts;
|
||||
if (server().hasFeature<ClusterFeature>() && server().isEnabled<ClusterFeature>()) {
|
||||
config.clusterInfo = &server().getFeature<ClusterFeature>().clusterInfo();
|
||||
}
|
||||
config.clusterInfo = ci;
|
||||
|
||||
_pool = std::make_unique<network::ConnectionPool>(config);
|
||||
_poolPtr.store(_pool.get(), std::memory_order_release);
|
||||
|
||||
_gcfunc = [this, ci](bool canceled) {
|
||||
if (canceled) {
|
||||
return;
|
||||
}
|
||||
|
||||
_pool->pruneConnections();
|
||||
|
||||
if (ci != nullptr) {
|
||||
auto failed = ci->getFailedServers();
|
||||
for (ServerID const& srvId : failed) {
|
||||
std::string endpoint = ci->getServerEndpoint(srvId);
|
||||
size_t n = _pool->cancelConnections(endpoint);
|
||||
LOG_TOPIC_IF("15d94", INFO, Logger::COMMUNICATION, n > 0)
|
||||
<< "canceling " << n << " connections to failed server '"
|
||||
<< srvId << "' on endpoint '" << endpoint << "'";
|
||||
}
|
||||
}
|
||||
|
||||
if (!server().isStopping() && !canceled) {
|
||||
std::chrono::seconds off(12);
|
||||
::queueGarbageCollection(_workItemMutex, _workItem, _gcfunc, off);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
void NetworkFeature::start() {
|
||||
|
|
|
@ -454,12 +454,12 @@ function TaskSuite () {
|
|||
assertEqual(1, t.length);
|
||||
|
||||
var tries = 0;
|
||||
while (tries++ < 10) {
|
||||
while (tries++ < 15) {
|
||||
if (db[cn].count() === 1) {
|
||||
return; // alright
|
||||
}
|
||||
|
||||
internal.wait(1);
|
||||
internal.wait(2);
|
||||
}
|
||||
|
||||
fail();
|
||||
|
@ -490,12 +490,12 @@ function TaskSuite () {
|
|||
assertEqual("_system", task.database);
|
||||
|
||||
var tries = 0;
|
||||
while (tries++ < 10) {
|
||||
while (tries++ < 15) {
|
||||
if (db[cn].count() === 1) {
|
||||
return; // alright
|
||||
}
|
||||
|
||||
internal.wait(1);
|
||||
internal.wait(2);
|
||||
}
|
||||
|
||||
fail();
|
||||
|
@ -525,13 +525,15 @@ function TaskSuite () {
|
|||
assertEqual(5, task.offset);
|
||||
assertEqual("_system", task.database);
|
||||
|
||||
internal.wait(5);
|
||||
|
||||
var tries = 0;
|
||||
while (tries++ < 20) {
|
||||
while (tries++ < 15) {
|
||||
if (db[cn].count() === 1) {
|
||||
return; // alright
|
||||
}
|
||||
|
||||
internal.wait(1);
|
||||
internal.wait(2);
|
||||
}
|
||||
|
||||
// task hasn't been executed
|
||||
|
@ -553,13 +555,13 @@ function TaskSuite () {
|
|||
var task = tasks.register({
|
||||
name: "UnitTests1",
|
||||
command: command,
|
||||
offset: 10,
|
||||
offset: 15,
|
||||
params: 23
|
||||
});
|
||||
|
||||
assertEqual("UnitTests1", task.name);
|
||||
assertEqual("timed", task.type);
|
||||
assertEqual(10, task.offset);
|
||||
assertEqual(15, task.offset);
|
||||
assertEqual("_system", task.database);
|
||||
|
||||
tasks.unregister(task);
|
||||
|
@ -599,13 +601,13 @@ function TaskSuite () {
|
|||
assertEqual("_system", task.database);
|
||||
|
||||
var tries = 0;
|
||||
while (tries++ < 20) {
|
||||
while (tries++ < 15) {
|
||||
if (db[cn].count() > 0) {
|
||||
assertTrue(db[cn].byExample({ value: 17 }).toArray().length > 0);
|
||||
return; // alright
|
||||
}
|
||||
|
||||
internal.wait(1);
|
||||
internal.wait(2);
|
||||
}
|
||||
|
||||
fail();
|
||||
|
@ -639,13 +641,13 @@ function TaskSuite () {
|
|||
assertEqual("_system", task.database);
|
||||
|
||||
var tries = 0;
|
||||
while (tries++ < 20) {
|
||||
while (tries++ < 15) {
|
||||
if (db[cn].count() > 0) {
|
||||
assertTrue(db[cn].byExample({ value: 42 }).toArray().length > 0);
|
||||
return; // alright
|
||||
}
|
||||
|
||||
internal.wait(1);
|
||||
internal.wait(2);
|
||||
}
|
||||
|
||||
fail();
|
Loading…
Reference in New Issue