1
0
Fork 0

Cleanup of error handling for asyncRequest and syncRequest.

I have added a thorough description of events to the comments
in ClusterComm.h. This should enable everybody to do proper
error handling when using ClusterComm::asyncRequest and
ClusterComm::syncRequest.
This commit is contained in:
Max Neunhoeffer 2016-06-04 23:05:48 -06:00
parent a2d237f84d
commit 20ef93d76b
8 changed files with 145 additions and 34 deletions

View File

@ -1091,18 +1091,19 @@ static bool throwExceptionAfterBadSyncRequest(ClusterCommResult* res,
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_CLUSTER_TIMEOUT, errorMessage);
}
if (res->status == CL_COMM_BACKEND_UNAVAILABLE) {
// there is no result
std::string errorMessage =
std::string("Empty result in communication with shard '") +
std::string(res->shardID) + std::string("' on cluster node '") +
std::string(res->serverID) + std::string("'");
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_CLUSTER_CONNECTION_LOST,
errorMessage);
}
if (res->status == CL_COMM_ERROR) {
std::string errorMessage;
// This could be a broken connection or an Http error:
if (res->result == nullptr || !res->result->isComplete()) {
// there is no result
errorMessage +=
std::string("Empty result in communication with shard '") +
std::string(res->shardID) + std::string("' on cluster node '") +
std::string(res->serverID) + std::string("'");
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_CLUSTER_CONNECTION_LOST,
errorMessage);
}
TRI_ASSERT(nullptr != res->result);
StringBuffer const& responseBodyBuf(res->result->getBody());

View File

@ -65,7 +65,7 @@ void ClusterCommResult::setDestination(std::string const& dest,
serverID = (*resp)[0];
} else {
serverID = "";
status = CL_COMM_ERROR;
status = CL_COMM_BACKEND_UNAVAILABLE;
if (logConnectionErrors) {
LOG(ERR) << "cannot find responsible server for shard '"
<< shardID << "'";
@ -89,7 +89,7 @@ void ClusterCommResult::setDestination(std::string const& dest,
shardID = "";
serverID = "";
endpoint = "";
status = CL_COMM_ERROR;
status = CL_COMM_BACKEND_UNAVAILABLE;
errorMessage = "did not understand destination'" + dest + "'";
if (logConnectionErrors) {
LOG(ERR) << "did not understand destination '" << dest << "'";
@ -102,7 +102,7 @@ void ClusterCommResult::setDestination(std::string const& dest,
auto ci = ClusterInfo::instance();
endpoint = ci->getServerEndpoint(serverID);
if (endpoint.empty()) {
status = CL_COMM_ERROR;
status = CL_COMM_BACKEND_UNAVAILABLE;
errorMessage = "did not find endpoint of server '" + serverID + "'";
if (logConnectionErrors) {
LOG(ERR) << "did not find endpoint of server '" << serverID
@ -253,8 +253,7 @@ OperationID ClusterComm::asyncRequest(
op->result.setDestination(destination, logConnectionErrors());
if (op->result.status == CL_COMM_ERROR) {
// In the non-singleRequest mode we want to put it into the received
// queue right away for backward compatibility:
// We put it into the received queue right away for error reporting:
ClusterCommResult const resCopy(op->result);
LOG(DEBUG) << "In asyncRequest, putting failed request "
<< resCopy.operationID << " directly into received queue.";
@ -262,7 +261,17 @@ OperationID ClusterComm::asyncRequest(
received.push_back(op.get());
op.release();
auto q = received.end();
receivedByOpID[resCopy.operationID] = --q;
receivedByOpID[opId] = --q;
if (nullptr != callback) {
op.reset(*q);
if ( (*callback.get())(&(op->result)) ) {
auto i = receivedByOpID.find(opId);
receivedByOpID.erase(i);
received.erase(q);
} else {
op.release();
}
}
somethingReceived.broadcast();
return opId;
}
@ -370,7 +379,7 @@ std::unique_ptr<ClusterCommResult> ClusterComm::syncRequest(
res->setDestination(destination, logConnectionErrors());
if (res->status == CL_COMM_ERROR) {
if (res->status == CL_COMM_BACKEND_UNAVAILABLE) {
return res;
}
@ -886,9 +895,10 @@ std::string ClusterComm::processAnswer(std::string& coordinatorHeader,
if ((*op->callback.get())(&op->result)) {
// This is fully processed, so let's remove it from the queue:
QueueIterator q = i->second;
std::unique_ptr<ClusterCommOperation> o(op);
receivedByOpID.erase(i);
received.erase(q);
delete op;
return std::string("");
}
}
} else {
@ -909,9 +919,10 @@ std::string ClusterComm::processAnswer(std::string& coordinatorHeader,
if ((*op->callback)(&op->result)) {
// This is fully processed, so let's remove it from the queue:
QueueIterator q = i->second;
std::unique_ptr<ClusterCommOperation> o(op);
toSendByOpID.erase(i);
toSend.erase(q);
delete op;
return std::string("");
}
}
} else {
@ -1140,8 +1151,7 @@ size_t ClusterComm::performRequests(std::vector<ClusterCommRequest>& requests,
LOG_TOPIC(TRACE, logTopic) << "ClusterComm::performRequests: "
<< "got BACKEND_UNAVAILABLE or TIMEOUT from "
<< requests[index].destination << ":"
<< requests[index].path << " with return code "
<< (int) res.answer_code;
<< requests[index].path;
// In this case we will retry at the dueTime
} else { // a "proper error"
requests[index].result = res;
@ -1329,13 +1339,29 @@ void ClusterCommThread::run() {
CONDITION_LOCKER(locker, cc->somethingReceived);
ClusterComm::QueueIterator q;
for (q = cc->received.begin(); q != cc->received.end(); ++q) {
for (q = cc->received.begin(); q != cc->received.end(); ) {
bool deleted = false;
op = *q;
if (op->result.status == CL_COMM_SENT) {
if (op->endTime < currentTime) {
op->result.status = CL_COMM_TIMEOUT;
if (nullptr != op->callback.get()) {
if ( (*op->callback.get())(&op->result) ) {
// This is fully processed, so let's remove it from the queue:
auto i = cc->receivedByOpID.find(op->result.operationID);
TRI_ASSERT(i != cc->receivedByOpID.end());
cc->receivedByOpID.erase(i);
std::unique_ptr<ClusterCommOperation> o(op);
auto qq = q++;
cc->received.erase(qq);
deleted = true;
}
}
}
}
if (!deleted) {
++q;
}
}
}

View File

@ -70,7 +70,7 @@ enum ClusterCommOpStatus {
CL_COMM_SENT = 3, // initial request sent, response available
CL_COMM_TIMEOUT = 4, // no answer received until timeout
CL_COMM_RECEIVED = 5, // answer received
CL_COMM_ERROR = 6, // original request could not be sent
CL_COMM_ERROR = 6, // original request could not be sent or HTTP error
CL_COMM_DROPPED = 7, // operation was dropped, not known
// this is only used to report an error
// in the wait or enquire methods
@ -82,7 +82,87 @@ enum ClusterCommOpStatus {
////////////////////////////////////////////////////////////////////////////////
/// @brief used to report the status, progress and possibly result of
/// an operation
/// an operation, this is used for the asyncRequest (with singleRequest
/// equal to true or to false), and for syncRequest.
///
/// Here is a complete overview of how the request can happen and how this
/// is reflected in the ClusterCommResult. We first cover the asyncRequest
/// case and then describe the differences for syncRequest:
///
/// First, the actual destination is determined. If the responsible server
/// for a shard is not found or the endpoint for a named server is not found,
/// or if the given endpoint is no known protocol (currently "tcp://" or
/// "ssl://", then `status` is set to CL_COMM_BACKEND_UNAVAILABLE,
/// `errorMessage` is set but `result` and `answer` are both set
/// to nullptr. The flag `sendWasComplete` remains false and the
/// `answer_code` remains GeneralResponse::ResponseCode::PROCESSING.
/// A potentially given ClusterCommCallback is called.
///
/// If no error occurs so far, the status is set to CL_COMM_SUBMITTED.
/// Still, `result`, `answer` and `answer_code` are not yet set.
/// A call to ClusterComm::enquire can return a result with this status.
/// A call to ClusterComm::wait cannot return a result wuth this status.
/// The request is queued for sending.
///
/// As soon as the sending thread discovers the submitted request, it
/// sets its status to CL_COMM_SENDING and tries to open a connection
/// or reuse an existing connection. If opening a connection fails
/// the status is set to CL_COMM_BACKEND_UNAVAILABLE. If the given timeout
/// is already reached, the status is set to CL_COMM_TIMEOUT. In both
/// error cases `result`, `answer` and `answer_code` are still unset.
///
/// If the connection was successfully created the request is sent.
/// If the request ended with a timeout, `status` is set to
/// CL_COMM_TIMEOUT as above. If another communication error (broken
/// connection) happens, `status` is set to CL_COMM_BACKEND_UNAVAILABLE.
/// In both cases, `result` can be set or can still be a nullptr.
/// `answer` and `answer_code` are still unset.
///
/// If the request is completed, but an HTTP status code >= 400 occurred,
/// the status is set to CL_COMM_ERROR, but `result` is set correctly
/// to indicate the error. If all is well, `status` is set to CL_COMM_SENT.
///
/// In the `singleRequest==true` mode, the operation is finished at this
/// stage. The callback is called, and the result either left in the
/// receiving queue or dropped. A call to ClusterComm::enquire or
/// ClusterComm::wait can return a result in this state. Note that
/// `answer` and `answer_code` are still not set. The flag
/// `sendWasComplete` is correctly set, though.
///
/// In the `singleRequest==false` mode, an asynchronous operation happens
/// at the server side and eventually, an HTTP request in the opposite
/// direction is issued. During that time, `status` remains CL_COMM_SENT.
/// A call to ClusterComm::enquire can return a result in this state.
/// A call to ClusterComm::wait does not.
///
/// If the answer does not arrive in the specified timeout, `status`
/// is set to CL_COMM_TIMEOUT and a potential callback is called. If
/// From then on, ClusterComm::wait will return it (unless deleted
/// by the callback returning true).
///
/// If an answer comes in in time, then `answer` and `answer_code`
/// are finally set, and `status` is set to CL_COMM_RECEIVED. The callback
/// is called, and the result either left in the received queue for
/// pickup by ClusterComm::wait or deleted. Note that if we get this
/// far, `status` is set to CL_COMM_RECEIVED, even if the status code
/// of the answer is >= 400.
///
/// Summing up, we have the following outcomes:
/// `status` `result` set `answer` set wait() returns
/// CL_COMM_SUBMITTED no no no
/// CL_COMM_SENDING no no no
/// CL_COMM_SENT yes no yes if single
/// CL_COMM_BACKEND_UN... yes or no no yes
/// CL_COMM_TIMEOUT yes or no no yes
/// CL_COMM_ERROR yes no yes
/// CL_COMM_RECEIVED yes yes yes
/// CL_COMM_DROPPED no no yes
///
/// The syncRequest behaves essentially in the same way, except that
/// no callback is ever called, the outcome cannot be CL_COMM_RECEIVED
/// or CL_COMM_DROPPED, and CL_COMM_SENT indicates a successful completion.
/// CL_COMM_ERROR means that the request was complete, but an HTTP error
/// occurred.
////////////////////////////////////////////////////////////////////////////////
struct ClusterCommResult {

View File

@ -1536,7 +1536,7 @@ int ClusterInfo::ensureIndexCoordinator(
AgencyCommResult previous = ac.getValues(key);
if (!res.successful()) {
if (!previous.successful()) {
return TRI_ERROR_CLUSTER_READING_PLAN_AGENCY;
}

View File

@ -52,6 +52,8 @@ static int handleGeneralCommErrors(ClusterCommResult const* res) {
return TRI_ERROR_CLUSTER_TIMEOUT;
} else if (res->status == CL_COMM_ERROR) {
// This could be a broken connection or an Http error:
// This case cannot actually happen, but it is not very expensive
// to check, just to be on the safe side:
if (res->result == nullptr || !res->result->isComplete()) {
// there is not result
return TRI_ERROR_CLUSTER_CONNECTION_LOST;

View File

@ -1671,7 +1671,7 @@ static void JS_AsyncRequest(v8::FunctionCallbackInfo<v8::Value> const& args) {
clientTransactionID, coordTransactionID, destination, reqType, path, body,
headerFields, 0, timeout, singleRequest);
ClusterCommResult res = cc->enquire(opId);
if (res.status == CL_COMM_ERROR) {
if (res.status == CL_COMM_BACKEND_UNAVAILABLE) {
TRI_V8_THROW_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL,
"couldn't queue async request");
}

View File

@ -799,15 +799,16 @@ void RestReplicationHandler::handleTrampolineCoordinator() {
"timeout within cluster");
return;
}
if (res->status == CL_COMM_BACKEND_UNAVAILABLE) {
// there is no result
generateError(GeneralResponse::ResponseCode::BAD,
TRI_ERROR_CLUSTER_CONNECTION_LOST,
"lost connection within cluster");
return;
}
if (res->status == CL_COMM_ERROR) {
// This could be a broken connection or an Http error:
if (res->result == nullptr || !res->result->isComplete()) {
// there is no result
generateError(GeneralResponse::ResponseCode::BAD,
TRI_ERROR_CLUSTER_CONNECTION_LOST,
"lost connection within cluster");
return;
}
TRI_ASSERT(nullptr != res->result && res->result->isComplete());
// In this case a proper HTTP error was reported by the DBserver,
// we simply forward the result.
// We intentionally fall through here.

View File

@ -1187,7 +1187,8 @@ static bool clusterSendToAllServers(
cc->drop("", coordTransactionID, 0, "");
return TRI_ERROR_CLUSTER_TIMEOUT;
}
if (res.status == CL_COMM_ERROR || res.status == CL_COMM_DROPPED) {
if (res.status == CL_COMM_ERROR || res.status == CL_COMM_DROPPED ||
res.status == CL_COMM_BACKEND_UNAVAILABLE) {
cc->drop("", coordTransactionID, 0, "");
return TRI_ERROR_INTERNAL;
}