//////////////////////////////////////////////////////////////////////////////// /// DISCLAIMER /// /// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Max Neunhoeffer //////////////////////////////////////////////////////////////////////////////// #include "ClusterMethods.h" #include "Basics/conversions.h" #include "Basics/StaticStrings.h" #include "Basics/StringRef.h" #include "Basics/StringUtils.h" #include "Basics/tri-strings.h" #include "Basics/VelocyPackHelper.h" #include "Cluster/ClusterComm.h" #include "Cluster/ClusterInfo.h" #include "Indexes/Index.h" #include "VocBase/Traverser.h" #include "VocBase/server.h" #include #include #include #include #include using namespace arangodb::basics; using namespace arangodb::rest; static double const CL_DEFAULT_TIMEOUT = 60.0; namespace arangodb { static int handleGeneralCommErrors(ClusterCommResult const* res) { // This function creates an error code from a ClusterCommResult, // but only if it is a communication error. If the communication // was successful and there was an HTTP error code, this function // returns TRI_ERROR_NO_ERROR. // If TRI_ERROR_NO_ERROR is returned, then the result was CL_COMM_RECEIVED // and .answer can safely be inspected. if (res->status == CL_COMM_TIMEOUT) { // No reply, we give up: return TRI_ERROR_CLUSTER_TIMEOUT; } else if (res->status == CL_COMM_ERROR) { return TRI_ERROR_CLUSTER_CONNECTION_LOST; } else if (res->status == CL_COMM_BACKEND_UNAVAILABLE) { if (res->result == nullptr) { return TRI_ERROR_CLUSTER_CONNECTION_LOST; } if (!res->result->isComplete()) { // there is no result return TRI_ERROR_CLUSTER_CONNECTION_LOST; } return TRI_ERROR_CLUSTER_BACKEND_UNAVAILABLE; } return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief extracts a numeric value from an hierarchical VelocyPack //////////////////////////////////////////////////////////////////////////////// template static T ExtractFigure(VPackSlice const& slice, char const* group, char const* name) { TRI_ASSERT(slice.isObject()); VPackSlice g = slice.get(group); if (!g.isObject()) { return static_cast(0); } return arangodb::basics::VelocyPackHelper::getNumericValue(g, name, 0); } //////////////////////////////////////////////////////////////////////////////// /// @brief extracts answer from response into a VPackBuilder. /// If there was an error extracting the answer the builder will be /// empty. /// No Error can be thrown. //////////////////////////////////////////////////////////////////////////////// static std::shared_ptr ExtractAnswer( ClusterCommResult const& res) { try { return VPackParser::fromJson(res.answer->body()); } catch (...) { // Return an empty Builder return std::make_shared(); } } //////////////////////////////////////////////////////////////////////////////// /// @brief merge the baby-object results. /// The shard map contains the ordering of elements, the vector in this /// Map is expected to be sorted from front to back. /// The second map contains the answers for each shard. /// The builder in the third parameter will be cleared and will contain /// the resulting array. It is guaranteed that the resulting array /// indexes /// are equal to the original request ordering before it was destructured /// for babies. //////////////////////////////////////////////////////////////////////////////// static void mergeResults( std::vector> const& reverseMapping, std::unordered_map> const& resultMap, std::shared_ptr& resultBody) { resultBody->clear(); resultBody->openArray(); for (auto const& pair : reverseMapping) { VPackSlice arr = resultMap.find(pair.first)->second->slice(); resultBody->add(arr.at(pair.second)); } resultBody->close(); } //////////////////////////////////////////////////////////////////////////////// /// @brief merge the baby-object results. (all shards version) /// results contians the result from all shards in any order. /// resultBody will be cleared and contains the merged result after this /// function /// errorCounter will correctly compute the NOT_FOUND counter, all other /// codes remain unmodified. /// /// The merge is executed the following way: /// FOR every expected document we scan iterate over the corresponding /// response /// of each shard. If any of them returned sth. different than NOT_FOUND /// we take this result as correct. /// If none returned sth different than NOT_FOUND we return NOT_FOUND as /// well //////////////////////////////////////////////////////////////////////////////// static void mergeResultsAllShards( std::vector> const& results, std::shared_ptr& resultBody, std::unordered_map& errorCounter, VPackValueLength const expectedResults) { // errorCounter is not allowed to contain any NOT_FOUND entry. TRI_ASSERT(errorCounter.find(TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND) == errorCounter.end()); size_t realNotFound = 0; VPackBuilder cmp; cmp.openObject(); cmp.add("error", VPackValue(true)); cmp.add("errorNum", VPackValue(TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND)); cmp.close(); VPackSlice notFound = cmp.slice(); resultBody->clear(); resultBody->openArray(); for (VPackValueLength currentIndex = 0; currentIndex < expectedResults; ++currentIndex) { bool foundRes = false; for (auto const& it : results) { VPackSlice oneRes = it->slice(); TRI_ASSERT(oneRes.isArray()); oneRes = oneRes.at(currentIndex); if (!oneRes.equals(notFound)) { // This is the correct result // Use it resultBody->add(oneRes); foundRes = true; break; } } if (!foundRes) { // Found none, use NOT_FOUND resultBody->add(notFound); realNotFound++; } } resultBody->close(); if (realNotFound > 0) { errorCounter.emplace(TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND, realNotFound); } } //////////////////////////////////////////////////////////////////////////////// /// @brief Extract all error baby-style error codes and store them in a map //////////////////////////////////////////////////////////////////////////////// static void extractErrorCodes(ClusterCommResult const& res, std::unordered_map& errorCounter, bool includeNotFound) { auto resultHeaders = res.answer->headers(); auto codes = resultHeaders.find(StaticStrings::ErrorCodes); if (codes != resultHeaders.end()) { auto parsedCodes = VPackParser::fromJson(codes->second); VPackSlice codesSlice = parsedCodes->slice(); TRI_ASSERT(codesSlice.isObject()); for (auto const& code : VPackObjectIterator(codesSlice)) { VPackValueLength codeLength; char const* codeString = code.key.getString(codeLength); int codeNr = static_cast(arangodb::basics::StringUtils::int64( codeString, static_cast(codeLength))); if (includeNotFound || codeNr != TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND) { errorCounter[codeNr] += code.value.getNumericValue(); } } } } //////////////////////////////////////////////////////////////////////////////// /// @brief Distribute one document onto a shard map. If this returns /// TRI_ERROR_NO_ERROR the correct shard could be determined, if /// it returns sth. else this document is NOT contained in the shardMap //////////////////////////////////////////////////////////////////////////////// static int distributeBabyOnShards( std::unordered_map>& shardMap, ClusterInfo* ci, std::string const& collid, std::shared_ptr collinfo, std::vector>& reverseMapping, VPackSlice const node, VPackValueLength const index) { // Now find the responsible shard: bool usesDefaultShardingAttributes; ShardID shardID; int error = ci->getResponsibleShard(collid, node, false, shardID, usesDefaultShardingAttributes); if (error == TRI_ERROR_ARANGO_COLLECTION_NOT_FOUND) { return TRI_ERROR_CLUSTER_SHARD_GONE; } if (error != TRI_ERROR_NO_ERROR) { // We can not find a responsible shard return error; } // We found the responsible shard. Add it to the list. auto it = shardMap.find(shardID); if (it == shardMap.end()) { std::vector counter({index}); shardMap.emplace(shardID, counter); reverseMapping.emplace_back(shardID, 0); } else { it->second.emplace_back(index); reverseMapping.emplace_back(shardID, it->second.size() - 1); } return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief Distribute one document onto a shard map. If this returns /// TRI_ERROR_NO_ERROR the correct shard could be determined, if /// it returns sth. else this document is NOT contained in the shardMap. /// Also generates a key if necessary. //////////////////////////////////////////////////////////////////////////////// static int distributeBabyOnShards( std::unordered_map>>& shardMap, ClusterInfo* ci, std::string const& collid, std::shared_ptr collinfo, std::vector>& reverseMapping, VPackSlice const node, VPackValueLength const index) { ShardID shardID; bool userSpecifiedKey = false; std::string _key = ""; if (!node.isObject()) { // We have invalid input at this point. // However we can work with the other babies. // This is for compatibility with single server // We just asign it to any shard and pretend the user has given a key std::shared_ptr> shards = ci->getShardList(collid); shardID = shards->at(0); userSpecifiedKey = true; } else { // Sort out the _key attribute: // The user is allowed to specify _key, provided that _key is the one // and only sharding attribute, because in this case we can delegate // the responsibility to make _key attributes unique to the responsible // shard. Otherwise, we ensure uniqueness here and now by taking a // cluster-wide unique number. Note that we only know the sharding // attributes a bit further down the line when we have determined // the responsible shard. VPackSlice keySlice = node.get(StaticStrings::KeyString); if (keySlice.isNone()) { // The user did not specify a key, let's create one: uint64_t uid = ci->uniqid(); _key = arangodb::basics::StringUtils::itoa(uid); } else { userSpecifiedKey = true; } // Now find the responsible shard: bool usesDefaultShardingAttributes; int error = TRI_ERROR_NO_ERROR; if (userSpecifiedKey) { error = ci->getResponsibleShard(collid, node, true, shardID, usesDefaultShardingAttributes); } else { error = ci->getResponsibleShard(collid, node, true, shardID, usesDefaultShardingAttributes, _key); } if (error == TRI_ERROR_ARANGO_COLLECTION_NOT_FOUND) { return TRI_ERROR_CLUSTER_SHARD_GONE; } // Now perform the above mentioned check: if (userSpecifiedKey && (!usesDefaultShardingAttributes || !collinfo->allowUserKeys())) { return TRI_ERROR_CLUSTER_MUST_NOT_SPECIFY_KEY; } } // We found the responsible shard. Add it to the list. auto it = shardMap.find(shardID); if (it == shardMap.end()) { std::vector> counter( {{index, _key}}); shardMap.emplace(shardID, counter); reverseMapping.emplace_back(shardID, 0); } else { it->second.emplace_back(index, _key); reverseMapping.emplace_back(shardID, it->second.size() - 1); } return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief Collect the results from all shards (fastpath variant) /// All result bodies are stored in resultMap //////////////////////////////////////////////////////////////////////////////// template static void collectResultsFromAllShards( std::unordered_map> const& shardMap, std::vector& requests, std::unordered_map& errorCounter, std::unordered_map>& resultMap, GeneralResponse::ResponseCode& responseCode) { // If none of the shards responds we return a SERVER_ERROR; responseCode = GeneralResponse::ResponseCode::SERVER_ERROR; for (auto const& req : requests) { auto res = req.result; int commError = handleGeneralCommErrors(&res); if (commError != TRI_ERROR_NO_ERROR) { auto tmpBuilder = std::make_shared(); // If there was no answer whatsoever, we cannot rely on the shardId // being present in the result struct: ShardID sId = req.destination.substr(6); auto weSend = shardMap.find(sId); TRI_ASSERT(weSend != shardMap.end()); // We send sth there earlier. size_t count = weSend->second.size(); for (size_t i = 0; i < count; ++i) { tmpBuilder->openObject(); tmpBuilder->add("error", VPackValue(true)); tmpBuilder->add("errorNum", VPackValue(commError)); tmpBuilder->close(); } resultMap.emplace(sId, tmpBuilder); } else { TRI_ASSERT(res.answer != nullptr); resultMap.emplace(res.shardID, res.answer->toVelocyPack(&VPackOptions::Defaults)); extractErrorCodes(res, errorCounter, true); responseCode = res.answer_code; } } } //////////////////////////////////////////////////////////////////////////////// /// @brief creates a copy of all HTTP headers to forward //////////////////////////////////////////////////////////////////////////////// std::unordered_map getForwardableRequestHeaders( arangodb::GeneralRequest* request) { std::unordered_map const& headers = request->headers(); std::unordered_map::const_iterator it = headers.begin(); std::unordered_map result; while (it != headers.end()) { std::string const& key = (*it).first; // ignore the following headers if (key != "x-arango-async" && key != "authorization" && key != "content-length" && key != "connection" && key != "expect" && key != "host" && key != "origin" && key != StaticStrings::ErrorCodes && key.substr(0, 14) != "access-control") { result.emplace(key, (*it).second); } ++it; } if (request != nullptr) { result["content-length"] = StringUtils::itoa(request->contentLength()); } return result; } //////////////////////////////////////////////////////////////////////////////// /// @brief check if a list of attributes have the same values in two vpack /// documents //////////////////////////////////////////////////////////////////////////////// bool shardKeysChanged(std::string const& dbname, std::string const& collname, VPackSlice const& oldValue, VPackSlice const& newValue, bool isPatch) { if (!oldValue.isObject() || !newValue.isObject()) { // expecting two objects. everything else is an error return true; } ClusterInfo* ci = ClusterInfo::instance(); std::shared_ptr c = ci->getCollection(dbname, collname); std::vector const& shardKeys = c->shardKeys(); for (size_t i = 0; i < shardKeys.size(); ++i) { if (shardKeys[i] == StaticStrings::KeyString) { continue; } VPackSlice n = newValue.get(shardKeys[i]); if (n.isNone() && isPatch) { // attribute not set in patch document. this means no update continue; } // a temporary buffer to hold a null value char buffer[1]; VPackSlice nullValue = arangodb::velocypack::buildNullValue(&buffer[0], sizeof(buffer)); VPackSlice o = oldValue.get(shardKeys[i]); if (o.isNone()) { // if attribute is undefined, use "null" instead o = nullValue; } if (n.isNone()) { // if attribute is undefined, use "null" instead n = nullValue; } if (arangodb::basics::VelocyPackHelper::compare(n, o, false) != 0) { return true; } } return false; } //////////////////////////////////////////////////////////////////////////////// /// @brief returns revision for a sharded collection //////////////////////////////////////////////////////////////////////////////// int revisionOnCoordinator(std::string const& dbname, std::string const& collname, TRI_voc_rid_t& rid) { // Set a few variables needed for our work: ClusterInfo* ci = ClusterInfo::instance(); ClusterComm* cc = ClusterComm::instance(); // First determine the collection ID from the name: std::shared_ptr collinfo = ci->getCollection(dbname, collname); if (collinfo->empty()) { return TRI_ERROR_ARANGO_COLLECTION_NOT_FOUND; } rid = 0; // If we get here, the sharding attributes are not only _key, therefore // we have to contact everybody: auto shards = collinfo->shardIds(); CoordTransactionID coordTransactionID = TRI_NewTickServer(); for (auto const& p : *shards) { auto headers = std::make_unique>(); cc->asyncRequest( "", coordTransactionID, "shard:" + p.first, arangodb::GeneralRequest::RequestType::GET, "/_db/" + StringUtils::urlEncode(dbname) + "/_api/collection/" + StringUtils::urlEncode(p.first) + "/revision", std::shared_ptr(), headers, nullptr, 300.0); } // Now listen to the results: int count; int nrok = 0; for (count = (int)shards->size(); count > 0; count--) { auto res = cc->wait("", coordTransactionID, 0, "", 0.0); if (res.status == CL_COMM_RECEIVED) { if (res.answer_code == arangodb::GeneralResponse::ResponseCode::OK) { std::shared_ptr answerBuilder = ExtractAnswer(res); VPackSlice answer = answerBuilder->slice(); if (answer.isObject()) { VPackSlice r = answer.get("revision"); if (r.isString()) { TRI_voc_rid_t cmp = StringUtils::uint64(r.copyString()); if (cmp > rid) { // get the maximum value rid = cmp; } } nrok++; } } } } if (nrok != (int)shards->size()) { return TRI_ERROR_INTERNAL; } return TRI_ERROR_NO_ERROR; // the cluster operation was OK, however, // the DBserver could have reported an error. } //////////////////////////////////////////////////////////////////////////////// /// @brief returns figures for a sharded collection //////////////////////////////////////////////////////////////////////////////// int figuresOnCoordinator(std::string const& dbname, std::string const& collname, TRI_doc_collection_info_t*& result) { // Set a few variables needed for our work: ClusterInfo* ci = ClusterInfo::instance(); ClusterComm* cc = ClusterComm::instance(); // First determine the collection ID from the name: std::shared_ptr collinfo = ci->getCollection(dbname, collname); if (collinfo->empty()) { return TRI_ERROR_ARANGO_COLLECTION_NOT_FOUND; } // prefill with 0s result = (TRI_doc_collection_info_t*)TRI_Allocate( TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_doc_collection_info_t), true); if (result == nullptr) { return TRI_ERROR_OUT_OF_MEMORY; } // If we get here, the sharding attributes are not only _key, therefore // we have to contact everybody: auto shards = collinfo->shardIds(); CoordTransactionID coordTransactionID = TRI_NewTickServer(); for (auto const& p : *shards) { auto headers = std::make_unique>(); cc->asyncRequest( "", coordTransactionID, "shard:" + p.first, arangodb::GeneralRequest::RequestType::GET, "/_db/" + StringUtils::urlEncode(dbname) + "/_api/collection/" + StringUtils::urlEncode(p.first) + "/figures", std::shared_ptr(), headers, nullptr, 300.0); } // Now listen to the results: int count; int nrok = 0; for (count = (int)shards->size(); count > 0; count--) { auto res = cc->wait("", coordTransactionID, 0, "", 0.0); if (res.status == CL_COMM_RECEIVED) { if (res.answer_code == arangodb::GeneralResponse::ResponseCode::OK) { std::shared_ptr answerBuilder = ExtractAnswer(res); VPackSlice answer = answerBuilder->slice(); if (answer.isObject()) { VPackSlice figures = answer.get("figures"); if (figures.isObject()) { // add to the total result->_numberAlive += ExtractFigure(figures, "alive", "count"); result->_numberDead += ExtractFigure(figures, "dead", "count"); result->_numberDeletions += ExtractFigure(figures, "dead", "deletion"); result->_numberIndexes += ExtractFigure(figures, "indexes", "count"); result->_sizeAlive += ExtractFigure(figures, "alive", "size"); result->_sizeDead += ExtractFigure(figures, "dead", "size"); result->_sizeIndexes += ExtractFigure(figures, "indexes", "size"); result->_numberDatafiles += ExtractFigure(figures, "datafiles", "count"); result->_numberJournalfiles += ExtractFigure(figures, "journals", "count"); result->_numberCompactorfiles += ExtractFigure(figures, "compactors", "count"); result->_datafileSize += ExtractFigure(figures, "datafiles", "fileSize"); result->_journalfileSize += ExtractFigure(figures, "journals", "fileSize"); result->_compactorfileSize += ExtractFigure(figures, "compactors", "fileSize"); result->_numberDocumentDitches += arangodb::basics::VelocyPackHelper::getNumericValue( figures, "documentReferences", 0); } nrok++; } } } } if (nrok != (int)shards->size()) { TRI_Free(TRI_UNKNOWN_MEM_ZONE, result); result = 0; return TRI_ERROR_INTERNAL; } return TRI_ERROR_NO_ERROR; // the cluster operation was OK, however, // the DBserver could have reported an error. } //////////////////////////////////////////////////////////////////////////////// /// @brief counts number of documents in a coordinator //////////////////////////////////////////////////////////////////////////////// int countOnCoordinator(std::string const& dbname, std::string const& collname, uint64_t& result) { // Set a few variables needed for our work: ClusterInfo* ci = ClusterInfo::instance(); ClusterComm* cc = ClusterComm::instance(); result = 0; // First determine the collection ID from the name: std::shared_ptr collinfo = ci->getCollection(dbname, collname); if (collinfo->empty()) { return TRI_ERROR_ARANGO_COLLECTION_NOT_FOUND; } auto shards = collinfo->shardIds(); std::vector requests; auto body = std::make_shared(); for (auto const& p : *shards) { requests.emplace_back("shard:" + p.first, arangodb::GeneralRequest::RequestType::GET, "/_db/" + StringUtils::urlEncode(dbname) + "/_api/collection/" + StringUtils::urlEncode(p.first) + "/count", body); } size_t nrDone = 0; cc->performRequests(requests, CL_DEFAULT_TIMEOUT, nrDone, Logger::QUERIES); for (auto& req : requests) { auto& res = req.result; if (res.status == CL_COMM_RECEIVED) { if (res.answer_code == arangodb::GeneralResponse::ResponseCode::OK) { std::shared_ptr answerBuilder = ExtractAnswer(res); VPackSlice answer = answerBuilder->slice(); if (answer.isObject()) { // add to the total result += arangodb::basics::VelocyPackHelper::getNumericValue( answer, "count", 0); } else { return TRI_ERROR_INTERNAL; } } else { return static_cast(res.answer_code); } } else { return TRI_ERROR_CLUSTER_BACKEND_UNAVAILABLE; } } return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief creates one or many documents in a coordinator /// /// In case of many documents (slice is a VPackArray) it will send to each /// shard all the relevant documents for this shard only. /// If one of them fails, this error is reported. /// There is NO guarantee for the stored documents of all other shards, they may /// be stored or not. All answers of these shards are dropped. /// If we return with NO_ERROR it is guaranteed that all shards reported success /// for their documents. //////////////////////////////////////////////////////////////////////////////// int createDocumentOnCoordinator( std::string const& dbname, std::string const& collname, arangodb::OperationOptions const& options, VPackSlice const& slice, arangodb::GeneralResponse::ResponseCode& responseCode, std::unordered_map& errorCounter, std::shared_ptr& resultBody) { // Set a few variables needed for our work: ClusterInfo* ci = ClusterInfo::instance(); ClusterComm* cc = ClusterComm::instance(); // First determine the collection ID from the name: std::shared_ptr collinfo = ci->getCollection(dbname, collname); if (collinfo->empty()) { return TRI_ERROR_ARANGO_COLLECTION_NOT_FOUND; } std::string const collid = StringUtils::itoa(collinfo->id()); std::unordered_map< ShardID, std::vector>> shardMap; std::vector> reverseMapping; bool useMultiple = slice.isArray(); int res = TRI_ERROR_NO_ERROR; if (useMultiple) { VPackValueLength length = slice.length(); for (VPackValueLength idx = 0; idx < length; ++idx) { res = distributeBabyOnShards(shardMap, ci, collid, collinfo, reverseMapping, slice.at(idx), idx); if (res != TRI_ERROR_NO_ERROR) { return res; } } } else { res = distributeBabyOnShards(shardMap, ci, collid, collinfo, reverseMapping, slice, 0); if (res != TRI_ERROR_NO_ERROR) { return res; } } std::string const baseUrl = "/_db/" + StringUtils::urlEncode(dbname) + "/_api/document?collection="; std::string const optsUrlPart = std::string("&waitForSync=") + (options.waitForSync ? "true" : "false") + "&returnNew=" + (options.returnNew ? "true" : "false") + "&returnOld=" + (options.returnOld ? "true" : "false"); VPackBuilder reqBuilder; // Now prepare the requests: std::vector requests; auto body = std::make_shared(); for (auto const& it : shardMap) { if (!useMultiple) { TRI_ASSERT(it.second.size() == 1); auto idx = it.second.front(); if (idx.second.empty()) { body = std::make_shared(slice.toJson()); } else { reqBuilder.clear(); reqBuilder.openObject(); reqBuilder.add(StaticStrings::KeyString, VPackValue(idx.second)); TRI_SanitizeObject(slice, reqBuilder); reqBuilder.close(); body = std::make_shared(reqBuilder.slice().toJson()); } } else { reqBuilder.clear(); reqBuilder.openArray(); for (auto const& idx : it.second) { if (idx.second.empty()) { reqBuilder.add(slice.at(idx.first)); } else { reqBuilder.openObject(); reqBuilder.add(StaticStrings::KeyString, VPackValue(idx.second)); TRI_SanitizeObject(slice.at(idx.first), reqBuilder); reqBuilder.close(); } } reqBuilder.close(); body = std::make_shared(reqBuilder.slice().toJson()); } requests.emplace_back( "shard:" + it.first, arangodb::GeneralRequest::RequestType::POST, baseUrl + StringUtils::urlEncode(it.first) + optsUrlPart, body); } // Perform the requests size_t nrDone = 0; cc->performRequests(requests, CL_DEFAULT_TIMEOUT, nrDone, Logger::REQUESTS); // Now listen to the results: if (!useMultiple) { TRI_ASSERT(requests.size() == 1); auto const& req = requests[0]; auto& res = req.result; int commError = handleGeneralCommErrors(&res); if (commError != TRI_ERROR_NO_ERROR) { return commError; } responseCode = res.answer_code; TRI_ASSERT(res.answer != nullptr); auto parsedResult = res.answer->toVelocyPack(&VPackOptions::Defaults); resultBody.swap(parsedResult); return TRI_ERROR_NO_ERROR; } std::unordered_map> resultMap; collectResultsFromAllShards>( shardMap, requests, errorCounter, resultMap, responseCode); responseCode = (options.waitForSync ? GeneralResponse::ResponseCode::CREATED : GeneralResponse::ResponseCode::ACCEPTED); mergeResults(reverseMapping, resultMap, resultBody); // the cluster operation was OK, however, // the DBserver could have reported an error. return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief deletes a document in a coordinator //////////////////////////////////////////////////////////////////////////////// int deleteDocumentOnCoordinator( std::string const& dbname, std::string const& collname, VPackSlice const slice, arangodb::OperationOptions const& options, arangodb::GeneralResponse::ResponseCode& responseCode, std::unordered_map& errorCounter, std::shared_ptr& resultBody) { // Set a few variables needed for our work: ClusterInfo* ci = ClusterInfo::instance(); ClusterComm* cc = ClusterComm::instance(); // First determine the collection ID from the name: std::shared_ptr collinfo = ci->getCollection(dbname, collname); if (collinfo->empty()) { return TRI_ERROR_ARANGO_COLLECTION_NOT_FOUND; } bool useDefaultSharding = collinfo->usesDefaultShardKeys(); std::string collid = StringUtils::itoa(collinfo->id()); bool useMultiple = slice.isArray(); std::string const baseUrl = "/_db/" + StringUtils::urlEncode(dbname) + "/_api/document/"; std::string const optsUrlPart = std::string("?waitForSync=") + (options.waitForSync ? "true" : "false") + "&returnOld=" + (options.returnOld ? "true" : "false") + "&ignoreRevs=" + (options.ignoreRevs ? "true" : "false"); VPackBuilder reqBuilder; if (useDefaultSharding) { // fastpath we know which server is responsible. // decompose the input into correct shards. // Send the correct documents to the correct shards // Merge the results with static merge helper std::unordered_map> shardMap; std::vector> reverseMapping; auto workOnOneNode = [&shardMap, &ci, &collid, &collinfo, &reverseMapping]( VPackSlice const node, VPackValueLength const index) -> int { // Sort out the _key attribute and identify the shard responsible for it. StringRef _key(Transaction::extractKeyPart(node)); ShardID shardID; if (_key.empty()) { // We have invalid input at this point. // However we can work with the other babies. // This is for compatibility with single server // We just asign it to any shard and pretend the user has given a key std::shared_ptr> shards = ci->getShardList(collid); shardID = shards->at(0); } else { // Now find the responsible shard: bool usesDefaultShardingAttributes; int error = ci->getResponsibleShard( collid, arangodb::basics::VelocyPackHelper::EmptyObjectValue(), true, shardID, usesDefaultShardingAttributes, _key.toString()); if (error == TRI_ERROR_ARANGO_COLLECTION_NOT_FOUND) { return TRI_ERROR_CLUSTER_SHARD_GONE; } } // We found the responsible shard. Add it to the list. auto it = shardMap.find(shardID); if (it == shardMap.end()) { std::vector counter({index}); shardMap.emplace(shardID, counter); reverseMapping.emplace_back(shardID, 0); } else { it->second.emplace_back(index); reverseMapping.emplace_back(shardID, it->second.size() - 1); } return TRI_ERROR_NO_ERROR; }; if (useMultiple) { for (VPackValueLength idx = 0; idx < slice.length(); ++idx) { int res = workOnOneNode(slice.at(idx), idx); if (res != TRI_ERROR_NO_ERROR) { // Is early abortion correct? return res; } } } else { int res = workOnOneNode(slice, 0); if (res != TRI_ERROR_NO_ERROR) { return res; } } // We sorted the shards correctly. // Now prepare the requests: std::vector requests; auto body = std::make_shared(); for (auto const& it : shardMap) { if (!useMultiple) { TRI_ASSERT(it.second.size() == 1); body = std::make_shared(slice.toJson()); } else { reqBuilder.clear(); reqBuilder.openArray(); for (auto const& idx : it.second) { reqBuilder.add(slice.at(idx)); } reqBuilder.close(); body = std::make_shared(reqBuilder.slice().toJson()); } requests.emplace_back( "shard:" + it.first, arangodb::GeneralRequest::RequestType::DELETE_REQ, baseUrl + StringUtils::urlEncode(it.first) + optsUrlPart, body); } // Perform the requests size_t nrDone = 0; cc->performRequests(requests, CL_DEFAULT_TIMEOUT, nrDone, Logger::REQUESTS); // Now listen to the results: if (!useMultiple) { TRI_ASSERT(requests.size() == 1); auto const& req = requests[0]; auto& res = req.result; int commError = handleGeneralCommErrors(&res); if (commError != TRI_ERROR_NO_ERROR) { return commError; } responseCode = res.answer_code; TRI_ASSERT(res.answer != nullptr); auto parsedResult = res.answer->toVelocyPack(&VPackOptions::Defaults); resultBody.swap(parsedResult); return TRI_ERROR_NO_ERROR; } std::unordered_map> resultMap; collectResultsFromAllShards( shardMap, requests, errorCounter, resultMap, responseCode); mergeResults(reverseMapping, resultMap, resultBody); return TRI_ERROR_NO_ERROR; // the cluster operation was OK, however, // the DBserver could have reported an error. } // slowpath we do not know which server is responsible ask all of them. // We simply send the body to all shards and await their results. // As soon as we have the results we merge them in the following way: // For 1 .. slice.length() // for res : allResults // if res != NOT_FOUND => insert this result. skip other results // end // if (!skipped) => insert NOT_FOUND auto body = std::make_shared(slice.toJson()); std::vector requests; auto shardList = ci->getShardList(collid); for (auto const& shard : *shardList) { requests.emplace_back( "shard:" + shard, arangodb::GeneralRequest::RequestType::DELETE_REQ, baseUrl + StringUtils::urlEncode(shard) + optsUrlPart, body); } // Perform the requests size_t nrDone = 0; cc->performRequests(requests, CL_DEFAULT_TIMEOUT, nrDone, Logger::REQUESTS); // Now listen to the results: if (!useMultiple) { // Only one can answer, we react a bit differently size_t count; int nrok = 0; for (count = requests.size(); count > 0; count--) { auto const& req = requests[count - 1]; auto res = req.result; if (res.status == CL_COMM_RECEIVED) { if (res.answer_code != arangodb::GeneralResponse::ResponseCode::NOT_FOUND || (nrok == 0 && count == 1)) { nrok++; responseCode = res.answer_code; TRI_ASSERT(res.answer != nullptr); auto parsedResult = res.answer->toVelocyPack(&VPackOptions::Defaults); resultBody.swap(parsedResult); } } } // Note that nrok is always at least 1! if (nrok > 1) { return TRI_ERROR_CLUSTER_GOT_CONTRADICTING_ANSWERS; } return TRI_ERROR_NO_ERROR; // the cluster operation was OK, however, // the DBserver could have reported an error. } // We select all results from all shards an merge them back again. std::vector> allResults; allResults.reserve(shardList->size()); // If no server responds we return 500 responseCode = GeneralResponse::ResponseCode::SERVER_ERROR; for (auto const& req : requests) { auto res = req.result; int error = handleGeneralCommErrors(&res); if (error != TRI_ERROR_NO_ERROR) { // Local data structures are automatically freed return error; } if (res.answer_code == GeneralResponse::ResponseCode::OK || res.answer_code == GeneralResponse::ResponseCode::ACCEPTED) { responseCode = res.answer_code; } TRI_ASSERT(res.answer != nullptr); allResults.emplace_back(res.answer->toVelocyPack(&VPackOptions::Defaults)); extractErrorCodes(res, errorCounter, false); } // If we get here we get exactly one result for every shard. TRI_ASSERT(allResults.size() == shardList->size()); mergeResultsAllShards(allResults, resultBody, errorCounter, static_cast(slice.length())); return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief truncate a cluster collection on a coordinator //////////////////////////////////////////////////////////////////////////////// int truncateCollectionOnCoordinator(std::string const& dbname, std::string const& collname) { // Set a few variables needed for our work: ClusterInfo* ci = ClusterInfo::instance(); ClusterComm* cc = ClusterComm::instance(); // First determine the collection ID from the name: std::shared_ptr collinfo = ci->getCollection(dbname, collname); if (collinfo->empty()) { return TRI_ERROR_ARANGO_COLLECTION_NOT_FOUND; } // Some stuff to prepare cluster-intern requests: // We have to contact everybody: auto shards = collinfo->shardIds(); CoordTransactionID coordTransactionID = TRI_NewTickServer(); for (auto const& p : *shards) { auto headers = std::make_unique>(); cc->asyncRequest("", coordTransactionID, "shard:" + p.first, arangodb::GeneralRequest::RequestType::PUT, "/_db/" + StringUtils::urlEncode(dbname) + "/_api/collection/" + p.first + "/truncate", std::shared_ptr(), headers, nullptr, 60.0); } // Now listen to the results: unsigned int count; unsigned int nrok = 0; for (count = (unsigned int)shards->size(); count > 0; count--) { auto res = cc->wait("", coordTransactionID, 0, "", 0.0); if (res.status == CL_COMM_RECEIVED) { if (res.answer_code == arangodb::GeneralResponse::ResponseCode::OK) { nrok++; } } } // Note that nrok is always at least 1! if (nrok < shards->size()) { return TRI_ERROR_CLUSTER_COULD_NOT_TRUNCATE_COLLECTION; } return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief get a document in a coordinator //////////////////////////////////////////////////////////////////////////////// int getDocumentOnCoordinator( std::string const& dbname, std::string const& collname, VPackSlice const slice, OperationOptions const& options, std::unique_ptr>& headers, arangodb::GeneralResponse::ResponseCode& responseCode, std::unordered_map& errorCounter, std::shared_ptr& resultBody) { // Set a few variables needed for our work: ClusterInfo* ci = ClusterInfo::instance(); ClusterComm* cc = ClusterComm::instance(); // First determine the collection ID from the name: std::shared_ptr collinfo = ci->getCollection(dbname, collname); if (collinfo->empty()) { return TRI_ERROR_ARANGO_COLLECTION_NOT_FOUND; } std::string collid = StringUtils::itoa(collinfo->id()); // If _key is the one and only sharding attribute, we can do this quickly, // because we can easily determine which shard is responsible for the // document. Otherwise we have to contact all shards and ask them to // delete the document. All but one will not know it. // Now find the responsible shard(s) ShardID shardID; std::unordered_map> shardMap; std::vector> reverseMapping; bool useMultiple = slice.isArray(); int res = TRI_ERROR_NO_ERROR; bool canUseFastPath = true; if (useMultiple) { VPackValueLength length = slice.length(); for (VPackValueLength idx = 0; idx < length; ++idx) { res = distributeBabyOnShards(shardMap, ci, collid, collinfo, reverseMapping, slice.at(idx), idx); if (res != TRI_ERROR_NO_ERROR) { canUseFastPath = false; shardMap.clear(); reverseMapping.clear(); break; } } } else { res = distributeBabyOnShards(shardMap, ci, collid, collinfo, reverseMapping, slice, 0); if (res != TRI_ERROR_NO_ERROR) { canUseFastPath = false; } } // Some stuff to prepare cluster-internal requests: std::string baseUrl = "/_db/" + StringUtils::urlEncode(dbname) + "/_api/document/"; std::string optsUrlPart = std::string("?ignoreRevs=") + (options.ignoreRevs ? "true" : "false"); arangodb::GeneralRequest::RequestType reqType; if (!useMultiple) { if (options.silent) { reqType = arangodb::GeneralRequest::RequestType::HEAD; } else { reqType = arangodb::GeneralRequest::RequestType::GET; } } else { reqType = arangodb::GeneralRequest::RequestType::PUT; if (options.silent) { optsUrlPart += std::string("&silent=true"); } optsUrlPart += std::string("&onlyget=true"); } if (canUseFastPath) { // All shard keys are known in all documents. // Contact all shards directly with the correct information. VPackBuilder reqBuilder; // Now prepare the requests: std::vector requests; auto body = std::make_shared(); for (auto const& it : shardMap) { if (!useMultiple) { TRI_ASSERT(it.second.size() == 1); if (!options.ignoreRevs && slice.hasKey(StaticStrings::RevString)) { headers->emplace("if-match", slice.get(StaticStrings::RevString).copyString()); } VPackSlice keySlice = slice; if (slice.isObject()) { keySlice = slice.get(StaticStrings::KeyString); } // We send to single endpoint requests.emplace_back( "shard:" + it.first, reqType, baseUrl + StringUtils::urlEncode(it.first) + "/" + StringUtils::urlEncode(keySlice.copyString()) + optsUrlPart, body); requests[0].setHeaders(headers); } else { reqBuilder.clear(); reqBuilder.openArray(); for (auto const& idx : it.second) { reqBuilder.add(slice.at(idx)); } reqBuilder.close(); body = std::make_shared(reqBuilder.slice().toJson()); // We send to Babies endpoint requests.emplace_back( "shard:" + it.first, reqType, baseUrl + StringUtils::urlEncode(it.first) + optsUrlPart, body); } } // Perform the requests size_t nrDone = 0; cc->performRequests(requests, CL_DEFAULT_TIMEOUT, nrDone, Logger::REQUESTS); // Now listen to the results: if (!useMultiple) { TRI_ASSERT(requests.size() == 1); auto const& req = requests[0]; auto res = req.result; int commError = handleGeneralCommErrors(&res); if (commError != TRI_ERROR_NO_ERROR) { return commError; } responseCode = res.answer_code; TRI_ASSERT(res.answer != nullptr); auto parsedResult = res.answer->toVelocyPack(&VPackOptions::Defaults); resultBody.swap(parsedResult); return TRI_ERROR_NO_ERROR; } std::unordered_map> resultMap; collectResultsFromAllShards( shardMap, requests, errorCounter, resultMap, responseCode); mergeResults(reverseMapping, resultMap, resultBody); // the cluster operation was OK, however, // the DBserver could have reported an error. return TRI_ERROR_NO_ERROR; } // Not all shard keys are known in all documents. // We contact all shards with the complete body and ignore NOT_FOUND std::vector requests; auto shardList = ci->getShardList(collid); if (!useMultiple) { if (!options.ignoreRevs && slice.hasKey(StaticStrings::RevString)) { headers->emplace("if-match", slice.get(StaticStrings::RevString).copyString()); } for (auto const& shard : *shardList) { VPackSlice keySlice = slice; if (slice.isObject()) { keySlice = slice.get(StaticStrings::KeyString); } ClusterCommRequest req( "shard:" + shard, reqType, baseUrl + StringUtils::urlEncode(shard) + "/" + StringUtils::urlEncode(keySlice.copyString()) + optsUrlPart, nullptr); auto headersCopy = std::make_unique>( *headers); req.setHeaders(headersCopy); requests.emplace_back(std::move(req)); } } else { auto body = std::make_shared(slice.toJson()); for (auto const& shard : *shardList) { requests.emplace_back( "shard:" + shard, reqType, baseUrl + StringUtils::urlEncode(shard) + optsUrlPart, body); } } // Perform the requests size_t nrDone = 0; cc->performRequests(requests, CL_DEFAULT_TIMEOUT, nrDone, Logger::REQUESTS); // Now listen to the results: if (!useMultiple) { // Only one can answer, we react a bit differently size_t count; int nrok = 0; int commError = TRI_ERROR_NO_ERROR; for (count = requests.size(); count > 0; count--) { auto const& req = requests[count - 1]; auto res = req.result; if (res.status == CL_COMM_RECEIVED) { if (res.answer_code != arangodb::GeneralResponse::ResponseCode::NOT_FOUND || (nrok == 0 && count == 1 && commError == TRI_ERROR_NO_ERROR)) { nrok++; responseCode = res.answer_code; TRI_ASSERT(res.answer != nullptr); auto parsedResult = res.answer->toVelocyPack(&VPackOptions::Defaults); resultBody.swap(parsedResult); } } else { commError = handleGeneralCommErrors(&res); } } if (nrok == 0) { // This can only happen, if a commError was encountered! return commError; } if (nrok > 1) { return TRI_ERROR_CLUSTER_GOT_CONTRADICTING_ANSWERS; } return TRI_ERROR_NO_ERROR; // the cluster operation was OK, however, // the DBserver could have reported an error. } // We select all results from all shards and merge them back again. std::vector> allResults; allResults.reserve(shardList->size()); // If no server responds we return 500 responseCode = GeneralResponse::ResponseCode::SERVER_ERROR; for (auto const& req : requests) { auto& res = req.result; int error = handleGeneralCommErrors(&res); if (error != TRI_ERROR_NO_ERROR) { // Local data structores are automatically freed return error; } if (res.answer_code == GeneralResponse::ResponseCode::OK || res.answer_code == GeneralResponse::ResponseCode::ACCEPTED) { responseCode = res.answer_code; } TRI_ASSERT(res.answer != nullptr); allResults.emplace_back(res.answer->toVelocyPack(&VPackOptions::Defaults)); extractErrorCodes(res, errorCounter, false); } // If we get here we get exactly one result for every shard. TRI_ASSERT(allResults.size() == shardList->size()); mergeResultsAllShards(allResults, resultBody, errorCounter, static_cast(slice.length())); return TRI_ERROR_NO_ERROR; } static void insertIntoShardMap( ClusterInfo* ci, std::string const& dbname, std::string const& documentId, std::unordered_map>& shardMap) { std::vector splitId = arangodb::basics::StringUtils::split(documentId, '/'); TRI_ASSERT(splitId.size() == 2); // First determine the collection ID from the name: std::shared_ptr collinfo = ci->getCollection(dbname, splitId[0]); if (collinfo->empty()) { THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_ARANGO_COLLECTION_NOT_FOUND, "Collection not found: " + splitId[0]); } std::string collid = StringUtils::itoa(collinfo->id()); if (collinfo->usesDefaultShardKeys()) { // We only need add one resp. shard VPackBuilder partial; partial.openObject(); partial.add(StaticStrings::KeyString, VPackValue(splitId[1])); partial.close(); bool usesDefaultShardingAttributes; ShardID shardID; int error = ci->getResponsibleShard(collid, partial.slice(), true, shardID, usesDefaultShardingAttributes); if (error != TRI_ERROR_NO_ERROR) { THROW_ARANGO_EXCEPTION(error); } TRI_ASSERT(usesDefaultShardingAttributes); // If this is false the if // condition should be false in // the first place auto it = shardMap.find(shardID); if (it == shardMap.end()) { shardMap.emplace(shardID, std::vector({splitId[1]})); } else { it->second.push_back(splitId[1]); } } else { // Sorry we do not know the responsible shard yet // Ask all of them auto shardList = ci->getShardList(collid); for (auto const& shard : *shardList) { auto it = shardMap.find(shard); if (it == shardMap.end()) { shardMap.emplace(shard, std::vector({splitId[1]})); } else { it->second.push_back(splitId[1]); } } } } //////////////////////////////////////////////////////////////////////////////// /// @brief get a list of filtered documents in a coordinator /// All found documents will be inserted into result. /// After execution documentIds will contain all id's of documents /// that could not be found. //////////////////////////////////////////////////////////////////////////////// int getFilteredDocumentsOnCoordinator( std::string const& dbname, std::vector const& expressions, std::unordered_set& documentIds, std::unordered_map>>& result) { // Set a few variables needed for our work: ClusterInfo* ci = ClusterInfo::instance(); ClusterComm* cc = ClusterComm::instance(); std::unordered_map> shardRequestMap; for (auto const& doc : documentIds) { try { insertIntoShardMap(ci, dbname, doc, shardRequestMap); } catch (...) { } } // Now start the request. // We do not have to care for shard attributes esp. shard by key. // If it is by key the key was only added to one key list, if not // it is contained multiple times. std::vector requests; VPackBuilder bodyBuilder; for (auto const& shard : shardRequestMap) { bodyBuilder.clear(); bodyBuilder.openObject(); bodyBuilder.add("collection", VPackValue(shard.first)); bodyBuilder.add("keys", VPackValue(VPackValueType::Array)); for (auto const& key : shard.second) { bodyBuilder.add(VPackValue(key)); } bodyBuilder.close(); // keys if (!expressions.empty()) { bodyBuilder.add("filter", VPackValue(VPackValueType::Array)); for (auto const& e : expressions) { e->toVelocyPack(bodyBuilder); } bodyBuilder.close(); // filter } bodyBuilder.close(); // Object auto bodyString = std::make_shared(bodyBuilder.toJson()); requests.emplace_back("shard:" + shard.first, arangodb::GeneralRequest::RequestType::PUT, "/_db/" + StringUtils::urlEncode(dbname) + "/_api/simple/lookup-by-keys", bodyString); } // Perform the requests size_t nrDone = 0; cc->performRequests(requests, CL_DEFAULT_TIMEOUT, nrDone, Logger::REQUESTS); // All requests send, now collect results. for (auto const& req : requests) { auto& res = req.result; if (res.status == CL_COMM_RECEIVED) { std::shared_ptr resultBody = res.answer->toVelocyPack(&VPackOptions::Defaults); VPackSlice resSlice = resultBody->slice(); if (!resSlice.isObject()) { THROW_ARANGO_EXCEPTION_MESSAGE( TRI_ERROR_INTERNAL, "Received an invalid result in cluster."); } bool isError = arangodb::basics::VelocyPackHelper::getBooleanValue( resSlice, "error", false); if (isError) { return arangodb::basics::VelocyPackHelper::getNumericValue( resSlice, "errorNum", TRI_ERROR_INTERNAL); } VPackSlice documents = resSlice.get("documents"); if (!documents.isArray()) { THROW_ARANGO_EXCEPTION_MESSAGE( TRI_ERROR_INTERNAL, "Received an invalid result in cluster."); } for (auto const& element : VPackArrayIterator(documents)) { std::string id = arangodb::basics::VelocyPackHelper::getStringValue( element, StaticStrings::IdString, ""); VPackBuilder tmp; tmp.add(element); result.emplace(id, tmp.steal()); } VPackSlice filtered = resSlice.get("filtered"); if (filtered.isArray()) { for (auto const& element : VPackArrayIterator(filtered)) { if (element.isString()) { std::string id = element.copyString(); documentIds.erase(id); } } } } } return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief get all edges on coordinator using a Traverser Filter //////////////////////////////////////////////////////////////////////////////// int getFilteredEdgesOnCoordinator( std::string const& dbname, std::string const& collname, std::string const& vertex, TRI_edge_direction_e const& direction, std::vector const& expressions, arangodb::GeneralResponse::ResponseCode& responseCode, VPackBuilder& result) { TRI_ASSERT(result.isOpenObject()); // Set a few variables needed for our work: ClusterInfo* ci = ClusterInfo::instance(); ClusterComm* cc = ClusterComm::instance(); // First determine the collection ID from the name: std::shared_ptr collinfo = ci->getCollection(dbname, collname); if (collinfo->empty()) { return TRI_ERROR_ARANGO_COLLECTION_NOT_FOUND; } auto shards = collinfo->shardIds(); std::string queryParameters = "?vertex=" + StringUtils::urlEncode(vertex); if (direction == TRI_EDGE_IN) { queryParameters += "&direction=in"; } else if (direction == TRI_EDGE_OUT) { queryParameters += "&direction=out"; } auto reqBodyString = std::make_shared(); if (!expressions.empty()) { VPackBuilder bodyBuilder; bodyBuilder.openArray(); for (auto& e : expressions) { e->toVelocyPack(bodyBuilder); } bodyBuilder.close(); reqBodyString->append(bodyBuilder.toJson()); } std::vector requests; std::string baseUrl = "/_db/" + StringUtils::urlEncode(dbname) + "/_api/edges/"; for (auto const& p : *shards) { requests.emplace_back( "shard:" + p.first, arangodb::GeneralRequest::RequestType::PUT, baseUrl + StringUtils::urlEncode(p.first) + queryParameters, reqBodyString); } // Perform the requests size_t nrDone = 0; cc->performRequests(requests, CL_DEFAULT_TIMEOUT, nrDone, Logger::REQUESTS); size_t filtered = 0; size_t scannedIndex = 0; responseCode = arangodb::GeneralResponse::ResponseCode::OK; result.add("edges", VPackValue(VPackValueType::Array)); // All requests send, now collect results. for (auto const& req : requests) { auto& res = req.result; int error = handleGeneralCommErrors(&res); if (error != TRI_ERROR_NO_ERROR) { // Cluster is in bad state. Report. return error; } TRI_ASSERT(res.answer != nullptr); std::shared_ptr shardResult = res.answer->toVelocyPack(&VPackOptions::Defaults); if (shardResult == nullptr) { return TRI_ERROR_INTERNAL; } VPackSlice shardSlice = shardResult->slice(); if (!shardSlice.isObject()) { return TRI_ERROR_INTERNAL; } bool const isError = arangodb::basics::VelocyPackHelper::getBooleanValue( shardSlice, "error", false); if (isError) { // shard returned an error return arangodb::basics::VelocyPackHelper::getNumericValue( shardSlice, "errorNum", TRI_ERROR_INTERNAL); } VPackSlice docs = shardSlice.get("edges"); if (!docs.isArray()) { return TRI_ERROR_INTERNAL; } for (auto const& doc : VPackArrayIterator(docs)) { result.add(doc); } VPackSlice stats = shardSlice.get("stats"); if (stats.isObject()) { filtered += arangodb::basics::VelocyPackHelper::getNumericValue( stats, "filtered", 0); scannedIndex += arangodb::basics::VelocyPackHelper::getNumericValue( stats, "scannedIndex", 0); } } result.close(); // edges result.add("stats", VPackValue(VPackValueType::Object)); result.add("scannedIndex", VPackValue(scannedIndex)); result.add("filtered", VPackValue(filtered)); result.close(); // stats // Leave outer Object open return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief modify a document in a coordinator //////////////////////////////////////////////////////////////////////////////// int modifyDocumentOnCoordinator( std::string const& dbname, std::string const& collname, VPackSlice const& slice, arangodb::OperationOptions const& options, bool isPatch, std::unique_ptr>& headers, arangodb::GeneralResponse::ResponseCode& responseCode, std::unordered_map& errorCounter, std::shared_ptr& resultBody) { // Set a few variables needed for our work: ClusterInfo* ci = ClusterInfo::instance(); ClusterComm* cc = ClusterComm::instance(); // First determine the collection ID from the name: std::shared_ptr collinfo = ci->getCollection(dbname, collname); if (collinfo->empty()) { return TRI_ERROR_ARANGO_COLLECTION_NOT_FOUND; } std::string collid = StringUtils::itoa(collinfo->id()); // We have a fast path and a slow path. The fast path only asks one shard // to do the job and the slow path asks them all and expects to get // "not found" from all but one shard. We have to cover the following // cases: // isPatch == false (this is a "replace" operation) // Here, the complete new document is given, we assume that we // can read off the responsible shard, therefore can use the fast // path, this is always true if _key is the one and only sharding // attribute, however, if there is any other sharding attribute, // it is possible that the user has changed the values in any of // them, in that case we will get a "not found" or a "sharding // attributes changed answer" in the fast path. In the first case // we have to delegate to the slow path. // isPatch == true (this is an "update" operation) // In this case we might or might not have all sharding attributes // specified in the partial document given. If _key is the one and // only sharding attribute, it is always given, if not all sharding // attributes are explicitly given (at least as value `null`), we must // assume that the fast path cannot be used. If all sharding attributes // are given, we first try the fast path, but might, as above, // have to use the slow path after all. ShardID shardID; std::unordered_map> shardMap; std::vector> reverseMapping; bool useMultiple = slice.isArray(); int res = TRI_ERROR_NO_ERROR; bool canUseFastPath = true; if (useMultiple) { VPackValueLength length = slice.length(); for (VPackValueLength idx = 0; idx < length; ++idx) { res = distributeBabyOnShards(shardMap, ci, collid, collinfo, reverseMapping, slice.at(idx), idx); if (res != TRI_ERROR_NO_ERROR) { if (!isPatch) { return res; } canUseFastPath = false; shardMap.clear(); reverseMapping.clear(); break; } } } else { res = distributeBabyOnShards(shardMap, ci, collid, collinfo, reverseMapping, slice, 0); if (res != TRI_ERROR_NO_ERROR) { if (!isPatch) { return res; } canUseFastPath = false; } } // Some stuff to prepare cluster-internal requests: std::string baseUrl = "/_db/" + StringUtils::urlEncode(dbname) + "/_api/document/"; std::string optsUrlPart = std::string("?waitForSync=") + (options.waitForSync ? "true" : "false"); optsUrlPart += std::string("&ignoreRevs=") + (options.ignoreRevs ? "true" : "false"); arangodb::GeneralRequest::RequestType reqType; if (isPatch) { reqType = arangodb::GeneralRequest::RequestType::PATCH; if (!options.keepNull) { optsUrlPart += "&keepNull=false"; } if (options.mergeObjects) { optsUrlPart += "&mergeObjects=true"; } else { optsUrlPart += "&mergeObjects=false"; } } else { reqType = arangodb::GeneralRequest::RequestType::PUT; } if (options.returnNew) { optsUrlPart += "&returnNew=true"; } if (options.returnOld) { optsUrlPart += "&returnOld=true"; } if (canUseFastPath) { // All shard keys are known in all documents. // Contact all shards directly with the correct information. std::vector requests; VPackBuilder reqBuilder; auto body = std::make_shared(); for (auto const& it : shardMap) { if (!useMultiple) { TRI_ASSERT(it.second.size() == 1); body = std::make_shared(slice.toJson()); // We send to single endpoint requests.emplace_back( "shard:" + it.first, reqType, baseUrl + StringUtils::urlEncode(it.first) + "/" + slice.get(StaticStrings::KeyString).copyString() + optsUrlPart, body); } else { reqBuilder.clear(); reqBuilder.openArray(); for (auto const& idx : it.second) { reqBuilder.add(slice.at(idx)); } reqBuilder.close(); body = std::make_shared(reqBuilder.slice().toJson()); // We send to Babies endpoint requests.emplace_back( "shard:" + it.first, reqType, baseUrl + StringUtils::urlEncode(it.first) + optsUrlPart, body); } } // Perform the requests size_t nrDone = 0; cc->performRequests(requests, CL_DEFAULT_TIMEOUT, nrDone, Logger::REQUESTS); // Now listen to the results: if (!useMultiple) { TRI_ASSERT(requests.size() == 1); auto res = requests[0].result; int commError = handleGeneralCommErrors(&res); if (commError != TRI_ERROR_NO_ERROR) { return commError; } responseCode = res.answer_code; TRI_ASSERT(res.answer != nullptr); auto parsedResult = res.answer->toVelocyPack(&VPackOptions::Defaults); resultBody.swap(parsedResult); return TRI_ERROR_NO_ERROR; } std::unordered_map> resultMap; collectResultsFromAllShards( shardMap, requests, errorCounter, resultMap, responseCode); mergeResults(reverseMapping, resultMap, resultBody); // the cluster operation was OK, however, // the DBserver could have reported an error. return TRI_ERROR_NO_ERROR; } // Not all shard keys are known in all documents. // We contact all shards with the complete body and ignore NOT_FOUND std::vector requests; auto body = std::make_shared(slice.toJson()); auto shardList = ci->getShardList(collid); if (!useMultiple) { std::string key = slice.get(StaticStrings::KeyString).copyString(); for (auto const& shard : *shardList) { requests.emplace_back( "shard:" + shard, reqType, baseUrl + StringUtils::urlEncode(shard) + "/" + key + optsUrlPart, body); } } else { for (auto const& shard : *shardList) { requests.emplace_back( "shard:" + shard, reqType, baseUrl + StringUtils::urlEncode(shard) + optsUrlPart, body); } } // Perform the requests size_t nrDone = 0; cc->performRequests(requests, CL_DEFAULT_TIMEOUT, nrDone, Logger::REQUESTS); // Now listen to the results: if (!useMultiple) { // Only one can answer, we react a bit differently int nrok = 0; int commError = TRI_ERROR_NO_ERROR; for (size_t count = shardList->size(); count > 0; count--) { auto const& req = requests[count - 1]; auto res = req.result; if (res.status == CL_COMM_RECEIVED) { if (res.answer_code != arangodb::GeneralResponse::ResponseCode::NOT_FOUND || (nrok == 0 && count == 1 && commError == TRI_ERROR_NO_ERROR)) { nrok++; responseCode = res.answer_code; TRI_ASSERT(res.answer != nullptr); auto parsedResult = res.answer->toVelocyPack(&VPackOptions::Defaults); resultBody.swap(parsedResult); } } else { commError = handleGeneralCommErrors(&res); } } if (nrok == 0) { // This can only happen, if a commError was encountered! return commError; } if (nrok > 1) { return TRI_ERROR_CLUSTER_GOT_CONTRADICTING_ANSWERS; } return TRI_ERROR_NO_ERROR; // the cluster operation was OK, however, // the DBserver could have reported an error. } responseCode = GeneralResponse::ResponseCode::SERVER_ERROR; // We select all results from all shards an merge them back again. std::vector> allResults; allResults.reserve(requests.size()); for (auto const& req : requests) { auto res = req.result; int error = handleGeneralCommErrors(&res); if (error != TRI_ERROR_NO_ERROR) { // Cluster is in bad state. Just report. // Local data structores are automatically freed return error; } if (res.answer_code == GeneralResponse::ResponseCode::OK || res.answer_code == GeneralResponse::ResponseCode::ACCEPTED) { responseCode = res.answer_code; } TRI_ASSERT(res.answer != nullptr); allResults.emplace_back(res.answer->toVelocyPack(&VPackOptions::Defaults)); extractErrorCodes(res, errorCounter, false); } // If we get here we get exactly one result for every shard. TRI_ASSERT(allResults.size() == shardList->size()); mergeResultsAllShards(allResults, resultBody, errorCounter, static_cast(slice.length())); return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief flush Wal on all DBservers //////////////////////////////////////////////////////////////////////////////// int flushWalOnAllDBServers(bool waitForSync, bool waitForCollector) { ClusterInfo* ci = ClusterInfo::instance(); ClusterComm* cc = ClusterComm::instance(); std::vector DBservers = ci->getCurrentDBServers(); CoordTransactionID coordTransactionID = TRI_NewTickServer(); std::string url = std::string("/_admin/wal/flush?waitForSync=") + (waitForSync ? "true" : "false") + "&waitForCollector=" + (waitForCollector ? "true" : "false"); auto body = std::make_shared(); for (auto it = DBservers.begin(); it != DBservers.end(); ++it) { auto headers = std::make_unique>(); // set collection name (shard id) cc->asyncRequest("", coordTransactionID, "server:" + *it, arangodb::GeneralRequest::RequestType::PUT, url, body, headers, nullptr, 120.0); } // Now listen to the results: int count; int nrok = 0; for (count = (int)DBservers.size(); count > 0; count--) { auto res = cc->wait("", coordTransactionID, 0, "", 0.0); if (res.status == CL_COMM_RECEIVED) { if (res.answer_code == arangodb::GeneralResponse::ResponseCode::OK) { nrok++; } } } if (nrok != (int)DBservers.size()) { return TRI_ERROR_INTERNAL; } return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief compute a shard distribution for a new collection, the list /// dbServers must be a list of DBserver ids to distribute across. /// If this list is empty, the complete current list of DBservers is /// fetched from ClusterInfo and with random_shuffle to mix it up. //////////////////////////////////////////////////////////////////////////////// std::map> distributeShards( uint64_t numberOfShards, uint64_t replicationFactor, std::vector& dbServers) { std::map> shards; ClusterInfo* ci = ClusterInfo::instance(); if (dbServers.size() == 0) { dbServers = ci->getCurrentDBServers(); if (dbServers.empty()) { return shards; } random_shuffle(dbServers.begin(), dbServers.end()); } // fetch a unique id for each shard to create uint64_t const id = ci->uniqid(numberOfShards); // now create the shards size_t count = 0; for (uint64_t i = 0; i < numberOfShards; ++i) { // determine responsible server(s) std::vector serverIds; for (uint64_t j = 0; j < replicationFactor; ++j) { std::string candidate; size_t count2 = 0; bool found = true; do { candidate = dbServers[count++]; if (count >= dbServers.size()) { count = 0; } if (++count2 == dbServers.size() + 1) { LOG(WARN) << "createCollectionCoordinator: replicationFactor is " "too large for the number of DBservers"; found = false; break; } } while (std::find(serverIds.begin(), serverIds.end(), candidate) != serverIds.end()); if (found) { serverIds.push_back(candidate); } } // determine shard id std::string shardId = "s" + StringUtils::itoa(id + 1 + i); shards.insert(std::make_pair(shardId, serverIds)); } return shards; } } // namespace arangodb