//////////////////////////////////////////////////////////////////////////////// /// DISCLAIMER /// /// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Jan Steemann //////////////////////////////////////////////////////////////////////////////// #include "MMFilesOptimizerRules.h" #include "Aql/Collection.h" #include "Aql/Condition.h" #include "Aql/ExecutionNode.h" #include "Aql/ExecutionPlan.h" #include "Aql/Function.h" #include "Aql/IndexNode.h" #include "Aql/Optimizer.h" #include "Aql/OptimizerRule.h" #include "Aql/OptimizerRulesFeature.h" #include "Aql/SortNode.h" #include "Cluster/ServerState.h" #include "Indexes/Index.h" #include "VocBase/LogicalCollection.h" using namespace arangodb; using namespace arangodb::aql; using EN = arangodb::aql::ExecutionNode; void MMFilesOptimizerRules::registerResources() { // patch update statements OptimizerRulesFeature::registerRule("geo-index-optimizer", geoIndexRule, OptimizerRule::applyGeoIndexRule, false, true); // remove SORT RAND() if appropriate OptimizerRulesFeature::registerRule("remove-sort-rand", removeSortRandRule, OptimizerRule::removeSortRandRule_pass5, false, true); } struct MMFilesGeoIndexInfo { operator bool() const { return distanceNode && valid; } void invalidate() { valid = false; } MMFilesGeoIndexInfo() : collectionNode(nullptr) , executionNode(nullptr) , indexNode(nullptr) , setter(nullptr) , expressionParent(nullptr) , expressionNode(nullptr) , distanceNode(nullptr) , index(nullptr) , range(nullptr) , executionNodeType(EN::NORESULTS) , within(false) , lessgreaterequal(false) , valid(true) , constantPair{nullptr,nullptr} {} EnumerateCollectionNode* collectionNode; // node that will be replaced by (geo) IndexNode ExecutionNode* executionNode; // start node that is a sort or filter IndexNode* indexNode; // AstNode that is the parent of the Node CalculationNode* setter; // node that has contains the condition for filter or sort AstNode* expressionParent; // AstNode that is the parent of the Node AstNode* expressionNode; // AstNode that contains the sort/filter condition AstNode* distanceNode; // AstNode that contains the distance parameters std::shared_ptr index; //pointer to geoindex AstNode const* range; // range for within ExecutionNode::NodeType executionNodeType; // type of execution node sort or filter bool within; // is this a within lookup bool lessgreaterequal; // is this a check for le/ge (true) or lt/gt (false) bool valid; // contains this node a valid condition std::vector longitude; // access path to longitude std::vector latitude; // access path to latitude std::pair constantPair; }; //candidate checking AstNode* isValueOrRefNode(AstNode* node){ //TODO - implement me return node; } MMFilesGeoIndexInfo isDistanceFunction(AstNode* distanceNode, AstNode* expressionParent){ // the expression must exist and it must be a function call auto rv = MMFilesGeoIndexInfo{}; if(distanceNode->type != NODE_TYPE_FCALL) { return rv; } //get the ast node of the expression auto func = static_cast(distanceNode->getData()); // we're looking for "DISTANCE()", which is a function call // with an empty parameters array if ( func->externalName != "DISTANCE" || distanceNode->numMembers() != 1 ) { return rv; } rv.distanceNode = distanceNode; rv.expressionNode = distanceNode; rv.expressionParent = expressionParent; return rv; } MMFilesGeoIndexInfo isGeoFilterExpression(AstNode* node, AstNode* expressionParent){ // binary compare must be on top bool dist_first = true; bool lessEqual = true; auto rv = MMFilesGeoIndexInfo{}; if( node->type != NODE_TYPE_OPERATOR_BINARY_GE && node->type != NODE_TYPE_OPERATOR_BINARY_GT && node->type != NODE_TYPE_OPERATOR_BINARY_LE && node->type != NODE_TYPE_OPERATOR_BINARY_LT) { return rv; } if (node->type == NODE_TYPE_OPERATOR_BINARY_GE || node->type == NODE_TYPE_OPERATOR_BINARY_GT) { dist_first = false; } if (node->type == NODE_TYPE_OPERATOR_BINARY_GT || node->type == NODE_TYPE_OPERATOR_BINARY_LT) { lessEqual = false; } if (node->numMembers() != 2){ return rv; } AstNode* first = node->getMember(0); AstNode* second = node->getMember(1); auto eval_stuff = [](bool dist_first, bool lessEqual, MMFilesGeoIndexInfo&& dist_fun, AstNode* value_node){ if (dist_first && dist_fun && value_node) { dist_fun.within = true; dist_fun.range = value_node; dist_fun.lessgreaterequal = lessEqual; } else { dist_fun.invalidate(); } return dist_fun; }; rv = eval_stuff(dist_first, lessEqual, isDistanceFunction(first, expressionParent), isValueOrRefNode(second)); if (!rv) { rv = eval_stuff(dist_first, lessEqual, isDistanceFunction(second, expressionParent), isValueOrRefNode(first)); } if(rv){ //this must be set after checking if the node contains a distance node. rv.expressionNode = node; } return rv; } MMFilesGeoIndexInfo iterativePreorderWithCondition(EN::NodeType type, AstNode* root, MMFilesGeoIndexInfo(*condition)(AstNode*, AstNode*)){ // returns on first hit if (!root){ return MMFilesGeoIndexInfo{}; } std::vector> nodestack; nodestack.push_back({root, nullptr}); while(nodestack.size()){ auto current = nodestack.back(); nodestack.pop_back(); MMFilesGeoIndexInfo rv = condition(current.first,current.second); if (rv) { return rv; } if (type == EN::FILTER){ if (current.first->type == NODE_TYPE_OPERATOR_BINARY_AND || current.first->type == NODE_TYPE_OPERATOR_NARY_AND ){ for (std::size_t i = 0; i < current.first->numMembers(); ++i){ nodestack.push_back({current.first->getMember(i),current.first}); } } } else if (type == EN::SORT) { // must be the only sort condition } } return MMFilesGeoIndexInfo{}; } MMFilesGeoIndexInfo geoDistanceFunctionArgCheck(std::pair const& pair, ExecutionPlan* plan, MMFilesGeoIndexInfo info){ std::pair> attributeAccess1; std::pair> attributeAccess2; // first and second should be based on the same document - need to provide the document // in order to see which collection is bound to it and if that collections supports geo-index if (!pair.first->isAttributeAccessForVariable(attributeAccess1) || !pair.second->isAttributeAccessForVariable(attributeAccess2)) { info.invalidate(); return info; } TRI_ASSERT(attributeAccess1.first != nullptr); TRI_ASSERT(attributeAccess2.first != nullptr); // expect access of the for doc.attribute auto setter1 = plan->getVarSetBy(attributeAccess1.first->id); auto setter2 = plan->getVarSetBy(attributeAccess2.first->id); if (setter1 != nullptr && setter2 != nullptr && setter1 == setter2 && setter1->getType() == EN::ENUMERATE_COLLECTION) { auto collNode = reinterpret_cast(setter1); auto coll = collNode->collection(); //what kind of indexes does it have on what attributes auto lcoll = coll->getCollection(); // TODO - check collection for suitable geo-indexes for(auto indexShardPtr : lcoll->getIndexes()){ // get real index arangodb::Index& index = *indexShardPtr.get(); // check if current index is a geo-index if( index.type() != arangodb::Index::IndexType::TRI_IDX_TYPE_GEO1_INDEX && index.type() != arangodb::Index::IndexType::TRI_IDX_TYPE_GEO2_INDEX) { continue; } TRI_ASSERT(index.fields().size() == 2); //check access paths of attributes in ast and those in index match if (index.fields()[0] == attributeAccess1.second && index.fields()[1] == attributeAccess2.second) { info.collectionNode = collNode; info.index = indexShardPtr; TRI_AttributeNamesJoinNested(attributeAccess1.second, info.longitude, true); TRI_AttributeNamesJoinNested(attributeAccess2.second, info.latitude, true); return info; } } } info.invalidate(); return info; } bool checkDistanceArguments(MMFilesGeoIndexInfo& info, ExecutionPlan* plan){ if(!info){ return false; } auto const& functionArguments = info.distanceNode->getMember(0); if(functionArguments->numMembers() < 4){ return false; } std::pair argPair1 = { functionArguments->getMember(0), functionArguments->getMember(1) }; std::pair argPair2 = { functionArguments->getMember(2), functionArguments->getMember(3) }; MMFilesGeoIndexInfo result1 = geoDistanceFunctionArgCheck(argPair1, plan, info /*copy*/); MMFilesGeoIndexInfo result2 = geoDistanceFunctionArgCheck(argPair2, plan, info /*copy*/); //info now conatins access path to collection // xor only one argument pair shall have a geoIndex if ( ( !result1 && !result2 ) || ( result1 && result2 ) ){ info.invalidate(); return false; } MMFilesGeoIndexInfo res; if(result1){ info = std::move(result1); info.constantPair = std::move(argPair2); } else { info = std::move(result2); info.constantPair = std::move(argPair1); } return true; } //checks a single sort or filter node MMFilesGeoIndexInfo identifyGeoOptimizationCandidate(ExecutionNode::NodeType type, ExecutionPlan* plan, ExecutionNode* n){ ExecutionNode* setter = nullptr; auto rv = MMFilesGeoIndexInfo{}; switch(type){ case EN::SORT: { auto node = static_cast(n); auto& elements = node->getElements(); // we're looking for "SORT DISTANCE(x,y,a,b) ASC", which has just one sort criterion if ( !(elements.size() == 1 && elements[0].ascending)) { //test on second makes sure the SORT is ascending return rv; } //variable of sort expression auto variable = elements[0].var; TRI_ASSERT(variable != nullptr); //// find the expression that is bound to the variable // get the expression node that holds the calculation setter = plan->getVarSetBy(variable->id); } break; case EN::FILTER: { auto node = static_cast(n); // filter nodes always have one input variable auto varsUsedHere = node->getVariablesUsedHere(); TRI_ASSERT(varsUsedHere.size() == 1); // now check who introduced our variable auto variable = varsUsedHere[0]; setter = plan->getVarSetBy(variable->id); } break; default: return rv; } // common part - extract astNode from setter witch is a calculation node if (setter == nullptr || setter->getType() != EN::CALCULATION) { return rv; } auto expression = static_cast(setter)->expression(); // the expression must exist and it must have an astNode if (expression == nullptr || expression->node() == nullptr){ // not the right type of node return rv; } AstNode* node = expression->nodeForModification(); //FIXME -- technical debt -- code duplication / not all cases covered switch(type){ case EN::SORT: { // check comma separated parts of condition cond0, cond1, cond2 rv = isDistanceFunction(node,nullptr); } break; case EN::FILTER: { rv = iterativePreorderWithCondition(type, node, &isGeoFilterExpression); } break; default: rv.invalidate(); // not required but make sure the result is invalid } rv.executionNode = n; rv.executionNodeType = type; rv.setter = static_cast(setter); checkDistanceArguments(rv, plan); return rv; }; //modify plan // builds a condition that can be used with the index interface and // contains all parameters required by the MMFilesGeoIndex std::unique_ptr buildGeoCondition(ExecutionPlan* plan, MMFilesGeoIndexInfo& info) { AstNode* lat = info.constantPair.first; AstNode* lon = info.constantPair.second; auto ast = plan->getAst(); auto varAstNode = ast->createNodeReference(info.collectionNode->outVariable()); auto args = ast->createNodeArray(info.within ? 4 : 3); args->addMember(varAstNode); // collection args->addMember(lat); // latitude args->addMember(lon); // longitude AstNode* cond = nullptr; if (info.within) { // WITHIN args->addMember(info.range); auto lessValue = ast->createNodeValueBool(info.lessgreaterequal); args->addMember(lessValue); cond = ast->createNodeFunctionCall("WITHIN", args); } else { // NEAR cond = ast->createNodeFunctionCall("NEAR", args); } TRI_ASSERT(cond != nullptr); auto condition = std::make_unique(ast); condition->andCombine(cond); condition->normalize(plan); return condition; } void replaceGeoCondition(ExecutionPlan* plan, MMFilesGeoIndexInfo& info){ if (info.expressionParent && info.executionNodeType == EN::FILTER) { auto ast = plan->getAst(); CalculationNode* newNode = nullptr; Expression* expr = new Expression(ast, static_cast(info.setter)->expression()->nodeForModification()->clone(ast)); try { newNode = new CalculationNode(plan, plan->nextId(), expr, static_cast(info.setter)->outVariable()); } catch (...) { delete expr; throw; } plan->registerNode(newNode); plan->replaceNode(info.setter, newNode); bool done = false; ast->traverseAndModify(newNode->expression()->nodeForModification(),[&done](AstNode* node, void* data) { if (done) { return node; } if (node->type == NODE_TYPE_OPERATOR_BINARY_AND) { for (std::size_t i = 0; i < node->numMembers(); i++){ if (isGeoFilterExpression(node->getMemberUnchecked(i),node)) { done = true; return node->getMemberUnchecked(i ? 0 : 1); } } } return node; }, nullptr); if(done){ return; } auto replaceInfo = iterativePreorderWithCondition(EN::FILTER, newNode->expression()->nodeForModification(), &isGeoFilterExpression); if (newNode->expression()->nodeForModification() == replaceInfo.expressionParent) { if (replaceInfo.expressionParent->type == NODE_TYPE_OPERATOR_BINARY_AND){ for (std::size_t i = 0; i < replaceInfo.expressionParent->numMembers(); ++i) { if (replaceInfo.expressionParent->getMember(i) != replaceInfo.expressionNode) { newNode->expression()->replaceNode(replaceInfo.expressionParent->getMember(i)); return; } } } } //else { // // COULD BE IMPROVED // if(replaceInfo.expressionParent->type == NODE_TYPE_OPERATOR_BINARY_AND){ // // delete ast node - we would need the parent of expression parent to delete the node // // we do not have it available here so we just replace the the node with true // return; // } //} //fallback auto replacement = ast->createNodeValueBool(true); for (std::size_t i = 0; i < replaceInfo.expressionParent->numMembers(); ++i) { if (replaceInfo.expressionParent->getMember(i) == replaceInfo.expressionNode) { replaceInfo.expressionParent->removeMemberUnchecked(i); replaceInfo.expressionParent->addMember(replacement); } } } } // applys the optimization for a candidate bool applyGeoOptimization(bool near, ExecutionPlan* plan, MMFilesGeoIndexInfo& first, MMFilesGeoIndexInfo& second) { if (!first && !second) { return false; } if (!first) { first = std::move(second); second.invalidate(); } // We are not allowed to be a inner loop if (first.collectionNode->isInInnerLoop() && first.executionNodeType == EN::SORT) { return false; } std::unique_ptr condition(buildGeoCondition(plan, first)); auto inode = new IndexNode( plan, plan->nextId(), first.collectionNode->vocbase(), first.collectionNode->collection(), first.collectionNode->outVariable(), std::vector{transaction::Methods::IndexHandle{first.index}}, condition.get(), false); plan->registerNode(inode); condition.release(); plan->replaceNode(first.collectionNode,inode); replaceGeoCondition(plan, first); replaceGeoCondition(plan, second); // if executionNode is sort OR a filter without further sub conditions // the node can be unlinked auto unlinkNode = [&](MMFilesGeoIndexInfo& info) { if (info && !info.expressionParent) { if (!arangodb::ServerState::instance()->isCoordinator() || info.executionNodeType == EN::FILTER) { plan->unlinkNode(info.executionNode); } else if (info.executionNodeType == EN::SORT) { //make sure sort is not reinserted in cluster static_cast(info.executionNode)->_reinsertInCluster = false; } } }; unlinkNode(first); unlinkNode(second); //signal that plan has been changed return true; } void MMFilesOptimizerRules::geoIndexRule(Optimizer* opt, std::unique_ptr plan, OptimizerRule const* rule) { SmallVector::allocator_type::arena_type a; SmallVector nodes{a}; bool modified = false; //inspect each return node and work upwards to SingletonNode plan->findEndNodes(nodes, true); for (auto& node : nodes) { MMFilesGeoIndexInfo sortInfo{}; MMFilesGeoIndexInfo filterInfo{}; auto current = node; while (current) { switch(current->getType()) { case EN::SORT:{ sortInfo = identifyGeoOptimizationCandidate(EN::SORT, plan.get(), current); break; } case EN::FILTER: { filterInfo = identifyGeoOptimizationCandidate(EN::FILTER, plan.get(), current); break; } case EN::ENUMERATE_COLLECTION: { EnumerateCollectionNode* collnode = static_cast(current); if( (sortInfo && sortInfo.collectionNode!= collnode) ||(filterInfo && filterInfo.collectionNode != collnode) ){ filterInfo.invalidate(); sortInfo.invalidate(); break; } if (applyGeoOptimization(true, plan.get(), filterInfo, sortInfo)){ modified = true; filterInfo.invalidate(); sortInfo.invalidate(); } break; } case EN::INDEX: case EN::COLLECT:{ filterInfo.invalidate(); sortInfo.invalidate(); break; } default: { //skip - do nothing break; } } current = current->getFirstDependency(); //inspect next node } } opt->addPlan(std::move(plan), rule, modified); } /// @brief remove SORT RAND() if appropriate void MMFilesOptimizerRules::removeSortRandRule(Optimizer* opt, std::unique_ptr plan, OptimizerRule const* rule) { SmallVector::allocator_type::arena_type a; SmallVector nodes{a}; plan->findNodesOfType(nodes, EN::SORT, true); bool modified = false; for (auto const& n : nodes) { auto node = static_cast(n); auto const& elements = node->getElements(); if (elements.size() != 1) { // we're looking for "SORT RAND()", which has just one sort criterion continue; } auto const variable = elements[0].var; TRI_ASSERT(variable != nullptr); auto setter = plan->getVarSetBy(variable->id); if (setter == nullptr || setter->getType() != EN::CALCULATION) { continue; } auto cn = static_cast(setter); auto const expression = cn->expression(); if (expression == nullptr || expression->node() == nullptr || expression->node()->type != NODE_TYPE_FCALL) { // not the right type of node continue; } auto funcNode = expression->node(); auto func = static_cast(funcNode->getData()); // we're looking for "RAND()", which is a function call // with an empty parameters array if (func->externalName != "RAND" || funcNode->numMembers() != 1 || funcNode->getMember(0)->numMembers() != 0) { continue; } // now we're sure we got SORT RAND() ! // we found what we were looking for! // now check if the dependencies qualify if (!n->hasDependency()) { break; } auto current = n->getFirstDependency(); ExecutionNode* collectionNode = nullptr; while (current != nullptr) { if (current->canThrow()) { // we shouldn't bypass a node that can throw collectionNode = nullptr; break; } switch (current->getType()) { case EN::SORT: case EN::COLLECT: case EN::FILTER: case EN::SUBQUERY: case EN::ENUMERATE_LIST: case EN::TRAVERSAL: case EN::SHORTEST_PATH: case EN::INDEX: { // if we found another SortNode, a CollectNode, FilterNode, a // SubqueryNode, an EnumerateListNode, a TraversalNode or an IndexNode // this means we cannot apply our optimization collectionNode = nullptr; current = nullptr; continue; // this will exit the while loop } case EN::ENUMERATE_COLLECTION: { if (collectionNode == nullptr) { // note this node collectionNode = current; break; } else { // we already found another collection node before. this means we // should not apply our optimization collectionNode = nullptr; current = nullptr; continue; // this will exit the while loop } // cannot get here TRI_ASSERT(false); } default: { // ignore all other nodes } } if (!current->hasDependency()) { break; } current = current->getFirstDependency(); } if (collectionNode != nullptr) { // we found a node to modify! TRI_ASSERT(collectionNode->getType() == EN::ENUMERATE_COLLECTION); // set the random iteration flag for the EnumerateCollectionNode static_cast(collectionNode)->setRandom(); // remove the SortNode // note: the CalculationNode will be removed by // "remove-unnecessary-calculations" // rule if not used plan->unlinkNode(n); modified = true; } } opt->addPlan(std::move(plan), rule, modified); }