1
0
Fork 0
arangodb/arangod/MMFiles/MMFilesOptimizerRules.cpp

714 lines
23 KiB
C++

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Jan Steemann
////////////////////////////////////////////////////////////////////////////////
#include "MMFilesOptimizerRules.h"
#include "Aql/Collection.h"
#include "Aql/Condition.h"
#include "Aql/ExecutionNode.h"
#include "Aql/ExecutionPlan.h"
#include "Aql/Function.h"
#include "Aql/IndexNode.h"
#include "Aql/Optimizer.h"
#include "Aql/OptimizerRule.h"
#include "Aql/OptimizerRulesFeature.h"
#include "Aql/SortNode.h"
#include "Cluster/ServerState.h"
#include "Indexes/Index.h"
#include "VocBase/LogicalCollection.h"
using namespace arangodb;
using namespace arangodb::aql;
using EN = arangodb::aql::ExecutionNode;
void MMFilesOptimizerRules::registerResources() {
// patch update statements
OptimizerRulesFeature::registerRule("geo-index-optimizer", geoIndexRule,
OptimizerRule::applyGeoIndexRule, false, true);
// remove SORT RAND() if appropriate
OptimizerRulesFeature::registerRule("remove-sort-rand", removeSortRandRule,
OptimizerRule::removeSortRandRule_pass5, false, true);
}
struct MMFilesGeoIndexInfo {
operator bool() const { return distanceNode && valid; }
void invalidate() { valid = false; }
MMFilesGeoIndexInfo()
: collectionNode(nullptr)
, executionNode(nullptr)
, indexNode(nullptr)
, setter(nullptr)
, expressionParent(nullptr)
, expressionNode(nullptr)
, distanceNode(nullptr)
, index(nullptr)
, range(nullptr)
, executionNodeType(EN::NORESULTS)
, within(false)
, lessgreaterequal(false)
, valid(true)
, constantPair{nullptr,nullptr}
{}
EnumerateCollectionNode* collectionNode; // node that will be replaced by (geo) IndexNode
ExecutionNode* executionNode; // start node that is a sort or filter
IndexNode* indexNode; // AstNode that is the parent of the Node
CalculationNode* setter; // node that has contains the condition for filter or sort
AstNode* expressionParent; // AstNode that is the parent of the Node
AstNode* expressionNode; // AstNode that contains the sort/filter condition
AstNode* distanceNode; // AstNode that contains the distance parameters
std::shared_ptr<arangodb::Index> index; //pointer to geoindex
AstNode const* range; // range for within
ExecutionNode::NodeType executionNodeType; // type of execution node sort or filter
bool within; // is this a within lookup
bool lessgreaterequal; // is this a check for le/ge (true) or lt/gt (false)
bool valid; // contains this node a valid condition
std::vector<std::string> longitude; // access path to longitude
std::vector<std::string> latitude; // access path to latitude
std::pair<AstNode*,AstNode*> constantPair;
};
//candidate checking
AstNode* isValueOrRefNode(AstNode* node){
//TODO - implement me
return node;
}
MMFilesGeoIndexInfo isDistanceFunction(AstNode* distanceNode, AstNode* expressionParent){
// the expression must exist and it must be a function call
auto rv = MMFilesGeoIndexInfo{};
if(distanceNode->type != NODE_TYPE_FCALL) {
return rv;
}
//get the ast node of the expression
auto func = static_cast<Function const*>(distanceNode->getData());
// we're looking for "DISTANCE()", which is a function call
// with an empty parameters array
if ( func->externalName != "DISTANCE" || distanceNode->numMembers() != 1 ) {
return rv;
}
rv.distanceNode = distanceNode;
rv.expressionNode = distanceNode;
rv.expressionParent = expressionParent;
return rv;
}
MMFilesGeoIndexInfo isGeoFilterExpression(AstNode* node, AstNode* expressionParent){
// binary compare must be on top
bool dist_first = true;
bool lessEqual = true;
auto rv = MMFilesGeoIndexInfo{};
if( node->type != NODE_TYPE_OPERATOR_BINARY_GE
&& node->type != NODE_TYPE_OPERATOR_BINARY_GT
&& node->type != NODE_TYPE_OPERATOR_BINARY_LE
&& node->type != NODE_TYPE_OPERATOR_BINARY_LT) {
return rv;
}
if (node->type == NODE_TYPE_OPERATOR_BINARY_GE || node->type == NODE_TYPE_OPERATOR_BINARY_GT) {
dist_first = false;
}
if (node->type == NODE_TYPE_OPERATOR_BINARY_GT || node->type == NODE_TYPE_OPERATOR_BINARY_LT) {
lessEqual = false;
}
if (node->numMembers() != 2){
return rv;
}
AstNode* first = node->getMember(0);
AstNode* second = node->getMember(1);
auto eval_stuff = [](bool dist_first, bool lessEqual, MMFilesGeoIndexInfo&& dist_fun, AstNode* value_node){
if (dist_first && dist_fun && value_node) {
dist_fun.within = true;
dist_fun.range = value_node;
dist_fun.lessgreaterequal = lessEqual;
} else {
dist_fun.invalidate();
}
return dist_fun;
};
rv = eval_stuff(dist_first, lessEqual, isDistanceFunction(first, expressionParent), isValueOrRefNode(second));
if (!rv) {
rv = eval_stuff(dist_first, lessEqual, isDistanceFunction(second, expressionParent), isValueOrRefNode(first));
}
if(rv){
//this must be set after checking if the node contains a distance node.
rv.expressionNode = node;
}
return rv;
}
MMFilesGeoIndexInfo iterativePreorderWithCondition(EN::NodeType type, AstNode* root, MMFilesGeoIndexInfo(*condition)(AstNode*, AstNode*)){
// returns on first hit
if (!root){
return MMFilesGeoIndexInfo{};
}
std::vector<std::pair<AstNode*,AstNode*>> nodestack;
nodestack.push_back({root, nullptr});
while(nodestack.size()){
auto current = nodestack.back();
nodestack.pop_back();
MMFilesGeoIndexInfo rv = condition(current.first,current.second);
if (rv) {
return rv;
}
if (type == EN::FILTER){
if (current.first->type == NODE_TYPE_OPERATOR_BINARY_AND || current.first->type == NODE_TYPE_OPERATOR_NARY_AND ){
for (std::size_t i = 0; i < current.first->numMembers(); ++i){
nodestack.push_back({current.first->getMember(i),current.first});
}
}
} else if (type == EN::SORT) {
// must be the only sort condition
}
}
return MMFilesGeoIndexInfo{};
}
MMFilesGeoIndexInfo geoDistanceFunctionArgCheck(std::pair<AstNode const*, AstNode const*> const& pair,
ExecutionPlan* plan, MMFilesGeoIndexInfo info){
std::pair<Variable const*, std::vector<arangodb::basics::AttributeName>> attributeAccess1;
std::pair<Variable const*, std::vector<arangodb::basics::AttributeName>> attributeAccess2;
// first and second should be based on the same document - need to provide the document
// in order to see which collection is bound to it and if that collections supports geo-index
if (!pair.first->isAttributeAccessForVariable(attributeAccess1) ||
!pair.second->isAttributeAccessForVariable(attributeAccess2)) {
info.invalidate();
return info;
}
TRI_ASSERT(attributeAccess1.first != nullptr);
TRI_ASSERT(attributeAccess2.first != nullptr);
// expect access of the for doc.attribute
auto setter1 = plan->getVarSetBy(attributeAccess1.first->id);
auto setter2 = plan->getVarSetBy(attributeAccess2.first->id);
if (setter1 != nullptr &&
setter2 != nullptr &&
setter1 == setter2 &&
setter1->getType() == EN::ENUMERATE_COLLECTION) {
auto collNode = reinterpret_cast<EnumerateCollectionNode*>(setter1);
auto coll = collNode->collection(); //what kind of indexes does it have on what attributes
auto lcoll = coll->getCollection();
// TODO - check collection for suitable geo-indexes
for(auto indexShardPtr : lcoll->getIndexes()){
// get real index
arangodb::Index& index = *indexShardPtr.get();
// check if current index is a geo-index
if( index.type() != arangodb::Index::IndexType::TRI_IDX_TYPE_GEO1_INDEX
&& index.type() != arangodb::Index::IndexType::TRI_IDX_TYPE_GEO2_INDEX) {
continue;
}
TRI_ASSERT(index.fields().size() == 2);
//check access paths of attributes in ast and those in index match
if (index.fields()[0] == attributeAccess1.second &&
index.fields()[1] == attributeAccess2.second) {
info.collectionNode = collNode;
info.index = indexShardPtr;
TRI_AttributeNamesJoinNested(attributeAccess1.second, info.longitude, true);
TRI_AttributeNamesJoinNested(attributeAccess2.second, info.latitude, true);
return info;
}
}
}
info.invalidate();
return info;
}
bool checkDistanceArguments(MMFilesGeoIndexInfo& info, ExecutionPlan* plan){
if(!info){
return false;
}
auto const& functionArguments = info.distanceNode->getMember(0);
if(functionArguments->numMembers() < 4){
return false;
}
std::pair<AstNode*,AstNode*> argPair1 = { functionArguments->getMember(0), functionArguments->getMember(1) };
std::pair<AstNode*,AstNode*> argPair2 = { functionArguments->getMember(2), functionArguments->getMember(3) };
MMFilesGeoIndexInfo result1 = geoDistanceFunctionArgCheck(argPair1, plan, info /*copy*/);
MMFilesGeoIndexInfo result2 = geoDistanceFunctionArgCheck(argPair2, plan, info /*copy*/);
//info now conatins access path to collection
// xor only one argument pair shall have a geoIndex
if ( ( !result1 && !result2 ) || ( result1 && result2 ) ){
info.invalidate();
return false;
}
MMFilesGeoIndexInfo res;
if(result1){
info = std::move(result1);
info.constantPair = std::move(argPair2);
} else {
info = std::move(result2);
info.constantPair = std::move(argPair1);
}
return true;
}
//checks a single sort or filter node
MMFilesGeoIndexInfo identifyGeoOptimizationCandidate(ExecutionNode::NodeType type, ExecutionPlan* plan, ExecutionNode* n){
ExecutionNode* setter = nullptr;
auto rv = MMFilesGeoIndexInfo{};
switch(type){
case EN::SORT: {
auto node = static_cast<SortNode*>(n);
auto& elements = node->getElements();
// we're looking for "SORT DISTANCE(x,y,a,b) ASC", which has just one sort criterion
if ( !(elements.size() == 1 && elements[0].ascending)) {
//test on second makes sure the SORT is ascending
return rv;
}
//variable of sort expression
auto variable = elements[0].var;
TRI_ASSERT(variable != nullptr);
//// find the expression that is bound to the variable
// get the expression node that holds the calculation
setter = plan->getVarSetBy(variable->id);
}
break;
case EN::FILTER: {
auto node = static_cast<FilterNode*>(n);
// filter nodes always have one input variable
auto varsUsedHere = node->getVariablesUsedHere();
TRI_ASSERT(varsUsedHere.size() == 1);
// now check who introduced our variable
auto variable = varsUsedHere[0];
setter = plan->getVarSetBy(variable->id);
}
break;
default:
return rv;
}
// common part - extract astNode from setter witch is a calculation node
if (setter == nullptr || setter->getType() != EN::CALCULATION) {
return rv;
}
auto expression = static_cast<CalculationNode*>(setter)->expression();
// the expression must exist and it must have an astNode
if (expression == nullptr || expression->node() == nullptr){
// not the right type of node
return rv;
}
AstNode* node = expression->nodeForModification();
//FIXME -- technical debt -- code duplication / not all cases covered
switch(type){
case EN::SORT: {
// check comma separated parts of condition cond0, cond1, cond2
rv = isDistanceFunction(node,nullptr);
}
break;
case EN::FILTER: {
rv = iterativePreorderWithCondition(type, node, &isGeoFilterExpression);
}
break;
default:
rv.invalidate(); // not required but make sure the result is invalid
}
rv.executionNode = n;
rv.executionNodeType = type;
rv.setter = static_cast<CalculationNode*>(setter);
checkDistanceArguments(rv, plan);
return rv;
};
//modify plan
// builds a condition that can be used with the index interface and
// contains all parameters required by the MMFilesGeoIndex
std::unique_ptr<Condition> buildGeoCondition(ExecutionPlan* plan, MMFilesGeoIndexInfo& info) {
AstNode* lat = info.constantPair.first;
AstNode* lon = info.constantPair.second;
auto ast = plan->getAst();
auto varAstNode = ast->createNodeReference(info.collectionNode->outVariable());
auto args = ast->createNodeArray(info.within ? 4 : 3);
args->addMember(varAstNode); // collection
args->addMember(lat); // latitude
args->addMember(lon); // longitude
AstNode* cond = nullptr;
if (info.within) {
// WITHIN
args->addMember(info.range);
auto lessValue = ast->createNodeValueBool(info.lessgreaterequal);
args->addMember(lessValue);
cond = ast->createNodeFunctionCall("WITHIN", args);
} else {
// NEAR
cond = ast->createNodeFunctionCall("NEAR", args);
}
TRI_ASSERT(cond != nullptr);
auto condition = std::make_unique<Condition>(ast);
condition->andCombine(cond);
condition->normalize(plan);
return condition;
}
void replaceGeoCondition(ExecutionPlan* plan, MMFilesGeoIndexInfo& info){
if (info.expressionParent && info.executionNodeType == EN::FILTER) {
auto ast = plan->getAst();
CalculationNode* newNode = nullptr;
Expression* expr = new Expression(ast, static_cast<CalculationNode*>(info.setter)->expression()->nodeForModification()->clone(ast));
try {
newNode = new CalculationNode(plan, plan->nextId(), expr, static_cast<CalculationNode*>(info.setter)->outVariable());
} catch (...) {
delete expr;
throw;
}
plan->registerNode(newNode);
plan->replaceNode(info.setter, newNode);
bool done = false;
ast->traverseAndModify(newNode->expression()->nodeForModification(),[&done](AstNode* node, void* data) {
if (done) {
return node;
}
if (node->type == NODE_TYPE_OPERATOR_BINARY_AND) {
for (std::size_t i = 0; i < node->numMembers(); i++){
if (isGeoFilterExpression(node->getMemberUnchecked(i),node)) {
done = true;
return node->getMemberUnchecked(i ? 0 : 1);
}
}
}
return node;
},
nullptr);
if(done){
return;
}
auto replaceInfo = iterativePreorderWithCondition(EN::FILTER, newNode->expression()->nodeForModification(), &isGeoFilterExpression);
if (newNode->expression()->nodeForModification() == replaceInfo.expressionParent) {
if (replaceInfo.expressionParent->type == NODE_TYPE_OPERATOR_BINARY_AND){
for (std::size_t i = 0; i < replaceInfo.expressionParent->numMembers(); ++i) {
if (replaceInfo.expressionParent->getMember(i) != replaceInfo.expressionNode) {
newNode->expression()->replaceNode(replaceInfo.expressionParent->getMember(i));
return;
}
}
}
}
//else {
// // COULD BE IMPROVED
// if(replaceInfo.expressionParent->type == NODE_TYPE_OPERATOR_BINARY_AND){
// // delete ast node - we would need the parent of expression parent to delete the node
// // we do not have it available here so we just replace the the node with true
// return;
// }
//}
//fallback
auto replacement = ast->createNodeValueBool(true);
for (std::size_t i = 0; i < replaceInfo.expressionParent->numMembers(); ++i) {
if (replaceInfo.expressionParent->getMember(i) == replaceInfo.expressionNode) {
replaceInfo.expressionParent->removeMemberUnchecked(i);
replaceInfo.expressionParent->addMember(replacement);
}
}
}
}
// applys the optimization for a candidate
bool applyGeoOptimization(bool near, ExecutionPlan* plan, MMFilesGeoIndexInfo& first, MMFilesGeoIndexInfo& second) {
if (!first && !second) {
return false;
}
if (!first) {
first = std::move(second);
second.invalidate();
}
// We are not allowed to be a inner loop
if (first.collectionNode->isInInnerLoop() && first.executionNodeType == EN::SORT) {
return false;
}
std::unique_ptr<Condition> condition(buildGeoCondition(plan, first));
auto inode = new IndexNode(
plan, plan->nextId(), first.collectionNode->vocbase(),
first.collectionNode->collection(), first.collectionNode->outVariable(),
std::vector<transaction::Methods::IndexHandle>{transaction::Methods::IndexHandle{first.index}},
condition.get(), false);
plan->registerNode(inode);
condition.release();
plan->replaceNode(first.collectionNode,inode);
replaceGeoCondition(plan, first);
replaceGeoCondition(plan, second);
// if executionNode is sort OR a filter without further sub conditions
// the node can be unlinked
auto unlinkNode = [&](MMFilesGeoIndexInfo& info) {
if (info && !info.expressionParent) {
if (!arangodb::ServerState::instance()->isCoordinator() || info.executionNodeType == EN::FILTER) {
plan->unlinkNode(info.executionNode);
} else if (info.executionNodeType == EN::SORT) {
//make sure sort is not reinserted in cluster
static_cast<SortNode*>(info.executionNode)->_reinsertInCluster = false;
}
}
};
unlinkNode(first);
unlinkNode(second);
//signal that plan has been changed
return true;
}
void MMFilesOptimizerRules::geoIndexRule(Optimizer* opt,
std::unique_ptr<ExecutionPlan> plan,
OptimizerRule const* rule) {
SmallVector<ExecutionNode*>::allocator_type::arena_type a;
SmallVector<ExecutionNode*> nodes{a};
bool modified = false;
//inspect each return node and work upwards to SingletonNode
plan->findEndNodes(nodes, true);
for (auto& node : nodes) {
MMFilesGeoIndexInfo sortInfo{};
MMFilesGeoIndexInfo filterInfo{};
auto current = node;
while (current) {
switch(current->getType()) {
case EN::SORT:{
sortInfo = identifyGeoOptimizationCandidate(EN::SORT, plan.get(), current);
break;
}
case EN::FILTER: {
filterInfo = identifyGeoOptimizationCandidate(EN::FILTER, plan.get(), current);
break;
}
case EN::ENUMERATE_COLLECTION: {
EnumerateCollectionNode* collnode = static_cast<EnumerateCollectionNode*>(current);
if( (sortInfo && sortInfo.collectionNode!= collnode)
||(filterInfo && filterInfo.collectionNode != collnode)
){
filterInfo.invalidate();
sortInfo.invalidate();
break;
}
if (applyGeoOptimization(true, plan.get(), filterInfo, sortInfo)){
modified = true;
filterInfo.invalidate();
sortInfo.invalidate();
}
break;
}
case EN::INDEX:
case EN::COLLECT:{
filterInfo.invalidate();
sortInfo.invalidate();
break;
}
default: {
//skip - do nothing
break;
}
}
current = current->getFirstDependency(); //inspect next node
}
}
opt->addPlan(std::move(plan), rule, modified);
}
/// @brief remove SORT RAND() if appropriate
void MMFilesOptimizerRules::removeSortRandRule(Optimizer* opt, std::unique_ptr<ExecutionPlan> plan,
OptimizerRule const* rule) {
SmallVector<ExecutionNode*>::allocator_type::arena_type a;
SmallVector<ExecutionNode*> nodes{a};
plan->findNodesOfType(nodes, EN::SORT, true);
bool modified = false;
for (auto const& n : nodes) {
auto node = static_cast<SortNode*>(n);
auto const& elements = node->getElements();
if (elements.size() != 1) {
// we're looking for "SORT RAND()", which has just one sort criterion
continue;
}
auto const variable = elements[0].var;
TRI_ASSERT(variable != nullptr);
auto setter = plan->getVarSetBy(variable->id);
if (setter == nullptr || setter->getType() != EN::CALCULATION) {
continue;
}
auto cn = static_cast<CalculationNode*>(setter);
auto const expression = cn->expression();
if (expression == nullptr || expression->node() == nullptr ||
expression->node()->type != NODE_TYPE_FCALL) {
// not the right type of node
continue;
}
auto funcNode = expression->node();
auto func = static_cast<Function const*>(funcNode->getData());
// we're looking for "RAND()", which is a function call
// with an empty parameters array
if (func->externalName != "RAND" || funcNode->numMembers() != 1 ||
funcNode->getMember(0)->numMembers() != 0) {
continue;
}
// now we're sure we got SORT RAND() !
// we found what we were looking for!
// now check if the dependencies qualify
if (!n->hasDependency()) {
break;
}
auto current = n->getFirstDependency();
ExecutionNode* collectionNode = nullptr;
while (current != nullptr) {
if (current->canThrow()) {
// we shouldn't bypass a node that can throw
collectionNode = nullptr;
break;
}
switch (current->getType()) {
case EN::SORT:
case EN::COLLECT:
case EN::FILTER:
case EN::SUBQUERY:
case EN::ENUMERATE_LIST:
case EN::TRAVERSAL:
case EN::SHORTEST_PATH:
case EN::INDEX: {
// if we found another SortNode, a CollectNode, FilterNode, a
// SubqueryNode, an EnumerateListNode, a TraversalNode or an IndexNode
// this means we cannot apply our optimization
collectionNode = nullptr;
current = nullptr;
continue; // this will exit the while loop
}
case EN::ENUMERATE_COLLECTION: {
if (collectionNode == nullptr) {
// note this node
collectionNode = current;
break;
} else {
// we already found another collection node before. this means we
// should not apply our optimization
collectionNode = nullptr;
current = nullptr;
continue; // this will exit the while loop
}
// cannot get here
TRI_ASSERT(false);
}
default: {
// ignore all other nodes
}
}
if (!current->hasDependency()) {
break;
}
current = current->getFirstDependency();
}
if (collectionNode != nullptr) {
// we found a node to modify!
TRI_ASSERT(collectionNode->getType() == EN::ENUMERATE_COLLECTION);
// set the random iteration flag for the EnumerateCollectionNode
static_cast<EnumerateCollectionNode*>(collectionNode)->setRandom();
// remove the SortNode
// note: the CalculationNode will be removed by
// "remove-unnecessary-calculations"
// rule if not used
plan->unlinkNode(n);
modified = true;
}
}
opt->addPlan(std::move(plan), rule, modified);
}