1
0
Fork 0

[3.5] Make AQL's constrained heap play nice with fullCount (#10007)

* Make AQL's constrained heap play nice with fullCount (#9981)

* Update CHANGELOG
This commit is contained in:
Tobias Gödderz 2019-09-13 11:28:28 +02:00 committed by KVS85
parent e0e05903fb
commit 6f16c3deef
6 changed files with 180 additions and 36 deletions

View File

@ -1,6 +1,8 @@
v3.5.1 (XXXX-XX-XX)
-------------------
* Fixed AQL constrained-heap sort in conjunction with fullCount.
* Added support for AQL expressions such as `a NOT LIKE b`, `a NOT =~ b` and
`a NOT !~ b`. Previous versions of ArangoDB did not support these expressions,
and using them in an AQL query resulted in a parse error.

View File

@ -138,6 +138,8 @@ ConstrainedSortExecutor::ConstrainedSortExecutor(Fetcher& fetcher, SortExecutorI
_state(ExecutionState::HASMORE),
_returnNext(0),
_rowsPushed(0),
_rowsRead(0),
_skippedAfter(0),
_heapBuffer(_infos._manager.requestBlock(_infos._limit,
_infos.numberOfOutputRegisters())),
_cmpHeap(std::make_unique<ConstrainedLessThan>(_infos.trx(), _infos.sortRegisters())),
@ -147,11 +149,23 @@ ConstrainedSortExecutor::ConstrainedSortExecutor(Fetcher& fetcher, SortExecutorI
TRI_ASSERT(_infos._limit > 0);
_rows.reserve(infos._limit);
_cmpHeap->setBuffer(_heapBuffer.get());
};
}
ConstrainedSortExecutor::~ConstrainedSortExecutor() = default;
std::pair<ExecutionState, NoStats> ConstrainedSortExecutor::produceRows(OutputAqlItemRow& output) {
bool ConstrainedSortExecutor::doneProducing() const noexcept {
// must not get strictly larger
TRI_ASSERT(_returnNext <= _rows.size());
return _state == ExecutionState::DONE && _returnNext >= _rows.size();
}
bool ConstrainedSortExecutor::doneSkipping() const noexcept {
// must not get strictly larger
TRI_ASSERT(_returnNext + _skippedAfter <= _rowsRead);
return _state == ExecutionState::DONE && _returnNext + _skippedAfter >= _rowsRead;
}
ExecutionState ConstrainedSortExecutor::consumeInput() {
while (_state != ExecutionState::DONE) {
TRI_IF_FAILURE("SortBlock::doSorting") {
THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG);
@ -161,37 +175,65 @@ std::pair<ExecutionState, NoStats> ConstrainedSortExecutor::produceRows(OutputAq
std::tie(_state, input) = _fetcher.fetchRow();
if (_state == ExecutionState::WAITING) {
return {_state, NoStats{}};
return _state;
}
if (!input.isInitialized()) {
TRI_ASSERT(_state == ExecutionState::DONE);
} else {
++_rowsRead;
if (_rowsPushed < _infos._limit || !compareInput(_rows.front(), input)) {
// Push this row into the heap
pushRow(input);
}
}
}
if (_returnNext >= _rows.size()) {
// Happens if, we either have no upstream e.g. _rows is empty
// Or if dependency is pulling too often (should not happen)
TRI_ASSERT(_state == ExecutionState::DONE);
return _state;
}
std::pair<ExecutionState, NoStats> ConstrainedSortExecutor::produceRows(OutputAqlItemRow& output) {
{
ExecutionState state = consumeInput();
TRI_ASSERT(state == _state);
if (state == ExecutionState::WAITING) {
return {ExecutionState::WAITING, NoStats{}};
}
TRI_ASSERT(state == ExecutionState::DONE);
}
if (doneProducing()) {
if (doneSkipping()) {
// No we're really done
return {ExecutionState::DONE, NoStats{}};
}
// We should never get here, as the following LIMIT block should never fetch
// more than our limit. He may only skip after that. Thus:
TRI_ASSERT(false);
THROW_ARANGO_EXCEPTION_MESSAGE(
TRI_ERROR_INTERNAL_AQL,
"Overfetch during constrained heap sort. Please report this error! Try "
"turning off the sort-limit optimizer rule to get your query working.");
}
if (_returnNext == 0) {
// Only once sort the rows again, s.t. the
// contained list of elements is in the right ordering.
std::sort(_rows.begin(), _rows.end(), *_cmpHeap);
}
// Now our heap is full and sorted, we just need to return it line by line
TRI_ASSERT(_returnNext < _rows.size());
std::size_t heapRowPosition = _rows[_returnNext++];
auto const heapRowPosition = _rows[_returnNext];
++_returnNext;
InputAqlItemRow heapRow(_heapBuffer, heapRowPosition);
TRI_ASSERT(heapRow.isInitialized());
TRI_ASSERT(heapRowPosition < _rowsPushed);
output.copyRow(heapRow);
if (_returnNext == _rows.size()) {
return {ExecutionState::DONE, NoStats{}};
}
// Lie, we may have a possible LIMIT block with fullCount to work.
// We emitted at least one row at this point, so this is fine.
return {ExecutionState::HASMORE, NoStats{}};
}
@ -213,8 +255,57 @@ std::pair<ExecutionState, size_t> ConstrainedSortExecutor::expectedNumberOfRows(
// We have exactly the following rows available:
rowsLeft = _rows.size() - _returnNext;
}
if (rowsLeft > 0) {
return {ExecutionState::HASMORE, rowsLeft};
}
if (rowsLeft == 0) {
if (doneSkipping()) {
return {ExecutionState::DONE, rowsLeft};
}
// We always report at least 1 row here, for a possible LIMIT block with fullCount to work.
// However, we should never have to do this if the LIMIT block doesn't overfetch with getSome.
rowsLeft = 1;
}
return {ExecutionState::HASMORE, rowsLeft};
}
std::tuple<ExecutionState, NoStats, size_t> ConstrainedSortExecutor::skipRows(size_t toSkipRequested) {
{
ExecutionState state = consumeInput();
TRI_ASSERT(state == _state);
if (state == ExecutionState::WAITING) {
return {ExecutionState::WAITING, NoStats{}, 0};
}
TRI_ASSERT(state == ExecutionState::DONE);
}
if (_returnNext == 0) {
// Only once sort the rows again, s.t. the
// contained list of elements is in the right ordering.
std::sort(_rows.begin(), _rows.end(), *_cmpHeap);
}
size_t skipped = 0;
// Skip rows in the heap
if (!doneProducing()) {
TRI_ASSERT(_rows.size() >= _returnNext);
auto const rowsLeftInHeap = _rows.size() - _returnNext;
auto const skipNum = (std::min)(toSkipRequested, rowsLeftInHeap);
_returnNext += skipNum;
skipped += skipNum;
}
// Skip rows we've dropped
if (skipped < toSkipRequested && !doneSkipping()) {
TRI_ASSERT(doneProducing());
auto const rowsLeftToSkip = _rowsRead - (_rows.size() + _skippedAfter);
auto const skipNum = (std::min)(toSkipRequested, rowsLeftToSkip);
_skippedAfter += skipNum;
skipped += skipNum;
}
TRI_ASSERT(skipped <= toSkipRequested);
auto const state = doneSkipping() ? ExecutionState::DONE : ExecutionState::HASMORE;
return {state, NoStats{}, skipped};
}

View File

@ -56,8 +56,6 @@ struct SortRegister;
*/
class ConstrainedSortExecutor {
public:
friend class Sorter;
struct Properties {
static const bool preservesOrder = false;
static const bool allowsBlockPassthrough = false;
@ -78,6 +76,8 @@ class ConstrainedSortExecutor {
*/
std::pair<ExecutionState, Stats> produceRows(OutputAqlItemRow& output);
std::tuple<ExecutionState, Stats, size_t> skipRows(size_t toSkipRequested);
/**
* @brief This Executor knows how many rows it will produce and most by itself
* It also knows that it could produce less if the upstream only has fewer rows.
@ -88,6 +88,16 @@ class ConstrainedSortExecutor {
bool compareInput(size_t const& rosPos, InputAqlItemRow& row) const;
arangodb::Result pushRow(InputAqlItemRow& row);
// We're done producing when we've emitted all rows from our heap.
bool doneProducing() const noexcept;
// We're done skipping when we've emitted all rows from our heap,
// AND emitted (in this case, skipped) all rows that were dropped during the
// sort as well. This is for fullCount queries only.
bool doneSkipping() const noexcept;
ExecutionState consumeInput();
private:
Infos& _infos;
Fetcher& _fetcher;
@ -95,6 +105,8 @@ class ConstrainedSortExecutor {
size_t _returnNext;
std::vector<size_t> _rows;
size_t _rowsPushed;
size_t _rowsRead;
size_t _skippedAfter;
SharedAqlItemBlockPtr _heapBuffer;
std::unique_ptr<ConstrainedLessThan> _cmpHeap; // in pointer to avoid
OutputAqlItemRow _heapOutputRow;

View File

@ -190,7 +190,12 @@ std::pair<ExecutionState, SharedAqlItemBlockPtr> ExecutionBlockImpl<Executor>::g
TRI_ASSERT(state == ExecutionState::HASMORE);
// When we're passing blocks through we have no control over the size of the
// output block.
if /* constexpr */ (!Executor::Properties::allowsBlockPassthrough) {
// Plus, the ConstrainedSortExecutor will report an expectedNumberOfRows
// according to its heap size, thus resulting in a smaller allocated output
// block. However, it won't report DONE after, because a LIMIT block with
// fullCount must continue to count after the sorted output.
if /* constexpr */ (!Executor::Properties::allowsBlockPassthrough &&
!std::is_same<Executor, ConstrainedSortExecutor>::value) {
TRI_ASSERT(_outputItemRow->numRowsWritten() == atMost);
}
@ -281,11 +286,13 @@ static SkipVariants constexpr skipType() {
std::is_same<Executor, IResearchViewMergeExecutor<false>>::value ||
std::is_same<Executor, IResearchViewMergeExecutor<true>>::value ||
std::is_same<Executor, EnumerateCollectionExecutor>::value ||
std::is_same<Executor, LimitExecutor>::value),
std::is_same<Executor, LimitExecutor>::value ||
std::is_same<Executor, ConstrainedSortExecutor>::value),
"Unexpected executor for SkipVariants::EXECUTOR");
// The LimitExecutor will not work correctly with SkipVariants::FETCHER!
static_assert(!std::is_same<Executor, LimitExecutor>::value || useFetcher,
static_assert(
!std::is_same<Executor, LimitExecutor>::value || useFetcher,
"LimitExecutor needs to implement skipRows() to work correctly");
if (useExecutor) {
@ -510,7 +517,11 @@ namespace arangodb {
namespace aql {
// The constant "PASSTHROUGH" is somehow reserved with MSVC.
enum class RequestWrappedBlockVariant { DEFAULT , PASS_THROUGH , INPUTRESTRICTED };
enum class RequestWrappedBlockVariant {
DEFAULT,
PASS_THROUGH,
INPUTRESTRICTED
};
// Specifying the namespace here is important to MSVC.
template <enum arangodb::aql::RequestWrappedBlockVariant>
@ -545,9 +556,9 @@ struct RequestWrappedBlock<RequestWrappedBlockVariant::PASS_THROUGH> {
typename Executor::Infos const& infos,
#endif
Executor& executor, ExecutionEngine& engine, size_t nrItems, RegisterCount nrRegs) {
static_assert(
Executor::Properties::allowsBlockPassthrough,
"This function can only be used with executors supporting `allowsBlockPassthrough`");
static_assert(Executor::Properties::allowsBlockPassthrough,
"This function can only be used with executors supporting "
"`allowsBlockPassthrough`");
static_assert(hasFetchBlockForPassthrough<Executor>::value,
"An Executor with allowsBlockPassthrough must implement "
"fetchBlockForPassthrough");
@ -601,9 +612,9 @@ struct RequestWrappedBlock<RequestWrappedBlockVariant::INPUTRESTRICTED> {
typename Executor::Infos const&,
#endif
Executor& executor, ExecutionEngine& engine, size_t nrItems, RegisterCount nrRegs) {
static_assert(
Executor::Properties::inputSizeRestrictsOutputSize,
"This function can only be used with executors supporting `inputSizeRestrictsOutputSize`");
static_assert(Executor::Properties::inputSizeRestrictsOutputSize,
"This function can only be used with executors supporting "
"`inputSizeRestrictsOutputSize`");
static_assert(hasExpectedNumberOfRows<Executor>::value,
"An Executor with inputSizeRestrictsOutputSize must "
"implement expectedNumberOfRows");

View File

@ -229,8 +229,8 @@ class OutputAqlItemRow {
#endif
_baseIndex = index;
}
// Use this function with caution! We need it for the SortedCollectExecutor
// and CountCollectExecutor.
// Use this function with caution! We need it for the SortedCollectExecutor,
// CountCollectExecutor, and the ConstrainedSortExecutor.
void setAllowSourceRowUninitialized() {
_allowSourceRowUninitialized = true;
}
@ -307,7 +307,7 @@ class OutputAqlItemRow {
bool _setBaseIndexNotUsed;
#endif
// Need this special bool for allowing an empty AqlValue inside the
// SortedCollectExecutor and CountCollectExecutor.
// SortedCollectExecutor, CountCollectExecutor and ConstrainedSortExecutor.
bool _allowSourceRowUninitialized;
private:

View File

@ -1,5 +1,5 @@
/*jshint globalstrict:false, strict:false, maxlen: 500 */
/*global assertEqual, AQL_EXPLAIN */
/*global assertEqual, AQL_EXPLAIN, AQL_EXECUTE */
////////////////////////////////////////////////////////////////////////////////
/// @brief tests for query language, limit optimizations
@ -28,11 +28,10 @@
/// @author Copyright 2012, triAGENS GmbH, Cologne, Germany
////////////////////////////////////////////////////////////////////////////////
var jsunity = require("jsunity");
var internal = require("internal");
var helper = require("@arangodb/aql-helper");
var getQueryResults = helper.getQueryResults;
var db = internal.db;
const jsunity = require("jsunity");
const internal = require("internal");
const helper = require("@arangodb/aql-helper");
const getQueryResults = helper.getQueryResults;
////////////////////////////////////////////////////////////////////////////////
/// @brief test suite
@ -55,7 +54,7 @@ function ahuacatlQueryOptimizerLimitTestSuite () {
setUp : function () {
internal.db._drop(cn);
collection = internal.db._create(cn);
collection = internal.db._create(cn, {numberOfShards: 9});
for (var i = 0; i < docCount; ++i) {
collection.save({ _key: "test" + i, value : i });
@ -303,6 +302,35 @@ function ahuacatlQueryOptimizerLimitTestSuite () {
assertEqual(sorts[0].strategy, "constrained-heap");
},
////////////////////////////////////////////////////////////////////////////////
/// @brief check limit optimization with sort and fullCount
/// This is a regression test, the constrained heap did not play well with
/// fullCount when 3.5 was released.
////////////////////////////////////////////////////////////////////////////////
testLimitFullCollectionSortWithFullCount : function () {
const query = "FOR c IN " + cn + " SORT c.value LIMIT 20, 10 RETURN c";
const queryResult = AQL_EXECUTE(query, {}, {fullCount: true});
const values = queryResult.json;
const fullCount = queryResult.stats.fullCount;
assertEqual(10, values.length);
assertEqual(20, values[0].value);
assertEqual(21, values[1].value);
assertEqual(22, values[2].value);
assertEqual(29, values[9].value);
assertEqual(fullCount, 1000);
const sorts = getSorts(query);
assertEqual(sorts.length, 1);
assertEqual(sorts[0].limit, 30);
assertEqual(sorts[0].strategy, "constrained-heap");
},
};
}