From d10e79c5005c9436b7150b47b4152357376f5248 Mon Sep 17 00:00:00 2001 From: Jan Steemann Date: Tue, 21 Apr 2015 16:08:22 +0200 Subject: [PATCH] updated documentation --- CHANGELOG | 101 ++++++++++++++++++ Documentation/Books/Users/Aql/Basics.mdpp | 2 +- Documentation/Books/Users/Aql/Operations.mdpp | 71 +++++++++--- .../tests/aql-optimizer-collect-methods.js | 73 +++++++++++++ 4 files changed, 232 insertions(+), 15 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 678c393c63..5dfd7725b7 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,104 @@ v2.6.0 (XXXX-XX-XX) ------------------- +* added alternative implementation for AQL COLLECT + + The alternative method uses a hash table for grouping and does not require its input elements + to be sorted. It will be taken into account by the optimizer for `COLLECT` statements that do + not use an `INTO` clause. + + In case a `COLLECT` statement can use the hash table variant, the optimizer will create an extra + plan for it at the beginning of the planning phase. In this plan, no extra `SORT` node will be + added in front of the `COLLECT` because the hash table variant of `COLLECT` does not require + sorted input. Instead, a `SORT` node will be added after it to sort its output. This `SORT` node + may be optimized away again in later stages. If the sort order of the result is irrelevant to + the user, adding an extra `SORT null` after a hash `COLLECT` operation will allow the optimizer to + remove the sorts altogether. + + In addition to the hash table variant of `COLLECT`, the optimizer will modify the original plan + to use the regular `COLLECT` implementation. As this implementation requires sorted input, the + optimizer will insert a `SORT` node in front of the `COLLECT`. This `SORT` node may be optimized + away in later stages. + + The created plans will then be shipped through the regular optimization pipeline. In the end, + the optimizer will pick the plan with the lowest estimated total cost as usual. The hash table + variant does not require an up-front sort of the input, and will thus be preferred over the + regular `COLLECT` if the optimizer estimates many input elements for the `COLLECT` node and + cannot use an index to sort them. + + The optimizer can be explicitly told to use the regular *sorted* variant of `COLLECT` by + suffixing a `COLLECT` statement with `OPTIONS { "method" : "sorted" }`. This will override the + optimizer guesswork and only produce the *sorted* variant of `COLLECT`. + +* re-factored cursor HTTP REST API for cursors + + The HTTP REST API for cursors (`/_api/cursor`) has been refactored to improve its performance + and use less memory. + + A post showing some of the performance improvements can be found here: + http://jsteemann.github.io/blog/2015/04/01/improvements-for-the-cursor-api/ + +* simplified return value syntax for data-modification AQL queries + + ArangoDB 2.4 since version allows to return results from data-modification AQL queries. The + syntax for this was quite limited and verbose: + + FOR i IN 1..10 + INSERT { value: i } IN test + LET inserted = NEW + RETURN inserted + + The `LET inserted = NEW RETURN inserted` was required literally to return the inserted + documents. No calculations could be made using the inserted documents. + + This is now more flexible. After a data-modification clause (e.g. `INSERT`, `UPDATE`, `REPLACE`, + `REMOVE`, `UPSERT`) there can follow any number of `LET` calculations. These calculations can + refer to the pseudo-values `OLD` and `NEW` that are created by the data-modification statements. + + This allows returning projections of inserted or updated documents, e.g.: + + FOR i IN 1..10 + INSERT { value: i } IN test + RETURN { _key: NEW._key, value: i } + + Still not every construct is allowed after a data-modification clause. For example, no functions + can be called that may access documents. + + More information can be found here: + http://jsteemann.github.io/blog/2015/03/27/improvements-for-data-modification-queries/ + +* added AQL `UPSERT` statement + + This adds an `UPSERT` statement to AQL that is a combination of both `INSERT` and `UPDATE` / + `REPLACE`. The `UPSERT` will search for a matching document using a user-provided example. + If no document matches the example, the *insert* part of the `UPSERT` statement will be + executed. If there is a match, the *update* / *replace* part will be carried out: + + UPSERT { page: 'index.html' } /* search example */ + INSERT { page: 'index.html', pageViews: 1 } /* insert part */ + UPDATE { pageViews: OLD.pageViews + 1 } /* update part */ + IN pageViews + + `UPSERT` can be used with an `UPDATE` or `REPLACE` clause. The `UPDATE` clause will perform + a partial update of the found document, whereas the `REPLACE` clause will replace the found + document entirely. The `UPDATE` or `REPLACE` parts can refer to the pseudo-value `OLD`, which + contains all attributes of the found document. + + `UPSERT` statements can optionally return values. In the following query, the return + attribute `found` will return the found document before the `UPDATE` was applied. If no + document was found, `found` will contain a value of `null`. The `updated` result attribute will + contain the inserted / updated document: + + UPSERT { page: 'index.html' } /* search example */ + INSERT { page: 'index.html', pageViews: 1 } /* insert part */ + UPDATE { pageViews: OLD.pageViews + 1 } /* update part */ + IN pageViews + RETURN { found: OLD, updated: NEW } + + A more detailed description of `UPSERT` can be found here: + http://jsteemann.github.io/blog/2015/03/27/preview-of-the-upsert-command/ + + * adjusted default configuration value for `--server.backlog-size` from 10 to 64. * issue #1231: bug xor feature in AQL: LENGTH(null) == 4 @@ -59,6 +157,9 @@ v2.6.0 (XXXX-XX-XX) To support the feature, arangoimp also has a new command line option `--on-duplicate` which can have one of the values `error`, `update`, `replace`, `ignore`. The default value is `error`. + A few examples for using arangoimp with the `--on-duplicate` option can be found here: + http://jsteemann.github.io/blog/2015/04/14/updating-documents-with-arangoimp/ + * changed behavior of `db._query()` in the ArangoShell: if the command's result is printed in the shell, the first 10 results will be printed. Previously diff --git a/Documentation/Books/Users/Aql/Basics.mdpp b/Documentation/Books/Users/Aql/Basics.mdpp index f94c445535..cae3c8a158 100644 --- a/Documentation/Books/Users/Aql/Basics.mdpp +++ b/Documentation/Books/Users/Aql/Basics.mdpp @@ -57,7 +57,7 @@ On the top level, AQL offers the following operations: - `UPDATE`: (partial) update of existing documents - `REPLACE`: replacement of existing documents - `REMOVE`: removal of existing documents -- `UPSERT`: insertion or (partial) update of existing documents +- `UPSERT`: insertion or update of existing documents Each of the above operations can be initiated in a query by using a keyword of the same name. An AQL query can (and typically does) consist of multiple of the diff --git a/Documentation/Books/Users/Aql/Operations.mdpp b/Documentation/Books/Users/Aql/Operations.mdpp index edfed0cd8d..d5c3e7fc0b 100644 --- a/Documentation/Books/Users/Aql/Operations.mdpp +++ b/Documentation/Books/Users/Aql/Operations.mdpp @@ -232,22 +232,14 @@ available. The general syntaxes for *COLLECT* are: ``` -COLLECT variable-name = expression -COLLECT variable-name = expression INTO groups-variable -COLLECT variable-name = expression INTO groups-variable = projection-expression -COLLECT variable-name = expression INTO groups-variable KEEP keep-variable -COLLECT variable-name = expression WITH COUNT INTO count-variable -COLLECT WITH COUNT INTO count-variable +COLLECT variable-name = expression options +COLLECT variable-name = expression INTO groups-variable options +COLLECT variable-name = expression INTO groups-variable = projection-expression options +COLLECT variable-name = expression INTO groups-variable KEEP keep-variable options +COLLECT variable-name = expression WITH COUNT INTO count-variable options +COLLECT WITH COUNT INTO count-variable options ``` -Note: the first four forms of *COLLECT* require their input be sorted by -the group criteria. To ensure correctness of the result, the AQL optimizer -will automatically insert a *SORT* statement into the query in front of the -*COLLECT* statement. The optimizer can optimize away the *SORT* statement -later if a sorted index is present on the group criteria. This will make -*COLLECT* operations more efficient. - - !SUBSUBSECTION Grouping syntaxes The first syntax form of *COLLECT* only groups the result by the defined group @@ -410,6 +402,57 @@ FOR u IN users Note: the *WITH COUNT* clause can only be used together with an *INTO* clause. +!SUBSUBSECTION COLLECT variants + +Since ArangoDB 2.6, there are two variants of *COLLECT* that the optimizer can +choose from: the *sorted* variant and the *hash* variant. The *hash* variant only becomes a +candidate for *COLLECT* statements that do not use an *INTO* clause. + +The optimizer will always generate a plan that employs the *sorted* method. The *sorted* method +requires its input to be sorted by the group criteria specified in the *COLLECT* clause. +To ensure correctness of the result, the AQL optimizer will automatically insert a *SORT* +statement into the query in front of the *COLLECT* statement. The optimizer may be able to +optimize away that *SORT* statement later if a sorted index is present on the group criteria. + +In case a *COLLECT* qualifies for using the *hash* variant, the optimizer will create an extra +plan for it at the beginning of the planning phase. In this plan, no extra *SORT* statement will be +added in front of the *COLLECT*. This is because the *hash* variant of *COLLECT* does not require +sorted input. Instead, a *SORT* statement will be added after the *COLLECT* to sort its output. +This *SORT* statement may be optimized away again in later stages. +If the sort order of the *COLLECT* is irrelevant to the user, adding the extra instruction *SORT null* +after the *COLLECT* will allow the optimizer to remove the sorts altogether. + +Which *COLLECT* variant is used by the optimizer depends on the optimizer's cost estimations. The +created plans with the different *COLLECT* variants will be shipped through the regular optimization +pipeline. In the end, the optimizer will pick the plan with the lowest estimated total cost as usual. + +In general, the *sorted* variant of *COLLECT* should be preferred in cases when there is a sorted index +present on the group criteria. In this case the optimizer can eliminate the *SORT* statement in front +of the *COLLECT*, so that no *SORT* will be left. + +If there is no sorted index available on the group criteria, the up-front sort required by the *sorted* +variant can be expensive. In this case it is likely that the optimizer will prefer the *hash* variant +of *COLLECT*, which does not require its input to be sorted. + +Which variant of *COLLECT* was actually used can be figured out by looking into the execution plan of +a query, specifically the *AggregateNode* and its *aggregationOptions* attribute. + + +!SUBSUBSECTION Setting COLLECT options + +*options* can be used in a *COLLECT* statement to inform the optimizer about the preferred *COLLECT* +method. When specifying the following appendix to a *COLLECT* statement, the optimizer will always use +the *sorted* variant of *COLLECT* and not even create a plan using the *hash* variant: + +``` +OPTIONS { method: "sorted" } +``` + +Note that specifying *hash* as method will not make the optimizer use the *hash* variant. This is +because the *hash* variant is not eligible for all queries. Instead, if no options or any other method +than *sorted* are specified in *OPTIONS*, the optimizer will use its regular cost estimations. + + !SUBSECTION REMOVE The *REMOVE* keyword can be used to remove documents from a collection. On a diff --git a/js/server/tests/aql-optimizer-collect-methods.js b/js/server/tests/aql-optimizer-collect-methods.js index 17951f3d9d..6b2d938596 100644 --- a/js/server/tests/aql-optimizer-collect-methods.js +++ b/js/server/tests/aql-optimizer-collect-methods.js @@ -256,6 +256,79 @@ function optimizerCollectMethodsTestSuite () { var results = AQL_EXECUTE(query[0]); assertEqual(query[1], results.json.length); }); + }, + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test override of collect method +//////////////////////////////////////////////////////////////////////////////// + + testOverrideMethodWithHashIgnored : function () { + c.ensureIndex({ type: "skiplist", fields: [ "group" ] }); + c.ensureIndex({ type: "skiplist", fields: [ "group", "value" ] }); + + // the expectation is that the optimizer will still use the 'sorted' method here as there are + // sorted indexes supporting it + var queries = [ + "FOR j IN " + c.name() + " COLLECT value = j.group INTO g OPTIONS { method: 'hash' } RETURN [ value, g ]", + "FOR j IN " + c.name() + " COLLECT value = j.group OPTIONS { method: 'hash' } RETURN value", + "FOR j IN " + c.name() + " COLLECT value1 = j.group, value2 = j.value OPTIONS { method: 'hash' } RETURN [ value1, value2 ]", + "FOR j IN " + c.name() + " COLLECT value = j.group WITH COUNT INTO l OPTIONS { method: 'hash' } RETURN [ value, l ]", + "FOR j IN " + c.name() + " COLLECT value1 = j.group, value2 = j.value WITH COUNT INTO l OPTIONS { method: 'hash' } RETURN [ value1, value2, l ]" + ]; + + queries.forEach(function(query) { + var plan = AQL_EXPLAIN(query).plan; + + var aggregateNodes = 0; + var sortNodes = 0; + plan.nodes.map(function(node) { + if (node.type === "AggregateNode") { + ++aggregateNodes; + assertEqual("sorted", node.aggregationOptions.method); + } + if (node.type === "SortNode") { + ++sortNodes; + } + }); + + assertEqual(1, aggregateNodes); + assertEqual(0, sortNodes); + }); + }, + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test override of collect method +//////////////////////////////////////////////////////////////////////////////// + + testOverrideMethodSortedUsed : function () { + // the expectation is that the optimizer will use the 'sorted' method here because we + // explicitly ask for it + var queries = [ + "FOR j IN " + c.name() + " COLLECT value = j.group INTO g OPTIONS { method: 'sorted' } RETURN [ value, g ]", + "FOR j IN " + c.name() + " COLLECT value = j.group OPTIONS { method: 'sorted' } RETURN value", + "FOR j IN " + c.name() + " COLLECT value1 = j.group, value2 = j.value OPTIONS { method: 'sorted' } RETURN [ value1, value2 ]", + "FOR j IN " + c.name() + " COLLECT value = j.group WITH COUNT INTO l OPTIONS { method: 'sorted' } RETURN [ value, l ]", + "FOR j IN " + c.name() + " COLLECT value1 = j.group, value2 = j.value WITH COUNT INTO l OPTIONS { method: 'sorted' } RETURN [ value1, value2, l ]" + ]; + + queries.forEach(function(query) { + var plan = AQL_EXPLAIN(query).plan; + + var aggregateNodes = 0; + var sortNodes = 0; + plan.nodes.map(function(node) { + if (node.type === "AggregateNode") { + ++aggregateNodes; + assertEqual("sorted", node.aggregationOptions.method); + } + if (node.type === "SortNode") { + ++sortNodes; + } + }); + + assertEqual(1, aggregateNodes); + assertEqual(1, sortNodes); + }); } };