From d10e79c5005c9436b7150b47b4152357376f5248 Mon Sep 17 00:00:00 2001
From: Jan Steemann <jan@arangodb.com>
Date: Tue, 21 Apr 2015 16:08:22 +0200
Subject: [PATCH] updated documentation

---
 CHANGELOG                                     | 101 ++++++++++++++++++
 Documentation/Books/Users/Aql/Basics.mdpp     |   2 +-
 Documentation/Books/Users/Aql/Operations.mdpp |  71 +++++++++---
 .../tests/aql-optimizer-collect-methods.js    |  73 +++++++++++++
 4 files changed, 232 insertions(+), 15 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 678c393c63..5dfd7725b7 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,104 @@
 v2.6.0 (XXXX-XX-XX)
 -------------------
 
+* added alternative implementation for AQL COLLECT
+
+  The alternative method uses a hash table for grouping and does not require its input elements
+  to be sorted. It will be taken into account by the optimizer for `COLLECT` statements that do
+  not use an `INTO` clause. 
+  
+  In case a `COLLECT` statement can use the hash table variant, the optimizer will create an extra 
+  plan for it at the beginning of the planning phase. In this plan, no extra `SORT` node will be
+  added in front of the `COLLECT` because the hash table variant of `COLLECT` does not require
+  sorted input. Instead, a `SORT` node will be added after it to sort its output. This `SORT` node
+  may be optimized away again in later stages. If the sort order of the result is irrelevant to
+  the user, adding an extra `SORT null` after a hash `COLLECT` operation will allow the optimizer to
+  remove the sorts altogether.
+  
+  In addition to the hash table variant of `COLLECT`, the optimizer will modify the original plan
+  to use the regular `COLLECT` implementation. As this implementation requires sorted input, the
+  optimizer will insert a `SORT` node in front of the `COLLECT`. This `SORT` node may be optimized
+  away in later stages.
+  
+  The created plans will then be shipped through the regular optimization pipeline. In the end,
+  the optimizer will pick the plan with the lowest estimated total cost as usual. The hash table
+  variant does not require an up-front sort of the input, and will thus be preferred over the
+  regular `COLLECT` if the optimizer estimates many input elements for the `COLLECT` node and 
+  cannot use an index to sort them.
+
+  The optimizer can be explicitly told to use the regular *sorted* variant of `COLLECT` by
+  suffixing a `COLLECT` statement with `OPTIONS { "method" : "sorted" }`. This will override the
+  optimizer guesswork and only produce the *sorted* variant of `COLLECT`.
+
+* re-factored cursor HTTP REST API for cursors 
+
+  The HTTP REST API for cursors (`/_api/cursor`) has been refactored to improve its performance
+  and use less memory.
+
+  A post showing some of the performance improvements can be found here:
+  http://jsteemann.github.io/blog/2015/04/01/improvements-for-the-cursor-api/
+
+* simplified return value syntax for data-modification AQL queries
+
+  ArangoDB 2.4 since version allows to return results from data-modification AQL queries. The
+  syntax for this was quite limited and verbose:
+  
+      FOR i IN 1..10
+        INSERT { value: i } IN test
+        LET inserted = NEW
+        RETURN inserted
+
+  The `LET inserted = NEW RETURN inserted` was required literally to return the inserted 
+  documents. No calculations could be made using the inserted documents. 
+
+  This is now more flexible. After a data-modification clause (e.g. `INSERT`, `UPDATE`, `REPLACE`,
+  `REMOVE`, `UPSERT`) there can follow any number of `LET` calculations. These calculations can 
+  refer to the pseudo-values `OLD` and `NEW` that are created by the data-modification statements.
+  
+  This allows returning projections of inserted or updated documents, e.g.:
+
+      FOR i IN 1..10
+        INSERT { value: i } IN test
+        RETURN { _key: NEW._key, value: i }
+
+  Still not every construct is allowed after a data-modification clause. For example, no functions
+  can be called that may access documents.
+
+  More information can be found here:
+  http://jsteemann.github.io/blog/2015/03/27/improvements-for-data-modification-queries/
+
+* added AQL `UPSERT` statement
+
+  This adds an `UPSERT` statement to AQL that is a combination of both `INSERT` and `UPDATE` /
+  `REPLACE`. The `UPSERT` will search for a matching document using a user-provided example.
+  If no document matches the example, the *insert* part of the `UPSERT` statement will be
+  executed. If there is a match, the *update* / *replace* part will be carried out:
+
+      UPSERT { page: 'index.html' }                 /* search example */
+        INSERT { page: 'index.html', pageViews: 1 } /* insert part */
+        UPDATE { pageViews: OLD.pageViews + 1 }     /* update part */
+        IN pageViews
+
+  `UPSERT` can be used with an `UPDATE` or `REPLACE` clause. The `UPDATE` clause will perform
+  a partial update of the found document, whereas the `REPLACE` clause will replace the found
+  document entirely. The `UPDATE` or `REPLACE` parts can refer to the pseudo-value `OLD`, which
+  contains all attributes of the found document.
+
+  `UPSERT` statements can optionally return values. In the following query, the return
+  attribute `found` will return the found document before the `UPDATE` was applied. If no
+  document was found, `found` will contain a value of `null`. The `updated` result attribute will
+  contain the inserted / updated document:
+      
+      UPSERT { page: 'index.html' }                 /* search example */
+        INSERT { page: 'index.html', pageViews: 1 } /* insert part */
+        UPDATE { pageViews: OLD.pageViews + 1 }     /* update part */
+        IN pageViews
+        RETURN { found: OLD, updated: NEW }
+
+  A more detailed description of `UPSERT` can be found here:
+  http://jsteemann.github.io/blog/2015/03/27/preview-of-the-upsert-command/
+
+
 * adjusted default configuration value for `--server.backlog-size` from 10 to 64.
 
 * issue #1231: bug xor feature in AQL: LENGTH(null) == 4 
@@ -59,6 +157,9 @@ v2.6.0 (XXXX-XX-XX)
   To support the feature, arangoimp also has a new command line option `--on-duplicate` which can
   have one of the values `error`, `update`, `replace`, `ignore`. The default value is `error`.
 
+  A few examples for using arangoimp with the `--on-duplicate` option can be found here:
+  http://jsteemann.github.io/blog/2015/04/14/updating-documents-with-arangoimp/
+
 * changed behavior of `db._query()` in the ArangoShell:
 
   if the command's result is printed in the shell, the first 10 results will be printed. Previously
diff --git a/Documentation/Books/Users/Aql/Basics.mdpp b/Documentation/Books/Users/Aql/Basics.mdpp
index f94c445535..cae3c8a158 100644
--- a/Documentation/Books/Users/Aql/Basics.mdpp
+++ b/Documentation/Books/Users/Aql/Basics.mdpp
@@ -57,7 +57,7 @@ On the top level, AQL offers the following operations:
 - `UPDATE`: (partial) update of existing documents
 - `REPLACE`: replacement of existing documents
 - `REMOVE`: removal of existing documents
-- `UPSERT`: insertion or (partial) update of existing documents
+- `UPSERT`: insertion or update of existing documents
 
 Each of the above operations can be initiated in a query by using a keyword of
 the same name. An AQL query can (and typically does) consist of multiple of the
diff --git a/Documentation/Books/Users/Aql/Operations.mdpp b/Documentation/Books/Users/Aql/Operations.mdpp
index edfed0cd8d..d5c3e7fc0b 100644
--- a/Documentation/Books/Users/Aql/Operations.mdpp
+++ b/Documentation/Books/Users/Aql/Operations.mdpp
@@ -232,22 +232,14 @@ available.
 The general syntaxes for *COLLECT* are:
 
 ```
-COLLECT variable-name = expression
-COLLECT variable-name = expression INTO groups-variable
-COLLECT variable-name = expression INTO groups-variable = projection-expression
-COLLECT variable-name = expression INTO groups-variable KEEP keep-variable
-COLLECT variable-name = expression WITH COUNT INTO count-variable
-COLLECT WITH COUNT INTO count-variable
+COLLECT variable-name = expression options
+COLLECT variable-name = expression INTO groups-variable options
+COLLECT variable-name = expression INTO groups-variable = projection-expression options
+COLLECT variable-name = expression INTO groups-variable KEEP keep-variable options
+COLLECT variable-name = expression WITH COUNT INTO count-variable options
+COLLECT WITH COUNT INTO count-variable options
 ```
 
-Note: the first four forms of *COLLECT* require their input be sorted by
-the group criteria. To ensure correctness of the result, the AQL optimizer 
-will automatically insert a *SORT* statement into the query in front of the 
-*COLLECT* statement. The optimizer can optimize away the *SORT* statement
-later if a sorted index is present on the group criteria. This will make
-*COLLECT* operations more efficient.
-
-
 !SUBSUBSECTION Grouping syntaxes
 
 The first syntax form of *COLLECT* only groups the result by the defined group 
@@ -410,6 +402,57 @@ FOR u IN users
 Note: the *WITH COUNT* clause can only be used together with an *INTO* clause.
 
 
+!SUBSUBSECTION COLLECT variants
+
+Since ArangoDB 2.6, there are two variants of *COLLECT* that the optimizer can
+choose from: the *sorted* variant and the *hash* variant. The *hash* variant only becomes a
+candidate for *COLLECT* statements that do not use an *INTO* clause.
+
+The optimizer will always generate a plan that employs the *sorted* method. The *sorted* method 
+requires its input to be sorted by the group criteria specified in the *COLLECT* clause. 
+To ensure correctness of the result, the AQL optimizer will automatically insert a *SORT* 
+statement into the query in front of the *COLLECT* statement. The optimizer may be able to 
+optimize away that *SORT* statement later if a sorted index is present on the group criteria. 
+
+In case a *COLLECT* qualifies for using the *hash* variant, the optimizer will create an extra 
+plan for it at the beginning of the planning phase. In this plan, no extra *SORT* statement will be
+added in front of the *COLLECT*. This is because the *hash* variant of *COLLECT* does not require
+sorted input. Instead, a *SORT* statement will be added after the *COLLECT* to sort its output. 
+This *SORT* statement may be optimized away again in later stages. 
+If the sort order of the *COLLECT* is irrelevant to the user, adding the extra instruction *SORT null* 
+after the *COLLECT* will allow the optimizer to remove the sorts altogether.
+  
+Which *COLLECT* variant is used by the optimizer depends on the optimizer's cost estimations. The 
+created plans with the different *COLLECT* variants will be shipped through the regular optimization 
+pipeline. In the end, the optimizer will pick the plan with the lowest estimated total cost as usual. 
+
+In general, the *sorted* variant of *COLLECT* should be preferred in cases when there is a sorted index
+present on the group criteria. In this case the optimizer can eliminate the *SORT* statement in front
+of the *COLLECT*, so that no *SORT* will be left. 
+
+If there is no sorted index available on the group criteria, the up-front sort required by the *sorted* 
+variant can be expensive. In this case it is likely that the optimizer will prefer the *hash* variant
+of *COLLECT*, which does not require its input to be sorted. 
+
+Which variant of *COLLECT* was actually used can be figured out by looking into the execution plan of
+a query, specifically the *AggregateNode* and its *aggregationOptions* attribute.
+
+
+!SUBSUBSECTION Setting COLLECT options
+
+*options* can be used in a *COLLECT* statement to inform the optimizer about the preferred *COLLECT*
+method. When specifying the following appendix to a *COLLECT* statement, the optimizer will always use
+the *sorted* variant of *COLLECT* and not even create a plan using the *hash* variant:
+
+```
+OPTIONS { method: "sorted" }
+```
+
+Note that specifying *hash* as method will not make the optimizer use the *hash* variant. This is
+because the *hash* variant is not eligible for all queries. Instead, if no options or any other method
+than *sorted* are specified in *OPTIONS*, the optimizer will use its regular cost estimations.
+
+
 !SUBSECTION REMOVE
 
 The *REMOVE* keyword can be used to remove documents from a collection. On a
diff --git a/js/server/tests/aql-optimizer-collect-methods.js b/js/server/tests/aql-optimizer-collect-methods.js
index 17951f3d9d..6b2d938596 100644
--- a/js/server/tests/aql-optimizer-collect-methods.js
+++ b/js/server/tests/aql-optimizer-collect-methods.js
@@ -256,6 +256,79 @@ function optimizerCollectMethodsTestSuite () {
         var results = AQL_EXECUTE(query[0]);
         assertEqual(query[1], results.json.length);
       });
+    },
+
+////////////////////////////////////////////////////////////////////////////////
+/// @brief test override of collect method
+////////////////////////////////////////////////////////////////////////////////
+
+    testOverrideMethodWithHashIgnored : function () {
+      c.ensureIndex({ type: "skiplist", fields: [ "group" ] }); 
+      c.ensureIndex({ type: "skiplist", fields: [ "group", "value" ] }); 
+
+      // the expectation is that the optimizer will still use the 'sorted' method here as there are
+      // sorted indexes supporting it
+      var queries = [
+        "FOR j IN " + c.name() + " COLLECT value = j.group INTO g OPTIONS { method: 'hash' } RETURN [ value, g ]",
+        "FOR j IN " + c.name() + " COLLECT value = j.group OPTIONS { method: 'hash' } RETURN value",
+        "FOR j IN " + c.name() + " COLLECT value1 = j.group, value2 = j.value OPTIONS { method: 'hash' } RETURN [ value1, value2 ]",
+        "FOR j IN " + c.name() + " COLLECT value = j.group WITH COUNT INTO l OPTIONS { method: 'hash' } RETURN [ value, l ]",
+        "FOR j IN " + c.name() + " COLLECT value1 = j.group, value2 = j.value WITH COUNT INTO l OPTIONS { method: 'hash' } RETURN [ value1, value2, l ]"
+      ];
+
+      queries.forEach(function(query) {
+        var plan = AQL_EXPLAIN(query).plan;
+
+        var aggregateNodes = 0;
+        var sortNodes = 0;
+        plan.nodes.map(function(node) {
+          if (node.type === "AggregateNode") {
+            ++aggregateNodes;
+            assertEqual("sorted", node.aggregationOptions.method);
+          }
+          if (node.type === "SortNode") {
+            ++sortNodes;
+          }
+        });
+        
+        assertEqual(1, aggregateNodes);
+        assertEqual(0, sortNodes);
+      });
+    },
+
+////////////////////////////////////////////////////////////////////////////////
+/// @brief test override of collect method
+////////////////////////////////////////////////////////////////////////////////
+
+    testOverrideMethodSortedUsed : function () {
+      // the expectation is that the optimizer will use the 'sorted' method here because we
+      // explicitly ask for it
+      var queries = [
+        "FOR j IN " + c.name() + " COLLECT value = j.group INTO g OPTIONS { method: 'sorted' } RETURN [ value, g ]",
+        "FOR j IN " + c.name() + " COLLECT value = j.group OPTIONS { method: 'sorted' } RETURN value",
+        "FOR j IN " + c.name() + " COLLECT value1 = j.group, value2 = j.value OPTIONS { method: 'sorted' } RETURN [ value1, value2 ]",
+        "FOR j IN " + c.name() + " COLLECT value = j.group WITH COUNT INTO l OPTIONS { method: 'sorted' } RETURN [ value, l ]",
+        "FOR j IN " + c.name() + " COLLECT value1 = j.group, value2 = j.value WITH COUNT INTO l OPTIONS { method: 'sorted' } RETURN [ value1, value2, l ]"
+      ];
+
+      queries.forEach(function(query) {
+        var plan = AQL_EXPLAIN(query).plan;
+
+        var aggregateNodes = 0;
+        var sortNodes = 0;
+        plan.nodes.map(function(node) {
+          if (node.type === "AggregateNode") {
+            ++aggregateNodes;
+            assertEqual("sorted", node.aggregationOptions.method);
+          }
+          if (node.type === "SortNode") {
+            ++sortNodes;
+          }
+        });
+        
+        assertEqual(1, aggregateNodes);
+        assertEqual(1, sortNodes);
+      });
     }
 
   };