Background Indexing (#8039)

2019-01-29 09:31:32 +01:00 · 2019-01-29 09:31:32 +01:00 · fd70b6fc34
parent 93ea32a00a
commit fd70b6fc34
29 changed files with 808 additions and 564 deletions
--- a/Documentation/Books/Manual/Indexing/Hash.md
+++ b/Documentation/Books/Manual/Indexing/Hash.md
@ -118,7 +118,6 @@ details, including the index-identifier, is returned.
    @endDocuBlock ensureHashIndexArray


-{###
 Creating Hash Index in Background
 ---------------------------------

@ -142,7 +141,7 @@ db.collection.ensureIndex({ type: "hash", fields: [ "value" ], inBackground: tru
 ```

 For more information see "Creating Indexes in Background" in the [Index basics](IndexBasics.md) page.
-###}
+

 Ensure uniqueness of relations in edge collections
 --------------------------------------------------
--- a/Documentation/Books/Manual/Indexing/IndexBasics.md
+++ b/Documentation/Books/Manual/Indexing/IndexBasics.md
@ -8,7 +8,7 @@ of documents.

 User-defined indexes can be created on collection level. Most user-defined indexes 
 can be created by specifying the names of the index attributes.
-Some index types allow indexing just one attribute (e.g. fulltext index) whereas 
+Some index types allow indexing just one attribute (e.g. *fulltext* index) whereas 
 other index types allow indexing multiple attributes at the same time.

 Learn how to use different indexes efficiently by going through the
@ -22,15 +22,14 @@ are covered by an edge collection's edge index automatically.
 Using the system attribute `_id` in user-defined indexes is not possible, but 
 indexing `_key`, `_rev`, `_from`, and `_to` is.

-{###
-Creating new indexes is usually done under an exclusive collection lock. The collection is not
-available as long as the index is created.  This "foreground" index creation can be undesirable, 
+Creating new indexes is by default done under an exclusive collection lock. The collection is not
+available while the index is being created.  This "foreground" index creation can be undesirable, 
 if you have to perform it on a live system without a dedicated maintenance window.

 For potentially long running index  creation operations the _rocksdb_ storage-engine also supports 
-creating indexes in "background". The collection remains available during the index creation, 
-see the section "Creating Indexes in Background" for more information.
-###}
+creating indexes in "background". The collection remains (mostly) available during the index creation, 
+see the section [Creating Indexes in Background](#creating-indexes-in-background) for more information.
+

 ArangoDB provides the following index types:

@ -551,7 +550,7 @@ based on the costs it estimates, even if a vertex centric index might
 in fact be faster. Vertex centric indexes are more likely to be chosen
 for highly connected graphs and with RocksDB storage engine.

-{###
+
 Creating Indexes in Background
 ------------------------------

@ -610,4 +609,4 @@ if this list grows to tens of millions of entries.

 Building an index is always a write heavy operation (internally), it is always a good idea to build indexes
 during times with less load.
-###}
+
--- a/Documentation/Books/Manual/Indexing/Skiplist.md
+++ b/Documentation/Books/Manual/Indexing/Skiplist.md
@ -187,7 +187,6 @@ and
 will match.


-{###
 Creating Skiplist Index in Background
 -------------------------------------

@ -211,4 +210,4 @@ db.collection.ensureIndex({ type: "skiplist", fields: [ "value" ], inBackground:
 ```

 For more information see "Creating Indexes in Background" in the [Index basics](IndexBasics.md#) page.
-###}
+
--- a/arangod/IResearch/AqlHelper.h
+++ b/arangod/IResearch/AqlHelper.h
@ -491,12 +491,12 @@ aql::AstNode const* checkAttributeAccess(aql::AstNode const* node,
 }  // namespace iresearch
 }  // namespace arangodb

-#endif  // ARANGOD_IRESEARCH__AQL_HELPER_H
-
 #if defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif

+#endif  // ARANGOD_IRESEARCH__AQL_HELPER_H
+
 // -----------------------------------------------------------------------------
 // --SECTION--                                                       END-OF-FILE
 // -----------------------------------------------------------------------------
--- a/arangod/RestServer/FlushFeature.h
+++ b/arangod/RestServer/FlushFeature.h
@ -26,7 +26,7 @@
 #include "ApplicationFeatures/ApplicationFeature.h"
 #include "Basics/ReadWriteLock.h"

-class TRI_vocbase_t; // forward declaration
+struct TRI_vocbase_t; // forward declaration

 namespace arangodb {

@ -51,7 +51,7 @@ class FlushFeature final : public application_features::ApplicationFeature {
    virtual ~FlushSubscription() = default;
    virtual Result commit(velocypack::Slice const& data) = 0;
  };
-  struct FlushSubscriptionBase; // forward declaration
+  class FlushSubscriptionBase; // forward declaration

  explicit FlushFeature(application_features::ApplicationServer& server);

--- a/arangod/RocksDBEngine/RocksDBBackgroundThread.cpp
+++ b/arangod/RocksDBEngine/RocksDBBackgroundThread.cpp
@ -32,7 +32,8 @@
 using namespace arangodb;

 RocksDBBackgroundThread::RocksDBBackgroundThread(RocksDBEngine* eng, double interval)
-    : Thread("RocksDBThread"), _engine(eng), _interval(interval) {}
+    : Thread("RocksDBThread"), _engine(eng), _interval(interval),
+      _disableWalFilePruning(0) {}

 RocksDBBackgroundThread::~RocksDBBackgroundThread() { shutdown(); }

@ -97,6 +98,7 @@ void RocksDBBackgroundThread::run() {
        });
      }

+      if (!disableWalFilePruning()) {
        // only start pruning of obsolete WAL files a few minutes after
        // server start. if we start pruning too early, replication slaves
        // will not have a chance to reconnect to a restarted master in
@ -108,6 +110,7 @@ void RocksDBBackgroundThread::run() {
          // and then prune them when they expired
          _engine->pruneWalFiles();
        }
+      }
    } catch (std::exception const& ex) {
      LOG_TOPIC(WARN, Logger::ENGINES)
          << "caught exception in rocksdb background thread: " << ex.what();
--- a/arangod/RocksDBEngine/RocksDBBackgroundThread.h
+++ b/arangod/RocksDBEngine/RocksDBBackgroundThread.h
@ -53,8 +53,24 @@ class RocksDBBackgroundThread final : public Thread {

  void beginShutdown() override;

+  /// disable pruning of wal files
+  void disableWalFilePruning(bool disable) {
+    int sub = 0;
+    if (disable) {
+      sub = _disableWalFilePruning.fetch_add(1);
+    } else {
+      sub = _disableWalFilePruning.fetch_sub(1);
+    }
+    TRI_ASSERT(sub >= 0);
+  }
+  
+  bool disableWalFilePruning() const {
+    return _disableWalFilePruning.load(std::memory_order_acquire) > 0;
+  }
+
 protected:
  void run() override;
+  std::atomic<int> _disableWalFilePruning;
 };
 }  // namespace arangodb

--- a/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp
+++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.cpp
@ -21,10 +21,13 @@
 ////////////////////////////////////////////////////////////////////////////////

 #include "RocksDBBuilderIndex.h"
+
+#include "Basics/HashSet.h"
 #include "Basics/VelocyPackHelper.h"
 #include "RocksDBEngine/RocksDBCollection.h"
 #include "RocksDBEngine/RocksDBColumnFamily.h"
 #include "RocksDBEngine/RocksDBCommon.h"
+#include "RocksDBEngine/RocksDBLogValue.h"
 #include "RocksDBEngine/RocksDBMethods.h"
 #include "RocksDBEngine/RocksDBTransactionCollection.h"
 #include "RocksDBEngine/RocksDBTransactionState.h"
@ -39,17 +42,43 @@
 #include <rocksdb/utilities/transaction_db.h>
 #include <rocksdb/utilities/write_batch_with_index.h>

+#include <velocypack/Builder.h>
 #include <velocypack/Iterator.h>
+#include <velocypack/velocypack-aliases.h>

 using namespace arangodb;
 using namespace arangodb::rocksutils;

+namespace {
+struct BuilderTrx : public arangodb::transaction::Methods {
+  BuilderTrx(std::shared_ptr<transaction::Context> const& transactionContext,
+             LogicalDataSource const& collection, AccessMode::Type type)
+      : transaction::Methods(transactionContext), _cid(collection.id()) {
+    // add the (sole) data-source
+    addCollection(collection.id(), collection.name(), type);
+    addHint(transaction::Hints::Hint::NO_DLD);
+  }
+
+  /// @brief get the underlying transaction collection
+  RocksDBTransactionCollection* resolveTrxCollection() {
+    return static_cast<RocksDBTransactionCollection*>(trxCollection(_cid));
+  }
+
+ private:
+  TRI_voc_cid_t _cid;
+};
+
+struct BuilderCookie : public arangodb::TransactionState::Cookie {
+  // do not track removed documents twice
+  arangodb::HashSet<LocalDocumentId::BaseType> tracked;
+};
+}  // namespace
+
 RocksDBBuilderIndex::RocksDBBuilderIndex(std::shared_ptr<arangodb::RocksDBIndex> const& wp)
    : RocksDBIndex(wp->id(), wp->collection(), wp->fields(), wp->unique(),
                   wp->sparse(), wp->columnFamily(), wp->objectId(),
                   /*useCache*/ false),
-      _wrapped(wp),
-      _hasError(false) {
+      _wrapped(wp) {
  TRI_ASSERT(_wrapped);
 }

@ -72,18 +101,24 @@ Result RocksDBBuilderIndex::insert(transaction::Methods& trx, RocksDBMethods* mt
                                   LocalDocumentId const& documentId,
                                   arangodb::velocypack::Slice const& slice,
                                   OperationMode mode) {
-  TRI_ASSERT(false);  // not enabled
-  Result r = _wrapped->insert(trx, mthd, documentId, slice, mode);
-  if (r.is(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED)) {
-    // these are expected errors; store in builder and suppress
-    bool expected = false;
-    if (!r.ok() && _hasError.compare_exchange_strong(expected, true)) {
-      std::lock_guard<std::mutex> guard(_errorMutex);
-      _errorResult = r;
+#ifdef ARANGODB_ENABLE_MAINTAINER_MODE
+  auto* ctx = dynamic_cast<::BuilderCookie*>(trx.state()->cookie(this));
+#else
+  auto* ctx = static_cast<::BuilderCookie*>(trx.state()->cookie(this));
+#endif
+  if (!ctx) {
+    auto ptr = std::make_unique<::BuilderCookie>();
+    ctx = ptr.get();
+    trx.state()->cookie(this, std::move(ptr));
  }
-    return Result();
+  
+  // do not track document more than once
+  if (ctx->tracked.find(documentId.id()) == ctx->tracked.end()) {
+    ctx->tracked.insert(documentId.id());
+    RocksDBLogValue val = RocksDBLogValue::TrackedDocumentInsert(documentId, slice);
+    mthd->PutLogData(val.slice());
  }
-  return r;
+  return Result();  // do nothing
 }

 /// remove index elements and put it in the specified write batch.
@ -91,221 +126,36 @@ Result RocksDBBuilderIndex::remove(transaction::Methods& trx, RocksDBMethods* mt
                                   LocalDocumentId const& documentId,
                                   arangodb::velocypack::Slice const& slice,
                                   OperationMode mode) {
-  TRI_ASSERT(false);  // not enabled
-  {
-    std::lock_guard<std::mutex> guard(_removedDocsMutex);
-    _removedDocs.insert(documentId.id());
-  }
-  {  // wait for keys do be inserted, so we can remove them again
-    std::unique_lock<std::mutex> guard(_lockedDocsMutex);
-    if (_lockedDocs.find(documentId.id()) != _lockedDocs.end()) {
-      _lockedDocsCond.wait(guard);
-    }
+#ifdef ARANGODB_ENABLE_MAINTAINER_MODE
+  auto* ctx = dynamic_cast<::BuilderCookie*>(trx.state()->cookie(this));
+#else
+  auto* ctx = static_cast<::BuilderCookie*>(trx.state()->cookie(this));
+#endif
+  if (!ctx) {
+    auto ptr = std::make_unique<::BuilderCookie>();
+    ctx = ptr.get();
+    trx.state()->cookie(this, std::move(ptr));
  }

-  Result r = _wrapped->remove(trx, mthd, documentId, slice, mode);
-  if (r.is(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED)) {
-    // these are expected errors; store in builder and suppress
-    bool expected = false;
-    if (!r.ok() && _hasError.compare_exchange_strong(expected, true)) {
-      std::lock_guard<std::mutex> guard(_errorMutex);
-      _errorResult = r;
+  // do not track document more than once
+  if (ctx->tracked.find(documentId.id()) == ctx->tracked.end()) {
+    ctx->tracked.insert(documentId.id());
+    RocksDBLogValue val = RocksDBLogValue::TrackedDocumentRemove(documentId, slice);
+    mthd->PutLogData(val.slice());
  }
-    return Result();
-  }
-  return r;
-}
-
-namespace {
-struct BuilderTrx : public arangodb::transaction::Methods {
-  BuilderTrx(std::shared_ptr<transaction::Context> const& transactionContext,
-             LogicalDataSource const& collection, AccessMode::Type type)
-      : transaction::Methods(transactionContext), _cid(collection.id()) {
-    // add the (sole) data-source
-    addCollection(collection.id(), collection.name(), type);
-    addHint(transaction::Hints::Hint::NO_DLD);
-  }
-
-  /// @brief get the underlying transaction collection
-  RocksDBTransactionCollection* resolveTrxCollection() {
-    return static_cast<RocksDBTransactionCollection*>(trxCollection(_cid));
-  }
-
- private:
-  TRI_voc_cid_t _cid;
-};
-
-//struct BuilderCookie {
-//  VPackBuffer<uint8_t>
-//  VPackBuilder _removals;
-//};
-}  // namespace
-
-// Background index filler task
-// FIXME simon: not used right now because rollbacks are not correctly handled
-// yet
-arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function<void()> const& unlock) {
-  arangodb::Result res;
-
-  //  1. Index everything under a snapshot iterator (get snapshot under
-  //  exclusive coll lock)
-  //  2. Track deleted document IDs so we can avoid indexing them
-  //  3. Avoid conflicts on unique index keys by using rocksdb::Transaction
-  //  snapshot conflict checking
-  //  4. Supress unique constraint violations / conflicts or client drivers
-
-  auto lockedDocsGuard = scopeGuard([&] {  // clear all the processed documents
-    std::lock_guard<std::mutex> guard(_lockedDocsMutex);
-    _lockedDocs.clear();
-    _lockedDocsCond.notify_all();
-  });
-
-  // fillindex can be non transactional, we just need to clean up
-  RocksDBEngine* engine = rocksutils::globalRocksEngine();
-  RocksDBCollection* rcoll = static_cast<RocksDBCollection*>(_collection.getPhysical());
-  rocksdb::DB* rootDB = engine->db()->GetRootDB();
-  TRI_ASSERT(rootDB != nullptr);
-
-  uint64_t numDocsWritten = 0;
-
-  auto bounds = RocksDBKeyBounds::CollectionDocuments(rcoll->objectId());
-
-  rocksdb::Slice upper(bounds.end());  // exclusive upper bound
-  rocksdb::Status s;
-  rocksdb::WriteOptions wo;
-  wo.disableWAL = false;  // TODO set to true eventually
-
-  // create a read-snapshot under the guard
-  rocksdb::Snapshot const* snap = rootDB->GetSnapshot();
-  auto snapGuard = scopeGuard([&] { rootDB->ReleaseSnapshot(snap); });
-  TRI_ASSERT(snap != nullptr);
-
-  rocksdb::ReadOptions ro;
-  ro.snapshot = snap;
-  ro.prefix_same_as_start = true;
-  ro.iterate_upper_bound = &upper;
-  ro.verify_checksums = false;
-  ro.fill_cache = false;
-
-  rocksdb::ColumnFamilyHandle* docCF = bounds.columnFamily();
-  std::unique_ptr<rocksdb::Iterator> it(rootDB->NewIterator(ro, docCF));
-
-  unlock();  // release indexes write lock
-  // FIXME use buildertrx
-  SingleCollectionTransaction trx(transaction::StandaloneContext::Create(
-                                      _collection.vocbase()),
-                                  _collection, AccessMode::Type::WRITE);
-  res = trx.begin();
-  if (res.fail()) {
-    return res;
-  }
-  auto state = RocksDBTransactionState::toState(&trx);
-
-  // transaction used to perform actual indexing
-  rocksdb::TransactionOptions to;
-  to.lock_timeout = 100;  // 100ms
-  std::unique_ptr<rocksdb::Transaction> rtrx(engine->db()->BeginTransaction(wo, to));
-  if (this->unique()) {
-    rtrx->SetSnapshot();  // needed for unique index conflict detection
-  } else {
-    rtrx->DisableIndexing();  // we never check for existing index keys
-  }
-  RocksDBSideTrxMethods batched(state, rtrx.get());
-
-  RocksDBIndex* internal = _wrapped.get();
-  TRI_ASSERT(internal != nullptr);
-
-  // FIXE make selectivity estimates batch wise
-  it->Seek(bounds.start());
-  while (it->Valid() && it->key().compare(upper) < 0) {
-    if (_hasError.load(std::memory_order_acquire)) {
-      std::lock_guard<std::mutex> guard(_errorMutex);
-      res = _errorResult;  // a Writer got an error
-      break;
-    }
-
-    LocalDocumentId docId = RocksDBKey::documentId(it->key());
-    {
-      // must acquire both locks here to prevent interleaved operations
-      std::lock_guard<std::mutex> guard(_removedDocsMutex);
-      std::lock_guard<std::mutex> guard2(_lockedDocsMutex);
-      if (_removedDocs.find(docId.id()) != _removedDocs.end()) {
-        _removedDocs.erase(_removedDocs.find(docId.id()));
-        it->Next();
-        continue;
-      }
-      _lockedDocs.insert(docId.id());
-    }
-
-    res = internal->insert(trx, &batched, docId, VPackSlice(it->value().data()),
-                           Index::OperationMode::normal);
-    if (res.fail()) {
-      break;
-    }
-    numDocsWritten++;
-
-    if (numDocsWritten % 200 == 0) {  // commit buffered writes
-      s = rtrx->Commit();
-      if (!s.ok()) {
-        res = rocksutils::convertStatus(s, rocksutils::StatusHint::index);
-        break;
-      }
-      {  // clear all the processed documents
-        std::lock_guard<std::mutex> guard(_lockedDocsMutex);
-        _lockedDocs.clear();
-        _lockedDocsCond.notify_all();
-      }
-      engine->db()->BeginTransaction(wo, to, rtrx.get());  // reuse transaction
-      if (this->unique()) {
-        rtrx->SetSnapshot();
-      }
-    }
-
-    it->Next();
-  }
-
-  // now actually write all remaining index keys
-  if (res.ok() && rtrx->GetNumPuts() > 0) {
-    s = rtrx->Commit();
-    if (!s.ok()) {
-      res = rocksutils::convertStatus(s, rocksutils::StatusHint::index);
-    }
-  }
-
-  if (res.ok()) {
-    res = trx.commit();  // required to commit selectivity estimates
-  }
-
-  return res;
+  return Result();  // do nothing
 }

 // fast mode assuming exclusive access locked from outside
 template <typename WriteBatchType, typename MethodsType>
-static arangodb::Result fillIndexFast(RocksDBIndex& ridx, LogicalCollection& coll,
-                                      WriteBatchType& batch) {
-  Result res;
-  ::BuilderTrx trx(transaction::StandaloneContext::Create(coll.vocbase()), coll,
-                   AccessMode::Type::EXCLUSIVE);
-  trx.addHint(transaction::Hints::Hint::LOCK_NEVER);  // already locked
-  res = trx.begin();
-  if (!res.ok()) {
-    THROW_ARANGO_EXCEPTION(res);
-  }
-
-  RocksDBCollection* rcoll = static_cast<RocksDBCollection*>(coll.getPhysical());
-  auto state = RocksDBTransactionState::toState(&trx);
-  auto methds = RocksDBTransactionState::toMethods(&trx);
-  RocksDBTransactionCollection* trxColl = trx.resolveTrxCollection();
-
+static arangodb::Result fillIndex(RocksDBIndex& ridx, WriteBatchType& batch,
+                                  rocksdb::Snapshot const* snap) {
  // fillindex can be non transactional, we just need to clean up
-  RocksDBEngine* engine = rocksutils::globalRocksEngine();
-  rocksdb::DB* rootDB = engine->db()->GetRootDB();
+  rocksdb::DB* rootDB = rocksutils::globalRocksDB()->GetRootDB();
  TRI_ASSERT(rootDB != nullptr);

-  uint64_t numDocsWritten = 0;
-  // write batch will be reset every x documents
-  MethodsType batched(state, &batch);
-
+  RocksDBCollection* rcoll =
+      static_cast<RocksDBCollection*>(ridx.collection().getPhysical());
  auto bounds = RocksDBKeyBounds::CollectionDocuments(rcoll->objectId());
  rocksdb::Slice upper(bounds.end());

@ -313,20 +163,32 @@ static arangodb::Result fillIndexFast(RocksDBIndex& ridx, LogicalCollection& col
  rocksdb::WriteOptions wo;
  wo.disableWAL = false;  // TODO set to true eventually

-  const rocksdb::Snapshot* snap = rootDB->GetSnapshot();
-  auto snapGuard = scopeGuard([&] { rootDB->ReleaseSnapshot(snap); });
-
-  rocksdb::ReadOptions ro;
+  rocksdb::ReadOptions ro(/*cksum*/ false, /*cache*/ false);
  ro.snapshot = snap;
  ro.prefix_same_as_start = true;
  ro.iterate_upper_bound = &upper;
-  ro.verify_checksums = false;
-  ro.fill_cache = false;

  rocksdb::ColumnFamilyHandle* docCF = RocksDBColumnFamily::documents();
-  std::unique_ptr<rocksdb::Iterator> it = methds->NewIterator(ro, docCF);
+  std::unique_ptr<rocksdb::Iterator> it(rootDB->NewIterator(ro, docCF));

-  auto commitLambda = [&] {
+  auto mode = snap == nullptr ? AccessMode::Type::EXCLUSIVE : AccessMode::Type::WRITE;
+  LogicalCollection& coll = ridx.collection();
+  ::BuilderTrx trx(transaction::StandaloneContext::Create(coll.vocbase()), coll, mode);
+  if (mode == AccessMode::Type::EXCLUSIVE) {
+    trx.addHint(transaction::Hints::Hint::LOCK_NEVER);
+  }
+  Result res = trx.begin();
+  if (!res.ok()) {
+    THROW_ARANGO_EXCEPTION(res);
+  }
+
+  uint64_t numDocsWritten = 0;
+  auto state = RocksDBTransactionState::toState(&trx);
+  RocksDBTransactionCollection* trxColl = trx.resolveTrxCollection();
+  // write batch will be reset every x documents
+  MethodsType batched(state, &batch);
+
+  auto commitLambda = [&](rocksdb::SequenceNumber seq) {
    if (batch.GetWriteBatch()->Count() > 0) {
      s = rootDB->Write(wo, batch.GetWriteBatch());
      if (!s.ok()) {
@ -339,14 +201,18 @@ static arangodb::Result fillIndexFast(RocksDBIndex& ridx, LogicalCollection& col
    if (!ops.empty()) {
      TRI_ASSERT(ridx.hasSelectivityEstimate() && ops.size() == 1);
      auto it = ops.begin();
-      ridx.estimator()->bufferUpdates(it->first, std::move(it->second.inserts),
+      TRI_ASSERT(ridx.id() == it->first);
+      ridx.estimator()->bufferUpdates(seq, std::move(it->second.inserts),
                                      std::move(it->second.removals));
    }
  };

-  it->Seek(bounds.start());
-  while (it->Valid()) {
+  for (it->Seek(bounds.start()); it->Valid(); it->Next()) {
    TRI_ASSERT(it->key().compare(upper) < 0);
+    if (application_features::ApplicationServer::isStopping()) {
+      res.reset(TRI_ERROR_SHUTTING_DOWN);
+      break;
+    }

    res = ridx.insert(trx, &batched, RocksDBKey::documentId(it->key()),
                      VPackSlice(it->value().data()), Index::OperationMode::normal);
@ -356,57 +222,398 @@ static arangodb::Result fillIndexFast(RocksDBIndex& ridx, LogicalCollection& col
    numDocsWritten++;

    if (numDocsWritten % 200 == 0) {  // commit buffered writes
-      commitLambda();
+      commitLambda(rootDB->GetLatestSequenceNumber());
      if (res.fail()) {
        break;
      }
    }
+  }

-    it->Next();
+  if (!it->status().ok() && res.ok()) {
+    res = rocksutils::convertStatus(it->status(), rocksutils::StatusHint::index);
  }

  if (res.ok()) {
-    commitLambda();
+    commitLambda(rootDB->GetLatestSequenceNumber());
  }
-  batch.Clear();

  if (res.ok()) {  // required so iresearch commits
    res = trx.commit();
  }

-  // we will need to remove index elements created before an error
-  // occurred, this needs to happen since we are non transactional
-  if (res.fail()) {
-    RocksDBKeyBounds bounds = ridx.getBounds();
-    arangodb::Result res2 =
-        rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, true,
-                                     /*useRangeDel*/ numDocsWritten > 25000);
-    if (res2.fail()) {
-      LOG_TOPIC(WARN, Logger::ENGINES) << "was not able to roll-back "
-                                       << "index creation: " << res2.errorMessage();
-    }
-  }
+  // if an error occured drop() will be called
+  LOG_TOPIC(DEBUG, Logger::ENGINES) << "SNAPSHOT CAPTURED " << numDocsWritten << " " << res.errorMessage();
  
  return res;
 }

-/// non-transactional: fill index with existing documents
-/// from this collection
-arangodb::Result RocksDBBuilderIndex::fillIndexFast() {
+ arangodb::Result RocksDBBuilderIndex::fillIndexForeground() {
  RocksDBIndex* internal = _wrapped.get();
  TRI_ASSERT(internal != nullptr);
+  std::function<void()> empty;
+
+  const rocksdb::Snapshot* snap = nullptr;
+
+  Result res;
  if (this->unique()) {
    const rocksdb::Comparator* cmp = internal->columnFamily()->GetComparator();
    // unique index. we need to keep track of all our changes because we need to
    // avoid duplicate index keys. must therefore use a WriteBatchWithIndex
    rocksdb::WriteBatchWithIndex batch(cmp, 32 * 1024 * 1024);
-    return ::fillIndexFast<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(
-        *internal, _collection, batch);
+    res = ::fillIndex<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(*internal, batch, snap);
  } else {
    // non-unique index. all index keys will be unique anyway because they
    // contain the document id we can therefore get away with a cheap WriteBatch
    rocksdb::WriteBatch batch(32 * 1024 * 1024);
-    return ::fillIndexFast<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, _collection,
-                                                                       batch);
+    res = ::fillIndex<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, batch, snap);
+  }
+
+  return res;
+}
+
+namespace {
+
+template <typename MethodsType>
+struct ReplayHandler final : public rocksdb::WriteBatch::Handler {
+  ReplayHandler(uint64_t oid, RocksDBIndex& idx, transaction::Methods& trx, MethodsType* methods)
+      : _objectId(oid), _index(idx), _trx(trx), _methods(methods) {}
+
+  bool Continue() override {
+    if (application_features::ApplicationServer::isStopping()) {
+      tmpRes.reset(TRI_ERROR_SHUTTING_DOWN);
+    }
+    return tmpRes.ok();
+  }
+
+  uint64_t numInserted = 0;
+  uint64_t numRemoved = 0;
+  Result tmpRes;
+
+  void startNewBatch(rocksdb::SequenceNumber startSequence) {
+    // starting new write batch
+    _startSequence = startSequence;
+    _currentSequence = startSequence;
+    _startOfBatch = true;
+    _lastObjectID = 0;
+  }
+
+  uint64_t endBatch() {
+    _lastObjectID = 0;
+    return _currentSequence;
+  }
+
+  // The default implementation of LogData does nothing.
+  void LogData(const rocksdb::Slice& blob) override {
+    
+    switch (RocksDBLogValue::type(blob)) {
+      case RocksDBLogType::TrackedDocumentInsert:
+        if (_lastObjectID == _objectId) {
+          auto pair = RocksDBLogValue::trackedDocument(blob);
+          tmpRes = _index.insert(_trx, _methods, pair.first, pair.second,
+                                 Index::OperationMode::normal);
+          numInserted++;
+        }
+      break;
+        
+      case RocksDBLogType::TrackedDocumentRemove:
+        if (_lastObjectID == _objectId) {
+          auto pair = RocksDBLogValue::trackedDocument(blob);
+          tmpRes = _index.remove(_trx, _methods, pair.first, pair.second,
+                                 Index::OperationMode::normal);
+          numRemoved++;
+        }
+        break;
+      
+      default:  // ignore
+        _lastObjectID = 0;
+        break;
+    }
+  }
+
+  rocksdb::Status PutCF(uint32_t column_family_id, const rocksdb::Slice& key,
+                        rocksdb::Slice const& value) override {
+    incTick();
+    if (column_family_id == RocksDBColumnFamily::definitions()->GetID()) {
+      _lastObjectID = 0;
+    } else if (column_family_id == RocksDBColumnFamily::documents()->GetID()) {
+      _lastObjectID = RocksDBKey::objectId(key);
+    }
+
+    return rocksdb::Status();
+  }
+
+  rocksdb::Status DeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override {
+    incTick();
+    if (column_family_id == RocksDBColumnFamily::definitions()->GetID()) {
+      _lastObjectID = 0;
+    } else if (column_family_id == RocksDBColumnFamily::documents()->GetID()) {
+      _lastObjectID = RocksDBKey::objectId(key);
+    }
+    return rocksdb::Status();
+  }
+
+  rocksdb::Status SingleDeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override {
+    incTick();
+    if (column_family_id == RocksDBColumnFamily::definitions()->GetID()) {
+      _lastObjectID = 0;
+    } else if (column_family_id == RocksDBColumnFamily::documents()->GetID()) {
+      _lastObjectID = RocksDBKey::objectId(key);
+    }
+    return rocksdb::Status();
+  }
+
+  rocksdb::Status DeleteRangeCF(uint32_t column_family_id,
+                                const rocksdb::Slice& begin_key,
+                                const rocksdb::Slice& end_key) override {
+    incTick();                 // drop and truncate may use this
+    if (column_family_id == _index.columnFamily()->GetID() &&
+        RocksDBKey::objectId(begin_key) == _objectId &&
+        RocksDBKey::objectId(end_key) == _objectId) {
+      _index.afterTruncate(_currentSequence);
+    }
+    return rocksdb::Status();  // make WAL iterator happy
+  }
+
+ private:
+  // tick function that is called before each new WAL entry
+  void incTick() {
+    if (_startOfBatch) {
+      // we are at the start of a batch. do NOT increase sequence number
+      _startOfBatch = false;
+    } else {
+      // we are inside a batch already. now increase sequence number
+      ++_currentSequence;
+    }
+  }
+
+ private:
+  const uint64_t _objectId;  /// collection objectID
+  RocksDBIndex& _index;      /// the index to use
+  transaction::Methods& _trx;
+  MethodsType* _methods;  /// methods to fill
+
+  rocksdb::SequenceNumber _startSequence;
+  rocksdb::SequenceNumber _currentSequence;
+  rocksdb::SequenceNumber _lastWrittenSequence;
+  bool _startOfBatch = false;
+  uint64_t _lastObjectID = 0;
+};
+
+template <typename WriteBatchType, typename MethodsType>
+Result catchup(RocksDBIndex& ridx, WriteBatchType& wb, AccessMode::Type mode,
+               rocksdb::SequenceNumber startingFrom,
+               rocksdb::SequenceNumber& lastScannedTick, uint64_t& numScanned) {
+  LogicalCollection& coll = ridx.collection();
+  ::BuilderTrx trx(transaction::StandaloneContext::Create(coll.vocbase()), coll, mode);
+  if (mode == AccessMode::Type::EXCLUSIVE) {
+    trx.addHint(transaction::Hints::Hint::LOCK_NEVER);
+  }
+  Result res = trx.begin();
+  if (res.fail()) {
+    return res;
+  }
+
+  auto state = RocksDBTransactionState::toState(&trx);
+  RocksDBTransactionCollection* trxColl = trx.resolveTrxCollection();
+  RocksDBCollection* rcoll = static_cast<RocksDBCollection*>(coll.getPhysical());
+
+  rocksdb::DB* rootDB = rocksutils::globalRocksDB()->GetRootDB();
+  TRI_ASSERT(rootDB != nullptr);
+
+  // write batch will be reset every x documents
+  MethodsType batched(state, &wb);
+
+  ReplayHandler<MethodsType> replay(rcoll->objectId(), ridx, trx, &batched);
+
+  std::unique_ptr<rocksdb::TransactionLogIterator> iterator;  // reader();
+  // no need verifying the WAL contents
+  rocksdb::TransactionLogIterator::ReadOptions ro(false);
+  rocksdb::Status s = rootDB->GetUpdatesSince(startingFrom, &iterator, ro);
+  if (!s.ok()) {
+    return res.reset(convertStatus(s, rocksutils::StatusHint::wal));
+  }
+
+  auto commitLambda = [&](rocksdb::SequenceNumber seq) {
+    if (wb.GetWriteBatch()->Count() > 0) {
+      rocksdb::WriteOptions wo;
+      s = rootDB->Write(wo, wb.GetWriteBatch());
+      if (!s.ok()) {
+        res = rocksutils::convertStatus(s, rocksutils::StatusHint::index);
+      }
+    }
+    wb.Clear();
+
+    auto ops = trxColl->stealTrackedOperations();
+    if (!ops.empty()) {
+      TRI_ASSERT(ridx.hasSelectivityEstimate() && ops.size() == 1);
+      auto it = ops.begin();
+      TRI_ASSERT(ridx.id() == it->first);
+      ridx.estimator()->bufferUpdates(seq, std::move(it->second.inserts),
+                                      std::move(it->second.removals));
+    }
+  };
+
+  LOG_TOPIC(DEBUG, Logger::ENGINES) << "Scanning from " << startingFrom;
+
+  for (; iterator->Valid(); iterator->Next()) {
+    rocksdb::BatchResult batch = iterator->GetBatch();
+    lastScannedTick = batch.sequence;  // start of the batch
+    if (batch.sequence < startingFrom) {
+      continue;  // skip
+    }
+
+    replay.startNewBatch(batch.sequence);
+    s = batch.writeBatchPtr->Iterate(&replay);
+    if (!s.ok()) {
+      res = rocksutils::convertStatus(s);
+      break;
+    }
+    if (replay.tmpRes.fail()) {
+      res = replay.tmpRes;
+      break;
+    }
+    
+    commitLambda(batch.sequence);
+    if (res.fail()) {
+      break;
+    }
+    lastScannedTick = replay.endBatch();
+  }
+
+  if (!iterator->status().ok() && res.ok()) {
+    LOG_TOPIC(ERR, Logger::ENGINES) << "iterator error " << s.ToString();
+    res = rocksutils::convertStatus(iterator->status());
+  }
+
+  if (res.ok()) {
+    numScanned = replay.numInserted + replay.numRemoved;
+    res = trx.commit();  // important for iresearch
+  }
+
+  LOG_TOPIC(DEBUG, Logger::ENGINES) << "WAL REPLAYED insertions: " << replay.numInserted
+            << "; deletions: " << replay.numRemoved << "; lastScannedTick "
+            << lastScannedTick;
+
+  return res;
+}
+}  // namespace
+
+bool RocksDBBuilderIndex::Locker::lock() {
+  if (!_locked) {
+    if (_collection->lockWrite() != TRI_ERROR_NO_ERROR) {
+      return false;
+    }
+    _locked = true;
+  }
+  return true;
+}
+
+void RocksDBBuilderIndex::Locker::unlock() {
+  if (_locked) {
+    _collection->unlockWrite();
+    _locked = false;
  }
 }
+
+// Background index filler task
+arangodb::Result RocksDBBuilderIndex::fillIndexBackground(Locker& locker) {
+  TRI_ASSERT(locker.isLocked());
+  
+  arangodb::Result res;
+  RocksDBIndex* internal = _wrapped.get();
+  TRI_ASSERT(internal != nullptr);
+
+  RocksDBEngine* engine = globalRocksEngine();
+  rocksdb::DB* rootDB = engine->db()->GetRootDB();
+  rocksdb::Snapshot const* snap = rootDB->GetSnapshot();
+  engine->disableWalFilePruning(true);
+    
+  auto scope = scopeGuard([&] {
+    engine->disableWalFilePruning(false);
+    if (snap) {
+      rootDB->ReleaseSnapshot(snap);
+    }
+  });
+  
+  locker.unlock();
+  if (internal->unique()) {
+    const rocksdb::Comparator* cmp = internal->columnFamily()->GetComparator();
+    // unique index. we need to keep track of all our changes because we need to
+    // avoid duplicate index keys. must therefore use a WriteBatchWithIndex
+    rocksdb::WriteBatchWithIndex batch(cmp, 32 * 1024 * 1024);
+    res = ::fillIndex<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(
+        *internal, batch, snap);
+  } else {
+    // non-unique index. all index keys will be unique anyway because they
+    // contain the document id we can therefore get away with a cheap WriteBatch
+    rocksdb::WriteBatch batch(32 * 1024 * 1024);
+    res = ::fillIndex<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, batch,
+                                                                  snap);
+  }
+
+  if (res.fail()) {
+    return res;
+  }
+
+  rocksdb::SequenceNumber scanFrom = snap->GetSequenceNumber();
+  rootDB->ReleaseSnapshot(snap);
+  snap = nullptr;
+
+  rocksdb::SequenceNumber lastScanned = 0;
+  uint64_t numScanned = 0;
+  
+  int maxCatchups = 4;
+  while(true) {
+    lastScanned = 0;
+    if (internal->unique()) {
+      const rocksdb::Comparator* cmp = internal->columnFamily()->GetComparator();
+      // unique index. we need to keep track of all our changes because we need to
+      // avoid duplicate index keys. must therefore use a WriteBatchWithIndex
+      rocksdb::WriteBatchWithIndex batch(cmp, 32 * 1024 * 1024);
+      res = ::catchup<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(
+          *internal, batch, AccessMode::Type::WRITE, scanFrom, lastScanned, numScanned);
+    } else {
+      // non-unique index. all index keys will be unique anyway because they
+      // contain the document id we can therefore get away with a cheap WriteBatch
+      rocksdb::WriteBatch batch(32 * 1024 * 1024);
+      res = ::catchup<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, batch,
+                                                                  AccessMode::Type::WRITE,
+                                                                  scanFrom, lastScanned,
+                                                                  numScanned);
+    }
+
+    if (res.fail()) {
+      return res;
+    }
+    
+    if (numScanned < 5000 || maxCatchups-- == 0) {
+      TRI_ASSERT(lastScanned > scanFrom);
+      std::this_thread::yield();
+      break;
+    }
+  }
+
+  if (!locker.lock()) {
+    return res.reset(TRI_ERROR_LOCK_TIMEOUT);
+  }
+  
+  scanFrom = lastScanned;
+  if (internal->unique()) {
+    const rocksdb::Comparator* cmp = internal->columnFamily()->GetComparator();
+    // unique index. we need to keep track of all our changes because we need to
+    // avoid duplicate index keys. must therefore use a WriteBatchWithIndex
+    rocksdb::WriteBatchWithIndex batch(cmp, 32 * 1024 * 1024);
+    res = ::catchup<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(
+        *internal, batch, AccessMode::Type::EXCLUSIVE, scanFrom, lastScanned, numScanned);
+  } else {
+    // non-unique index. all index keys will be unique anyway because they
+    // contain the document id we can therefore get away with a cheap WriteBatch
+    rocksdb::WriteBatch batch(32 * 1024 * 1024);
+    res = ::catchup<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, batch,
+                                                                AccessMode::Type::EXCLUSIVE,
+                                                                scanFrom, lastScanned,
+                                                                numScanned);
+  }
+  
+  return res;
+}
--- a/arangod/RocksDBEngine/RocksDBBuilderIndex.h
+++ b/arangod/RocksDBEngine/RocksDBBuilderIndex.h
@ -30,6 +30,8 @@

 namespace arangodb {
  
+class RocksDBCollection;
+
 /// Dummy index class that contains the logic to build indexes
 /// without an exclusive lock. It wraps the actual index implementation
 /// and adds some required synchronization logic on top
@ -71,13 +73,11 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex {
  bool hasSelectivityEstimate() const override { return false; }

  /// insert index elements into the specified write batch.
-  Result insert(transaction::Methods& trx, RocksDBMethods*,
-                LocalDocumentId const& documentId,
+  Result insert(transaction::Methods& trx, RocksDBMethods*, LocalDocumentId const& documentId,
                arangodb::velocypack::Slice const&, OperationMode mode) override;

  /// remove index elements and put it in the specified write batch.
-  Result remove(transaction::Methods& trx, RocksDBMethods*,
-                LocalDocumentId const& documentId,
+  Result remove(transaction::Methods& trx, RocksDBMethods*, LocalDocumentId const& documentId,
                arangodb::velocypack::Slice const&, OperationMode mode) override;

  RocksDBBuilderIndex(std::shared_ptr<arangodb::RocksDBIndex> const&);
@ -91,12 +91,23 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex {
  }
  void recalculateEstimates() override { _wrapped->recalculateEstimates(); }

-  /// @brief fill index, will exclusively lock the collection
-  Result fillIndexFast();
+  /// @brief assumes an exclusive lock on the collection
+  Result fillIndexForeground();
+  
+  struct Locker {
+    Locker(RocksDBCollection* c) : _collection(c), _locked(false) {}
+    ~Locker() { unlock(); }
+    bool lock();
+    void unlock();
+    bool isLocked() const { return _locked; }
+  private:
+    RocksDBCollection* const _collection;
+    bool _locked;
+  };

  /// @brief fill the index, assume already locked exclusively
-  /// @param unlock called when collection lock can be released
-  Result fillIndexBackground(std::function<void()> const& unlock);
+  /// @param locker locks and unlocks the collection
+  Result fillIndexBackground(Locker& locker);

  virtual IndexIterator* iteratorForCondition(transaction::Methods* trx,
                                              ManagedDocumentResult* result,
@ -109,17 +120,6 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex {

 private:
  std::shared_ptr<arangodb::RocksDBIndex> _wrapped;
-
-  std::atomic<bool> _hasError;
-  std::mutex _errorMutex;
-  Result _errorResult;
-
-  std::mutex _removedDocsMutex;
-  std::unordered_set<LocalDocumentId::BaseType> _removedDocs;
-
-  std::mutex _lockedDocsMutex;
-  std::condition_variable _lockedDocsCond;
-  std::unordered_set<LocalDocumentId::BaseType> _lockedDocs;
 };
 }  // namespace arangodb

--- a/arangod/RocksDBEngine/RocksDBCollection.cpp
+++ b/arangod/RocksDBEngine/RocksDBCollection.cpp
@ -80,7 +80,8 @@ RocksDBCollection::RocksDBCollection(LogicalCollection& collection,
      _cacheEnabled(
          !collection.system() &&
          basics::VelocyPackHelper::readBooleanValue(info, "cacheEnabled", false) &&
-          CacheManagerFeature::MANAGER != nullptr) {
+          CacheManagerFeature::MANAGER != nullptr),
+      _numIndexCreations(0) {
  TRI_ASSERT(!ServerState::instance()->isCoordinator());
  VPackSlice s = info.get("isVolatile");
  if (s.isBoolean() && s.getBoolean()) {
@ -108,7 +109,8 @@ RocksDBCollection::RocksDBCollection(LogicalCollection& collection,
      _cache(nullptr),
      _cachePresent(false),
      _cacheEnabled(static_cast<RocksDBCollection const*>(physical)->_cacheEnabled &&
-                    CacheManagerFeature::MANAGER != nullptr) {
+                    CacheManagerFeature::MANAGER != nullptr),
+      _numIndexCreations(0) {
  TRI_ASSERT(!ServerState::instance()->isCoordinator());
  rocksutils::globalRocksEngine()->addCollectionMapping(
      _objectId, _logicalCollection.vocbase().id(), _logicalCollection.id());
@ -264,8 +266,23 @@ void RocksDBCollection::prepareIndexes(arangodb::velocypack::Slice indexesSlice)
  }

  WRITE_LOCKER(guard, _indexesLock);
+  TRI_ASSERT(_indexes.empty());
  for (std::shared_ptr<Index>& idx : indexes) {
-    addIndex(std::move(idx));
+    auto const id = idx->id();
+    for (auto const& it : _indexes) {
+      if (it->id() == id) {  // index is there twice
+        idx.reset();
+      }
+    }
+
+    if (idx) {
+      TRI_UpdateTickServer(static_cast<TRI_voc_tick_t>(id));
+      _indexes.emplace_back(idx);
+      if (idx->type() == Index::TRI_IDX_TYPE_PRIMARY_INDEX) {
+        TRI_ASSERT(idx->id() == 0);
+        _primaryIndex = static_cast<RocksDBPrimaryIndex*>(idx.get());
+      }
+    }
  }

  if (_indexes[0]->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX ||
@ -286,7 +303,7 @@ void RocksDBCollection::prepareIndexes(arangodb::velocypack::Slice indexesSlice)
  TRI_ASSERT(!_indexes.empty());
 }

-std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slice const& info,
+std::shared_ptr<Index> RocksDBCollection::createIndex(VPackSlice const& info,
                                                      bool restore, bool& created) {
  TRI_ASSERT(info.isObject());
  Result res;
@ -298,18 +315,17 @@ std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slic
  if (res.fail()) {
    THROW_ARANGO_EXCEPTION(res);
  }
-  auto releaseGuard =
-      scopeGuard([&] { vocbase.releaseCollection(&_logicalCollection); });
-  res = lockWrite();
-  if (res.fail()) {
-    THROW_ARANGO_EXCEPTION(res);
-  }
-  //  WRITE_LOCKER(indexGuard, _indexesLock);
-  auto unlockGuard = scopeGuard([&] {
-    //    indexGuard.unlock(); // unlock in reverse order
-    this->unlockWrite();
+  _numIndexCreations.fetch_add(1, std::memory_order_release);
+  auto colGuard = scopeGuard([&] {
+    vocbase.releaseCollection(&_logicalCollection);
+    _numIndexCreations.fetch_sub(1, std::memory_order_release);
  });

+  RocksDBBuilderIndex::Locker locker(this);
+  if (!locker.lock()) {
+    THROW_ARANGO_EXCEPTION(TRI_ERROR_LOCK_TIMEOUT);
+  }
+
  std::shared_ptr<Index> idx;
  {  // Step 1. Check for matching index
    WRITE_LOCKER(guard, _indexesLock);
@ -377,23 +393,18 @@ std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slic
    }
  }

-  const bool inBackground = false;  // TODO simon: will be enabled in a seperate PR
-#if 0
-  bool inBackground = basics::VelocyPackHelper::getBooleanValue(
-                        info, StaticStrings::IndexInBackground, false);
-#endif
-
+  const bool inBackground =
+      basics::VelocyPackHelper::getBooleanValue(info, StaticStrings::IndexInBackground, false);
  // Step 4. fill index
  if (res.ok()) {
    if (inBackground) {  // allow concurrent inserts into index
      _indexes.emplace_back(buildIdx);
-      res = buildIdx->fillIndexBackground([&] {
-        unlockGuard.fire();  // will be called at appropriate time
-      });
+      res = buildIdx->fillIndexBackground(locker);
    } else {
-      res = buildIdx->fillIndexFast();  // will lock again internally
+      res = buildIdx->fillIndexForeground();  // will lock again internally
    }
  }
+  locker.lock(); // always lock to avoid inconsistencies

  // Step 5. cleanup
  if (res.ok()) {
@ -411,20 +422,6 @@ std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slic
      }
    }

-    //    // we should sync the selectivity estimates TODO fix
-    //    res = engine->settingsManager()->sync(false);
-    //    if (res.fail()) { // not critical
-    //      LOG_TOPIC(WARN, Logger::ENGINES) << "could not sync settings: "
-    //      << res.errorMessage();
-    //      res.reset();
-    //    }
-    //
-    //    rocksdb::Status s = engine->db()->GetRootDB()->FlushWAL(true);
-    //    if (!s.ok()) { // not critical
-    //      LOG_TOPIC(WARN, Logger::ENGINES) << "could not flush wal: "
-    //      << s.ToString();
-    //    }
-
 #if USE_PLAN_CACHE
    arangodb::aql::PlanCache::instance()->invalidate(_logicalCollection->vocbase());
 #endif
@ -442,7 +439,6 @@ std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slic
    }
  }

-  unlockGuard.fire();  // may have already been fired
  if (res.fail()) {
    {  // We could not create the index. Better abort
      WRITE_LOCKER(guard, _indexesLock);
@ -550,8 +546,7 @@ Result RocksDBCollection::truncate(transaction::Methods& trx, OperationOptions&

  if (state->isOnlyExclusiveTransaction() &&
      state->hasHint(transaction::Hints::Hint::ALLOW_RANGE_DELETE) &&
-      static_cast<RocksDBEngine*>(EngineSelectorFeature::ENGINE)->canUseRangeDeleteInWal() &&
-      _numberDocuments >= 32 * 1024) {
+      this->canUseRangeDeleteInWal() && _numberDocuments >= 32 * 1024) {
    // non-transactional truncate optimization. We perform a bunch of
    // range deletes and circumwent the normal rocksdb::Transaction.
    // no savepoint needed here
@ -562,8 +557,6 @@ Result RocksDBCollection::truncate(transaction::Methods& trx, OperationOptions&
    }

    RocksDBEngine* engine = rocksutils::globalRocksEngine();
-    // add the assertion again here, so we are sure we can use RangeDeletes
-    TRI_ASSERT(engine->canUseRangeDeleteInWal());
    rocksdb::DB* db = engine->db()->GetRootDB();

    TRI_IF_FAILURE("RocksDBCollection::truncate::forceSync") {
@ -831,9 +824,10 @@ Result RocksDBCollection::insert(arangodb::transaction::Methods* trx,
      if (documentId.isSet()) {
        if (options.indexOperationMode == Index::OperationMode::internal) {
          // need to return the key of the conflict document
-          return Result(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED, keySlice.copyString());
+          return res.reset(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED,
+                           keySlice.copyString());
        }
-        return Result(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED);
+        return res.reset(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED);
      }
    }
  }
@ -910,7 +904,7 @@ Result RocksDBCollection::update(arangodb::transaction::Methods* trx,
    int result = checkRevision(trx, expectedRev, prevRev);

    if (result != TRI_ERROR_NO_ERROR) {
-      return Result(result);
+      return res.reset(result);
    }
  }

@ -920,7 +914,7 @@ Result RocksDBCollection::update(arangodb::transaction::Methods* trx,
    TRI_ASSERT(!mdr.empty());

    trackWaitForSync(trx, options);
-    return Result();
+    return res;
  }

  // merge old and new values
@ -937,7 +931,7 @@ Result RocksDBCollection::update(arangodb::transaction::Methods* trx,
  if (_isDBServer) {
    // Need to check that no sharding keys have changed:
    if (arangodb::shardKeysChanged(_logicalCollection, oldDoc, builder->slice(), false)) {
-      return Result(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SHARDING_ATTRIBUTES);
+      return res.reset(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SHARDING_ATTRIBUTES);
    }
  }

@ -994,13 +988,13 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
  // get the previous revision
  VPackSlice key = newSlice.get(StaticStrings::KeyString);

+  Result res;
  if (key.isNone()) {
-    return Result(TRI_ERROR_ARANGO_DOCUMENT_HANDLE_BAD);
+    return res.reset(TRI_ERROR_ARANGO_DOCUMENT_HANDLE_BAD);
  }

  // get the previous revision
-  Result res = this->read(trx, key, previous, /*lock*/ false);
-
+  res = this->read(trx, key, previous, /*lock*/ false);
  if (res.fail()) {
    return res;
  }
@ -1018,10 +1012,10 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
    if (newSlice.isObject()) {
      expectedRev = TRI_ExtractRevisionId(newSlice);
    }
-    int res = checkRevision(trx, expectedRev, prevRev);

-    if (res != TRI_ERROR_NO_ERROR) {
-      return Result(res);
+    res = checkRevision(trx, expectedRev, prevRev);
+    if (res.fail()) {
+      return res;
    }
  }

@ -1038,7 +1032,7 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
  if (_isDBServer) {
    // Need to check that no sharding keys have changed:
    if (arangodb::shardKeysChanged(_logicalCollection, oldDoc, builder->slice(), false)) {
-      return Result(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SHARDING_ATTRIBUTES);
+      return res.reset(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SHARDING_ATTRIBUTES);
    }
  }

@ -1050,9 +1044,9 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
  // add possible log statement under guard
  state->prepareOperation(_logicalCollection.id(), revisionId, TRI_VOC_DOCUMENT_OPERATION_REPLACE);

-  Result opResult = updateDocument(trx, oldDocumentId, oldDoc, documentId, newDoc, options);
+  res = updateDocument(trx, oldDocumentId, oldDoc, documentId, newDoc, options);

-  if (opResult.ok()) {
+  if (res.ok()) {
    trackWaitForSync(trx, options);

    if (options.silent) {
@ -1079,7 +1073,7 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
    guard.finish(hasPerformedIntermediateCommit);
  }

-  return opResult;
+  return res;
 }

 Result RocksDBCollection::remove(transaction::Methods& trx, velocypack::Slice slice,
@ -1103,7 +1097,7 @@ Result RocksDBCollection::remove(transaction::Methods& trx, velocypack::Slice sl
  TRI_ASSERT(!key.isNone());

  // get the previous revision
-  auto res = this->read(&trx, key, previous, /*lock*/ false);
+  Result res = this->read(&trx, key, previous, /*lock*/ false);

  if (res.fail()) {
    return res;
@ -1120,10 +1114,10 @@ Result RocksDBCollection::remove(transaction::Methods& trx, velocypack::Slice sl
  // Check old revision:
  if (!options.ignoreRevs && slice.isObject()) {
    TRI_voc_rid_t expectedRevisionId = TRI_ExtractRevisionId(slice);
-    auto res = checkRevision(&trx, expectedRevisionId, oldRevisionId);
+    res = checkRevision(&trx, expectedRevisionId, oldRevisionId);

-    if (res != TRI_ERROR_NO_ERROR) {
-      return Result(res);
+    if (res.fail()) {
+      return res;
    }
  }

@ -1193,29 +1187,6 @@ void RocksDBCollection::figuresSpecific(std::shared_ptr<arangodb::velocypack::Bu
  }
 }

-void RocksDBCollection::addIndex(std::shared_ptr<arangodb::Index> idx) {
-  // LOCKED from the outside
-  // primary index must be added at position 0
-  TRI_ASSERT(ServerState::instance()->isRunningInCluster() ||
-             idx->type() != arangodb::Index::TRI_IDX_TYPE_PRIMARY_INDEX ||
-             _indexes.empty());
-
-  auto const id = idx->id();
-  for (auto const& it : _indexes) {
-    if (it->id() == id) {
-      // already have this particular index. do not add it again
-      return;
-    }
-  }
-
-  TRI_UpdateTickServer(static_cast<TRI_voc_tick_t>(id));
-  _indexes.emplace_back(idx);
-  if (idx->type() == Index::TRI_IDX_TYPE_PRIMARY_INDEX) {
-    TRI_ASSERT(idx->id() == 0);
-    _primaryIndex = static_cast<RocksDBPrimaryIndex*>(idx.get());
-  }
-}
-
 Result RocksDBCollection::insertDocument(arangodb::transaction::Methods* trx,
                                         LocalDocumentId const& documentId,
                                         VPackSlice const& doc,
@ -1312,28 +1283,23 @@ Result RocksDBCollection::updateDocument(transaction::Methods* trx,
  Result res;

  RocksDBMethods* mthd = RocksDBTransactionState::toMethods(trx);
-
-  // We NEED to do the PUT first, otherwise WAL tailing breaks
-  RocksDBKeyLeaser newKey(trx);
-  newKey->constructDocument(_objectId, newDocumentId);
-  // simon: we do not need to blacklist the new documentId
-
  // disable indexing in this transaction if we are allowed to
  IndexingDisabler disabler(mthd, trx->isSingleOperationTransaction());

-  rocksdb::Status s =
-      mthd->Put(RocksDBColumnFamily::documents(), newKey.ref(),
-                rocksdb::Slice(reinterpret_cast<char const*>(newDoc.begin()),
-                               static_cast<size_t>(newDoc.byteSize())));
+  RocksDBKeyLeaser key(trx);
+  key->constructDocument(_objectId, oldDocumentId);
+  blackListKey(key->string().data(), static_cast<uint32_t>(key->string().size()));
+
+  rocksdb::Status s = mthd->SingleDelete(RocksDBColumnFamily::documents(), key.ref());
  if (!s.ok()) {
    return res.reset(rocksutils::convertStatus(s, rocksutils::document));
  }

-  RocksDBKeyLeaser oldKey(trx);
-  oldKey->constructDocument(_objectId, oldDocumentId);
-  blackListKey(oldKey->string().data(), static_cast<uint32_t>(oldKey->string().size()));
-
-  s = mthd->SingleDelete(RocksDBColumnFamily::documents(), oldKey.ref());
+  key->constructDocument(_objectId, newDocumentId);
+  // simon: we do not need to blacklist the new documentId
+  s = mthd->Put(RocksDBColumnFamily::documents(), key.ref(),
+                rocksdb::Slice(reinterpret_cast<char const*>(newDoc.begin()),
+                               static_cast<size_t>(newDoc.byteSize())));
  if (!s.ok()) {
    return res.reset(rocksutils::convertStatus(s, rocksutils::document));
  }
@ -1341,8 +1307,8 @@ Result RocksDBCollection::updateDocument(transaction::Methods* trx,
  READ_LOCKER(guard, _indexesLock);
  for (std::shared_ptr<Index> const& idx : _indexes) {
    RocksDBIndex* rIdx = static_cast<RocksDBIndex*>(idx.get());
-    res = rIdx->update(*trx, mthd, oldDocumentId, oldDoc, newDocumentId,
-                       newDoc, options.indexOperationMode);
+    res = rIdx->update(*trx, mthd, oldDocumentId, oldDoc, newDocumentId, newDoc,
+                       options.indexOperationMode);

    if (res.fail()) {
      break;
@ -1411,8 +1377,7 @@ arangodb::Result RocksDBCollection::lookupDocumentVPack(LocalDocumentId const& d
    }
  } else {
    LOG_TOPIC(DEBUG, Logger::ENGINES)
-        << "NOT FOUND rev: " << documentId.id()
-        << " trx: " << trx->state()->id() << " seq: " << mthd->sequenceNumber()
+        << "NOT FOUND rev: " << documentId.id() << " trx: " << trx->state()->id()
        << " objectID " << _objectId << " name: " << _logicalCollection.name();
    mdr.clear();
    res.reset(rocksutils::convertStatus(s, rocksutils::document));
@ -1476,8 +1441,7 @@ arangodb::Result RocksDBCollection::lookupDocumentVPack(
    cb(documentId, VPackSlice(ps.data()));
  } else {
    LOG_TOPIC(DEBUG, Logger::ENGINES)
-        << "NOT FOUND rev: " << documentId.id()
-        << " trx: " << trx->state()->id() << " seq: " << mthd->sequenceNumber()
+        << "NOT FOUND rev: " << documentId.id() << " trx: " << trx->state()->id()
        << " objectID " << _objectId << " name: " << _logicalCollection.name();
    res.reset(rocksutils::convertStatus(s, rocksutils::document));
  }
@ -1663,10 +1627,9 @@ uint64_t RocksDBCollection::recalculateCounts() {
  std::unique_ptr<rocksdb::Iterator> it(db->NewIterator(ro, cf));
  std::size_t count = 0;

-  it->Seek(bounds.start());
-  while (it->Valid() && it->key().compare(upper) < 0) {
+  for (it->Seek(bounds.start()); it->Valid(); it->Next()) {
+    TRI_ASSERT(it->key().compare(upper) < 0);
    ++count;
-    it->Next();
  }

  int64_t adjustment = snapNumberOfDocuments - count;
@ -1780,3 +1743,12 @@ void RocksDBCollection::trackWaitForSync(arangodb::transaction::Methods* trx,
    trx->state()->waitForSync(true);
  }
 }
+
+/// @brief can use non transactional range delete in write ahead log
+bool RocksDBCollection::canUseRangeDeleteInWal() const {
+  if (ServerState::instance()->isSingleServer()) {
+    // disableWalFilePruning is used by createIndex
+    return _numIndexCreations.load(std::memory_order_acquire) == 0;
+  }
+  return false;
+}
--- a/arangod/RocksDBEngine/RocksDBCollection.h
+++ b/arangod/RocksDBEngine/RocksDBCollection.h
@ -181,12 +181,8 @@ class RocksDBCollection final : public PhysicalCollection {
  RocksDBCollectionMeta& meta() { return _meta; }

 private:
-  /// @brief track the usage of waitForSync option in an operation
-  void trackWaitForSync(arangodb::transaction::Methods* trx, OperationOptions& options);
-
  /// @brief return engine-specific figures
  void figuresSpecific(std::shared_ptr<velocypack::Builder>&) override;
-  void addIndex(std::shared_ptr<arangodb::Index> idx);

  // @brief return the primary index
  // WARNING: Make sure that this instance
@ -232,8 +228,15 @@ class RocksDBCollection final : public PhysicalCollection {
    return (_cacheEnabled && _cachePresent);
  }

+  /// @brief track key in file
  void blackListKey(char const* data, std::size_t len) const;

+  /// @brief track the usage of waitForSync option in an operation
+  void trackWaitForSync(arangodb::transaction::Methods* trx, OperationOptions& options);
+
+  /// @brief can use non transactional range delete in write ahead log
+  bool canUseRangeDeleteInWal() const;
+
 private:
  uint64_t const _objectId;     // rocksdb-specific object id for collection
  RocksDBCollectionMeta _meta;  /// collection metadata
@ -247,10 +250,13 @@ class RocksDBCollection final : public PhysicalCollection {
  mutable basics::ReadWriteLock _exclusiveLock;
  /// @brief document cache (optional)
  mutable std::shared_ptr<cache::Cache> _cache;
+
  // we use this boolean for testing whether _cache is set.
  // it's quicker than accessing the shared_ptr each time
  mutable bool _cachePresent;
  bool _cacheEnabled;
+  /// @brief number of index creations in progress
+  std::atomic<int> _numIndexCreations;
 };

 inline RocksDBCollection* toRocksDBCollection(PhysicalCollection* physical) {
--- a/arangod/RocksDBEngine/RocksDBCollectionMeta.cpp
+++ b/arangod/RocksDBEngine/RocksDBCollectionMeta.cpp
@ -354,7 +354,7 @@ Result RocksDBCollectionMeta::deserializeMeta(rocksdb::DB* db, LogicalCollection
    if (!s.ok() && !s.IsNotFound()) {
      return rocksutils::convertStatus(s);
    } else if (s.IsNotFound()) {  // expected with nosync recovery tests
-      LOG_TOPIC(WARN, Logger::ROCKSDB)
+      LOG_TOPIC(WARN, Logger::ENGINES)
          << "recalculating index estimate for index "
          << "type '" << idx->typeName() << "' with id '" << idx->id() << "'";
      idx->recalculateEstimates();
--- a/arangod/RocksDBEngine/RocksDBCommon.h
+++ b/arangod/RocksDBEngine/RocksDBCommon.h
@ -27,7 +27,6 @@
 #define ARANGO_ROCKSDB_ROCKSDB_COMMON_H 1

 #include "Basics/Common.h"
-#include "Basics/Endian.h"
 #include "Basics/Result.h"
 #include "Basics/RocksDBUtils.h"
 #include "RocksDBEngine/RocksDBComparator.h"
--- a/arangod/RocksDBEngine/RocksDBEngine.cpp
+++ b/arangod/RocksDBEngine/RocksDBEngine.cpp
@ -902,16 +902,17 @@ int RocksDBEngine::getCollectionsAndIndexes(TRI_vocbase_t& vocbase,
 }

 int RocksDBEngine::getViews(TRI_vocbase_t& vocbase, arangodb::velocypack::Builder& result) {
-  rocksdb::ReadOptions readOptions;
-  std::unique_ptr<rocksdb::Iterator> iter(
-      _db->NewIterator(readOptions, RocksDBColumnFamily::definitions()));
-
-  result.openArray();
-
  auto bounds = RocksDBKeyBounds::DatabaseViews(vocbase.id());
+  rocksdb::Slice upper = bounds.end();
+  rocksdb::ColumnFamilyHandle* cf = RocksDBColumnFamily::definitions();
  
-  for (iter->Seek(bounds.start());
-       iter->Valid() && iter->key().compare(bounds.end()) < 0; iter->Next()) {
+  rocksdb::ReadOptions ro;
+  ro.iterate_upper_bound = &upper;
+  
+  std::unique_ptr<rocksdb::Iterator> iter(_db->NewIterator(ro, cf));
+  result.openArray();
+  for (iter->Seek(bounds.start()); iter->Valid(); iter->Next()) {
+    TRI_ASSERT(iter->key().compare(bounds.end()) < 0);
    auto slice = VPackSlice(iter->value().data());

    LOG_TOPIC(TRACE, Logger::VIEWS) << "got view slice: " << slice.toJson();
@ -1571,6 +1572,14 @@ void RocksDBEngine::waitForEstimatorSync(std::chrono::milliseconds maxWaitTime)
  }
 }
  
+void RocksDBEngine::disableWalFilePruning(bool disable) {
+  _backgroundThread->disableWalFilePruning(disable);
+}
+
+bool RocksDBEngine::disableWalFilePruning() const {
+  return _backgroundThread->disableWalFilePruning();
+}
+
 Result RocksDBEngine::registerRecoveryHelper(std::shared_ptr<RocksDBRecoveryHelper> helper) {
  try {
    _recoveryHelpers.emplace_back(helper);
@ -2177,10 +2186,6 @@ void RocksDBEngine::releaseTick(TRI_voc_tick_t tick) {
  }
 }

-bool RocksDBEngine::canUseRangeDeleteInWal() const {
-  return ServerState::instance()->isSingleServer();
-}
-
 }  // namespace arangodb

 // -----------------------------------------------------------------------------
--- a/arangod/RocksDBEngine/RocksDBEngine.h
+++ b/arangod/RocksDBEngine/RocksDBEngine.h
@ -296,7 +296,9 @@ class RocksDBEngine final : public StorageEngine {
  static std::string const EngineName;
  static std::string const FeatureName;
  
-  bool canUseRangeDeleteInWal() const;
+  /// @brief allow / disbable removal of WAL files
+  void disableWalFilePruning(bool disable);
+  bool disableWalFilePruning() const;

  rocksdb::Options const& rocksDBOptions() const { return _options; }

--- a/arangod/RocksDBEngine/RocksDBFulltextIndex.cpp
+++ b/arangod/RocksDBEngine/RocksDBFulltextIndex.cpp
@ -420,12 +420,13 @@ Result RocksDBFulltextIndex::applyQueryToken(transaction::Methods* trx,
  rocksdb::ReadOptions ro = mthds->iteratorReadOptions();
  ro.iterate_upper_bound = &end;
  std::unique_ptr<rocksdb::Iterator> iter = mthds->NewIterator(ro, _cf);
-  iter->Seek(bounds.start());

  // set is used to perform an intersection with the result set
  std::set<LocalDocumentId> intersect;
  // apply left to right logic, merging all current results with ALL previous
-  while (iter->Valid() && cmp->Compare(iter->key(), end) < 0) {
+  for (iter->Seek(bounds.start());
+       iter->Valid() && cmp->Compare(iter->key(), end) < 0;
+       iter->Next()) {
    TRI_ASSERT(_objectId == RocksDBKey::objectId(iter->key()));

    rocksdb::Status s = iter->status();
@ -442,7 +443,6 @@ Result RocksDBFulltextIndex::applyQueryToken(transaction::Methods* trx,
    } else if (token.operation == FulltextQueryToken::EXCLUDE) {
      resultSet.erase(documentId);
    }
-    iter->Next();
  }
  if (token.operation == FulltextQueryToken::AND) {
    if (resultSet.empty() || intersect.empty()) {
--- a/arangod/RocksDBEngine/RocksDBIndex.h
+++ b/arangod/RocksDBEngine/RocksDBIndex.h
@ -42,7 +42,6 @@ namespace cache {
 class Cache;
 }
 class LogicalCollection;
-class RocksDBSettingsManager;
 class RocksDBMethods;

 class RocksDBIndex : public Index {
--- a/arangod/RocksDBEngine/RocksDBIndexFactory.cpp
+++ b/arangod/RocksDBEngine/RocksDBIndexFactory.cpp
@ -447,8 +447,7 @@ struct GeoIndexFactory : public DefaultIndexFactory {

    if (isCreation && !ServerState::instance()->isCoordinator() &&
        !definition.hasKey("objectId")) {
-      normalized.add("objectId",
-                     arangodb::velocypack::Value(std::to_string(TRI_NewTickServer())));
+      normalized.add("objectId", VPackValue(std::to_string(TRI_NewTickServer())));
    }

    return EnhanceJsonIndexGeo(definition, normalized, isCreation);
--- a/arangod/RocksDBEngine/RocksDBLogValue.cpp
+++ b/arangod/RocksDBEngine/RocksDBLogValue.cpp
@ -100,12 +100,33 @@ RocksDBLogValue RocksDBLogValue::DocumentRemoveV2(TRI_voc_rid_t rid) {
 RocksDBLogValue RocksDBLogValue::SinglePut(TRI_voc_tick_t vocbaseId, TRI_voc_cid_t cid) {
  return RocksDBLogValue(RocksDBLogType::SinglePut, vocbaseId, cid);
 }
+
 RocksDBLogValue RocksDBLogValue::SingleRemoveV2(TRI_voc_tick_t vocbaseId,
                                                TRI_voc_cid_t cid, TRI_voc_rid_t rid) {
  return RocksDBLogValue(RocksDBLogType::SingleRemoveV2, vocbaseId, cid, rid);
 }

-/*static*/ RocksDBLogValue RocksDBLogValue::Empty() {
+RocksDBLogValue RocksDBLogValue::TrackedDocumentInsert(LocalDocumentId docId,
+                                                       VPackSlice const& slice) {
+  RocksDBLogValue val{};
+  val._buffer.reserve(sizeof(RocksDBLogType) + sizeof(LocalDocumentId::BaseType) + slice.byteSize());
+  val._buffer.push_back(static_cast<char>(RocksDBLogType::TrackedDocumentInsert));
+  uintToPersistentLittleEndian(val._buffer, docId.id());
+  val._buffer.append(slice.startAs<char>(), slice.byteSize());
+  return val;
+}
+
+RocksDBLogValue RocksDBLogValue::TrackedDocumentRemove(LocalDocumentId docId,
+                                                       VPackSlice const& slice) {
+  RocksDBLogValue val{};
+  val._buffer.reserve(sizeof(RocksDBLogType) + sizeof(LocalDocumentId::BaseType) + slice.byteSize());
+  val._buffer.push_back(static_cast<char>(RocksDBLogType::TrackedDocumentRemove));
+  uintToPersistentLittleEndian(val._buffer, docId.id());
+  val._buffer.append(reinterpret_cast<char const*>(slice.begin()), slice.byteSize());
+  return val;
+}
+
+RocksDBLogValue RocksDBLogValue::Empty() {
  return RocksDBLogValue();
 }

@ -322,6 +343,18 @@ StringRef RocksDBLogValue::oldCollectionName(rocksdb::Slice const& slice) {
  return StringRef(slice.data() + off, slice.size() - off);
 }

+/// @brief get slice from tracked document
+std::pair<LocalDocumentId, VPackSlice> RocksDBLogValue::trackedDocument(rocksdb::Slice const& slice) {
+  TRI_ASSERT(slice.size() >= 2);
+  RocksDBLogType type = static_cast<RocksDBLogType>(slice.data()[0]);
+  TRI_ASSERT(type == RocksDBLogType::TrackedDocumentInsert ||
+             type == RocksDBLogType::TrackedDocumentRemove);
+  
+  LocalDocumentId id(uintFromPersistentLittleEndian<LocalDocumentId::BaseType>(slice.data() + sizeof(RocksDBLogType)));
+  VPackSlice data(slice.data() + sizeof(RocksDBLogType) + sizeof(LocalDocumentId::BaseType));
+  return std::make_pair(id, data);
+}
+
 bool RocksDBLogValue::containsDatabaseId(RocksDBLogType type) {
  return type == RocksDBLogType::DatabaseCreate || type == RocksDBLogType::DatabaseDrop ||
         type == RocksDBLogType::CollectionCreate ||
--- a/arangod/RocksDBEngine/RocksDBLogValue.h
+++ b/arangod/RocksDBEngine/RocksDBLogValue.h
@ -27,6 +27,7 @@
 #include "Basics/Common.h"
 #include "Basics/StringRef.h"
 #include "RocksDBEngine/RocksDBTypes.h"
+#include "VocBase/LocalDocumentId.h"
 #include "VocBase/voc-types.h"

 #include <rocksdb/slice.h>
@ -72,6 +73,9 @@ class RocksDBLogValue {
  static RocksDBLogValue SingleRemoveV2(TRI_voc_tick_t vocbaseId,
                                        TRI_voc_cid_t cid, TRI_voc_rid_t rid);
  
+  static RocksDBLogValue TrackedDocumentInsert(LocalDocumentId, velocypack::Slice const&);
+  static RocksDBLogValue TrackedDocumentRemove(LocalDocumentId, velocypack::Slice const&);
+
  // empty log value
  static RocksDBLogValue Empty();

@ -96,9 +100,12 @@ class RocksDBLogValue {
  /// @brief get UUID from view drop marker
  static arangodb::StringRef viewUUID(rocksdb::Slice const&);

-  // deprecated method for old collection drop marker
+  /// @deprecated method for old collection drop marker
  static arangodb::StringRef oldCollectionName(rocksdb::Slice const&);
  
+  /// @brief get slice from tracked document
+  static std::pair<LocalDocumentId, velocypack::Slice> trackedDocument(rocksdb::Slice const&);
+
  static bool containsDatabaseId(RocksDBLogType type);
  static bool containsCollectionId(RocksDBLogType type);
  static bool containsViewId(RocksDBLogType type);
--- a/arangod/RocksDBEngine/RocksDBMethods.cpp
+++ b/arangod/RocksDBEngine/RocksDBMethods.cpp
@ -98,10 +98,6 @@ void RocksDBSavePoint::rollback() {

 // =================== RocksDBMethods ===================

-rocksdb::SequenceNumber RocksDBMethods::sequenceNumber() {
-  return _state->sequenceNumber();
-}
-
 rocksdb::ReadOptions RocksDBMethods::iteratorReadOptions() {
  if (_state->hasHint(transaction::Hints::Hint::INTERMEDIATE_COMMITS)) {
    rocksdb::ReadOptions ro = _state->_rocksReadOptions;
@ -179,6 +175,10 @@ rocksdb::Status RocksDBReadOnlyMethods::SingleDelete(rocksdb::ColumnFamilyHandle
  THROW_ARANGO_EXCEPTION(TRI_ERROR_ARANGO_READ_ONLY);
 }

+void RocksDBReadOnlyMethods::PutLogData(rocksdb::Slice const& blob) {
+  THROW_ARANGO_EXCEPTION(TRI_ERROR_ARANGO_READ_ONLY);
+}
+
 std::unique_ptr<rocksdb::Iterator> RocksDBReadOnlyMethods::NewIterator(
    rocksdb::ReadOptions const& opts, rocksdb::ColumnFamilyHandle* cf) {
  TRI_ASSERT(cf != nullptr);
@ -241,6 +241,10 @@ rocksdb::Status RocksDBTrxMethods::SingleDelete(rocksdb::ColumnFamilyHandle* cf,
  return _state->_rocksTransaction->SingleDelete(cf, key.string());
 }

+void RocksDBTrxMethods::PutLogData(rocksdb::Slice const& blob) {
+  _state->_rocksTransaction->PutLogData(blob);
+}
+
 std::unique_ptr<rocksdb::Iterator> RocksDBTrxMethods::NewIterator(
    rocksdb::ReadOptions const& opts, rocksdb::ColumnFamilyHandle* cf) {
  TRI_ASSERT(cf != nullptr);
@ -326,6 +330,10 @@ rocksdb::Status RocksDBBatchedMethods::SingleDelete(rocksdb::ColumnFamilyHandle*
  return _wb->SingleDelete(cf, key.string());
 }

+void RocksDBBatchedMethods::PutLogData(rocksdb::Slice const& blob) {
+  _wb->PutLogData(blob);
+}
+
 std::unique_ptr<rocksdb::Iterator> RocksDBBatchedMethods::NewIterator(
    rocksdb::ReadOptions const&, rocksdb::ColumnFamilyHandle*) {
  THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL,
@ -375,55 +383,13 @@ rocksdb::Status RocksDBBatchedWithIndexMethods::SingleDelete(rocksdb::ColumnFami
  return _wb->SingleDelete(cf, key.string());
 }

+void RocksDBBatchedWithIndexMethods::PutLogData(rocksdb::Slice const& blob) {
+  _wb->PutLogData(blob);
+}
+
 std::unique_ptr<rocksdb::Iterator> RocksDBBatchedWithIndexMethods::NewIterator(
    rocksdb::ReadOptions const& ro, rocksdb::ColumnFamilyHandle* cf) {
  TRI_ASSERT(cf != nullptr);
  return std::unique_ptr<rocksdb::Iterator>(
      _wb->NewIteratorWithBase(_db->NewIterator(ro, cf)));
 }
-
-// =================== RocksDBSideTrxMethods ====================
-
-/// transaction wrapper, uses the provided rocksdb transaction
-RocksDBSideTrxMethods::RocksDBSideTrxMethods(RocksDBTransactionState* state,
-                                             rocksdb::Transaction* trx)
-    : RocksDBMethods(state), _trx(trx) {
-  _ro.prefix_same_as_start = true;
-  _ro.fill_cache = false;
-}
-
-rocksdb::Status RocksDBSideTrxMethods::Get(rocksdb::ColumnFamilyHandle* cf,
-                                           rocksdb::Slice const& key, std::string* val) {
-  TRI_ASSERT(cf != nullptr);
-  return _trx->Get(_ro, cf, key, val);
-}
-
-rocksdb::Status RocksDBSideTrxMethods::Get(rocksdb::ColumnFamilyHandle* cf,
-                                           rocksdb::Slice const& key,
-                                           rocksdb::PinnableSlice* val) {
-  TRI_ASSERT(cf != nullptr);
-  return _trx->Get(_ro, cf, key, val);
-}
-
-rocksdb::Status RocksDBSideTrxMethods::Put(rocksdb::ColumnFamilyHandle* cf,
-                                           RocksDBKey const& key,
-                                           rocksdb::Slice const& val) {
-  TRI_ASSERT(cf != nullptr);
-  return _trx->Put(cf, key.string(), val);
-}
-rocksdb::Status RocksDBSideTrxMethods::Delete(rocksdb::ColumnFamilyHandle* cf,
-                                              RocksDBKey const& key) {
-  TRI_ASSERT(cf != nullptr);
-  return _trx->Delete(cf, key.string());
-}
-
-rocksdb::Status RocksDBSideTrxMethods::SingleDelete(rocksdb::ColumnFamilyHandle* cf,
-                                                    RocksDBKey const& key) {
-  TRI_ASSERT(cf != nullptr);
-  return _trx->SingleDelete(cf, key.string());
-}
-
-bool RocksDBSideTrxMethods::DisableIndexing() {
-  _trx->DisableIndexing();
-  return true;
-}
--- a/arangod/RocksDBEngine/RocksDBMethods.h
+++ b/arangod/RocksDBEngine/RocksDBMethods.h
@ -72,9 +72,6 @@ class RocksDBMethods {
  explicit RocksDBMethods(RocksDBTransactionState* state) : _state(state) {}
  virtual ~RocksDBMethods() {}

-  /// @brief current sequence number
-  rocksdb::SequenceNumber sequenceNumber();
-
  /// @brief read options for use with iterators
  rocksdb::ReadOptions iteratorReadOptions();

@ -97,6 +94,8 @@ class RocksDBMethods {
  /// when keys are inserted exactly once (and never overwritten)
  virtual rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) = 0;
  
+  virtual void PutLogData(rocksdb::Slice const&) = 0;
+
  virtual std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
                                                         rocksdb::ColumnFamilyHandle*) = 0;

@ -125,6 +124,7 @@ class RocksDBReadOnlyMethods final : public RocksDBMethods {
                      rocksdb::Slice const& val) override;
  rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
  rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
+  void PutLogData(rocksdb::Slice const&) override;

  std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
                                                 rocksdb::ColumnFamilyHandle*) override;
@ -157,6 +157,7 @@ class RocksDBTrxMethods : public RocksDBMethods {
                      rocksdb::Slice const& val) override;
  rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
  rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
+  void PutLogData(rocksdb::Slice const&) override;

  std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
                                                 rocksdb::ColumnFamilyHandle*) override;
@ -193,6 +194,8 @@ class RocksDBBatchedMethods final : public RocksDBMethods {
                      rocksdb::Slice const& val) override;
  rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
  rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
+  void PutLogData(rocksdb::Slice const&) override;
+
  std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
                                                 rocksdb::ColumnFamilyHandle*) override;

@ -219,6 +222,7 @@ class RocksDBBatchedWithIndexMethods final : public RocksDBMethods {
                      rocksdb::Slice const& val) override;
  rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
  rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
+  void PutLogData(rocksdb::Slice const&) override;

  std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
                                                 rocksdb::ColumnFamilyHandle*) override;
@ -234,38 +238,6 @@ class RocksDBBatchedWithIndexMethods final : public RocksDBMethods {
  rocksdb::WriteBatchWithIndex* _wb;
 };

-/// transaction wrapper, uses the provided rocksdb transaction
-class RocksDBSideTrxMethods final : public RocksDBMethods {
- public:
-  explicit RocksDBSideTrxMethods(RocksDBTransactionState* state, rocksdb::Transaction* trx);
-
-  rocksdb::Status Get(rocksdb::ColumnFamilyHandle*, rocksdb::Slice const& key,
-                      std::string* val) override;
-  rocksdb::Status Get(rocksdb::ColumnFamilyHandle*, rocksdb::Slice const& key,
-                      rocksdb::PinnableSlice* val) override;
-  rocksdb::Status Put(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key,
-                      rocksdb::Slice const& val) override;
-  rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
-  rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
-
-  std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
-                                                 rocksdb::ColumnFamilyHandle*) override {
-    return nullptr;
-  }
-
-  void SetSavePoint() override {}
-  rocksdb::Status RollbackToSavePoint() override {
-    return rocksdb::Status::OK();
-  }
-  void PopSavePoint() override {}
-
-  bool DisableIndexing() override;
-
- private:
-  rocksdb::Transaction* _trx;
-  rocksdb::ReadOptions _ro;
-};
-
 // INDEXING MAY ONLY BE DISABLED IN TOPLEVEL AQL TRANSACTIONS
 // THIS IS BECAUSE THESE TRANSACTIONS WILL EITHER READ FROM
 // OR (XOR) WRITE TO A COLLECTION. IF THIS PRECONDITION IS
--- a/arangod/RocksDBEngine/RocksDBReplicationTailing.cpp
+++ b/arangod/RocksDBEngine/RocksDBReplicationTailing.cpp
@ -335,6 +335,8 @@ class WALParser final : public rocksdb::WriteBatch::Handler {
      case RocksDBLogType::DocumentOperationsPrologue:
      case RocksDBLogType::DocumentRemove:
      case RocksDBLogType::DocumentRemoveAsPartOfUpdate:
+      case RocksDBLogType::TrackedDocumentInsert:
+      case RocksDBLogType::TrackedDocumentRemove:
        break;  // ignore deprecated && unused markers

      default:
@ -435,7 +437,9 @@ class WALParser final : public rocksdb::WriteBatch::Handler {

    if (cfId != _primaryCF) {
      return;  // ignore all document operations
-    } else if (_state != TRANSACTION && _state != SINGLE_REMOVE) {
+    }
+    
+    if (_state != TRANSACTION && _state != SINGLE_REMOVE) {
      resetTransientState();
      return;
    }
--- a/arangod/RocksDBEngine/RocksDBTransactionCollection.cpp
+++ b/arangod/RocksDBEngine/RocksDBTransactionCollection.cpp
@ -242,7 +242,7 @@ void RocksDBTransactionCollection::commitCounts(uint64_t trxId, uint64_t commitS
  // Update the index estimates.
  for (auto& pair : _trackedIndexOperations) {
    auto idx = _collection->lookupIndex(pair.first);
-    if (idx == nullptr) {
+    if (ADB_UNLIKELY(idx == nullptr)) {
      TRI_ASSERT(false);  // Index reported estimates, but does not exist
      continue;
    }
--- a/arangod/RocksDBEngine/RocksDBTransactionState.h
+++ b/arangod/RocksDBEngine/RocksDBTransactionState.h
@ -26,7 +26,7 @@

 #include "Basics/Common.h"
 #include "Basics/SmallVector.h"
-#include "RocksDBEngine/RocksDBCommon.h"
+#include "RocksDBEngine/RocksDBKey.h"
 #include "StorageEngine/TransactionState.h"
 #include "Transaction/Hints.h"
 #include "Transaction/Methods.h"
--- a/arangod/RocksDBEngine/RocksDBTypes.cpp
+++ b/arangod/RocksDBEngine/RocksDBTypes.cpp
@ -183,6 +183,12 @@ char const* arangodb::rocksDBLogTypeName(arangodb::RocksDBLogType type) {
      return "SingleRemove";
    case arangodb::RocksDBLogType::SingleRemoveV2:
      return "SingleRemoveV2";
+    case arangodb::RocksDBLogType::FlushSync:
+      return "FlushSync";
+    case RocksDBLogType::TrackedDocumentInsert:
+      return "TrackedDocumentInsert";
+    case RocksDBLogType::TrackedDocumentRemove:
+      return "TrackedDocumentRemove";
    case arangodb::RocksDBLogType::Invalid:
      return "Invalid";
    default:
--- a/arangod/RocksDBEngine/RocksDBTypes.h
+++ b/arangod/RocksDBEngine/RocksDBTypes.h
@ -86,6 +86,8 @@ enum class RocksDBLogType : char {
  SingleRemoveV2 = 'F',
  CollectionTruncate = 'G',
  FlushSync = 'H', // @see FlushFeature
+  TrackedDocumentInsert = 'I',
+  TrackedDocumentRemove = 'J',
 };

 /// @brief settings keys
--- a/arangod/RocksDBEngine/RocksDBWalAccess.cpp
+++ b/arangod/RocksDBEngine/RocksDBWalAccess.cpp
@ -390,7 +390,9 @@ class MyWALDumper final : public rocksdb::WriteBatch::Handler, public WalAccessC
      case RocksDBLogType::DocumentOperationsPrologue:
      case RocksDBLogType::DocumentRemove:
      case RocksDBLogType::DocumentRemoveAsPartOfUpdate:
-        break;  // ignore deprecated markers
+      case RocksDBLogType::TrackedDocumentInsert:
+      case RocksDBLogType::TrackedDocumentRemove:
+        break;  // ignore deprecated / unused markers

      default:
        LOG_TOPIC(WARN, Logger::REPLICATION)
@ -553,7 +555,9 @@ class MyWALDumper final : public rocksdb::WriteBatch::Handler, public WalAccessC

    if (cfId != _primaryCF) {
      return;  // ignore all document operations
-    } else if (_state != TRANSACTION && _state != SINGLE_REMOVE) {
+    }
+    
+    if (_state != TRANSACTION && _state != SINGLE_REMOVE) {
      resetTransientState();
      return;
    }
@ -617,12 +621,6 @@ class MyWALDumper final : public rocksdb::WriteBatch::Handler, public WalAccessC
    return rocksdb::Status();  // make WAL iterator happy
  }

-  rocksdb::Status MergeCF(uint32_t, const rocksdb::Slice&, const rocksdb::Slice&) override {
-    incTick();
-    // not used for anything in ArangoDB currently
-    return rocksdb::Status();  // make WAL iterator happy
-  }
-
 public:
  /// figures out from which sequence number we need to start scanning
  /// if we just use tickStart rocksdb will skip over batches we might
@ -765,11 +763,6 @@ WalAccessResult RocksDBWalAccess::tail(Filter const& filter, size_t chunkSize,
      << "WAL tailing call. Scan since: " << since << ", tick start: " << filter.tickStart
      << ", tick end: " << filter.tickEnd << ", chunk size: " << chunkSize;
  while (iterator->Valid() && lastScannedTick <= filter.tickEnd) {
-    s = iterator->status();
-    if (!s.ok()) {
-      LOG_TOPIC(ERR, Logger::REPLICATION) << "error during WAL scan: " << s.ToString();
-      break;  // s is considered in the end
-    }

    rocksdb::BatchResult batch = iterator->GetBatch();
    // record the first tick we are actually considering
--- a/tests/js/common/shell/shell-index-rocksdb-disabled.js
+++ b/tests/js/common/shell/shell-index-rocksdb-disabled.js
@ -8,7 +8,7 @@
 ///
 /// DISCLAIMER
 ///
-/// Copyright 2018 ArangoDB GmbH, Cologne, Germany
+/// Copyright 2018-2019 ArangoDB GmbH, Cologne, Germany
 ///
 /// Licensed under the Apache License, Version 2.0 (the "License");
 /// you may not use this file except in compliance with the License.
@ -131,6 +131,64 @@ function backgroundIndexSuite() {
      }
    },

+    testInsertParallelNonUnique2: function () {
+      let c = require("internal").db._collection(cn);
+      // first lets add some initial documents
+      let x = 10; 
+      while(x-- > 0) {
+        let docs = []; 
+        for(let i = 0; i < 1000; i++) {
+          docs.push({value:i});
+        } 
+        c.save(docs);
+      }
+
+      // lets insert the rest via tasks
+      let n = 9;
+      for (let i = 0; i < n; ++i) {
+        if (i === 6) { // create the index in a task
+          let command = `const c = require("internal").db._collection("${cn}"); 
+          c.ensureIndex({type: 'hash', fields: ['value'], unique: false, inBackground: true});`;
+          tasks.register({ name: "UnitTestsIndexCreateIDX" + i, command: command });
+        }
+        let command = `const c = require("internal").db._collection("${cn}"); 
+                       let x = 10;
+                       while(x-- > 0) {
+                         let docs = []; 
+                         for(let i = 0; i < 1000; i++) {
+                           docs.push({value:i})
+                         } 
+                         c.save(docs);
+                       }`;
+        tasks.register({ name: "UnitTestsIndexInsert" + i, command: command });
+      }
+
+      // wait for tasks to complete
+      waitForTasks();
+      
+      // sanity checks
+      assertEqual(c.count(), 100000);
+      for (let i = 0; i < 1000; i++) { // 100 entries of each value [0,999]
+        let cursor = db._query("FOR doc IN @@coll FILTER doc.value == @val RETURN 1", 
+                               {'@coll': cn, 'val': i}, {count:true});
+        assertEqual(cursor.count(), 100);
+      }
+
+      internal.waitForEstimatorSync(); // make sure estimates are consistent
+      let indexes = c.getIndexes(true);
+      for (let i of indexes) {
+        switch (i.type) {
+          case 'primary':
+            break;
+          case 'hash':
+            assertEqual(i.selectivityEstimate, 0.01);
+            break;
+          default:
+            fail();
+        }
+      }
+    },
+
    testInsertParallelUnique: function () {
      let c = require("internal").db._collection(cn);
      // first lets add some initial documents
@ -227,17 +285,6 @@ function backgroundIndexSuite() {
      
      // sanity checks
      assertEqual(c.count(), 50001);
-
-      let indexes = c.getIndexes();
-      for (let i of indexes) {
-        switch (i.type) {
-          case 'primary':
-            break;
-          case 'hash':
-          default:
-            fail();
-        }
-      }
    },

    testRemoveParallel: function () {
@ -256,6 +303,12 @@ function backgroundIndexSuite() {

      // lets remove half via tasks
      for (let i = 0; i < 10; ++i) {
+        if (i === 3) { // create the index in a task
+          let command = `const c = require("internal").db._collection("${cn}"); 
+          c.ensureIndex({type: 'hash', fields: ['value'], unique: false, inBackground: true});`;
+          tasks.register({ name: "UnitTestsIndexCreateIDX" + i, command: command });
+        }
+
        let command = `const c = require("internal").db._collection("${cn}"); 
                       if (!c) {
                         throw new Error('could not find collection');
@ -275,9 +328,6 @@ function backgroundIndexSuite() {
        tasks.register({ name: "UnitTestsIndexRemove" + i, command: command });
      }

-      // create the index on the main thread
-      c.ensureIndex({type: 'hash', fields: ['value'], inBackground: true });
-
      // wait for insertion tasks to complete
      waitForTasks();
      
@ -298,12 +348,14 @@ function backgroundIndexSuite() {
        }
      }

+      internal.waitForEstimatorSync(); // make sure estimates are consistent
      let indexes = c.getIndexes(true);
      for (let i of indexes) {
        switch (i.type) {
          case 'primary':
            break;
          case 'hash':
+          assertTrue(Math.abs(i.selectivityEstimate - 1.0) < 0.005, i);
            break;
          default:
            fail();
@ -327,6 +379,11 @@ function backgroundIndexSuite() {

      // lets update all via tasks
      for (let i = 0; i < 10; ++i) {
+        if (i === 5) { // create the index in a task
+          let command = `const c = require("internal").db._collection("${cn}"); 
+          c.ensureIndex({type: 'skiplist', fields: ['value'], unique: false, inBackground: true});`;
+          tasks.register({ name: "UnitTestsIndexCreateIDX" + i, command: command });
+        }
        let command = `const c = require("internal").db._collection("${cn}"); 
                       if (!c) {
                         throw new Error('could not find collection');
@ -350,9 +407,6 @@ function backgroundIndexSuite() {
      // wait for insertion tasks to complete
      waitForTasks();
      
-      // create the index on the main thread
-      c.ensureIndex({type: 'skiplist', fields: ['value'], inBackground: true });
-
      // sanity checks
      assertEqual(c.count(), 100000);
      // check for new entries via index
@ -364,12 +418,14 @@ function backgroundIndexSuite() {
                                  {'@coll': cn, 'val': 100000}, {count:true});
      assertEqual(oldCursor.count(), 0);

+      internal.waitForEstimatorSync(); // make sure estimates are consistent
      let indexes = c.getIndexes(true);
      for (let i of indexes) {
        switch (i.type) {
          case 'primary':
            break;
          case 'skiplist':
+          assertTrue(Math.abs(i.selectivityEstimate - 1.0) < 0.005, i);
          break;
          default:
            fail();