From 0a0cb41f04c833d297b1355ce41e129ca7265339 Mon Sep 17 00:00:00 2001 From: Simon Date: Tue, 7 May 2019 09:50:14 +0200 Subject: [PATCH] do not acquire a snapshot for a single read op (#8916) --- arangod/RocksDBEngine/RocksDBCollection.cpp | 36 ++++++++++--------- arangod/RocksDBEngine/RocksDBPrimaryIndex.cpp | 7 ++-- .../RocksDBEngine/RocksDBTransactionState.cpp | 25 ++++++++++--- .../RocksDBEngine/RocksDBTransactionState.h | 3 ++ 4 files changed, 47 insertions(+), 24 deletions(-) diff --git a/arangod/RocksDBEngine/RocksDBCollection.cpp b/arangod/RocksDBEngine/RocksDBCollection.cpp index aa19304cc7..9892c7ea09 100644 --- a/arangod/RocksDBEngine/RocksDBCollection.cpp +++ b/arangod/RocksDBEngine/RocksDBCollection.cpp @@ -791,21 +791,25 @@ bool RocksDBCollection::lookupRevision(transaction::Methods* trx, VPackSlice con Result RocksDBCollection::read(transaction::Methods* trx, arangodb::velocypack::StringRef const& key, ManagedDocumentResult& result, bool /*lock*/) { - LocalDocumentId const documentId = primaryIndex()->lookupKey(trx, key); - if (!documentId.isSet()) { - return TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND; - } // found - - std::string* buffer = result.setManaged(); - rocksdb::PinnableSlice ps(buffer); - Result res = lookupDocumentVPack(trx, documentId, ps, /*readCache*/true, /*fillCache*/true); - if (res.ok()) { - if (ps.IsPinned()) { - buffer->assign(ps.data(), ps.size()); - } // else value is already assigned - result.setRevisionId(); // extracts id from buffer - } - + Result res; + do { + LocalDocumentId const documentId = primaryIndex()->lookupKey(trx, key); + if (!documentId.isSet()) { + res.reset(TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND); + break; + } // else found + + std::string* buffer = result.setManaged(); + rocksdb::PinnableSlice ps(buffer); + res = lookupDocumentVPack(trx, documentId, ps, /*readCache*/true, /*fillCache*/true); + if (res.ok()) { + if (ps.IsPinned()) { + buffer->assign(ps.data(), ps.size()); + } // else value is already assigned + result.setRevisionId(); // extracts id from buffer + } + } while(res.is(TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND) && + RocksDBTransactionState::toState(trx)->setSnapshotOnReadOnly()); return res; } @@ -1416,7 +1420,7 @@ arangodb::Result RocksDBCollection::lookupDocumentVPack(transaction::Methods* tr ps.PinSelf(rocksdb::Slice(reinterpret_cast(f.value()->value()), f.value()->valueSize())); // TODO we could potentially use the PinSlice method ?! - return res; + return res; // all good } if (f.result().errorNumber() == TRI_ERROR_LOCK_TIMEOUT) { // assuming someone is currently holding a write lock, which diff --git a/arangod/RocksDBEngine/RocksDBPrimaryIndex.cpp b/arangod/RocksDBEngine/RocksDBPrimaryIndex.cpp index 0d58327ddd..e76adff16b 100644 --- a/arangod/RocksDBEngine/RocksDBPrimaryIndex.cpp +++ b/arangod/RocksDBEngine/RocksDBPrimaryIndex.cpp @@ -584,8 +584,9 @@ Result RocksDBPrimaryIndex::insert(transaction::Methods& trx, RocksDBMethods* mt RocksDBKeyLeaser key(&trx); key->constructPrimaryIndexValue(_objectId, arangodb::velocypack::StringRef(keySlice)); - rocksdb::PinnableSlice val; - rocksdb::Status s = mthd->Get(_cf, key->string(), &val); + transaction::StringLeaser buffer(&trx); + rocksdb::PinnableSlice ps(buffer.get()); + rocksdb::Status s = mthd->Get(_cf, key->string(), &ps); Result res; if (s.ok()) { // detected conflicting primary key @@ -599,7 +600,7 @@ Result RocksDBPrimaryIndex::insert(transaction::Methods& trx, RocksDBMethods* mt return addErrorMsg(res, existingId); } - val.Reset(); // clear used memory + ps.Reset(); // clear used memory if (trx.state()->hasHint(transaction::Hints::Hint::GLOBAL_MANAGED)) { // blacklist new index entry to avoid caching without committing first diff --git a/arangod/RocksDBEngine/RocksDBTransactionState.cpp b/arangod/RocksDBEngine/RocksDBTransactionState.cpp index e43a235d64..de215723fd 100644 --- a/arangod/RocksDBEngine/RocksDBTransactionState.cpp +++ b/arangod/RocksDBEngine/RocksDBTransactionState.cpp @@ -118,9 +118,12 @@ Result RocksDBTransactionState::beginTransaction(transaction::Hints hints) { TRI_ASSERT(_readSnapshot == nullptr); if (isReadOnlyTransaction()) { - _readSnapshot = db->GetSnapshot(); // must call ReleaseSnapshot later - TRI_ASSERT(_readSnapshot != nullptr); - _rocksReadOptions.snapshot = _readSnapshot; + // no need to acquire a snapshot for a single op + if (!isSingleOperation()) { + _readSnapshot = db->GetSnapshot(); // must call ReleaseSnapshot later + TRI_ASSERT(_readSnapshot != nullptr); + _rocksReadOptions.snapshot = _readSnapshot; + } _rocksMethods.reset(new RocksDBReadOnlyMethods(this)); } else { createTransaction(); @@ -133,7 +136,6 @@ Result RocksDBTransactionState::beginTransaction(transaction::Hints hints) { } _rocksMethods.reset(new RocksDBTrxMethods(this)); - if (hasHint(transaction::Hints::Hint::NO_INDEXING)) { // do not track our own writes... we can only use this in very // specific scenarios, i.e. when we are sure that we will have a @@ -186,9 +188,9 @@ void RocksDBTransactionState::createTransaction() { rocksdb::TransactionDB* db = rocksutils::globalRocksDB(); rocksdb::TransactionOptions trxOpts; trxOpts.set_snapshot = true; + // unclear performance implications do not use for now // trxOpts.deadlock_detect = !hasHint(transaction::Hints::Hint::NO_DLD); - if (isOnlyExclusiveTransaction()) { // we are exclusively modifying collection data here, so we can turn off // all concurrency control checks to save time @@ -550,11 +552,24 @@ uint64_t RocksDBTransactionState::sequenceNumber() const { return static_cast(_rocksTransaction->GetSnapshot()->GetSequenceNumber()); } else if (_readSnapshot != nullptr) { return static_cast(_readSnapshot->GetSequenceNumber()); + } else if (isReadOnlyTransaction() && isSingleOperation()) { + return rocksutils::latestSequenceNumber(); } TRI_ASSERT(false); THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "No snapshot set"); } +/// @brief acquire a database snapshot +bool RocksDBTransactionState::setSnapshotOnReadOnly() { + if (_readSnapshot == nullptr && isReadOnlyTransaction()) { + TRI_ASSERT(_rocksTransaction == nullptr); + TRI_ASSERT(isSingleOperation()); + _readSnapshot = rocksutils::globalRocksDB()->GetSnapshot(); + return true; + } + return false; +} + Result RocksDBTransactionState::triggerIntermediateCommit(bool& hasPerformedIntermediateCommit) { TRI_ASSERT(!hasPerformedIntermediateCommit); diff --git a/arangod/RocksDBEngine/RocksDBTransactionState.h b/arangod/RocksDBEngine/RocksDBTransactionState.h index a77737d8b4..8d9fcd19a5 100644 --- a/arangod/RocksDBEngine/RocksDBTransactionState.h +++ b/arangod/RocksDBEngine/RocksDBTransactionState.h @@ -115,6 +115,9 @@ class RocksDBTransactionState final : public TransactionState { /// @brief Rocksdb sequence number of snapshot. Works while trx /// has either a snapshot or a transaction rocksdb::SequenceNumber sequenceNumber() const; + + /// @brief acquire a database snapshot + bool setSnapshotOnReadOnly(); static RocksDBTransactionState* toState(transaction::Methods* trx) { TRI_ASSERT(trx != nullptr);