1
0
Fork 0

Background Indexing (#8039)

This commit is contained in:
Simon 2019-01-29 09:31:32 +01:00 committed by Jan
parent 93ea32a00a
commit fd70b6fc34
29 changed files with 808 additions and 564 deletions

View File

@ -118,7 +118,6 @@ details, including the index-identifier, is returned.
@endDocuBlock ensureHashIndexArray
{###
Creating Hash Index in Background
---------------------------------
@ -142,7 +141,7 @@ db.collection.ensureIndex({ type: "hash", fields: [ "value" ], inBackground: tru
```
For more information see "Creating Indexes in Background" in the [Index basics](IndexBasics.md) page.
###}
Ensure uniqueness of relations in edge collections
--------------------------------------------------

View File

@ -8,7 +8,7 @@ of documents.
User-defined indexes can be created on collection level. Most user-defined indexes
can be created by specifying the names of the index attributes.
Some index types allow indexing just one attribute (e.g. fulltext index) whereas
Some index types allow indexing just one attribute (e.g. *fulltext* index) whereas
other index types allow indexing multiple attributes at the same time.
Learn how to use different indexes efficiently by going through the
@ -22,15 +22,14 @@ are covered by an edge collection's edge index automatically.
Using the system attribute `_id` in user-defined indexes is not possible, but
indexing `_key`, `_rev`, `_from`, and `_to` is.
{###
Creating new indexes is usually done under an exclusive collection lock. The collection is not
available as long as the index is created. This "foreground" index creation can be undesirable,
Creating new indexes is by default done under an exclusive collection lock. The collection is not
available while the index is being created. This "foreground" index creation can be undesirable,
if you have to perform it on a live system without a dedicated maintenance window.
For potentially long running index creation operations the _rocksdb_ storage-engine also supports
creating indexes in "background". The collection remains available during the index creation,
see the section "Creating Indexes in Background" for more information.
###}
creating indexes in "background". The collection remains (mostly) available during the index creation,
see the section [Creating Indexes in Background](#creating-indexes-in-background) for more information.
ArangoDB provides the following index types:
@ -551,7 +550,7 @@ based on the costs it estimates, even if a vertex centric index might
in fact be faster. Vertex centric indexes are more likely to be chosen
for highly connected graphs and with RocksDB storage engine.
{###
Creating Indexes in Background
------------------------------
@ -610,4 +609,4 @@ if this list grows to tens of millions of entries.
Building an index is always a write heavy operation (internally), it is always a good idea to build indexes
during times with less load.
###}

View File

@ -187,7 +187,6 @@ and
will match.
{###
Creating Skiplist Index in Background
-------------------------------------
@ -211,4 +210,4 @@ db.collection.ensureIndex({ type: "skiplist", fields: [ "value" ], inBackground:
```
For more information see "Creating Indexes in Background" in the [Index basics](IndexBasics.md#) page.
###}

View File

@ -491,12 +491,12 @@ aql::AstNode const* checkAttributeAccess(aql::AstNode const* node,
} // namespace iresearch
} // namespace arangodb
#endif // ARANGOD_IRESEARCH__AQL_HELPER_H
#if defined(__GNUC__)
#pragma GCC diagnostic pop
#endif
#endif // ARANGOD_IRESEARCH__AQL_HELPER_H
// -----------------------------------------------------------------------------
// --SECTION-- END-OF-FILE
// -----------------------------------------------------------------------------

View File

@ -26,7 +26,7 @@
#include "ApplicationFeatures/ApplicationFeature.h"
#include "Basics/ReadWriteLock.h"
class TRI_vocbase_t; // forward declaration
struct TRI_vocbase_t; // forward declaration
namespace arangodb {
@ -51,7 +51,7 @@ class FlushFeature final : public application_features::ApplicationFeature {
virtual ~FlushSubscription() = default;
virtual Result commit(velocypack::Slice const& data) = 0;
};
struct FlushSubscriptionBase; // forward declaration
class FlushSubscriptionBase; // forward declaration
explicit FlushFeature(application_features::ApplicationServer& server);

View File

@ -32,7 +32,8 @@
using namespace arangodb;
RocksDBBackgroundThread::RocksDBBackgroundThread(RocksDBEngine* eng, double interval)
: Thread("RocksDBThread"), _engine(eng), _interval(interval) {}
: Thread("RocksDBThread"), _engine(eng), _interval(interval),
_disableWalFilePruning(0) {}
RocksDBBackgroundThread::~RocksDBBackgroundThread() { shutdown(); }
@ -97,6 +98,7 @@ void RocksDBBackgroundThread::run() {
});
}
if (!disableWalFilePruning()) {
// only start pruning of obsolete WAL files a few minutes after
// server start. if we start pruning too early, replication slaves
// will not have a chance to reconnect to a restarted master in
@ -108,6 +110,7 @@ void RocksDBBackgroundThread::run() {
// and then prune them when they expired
_engine->pruneWalFiles();
}
}
} catch (std::exception const& ex) {
LOG_TOPIC(WARN, Logger::ENGINES)
<< "caught exception in rocksdb background thread: " << ex.what();

View File

@ -53,8 +53,24 @@ class RocksDBBackgroundThread final : public Thread {
void beginShutdown() override;
/// disable pruning of wal files
void disableWalFilePruning(bool disable) {
int sub = 0;
if (disable) {
sub = _disableWalFilePruning.fetch_add(1);
} else {
sub = _disableWalFilePruning.fetch_sub(1);
}
TRI_ASSERT(sub >= 0);
}
bool disableWalFilePruning() const {
return _disableWalFilePruning.load(std::memory_order_acquire) > 0;
}
protected:
void run() override;
std::atomic<int> _disableWalFilePruning;
};
} // namespace arangodb

View File

@ -21,10 +21,13 @@
////////////////////////////////////////////////////////////////////////////////
#include "RocksDBBuilderIndex.h"
#include "Basics/HashSet.h"
#include "Basics/VelocyPackHelper.h"
#include "RocksDBEngine/RocksDBCollection.h"
#include "RocksDBEngine/RocksDBColumnFamily.h"
#include "RocksDBEngine/RocksDBCommon.h"
#include "RocksDBEngine/RocksDBLogValue.h"
#include "RocksDBEngine/RocksDBMethods.h"
#include "RocksDBEngine/RocksDBTransactionCollection.h"
#include "RocksDBEngine/RocksDBTransactionState.h"
@ -39,17 +42,43 @@
#include <rocksdb/utilities/transaction_db.h>
#include <rocksdb/utilities/write_batch_with_index.h>
#include <velocypack/Builder.h>
#include <velocypack/Iterator.h>
#include <velocypack/velocypack-aliases.h>
using namespace arangodb;
using namespace arangodb::rocksutils;
namespace {
struct BuilderTrx : public arangodb::transaction::Methods {
BuilderTrx(std::shared_ptr<transaction::Context> const& transactionContext,
LogicalDataSource const& collection, AccessMode::Type type)
: transaction::Methods(transactionContext), _cid(collection.id()) {
// add the (sole) data-source
addCollection(collection.id(), collection.name(), type);
addHint(transaction::Hints::Hint::NO_DLD);
}
/// @brief get the underlying transaction collection
RocksDBTransactionCollection* resolveTrxCollection() {
return static_cast<RocksDBTransactionCollection*>(trxCollection(_cid));
}
private:
TRI_voc_cid_t _cid;
};
struct BuilderCookie : public arangodb::TransactionState::Cookie {
// do not track removed documents twice
arangodb::HashSet<LocalDocumentId::BaseType> tracked;
};
} // namespace
RocksDBBuilderIndex::RocksDBBuilderIndex(std::shared_ptr<arangodb::RocksDBIndex> const& wp)
: RocksDBIndex(wp->id(), wp->collection(), wp->fields(), wp->unique(),
wp->sparse(), wp->columnFamily(), wp->objectId(),
/*useCache*/ false),
_wrapped(wp),
_hasError(false) {
_wrapped(wp) {
TRI_ASSERT(_wrapped);
}
@ -72,18 +101,24 @@ Result RocksDBBuilderIndex::insert(transaction::Methods& trx, RocksDBMethods* mt
LocalDocumentId const& documentId,
arangodb::velocypack::Slice const& slice,
OperationMode mode) {
TRI_ASSERT(false); // not enabled
Result r = _wrapped->insert(trx, mthd, documentId, slice, mode);
if (r.is(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED)) {
// these are expected errors; store in builder and suppress
bool expected = false;
if (!r.ok() && _hasError.compare_exchange_strong(expected, true)) {
std::lock_guard<std::mutex> guard(_errorMutex);
_errorResult = r;
#ifdef ARANGODB_ENABLE_MAINTAINER_MODE
auto* ctx = dynamic_cast<::BuilderCookie*>(trx.state()->cookie(this));
#else
auto* ctx = static_cast<::BuilderCookie*>(trx.state()->cookie(this));
#endif
if (!ctx) {
auto ptr = std::make_unique<::BuilderCookie>();
ctx = ptr.get();
trx.state()->cookie(this, std::move(ptr));
}
return Result();
// do not track document more than once
if (ctx->tracked.find(documentId.id()) == ctx->tracked.end()) {
ctx->tracked.insert(documentId.id());
RocksDBLogValue val = RocksDBLogValue::TrackedDocumentInsert(documentId, slice);
mthd->PutLogData(val.slice());
}
return r;
return Result(); // do nothing
}
/// remove index elements and put it in the specified write batch.
@ -91,221 +126,36 @@ Result RocksDBBuilderIndex::remove(transaction::Methods& trx, RocksDBMethods* mt
LocalDocumentId const& documentId,
arangodb::velocypack::Slice const& slice,
OperationMode mode) {
TRI_ASSERT(false); // not enabled
{
std::lock_guard<std::mutex> guard(_removedDocsMutex);
_removedDocs.insert(documentId.id());
}
{ // wait for keys do be inserted, so we can remove them again
std::unique_lock<std::mutex> guard(_lockedDocsMutex);
if (_lockedDocs.find(documentId.id()) != _lockedDocs.end()) {
_lockedDocsCond.wait(guard);
}
#ifdef ARANGODB_ENABLE_MAINTAINER_MODE
auto* ctx = dynamic_cast<::BuilderCookie*>(trx.state()->cookie(this));
#else
auto* ctx = static_cast<::BuilderCookie*>(trx.state()->cookie(this));
#endif
if (!ctx) {
auto ptr = std::make_unique<::BuilderCookie>();
ctx = ptr.get();
trx.state()->cookie(this, std::move(ptr));
}
Result r = _wrapped->remove(trx, mthd, documentId, slice, mode);
if (r.is(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED)) {
// these are expected errors; store in builder and suppress
bool expected = false;
if (!r.ok() && _hasError.compare_exchange_strong(expected, true)) {
std::lock_guard<std::mutex> guard(_errorMutex);
_errorResult = r;
// do not track document more than once
if (ctx->tracked.find(documentId.id()) == ctx->tracked.end()) {
ctx->tracked.insert(documentId.id());
RocksDBLogValue val = RocksDBLogValue::TrackedDocumentRemove(documentId, slice);
mthd->PutLogData(val.slice());
}
return Result();
}
return r;
}
namespace {
struct BuilderTrx : public arangodb::transaction::Methods {
BuilderTrx(std::shared_ptr<transaction::Context> const& transactionContext,
LogicalDataSource const& collection, AccessMode::Type type)
: transaction::Methods(transactionContext), _cid(collection.id()) {
// add the (sole) data-source
addCollection(collection.id(), collection.name(), type);
addHint(transaction::Hints::Hint::NO_DLD);
}
/// @brief get the underlying transaction collection
RocksDBTransactionCollection* resolveTrxCollection() {
return static_cast<RocksDBTransactionCollection*>(trxCollection(_cid));
}
private:
TRI_voc_cid_t _cid;
};
//struct BuilderCookie {
// VPackBuffer<uint8_t>
// VPackBuilder _removals;
//};
} // namespace
// Background index filler task
// FIXME simon: not used right now because rollbacks are not correctly handled
// yet
arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function<void()> const& unlock) {
arangodb::Result res;
// 1. Index everything under a snapshot iterator (get snapshot under
// exclusive coll lock)
// 2. Track deleted document IDs so we can avoid indexing them
// 3. Avoid conflicts on unique index keys by using rocksdb::Transaction
// snapshot conflict checking
// 4. Supress unique constraint violations / conflicts or client drivers
auto lockedDocsGuard = scopeGuard([&] { // clear all the processed documents
std::lock_guard<std::mutex> guard(_lockedDocsMutex);
_lockedDocs.clear();
_lockedDocsCond.notify_all();
});
// fillindex can be non transactional, we just need to clean up
RocksDBEngine* engine = rocksutils::globalRocksEngine();
RocksDBCollection* rcoll = static_cast<RocksDBCollection*>(_collection.getPhysical());
rocksdb::DB* rootDB = engine->db()->GetRootDB();
TRI_ASSERT(rootDB != nullptr);
uint64_t numDocsWritten = 0;
auto bounds = RocksDBKeyBounds::CollectionDocuments(rcoll->objectId());
rocksdb::Slice upper(bounds.end()); // exclusive upper bound
rocksdb::Status s;
rocksdb::WriteOptions wo;
wo.disableWAL = false; // TODO set to true eventually
// create a read-snapshot under the guard
rocksdb::Snapshot const* snap = rootDB->GetSnapshot();
auto snapGuard = scopeGuard([&] { rootDB->ReleaseSnapshot(snap); });
TRI_ASSERT(snap != nullptr);
rocksdb::ReadOptions ro;
ro.snapshot = snap;
ro.prefix_same_as_start = true;
ro.iterate_upper_bound = &upper;
ro.verify_checksums = false;
ro.fill_cache = false;
rocksdb::ColumnFamilyHandle* docCF = bounds.columnFamily();
std::unique_ptr<rocksdb::Iterator> it(rootDB->NewIterator(ro, docCF));
unlock(); // release indexes write lock
// FIXME use buildertrx
SingleCollectionTransaction trx(transaction::StandaloneContext::Create(
_collection.vocbase()),
_collection, AccessMode::Type::WRITE);
res = trx.begin();
if (res.fail()) {
return res;
}
auto state = RocksDBTransactionState::toState(&trx);
// transaction used to perform actual indexing
rocksdb::TransactionOptions to;
to.lock_timeout = 100; // 100ms
std::unique_ptr<rocksdb::Transaction> rtrx(engine->db()->BeginTransaction(wo, to));
if (this->unique()) {
rtrx->SetSnapshot(); // needed for unique index conflict detection
} else {
rtrx->DisableIndexing(); // we never check for existing index keys
}
RocksDBSideTrxMethods batched(state, rtrx.get());
RocksDBIndex* internal = _wrapped.get();
TRI_ASSERT(internal != nullptr);
// FIXE make selectivity estimates batch wise
it->Seek(bounds.start());
while (it->Valid() && it->key().compare(upper) < 0) {
if (_hasError.load(std::memory_order_acquire)) {
std::lock_guard<std::mutex> guard(_errorMutex);
res = _errorResult; // a Writer got an error
break;
}
LocalDocumentId docId = RocksDBKey::documentId(it->key());
{
// must acquire both locks here to prevent interleaved operations
std::lock_guard<std::mutex> guard(_removedDocsMutex);
std::lock_guard<std::mutex> guard2(_lockedDocsMutex);
if (_removedDocs.find(docId.id()) != _removedDocs.end()) {
_removedDocs.erase(_removedDocs.find(docId.id()));
it->Next();
continue;
}
_lockedDocs.insert(docId.id());
}
res = internal->insert(trx, &batched, docId, VPackSlice(it->value().data()),
Index::OperationMode::normal);
if (res.fail()) {
break;
}
numDocsWritten++;
if (numDocsWritten % 200 == 0) { // commit buffered writes
s = rtrx->Commit();
if (!s.ok()) {
res = rocksutils::convertStatus(s, rocksutils::StatusHint::index);
break;
}
{ // clear all the processed documents
std::lock_guard<std::mutex> guard(_lockedDocsMutex);
_lockedDocs.clear();
_lockedDocsCond.notify_all();
}
engine->db()->BeginTransaction(wo, to, rtrx.get()); // reuse transaction
if (this->unique()) {
rtrx->SetSnapshot();
}
}
it->Next();
}
// now actually write all remaining index keys
if (res.ok() && rtrx->GetNumPuts() > 0) {
s = rtrx->Commit();
if (!s.ok()) {
res = rocksutils::convertStatus(s, rocksutils::StatusHint::index);
}
}
if (res.ok()) {
res = trx.commit(); // required to commit selectivity estimates
}
return res;
return Result(); // do nothing
}
// fast mode assuming exclusive access locked from outside
template <typename WriteBatchType, typename MethodsType>
static arangodb::Result fillIndexFast(RocksDBIndex& ridx, LogicalCollection& coll,
WriteBatchType& batch) {
Result res;
::BuilderTrx trx(transaction::StandaloneContext::Create(coll.vocbase()), coll,
AccessMode::Type::EXCLUSIVE);
trx.addHint(transaction::Hints::Hint::LOCK_NEVER); // already locked
res = trx.begin();
if (!res.ok()) {
THROW_ARANGO_EXCEPTION(res);
}
RocksDBCollection* rcoll = static_cast<RocksDBCollection*>(coll.getPhysical());
auto state = RocksDBTransactionState::toState(&trx);
auto methds = RocksDBTransactionState::toMethods(&trx);
RocksDBTransactionCollection* trxColl = trx.resolveTrxCollection();
static arangodb::Result fillIndex(RocksDBIndex& ridx, WriteBatchType& batch,
rocksdb::Snapshot const* snap) {
// fillindex can be non transactional, we just need to clean up
RocksDBEngine* engine = rocksutils::globalRocksEngine();
rocksdb::DB* rootDB = engine->db()->GetRootDB();
rocksdb::DB* rootDB = rocksutils::globalRocksDB()->GetRootDB();
TRI_ASSERT(rootDB != nullptr);
uint64_t numDocsWritten = 0;
// write batch will be reset every x documents
MethodsType batched(state, &batch);
RocksDBCollection* rcoll =
static_cast<RocksDBCollection*>(ridx.collection().getPhysical());
auto bounds = RocksDBKeyBounds::CollectionDocuments(rcoll->objectId());
rocksdb::Slice upper(bounds.end());
@ -313,20 +163,32 @@ static arangodb::Result fillIndexFast(RocksDBIndex& ridx, LogicalCollection& col
rocksdb::WriteOptions wo;
wo.disableWAL = false; // TODO set to true eventually
const rocksdb::Snapshot* snap = rootDB->GetSnapshot();
auto snapGuard = scopeGuard([&] { rootDB->ReleaseSnapshot(snap); });
rocksdb::ReadOptions ro;
rocksdb::ReadOptions ro(/*cksum*/ false, /*cache*/ false);
ro.snapshot = snap;
ro.prefix_same_as_start = true;
ro.iterate_upper_bound = &upper;
ro.verify_checksums = false;
ro.fill_cache = false;
rocksdb::ColumnFamilyHandle* docCF = RocksDBColumnFamily::documents();
std::unique_ptr<rocksdb::Iterator> it = methds->NewIterator(ro, docCF);
std::unique_ptr<rocksdb::Iterator> it(rootDB->NewIterator(ro, docCF));
auto commitLambda = [&] {
auto mode = snap == nullptr ? AccessMode::Type::EXCLUSIVE : AccessMode::Type::WRITE;
LogicalCollection& coll = ridx.collection();
::BuilderTrx trx(transaction::StandaloneContext::Create(coll.vocbase()), coll, mode);
if (mode == AccessMode::Type::EXCLUSIVE) {
trx.addHint(transaction::Hints::Hint::LOCK_NEVER);
}
Result res = trx.begin();
if (!res.ok()) {
THROW_ARANGO_EXCEPTION(res);
}
uint64_t numDocsWritten = 0;
auto state = RocksDBTransactionState::toState(&trx);
RocksDBTransactionCollection* trxColl = trx.resolveTrxCollection();
// write batch will be reset every x documents
MethodsType batched(state, &batch);
auto commitLambda = [&](rocksdb::SequenceNumber seq) {
if (batch.GetWriteBatch()->Count() > 0) {
s = rootDB->Write(wo, batch.GetWriteBatch());
if (!s.ok()) {
@ -339,14 +201,18 @@ static arangodb::Result fillIndexFast(RocksDBIndex& ridx, LogicalCollection& col
if (!ops.empty()) {
TRI_ASSERT(ridx.hasSelectivityEstimate() && ops.size() == 1);
auto it = ops.begin();
ridx.estimator()->bufferUpdates(it->first, std::move(it->second.inserts),
TRI_ASSERT(ridx.id() == it->first);
ridx.estimator()->bufferUpdates(seq, std::move(it->second.inserts),
std::move(it->second.removals));
}
};
it->Seek(bounds.start());
while (it->Valid()) {
for (it->Seek(bounds.start()); it->Valid(); it->Next()) {
TRI_ASSERT(it->key().compare(upper) < 0);
if (application_features::ApplicationServer::isStopping()) {
res.reset(TRI_ERROR_SHUTTING_DOWN);
break;
}
res = ridx.insert(trx, &batched, RocksDBKey::documentId(it->key()),
VPackSlice(it->value().data()), Index::OperationMode::normal);
@ -356,57 +222,398 @@ static arangodb::Result fillIndexFast(RocksDBIndex& ridx, LogicalCollection& col
numDocsWritten++;
if (numDocsWritten % 200 == 0) { // commit buffered writes
commitLambda();
commitLambda(rootDB->GetLatestSequenceNumber());
if (res.fail()) {
break;
}
}
}
it->Next();
if (!it->status().ok() && res.ok()) {
res = rocksutils::convertStatus(it->status(), rocksutils::StatusHint::index);
}
if (res.ok()) {
commitLambda();
commitLambda(rootDB->GetLatestSequenceNumber());
}
batch.Clear();
if (res.ok()) { // required so iresearch commits
res = trx.commit();
}
// we will need to remove index elements created before an error
// occurred, this needs to happen since we are non transactional
if (res.fail()) {
RocksDBKeyBounds bounds = ridx.getBounds();
arangodb::Result res2 =
rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, true,
/*useRangeDel*/ numDocsWritten > 25000);
if (res2.fail()) {
LOG_TOPIC(WARN, Logger::ENGINES) << "was not able to roll-back "
<< "index creation: " << res2.errorMessage();
}
}
// if an error occured drop() will be called
LOG_TOPIC(DEBUG, Logger::ENGINES) << "SNAPSHOT CAPTURED " << numDocsWritten << " " << res.errorMessage();
return res;
}
/// non-transactional: fill index with existing documents
/// from this collection
arangodb::Result RocksDBBuilderIndex::fillIndexFast() {
arangodb::Result RocksDBBuilderIndex::fillIndexForeground() {
RocksDBIndex* internal = _wrapped.get();
TRI_ASSERT(internal != nullptr);
std::function<void()> empty;
const rocksdb::Snapshot* snap = nullptr;
Result res;
if (this->unique()) {
const rocksdb::Comparator* cmp = internal->columnFamily()->GetComparator();
// unique index. we need to keep track of all our changes because we need to
// avoid duplicate index keys. must therefore use a WriteBatchWithIndex
rocksdb::WriteBatchWithIndex batch(cmp, 32 * 1024 * 1024);
return ::fillIndexFast<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(
*internal, _collection, batch);
res = ::fillIndex<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(*internal, batch, snap);
} else {
// non-unique index. all index keys will be unique anyway because they
// contain the document id we can therefore get away with a cheap WriteBatch
rocksdb::WriteBatch batch(32 * 1024 * 1024);
return ::fillIndexFast<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, _collection,
batch);
res = ::fillIndex<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, batch, snap);
}
return res;
}
namespace {
template <typename MethodsType>
struct ReplayHandler final : public rocksdb::WriteBatch::Handler {
ReplayHandler(uint64_t oid, RocksDBIndex& idx, transaction::Methods& trx, MethodsType* methods)
: _objectId(oid), _index(idx), _trx(trx), _methods(methods) {}
bool Continue() override {
if (application_features::ApplicationServer::isStopping()) {
tmpRes.reset(TRI_ERROR_SHUTTING_DOWN);
}
return tmpRes.ok();
}
uint64_t numInserted = 0;
uint64_t numRemoved = 0;
Result tmpRes;
void startNewBatch(rocksdb::SequenceNumber startSequence) {
// starting new write batch
_startSequence = startSequence;
_currentSequence = startSequence;
_startOfBatch = true;
_lastObjectID = 0;
}
uint64_t endBatch() {
_lastObjectID = 0;
return _currentSequence;
}
// The default implementation of LogData does nothing.
void LogData(const rocksdb::Slice& blob) override {
switch (RocksDBLogValue::type(blob)) {
case RocksDBLogType::TrackedDocumentInsert:
if (_lastObjectID == _objectId) {
auto pair = RocksDBLogValue::trackedDocument(blob);
tmpRes = _index.insert(_trx, _methods, pair.first, pair.second,
Index::OperationMode::normal);
numInserted++;
}
break;
case RocksDBLogType::TrackedDocumentRemove:
if (_lastObjectID == _objectId) {
auto pair = RocksDBLogValue::trackedDocument(blob);
tmpRes = _index.remove(_trx, _methods, pair.first, pair.second,
Index::OperationMode::normal);
numRemoved++;
}
break;
default: // ignore
_lastObjectID = 0;
break;
}
}
rocksdb::Status PutCF(uint32_t column_family_id, const rocksdb::Slice& key,
rocksdb::Slice const& value) override {
incTick();
if (column_family_id == RocksDBColumnFamily::definitions()->GetID()) {
_lastObjectID = 0;
} else if (column_family_id == RocksDBColumnFamily::documents()->GetID()) {
_lastObjectID = RocksDBKey::objectId(key);
}
return rocksdb::Status();
}
rocksdb::Status DeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override {
incTick();
if (column_family_id == RocksDBColumnFamily::definitions()->GetID()) {
_lastObjectID = 0;
} else if (column_family_id == RocksDBColumnFamily::documents()->GetID()) {
_lastObjectID = RocksDBKey::objectId(key);
}
return rocksdb::Status();
}
rocksdb::Status SingleDeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override {
incTick();
if (column_family_id == RocksDBColumnFamily::definitions()->GetID()) {
_lastObjectID = 0;
} else if (column_family_id == RocksDBColumnFamily::documents()->GetID()) {
_lastObjectID = RocksDBKey::objectId(key);
}
return rocksdb::Status();
}
rocksdb::Status DeleteRangeCF(uint32_t column_family_id,
const rocksdb::Slice& begin_key,
const rocksdb::Slice& end_key) override {
incTick(); // drop and truncate may use this
if (column_family_id == _index.columnFamily()->GetID() &&
RocksDBKey::objectId(begin_key) == _objectId &&
RocksDBKey::objectId(end_key) == _objectId) {
_index.afterTruncate(_currentSequence);
}
return rocksdb::Status(); // make WAL iterator happy
}
private:
// tick function that is called before each new WAL entry
void incTick() {
if (_startOfBatch) {
// we are at the start of a batch. do NOT increase sequence number
_startOfBatch = false;
} else {
// we are inside a batch already. now increase sequence number
++_currentSequence;
}
}
private:
const uint64_t _objectId; /// collection objectID
RocksDBIndex& _index; /// the index to use
transaction::Methods& _trx;
MethodsType* _methods; /// methods to fill
rocksdb::SequenceNumber _startSequence;
rocksdb::SequenceNumber _currentSequence;
rocksdb::SequenceNumber _lastWrittenSequence;
bool _startOfBatch = false;
uint64_t _lastObjectID = 0;
};
template <typename WriteBatchType, typename MethodsType>
Result catchup(RocksDBIndex& ridx, WriteBatchType& wb, AccessMode::Type mode,
rocksdb::SequenceNumber startingFrom,
rocksdb::SequenceNumber& lastScannedTick, uint64_t& numScanned) {
LogicalCollection& coll = ridx.collection();
::BuilderTrx trx(transaction::StandaloneContext::Create(coll.vocbase()), coll, mode);
if (mode == AccessMode::Type::EXCLUSIVE) {
trx.addHint(transaction::Hints::Hint::LOCK_NEVER);
}
Result res = trx.begin();
if (res.fail()) {
return res;
}
auto state = RocksDBTransactionState::toState(&trx);
RocksDBTransactionCollection* trxColl = trx.resolveTrxCollection();
RocksDBCollection* rcoll = static_cast<RocksDBCollection*>(coll.getPhysical());
rocksdb::DB* rootDB = rocksutils::globalRocksDB()->GetRootDB();
TRI_ASSERT(rootDB != nullptr);
// write batch will be reset every x documents
MethodsType batched(state, &wb);
ReplayHandler<MethodsType> replay(rcoll->objectId(), ridx, trx, &batched);
std::unique_ptr<rocksdb::TransactionLogIterator> iterator; // reader();
// no need verifying the WAL contents
rocksdb::TransactionLogIterator::ReadOptions ro(false);
rocksdb::Status s = rootDB->GetUpdatesSince(startingFrom, &iterator, ro);
if (!s.ok()) {
return res.reset(convertStatus(s, rocksutils::StatusHint::wal));
}
auto commitLambda = [&](rocksdb::SequenceNumber seq) {
if (wb.GetWriteBatch()->Count() > 0) {
rocksdb::WriteOptions wo;
s = rootDB->Write(wo, wb.GetWriteBatch());
if (!s.ok()) {
res = rocksutils::convertStatus(s, rocksutils::StatusHint::index);
}
}
wb.Clear();
auto ops = trxColl->stealTrackedOperations();
if (!ops.empty()) {
TRI_ASSERT(ridx.hasSelectivityEstimate() && ops.size() == 1);
auto it = ops.begin();
TRI_ASSERT(ridx.id() == it->first);
ridx.estimator()->bufferUpdates(seq, std::move(it->second.inserts),
std::move(it->second.removals));
}
};
LOG_TOPIC(DEBUG, Logger::ENGINES) << "Scanning from " << startingFrom;
for (; iterator->Valid(); iterator->Next()) {
rocksdb::BatchResult batch = iterator->GetBatch();
lastScannedTick = batch.sequence; // start of the batch
if (batch.sequence < startingFrom) {
continue; // skip
}
replay.startNewBatch(batch.sequence);
s = batch.writeBatchPtr->Iterate(&replay);
if (!s.ok()) {
res = rocksutils::convertStatus(s);
break;
}
if (replay.tmpRes.fail()) {
res = replay.tmpRes;
break;
}
commitLambda(batch.sequence);
if (res.fail()) {
break;
}
lastScannedTick = replay.endBatch();
}
if (!iterator->status().ok() && res.ok()) {
LOG_TOPIC(ERR, Logger::ENGINES) << "iterator error " << s.ToString();
res = rocksutils::convertStatus(iterator->status());
}
if (res.ok()) {
numScanned = replay.numInserted + replay.numRemoved;
res = trx.commit(); // important for iresearch
}
LOG_TOPIC(DEBUG, Logger::ENGINES) << "WAL REPLAYED insertions: " << replay.numInserted
<< "; deletions: " << replay.numRemoved << "; lastScannedTick "
<< lastScannedTick;
return res;
}
} // namespace
bool RocksDBBuilderIndex::Locker::lock() {
if (!_locked) {
if (_collection->lockWrite() != TRI_ERROR_NO_ERROR) {
return false;
}
_locked = true;
}
return true;
}
void RocksDBBuilderIndex::Locker::unlock() {
if (_locked) {
_collection->unlockWrite();
_locked = false;
}
}
// Background index filler task
arangodb::Result RocksDBBuilderIndex::fillIndexBackground(Locker& locker) {
TRI_ASSERT(locker.isLocked());
arangodb::Result res;
RocksDBIndex* internal = _wrapped.get();
TRI_ASSERT(internal != nullptr);
RocksDBEngine* engine = globalRocksEngine();
rocksdb::DB* rootDB = engine->db()->GetRootDB();
rocksdb::Snapshot const* snap = rootDB->GetSnapshot();
engine->disableWalFilePruning(true);
auto scope = scopeGuard([&] {
engine->disableWalFilePruning(false);
if (snap) {
rootDB->ReleaseSnapshot(snap);
}
});
locker.unlock();
if (internal->unique()) {
const rocksdb::Comparator* cmp = internal->columnFamily()->GetComparator();
// unique index. we need to keep track of all our changes because we need to
// avoid duplicate index keys. must therefore use a WriteBatchWithIndex
rocksdb::WriteBatchWithIndex batch(cmp, 32 * 1024 * 1024);
res = ::fillIndex<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(
*internal, batch, snap);
} else {
// non-unique index. all index keys will be unique anyway because they
// contain the document id we can therefore get away with a cheap WriteBatch
rocksdb::WriteBatch batch(32 * 1024 * 1024);
res = ::fillIndex<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, batch,
snap);
}
if (res.fail()) {
return res;
}
rocksdb::SequenceNumber scanFrom = snap->GetSequenceNumber();
rootDB->ReleaseSnapshot(snap);
snap = nullptr;
rocksdb::SequenceNumber lastScanned = 0;
uint64_t numScanned = 0;
int maxCatchups = 4;
while(true) {
lastScanned = 0;
if (internal->unique()) {
const rocksdb::Comparator* cmp = internal->columnFamily()->GetComparator();
// unique index. we need to keep track of all our changes because we need to
// avoid duplicate index keys. must therefore use a WriteBatchWithIndex
rocksdb::WriteBatchWithIndex batch(cmp, 32 * 1024 * 1024);
res = ::catchup<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(
*internal, batch, AccessMode::Type::WRITE, scanFrom, lastScanned, numScanned);
} else {
// non-unique index. all index keys will be unique anyway because they
// contain the document id we can therefore get away with a cheap WriteBatch
rocksdb::WriteBatch batch(32 * 1024 * 1024);
res = ::catchup<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, batch,
AccessMode::Type::WRITE,
scanFrom, lastScanned,
numScanned);
}
if (res.fail()) {
return res;
}
if (numScanned < 5000 || maxCatchups-- == 0) {
TRI_ASSERT(lastScanned > scanFrom);
std::this_thread::yield();
break;
}
}
if (!locker.lock()) {
return res.reset(TRI_ERROR_LOCK_TIMEOUT);
}
scanFrom = lastScanned;
if (internal->unique()) {
const rocksdb::Comparator* cmp = internal->columnFamily()->GetComparator();
// unique index. we need to keep track of all our changes because we need to
// avoid duplicate index keys. must therefore use a WriteBatchWithIndex
rocksdb::WriteBatchWithIndex batch(cmp, 32 * 1024 * 1024);
res = ::catchup<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(
*internal, batch, AccessMode::Type::EXCLUSIVE, scanFrom, lastScanned, numScanned);
} else {
// non-unique index. all index keys will be unique anyway because they
// contain the document id we can therefore get away with a cheap WriteBatch
rocksdb::WriteBatch batch(32 * 1024 * 1024);
res = ::catchup<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, batch,
AccessMode::Type::EXCLUSIVE,
scanFrom, lastScanned,
numScanned);
}
return res;
}

View File

@ -30,6 +30,8 @@
namespace arangodb {
class RocksDBCollection;
/// Dummy index class that contains the logic to build indexes
/// without an exclusive lock. It wraps the actual index implementation
/// and adds some required synchronization logic on top
@ -71,13 +73,11 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex {
bool hasSelectivityEstimate() const override { return false; }
/// insert index elements into the specified write batch.
Result insert(transaction::Methods& trx, RocksDBMethods*,
LocalDocumentId const& documentId,
Result insert(transaction::Methods& trx, RocksDBMethods*, LocalDocumentId const& documentId,
arangodb::velocypack::Slice const&, OperationMode mode) override;
/// remove index elements and put it in the specified write batch.
Result remove(transaction::Methods& trx, RocksDBMethods*,
LocalDocumentId const& documentId,
Result remove(transaction::Methods& trx, RocksDBMethods*, LocalDocumentId const& documentId,
arangodb::velocypack::Slice const&, OperationMode mode) override;
RocksDBBuilderIndex(std::shared_ptr<arangodb::RocksDBIndex> const&);
@ -91,12 +91,23 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex {
}
void recalculateEstimates() override { _wrapped->recalculateEstimates(); }
/// @brief fill index, will exclusively lock the collection
Result fillIndexFast();
/// @brief assumes an exclusive lock on the collection
Result fillIndexForeground();
struct Locker {
Locker(RocksDBCollection* c) : _collection(c), _locked(false) {}
~Locker() { unlock(); }
bool lock();
void unlock();
bool isLocked() const { return _locked; }
private:
RocksDBCollection* const _collection;
bool _locked;
};
/// @brief fill the index, assume already locked exclusively
/// @param unlock called when collection lock can be released
Result fillIndexBackground(std::function<void()> const& unlock);
/// @param locker locks and unlocks the collection
Result fillIndexBackground(Locker& locker);
virtual IndexIterator* iteratorForCondition(transaction::Methods* trx,
ManagedDocumentResult* result,
@ -109,17 +120,6 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex {
private:
std::shared_ptr<arangodb::RocksDBIndex> _wrapped;
std::atomic<bool> _hasError;
std::mutex _errorMutex;
Result _errorResult;
std::mutex _removedDocsMutex;
std::unordered_set<LocalDocumentId::BaseType> _removedDocs;
std::mutex _lockedDocsMutex;
std::condition_variable _lockedDocsCond;
std::unordered_set<LocalDocumentId::BaseType> _lockedDocs;
};
} // namespace arangodb

View File

@ -80,7 +80,8 @@ RocksDBCollection::RocksDBCollection(LogicalCollection& collection,
_cacheEnabled(
!collection.system() &&
basics::VelocyPackHelper::readBooleanValue(info, "cacheEnabled", false) &&
CacheManagerFeature::MANAGER != nullptr) {
CacheManagerFeature::MANAGER != nullptr),
_numIndexCreations(0) {
TRI_ASSERT(!ServerState::instance()->isCoordinator());
VPackSlice s = info.get("isVolatile");
if (s.isBoolean() && s.getBoolean()) {
@ -108,7 +109,8 @@ RocksDBCollection::RocksDBCollection(LogicalCollection& collection,
_cache(nullptr),
_cachePresent(false),
_cacheEnabled(static_cast<RocksDBCollection const*>(physical)->_cacheEnabled &&
CacheManagerFeature::MANAGER != nullptr) {
CacheManagerFeature::MANAGER != nullptr),
_numIndexCreations(0) {
TRI_ASSERT(!ServerState::instance()->isCoordinator());
rocksutils::globalRocksEngine()->addCollectionMapping(
_objectId, _logicalCollection.vocbase().id(), _logicalCollection.id());
@ -264,8 +266,23 @@ void RocksDBCollection::prepareIndexes(arangodb::velocypack::Slice indexesSlice)
}
WRITE_LOCKER(guard, _indexesLock);
TRI_ASSERT(_indexes.empty());
for (std::shared_ptr<Index>& idx : indexes) {
addIndex(std::move(idx));
auto const id = idx->id();
for (auto const& it : _indexes) {
if (it->id() == id) { // index is there twice
idx.reset();
}
}
if (idx) {
TRI_UpdateTickServer(static_cast<TRI_voc_tick_t>(id));
_indexes.emplace_back(idx);
if (idx->type() == Index::TRI_IDX_TYPE_PRIMARY_INDEX) {
TRI_ASSERT(idx->id() == 0);
_primaryIndex = static_cast<RocksDBPrimaryIndex*>(idx.get());
}
}
}
if (_indexes[0]->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX ||
@ -286,7 +303,7 @@ void RocksDBCollection::prepareIndexes(arangodb::velocypack::Slice indexesSlice)
TRI_ASSERT(!_indexes.empty());
}
std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slice const& info,
std::shared_ptr<Index> RocksDBCollection::createIndex(VPackSlice const& info,
bool restore, bool& created) {
TRI_ASSERT(info.isObject());
Result res;
@ -298,18 +315,17 @@ std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slic
if (res.fail()) {
THROW_ARANGO_EXCEPTION(res);
}
auto releaseGuard =
scopeGuard([&] { vocbase.releaseCollection(&_logicalCollection); });
res = lockWrite();
if (res.fail()) {
THROW_ARANGO_EXCEPTION(res);
}
// WRITE_LOCKER(indexGuard, _indexesLock);
auto unlockGuard = scopeGuard([&] {
// indexGuard.unlock(); // unlock in reverse order
this->unlockWrite();
_numIndexCreations.fetch_add(1, std::memory_order_release);
auto colGuard = scopeGuard([&] {
vocbase.releaseCollection(&_logicalCollection);
_numIndexCreations.fetch_sub(1, std::memory_order_release);
});
RocksDBBuilderIndex::Locker locker(this);
if (!locker.lock()) {
THROW_ARANGO_EXCEPTION(TRI_ERROR_LOCK_TIMEOUT);
}
std::shared_ptr<Index> idx;
{ // Step 1. Check for matching index
WRITE_LOCKER(guard, _indexesLock);
@ -377,23 +393,18 @@ std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slic
}
}
const bool inBackground = false; // TODO simon: will be enabled in a seperate PR
#if 0
bool inBackground = basics::VelocyPackHelper::getBooleanValue(
info, StaticStrings::IndexInBackground, false);
#endif
const bool inBackground =
basics::VelocyPackHelper::getBooleanValue(info, StaticStrings::IndexInBackground, false);
// Step 4. fill index
if (res.ok()) {
if (inBackground) { // allow concurrent inserts into index
_indexes.emplace_back(buildIdx);
res = buildIdx->fillIndexBackground([&] {
unlockGuard.fire(); // will be called at appropriate time
});
res = buildIdx->fillIndexBackground(locker);
} else {
res = buildIdx->fillIndexFast(); // will lock again internally
res = buildIdx->fillIndexForeground(); // will lock again internally
}
}
locker.lock(); // always lock to avoid inconsistencies
// Step 5. cleanup
if (res.ok()) {
@ -411,20 +422,6 @@ std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slic
}
}
// // we should sync the selectivity estimates TODO fix
// res = engine->settingsManager()->sync(false);
// if (res.fail()) { // not critical
// LOG_TOPIC(WARN, Logger::ENGINES) << "could not sync settings: "
// << res.errorMessage();
// res.reset();
// }
//
// rocksdb::Status s = engine->db()->GetRootDB()->FlushWAL(true);
// if (!s.ok()) { // not critical
// LOG_TOPIC(WARN, Logger::ENGINES) << "could not flush wal: "
// << s.ToString();
// }
#if USE_PLAN_CACHE
arangodb::aql::PlanCache::instance()->invalidate(_logicalCollection->vocbase());
#endif
@ -442,7 +439,6 @@ std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slic
}
}
unlockGuard.fire(); // may have already been fired
if (res.fail()) {
{ // We could not create the index. Better abort
WRITE_LOCKER(guard, _indexesLock);
@ -550,8 +546,7 @@ Result RocksDBCollection::truncate(transaction::Methods& trx, OperationOptions&
if (state->isOnlyExclusiveTransaction() &&
state->hasHint(transaction::Hints::Hint::ALLOW_RANGE_DELETE) &&
static_cast<RocksDBEngine*>(EngineSelectorFeature::ENGINE)->canUseRangeDeleteInWal() &&
_numberDocuments >= 32 * 1024) {
this->canUseRangeDeleteInWal() && _numberDocuments >= 32 * 1024) {
// non-transactional truncate optimization. We perform a bunch of
// range deletes and circumwent the normal rocksdb::Transaction.
// no savepoint needed here
@ -562,8 +557,6 @@ Result RocksDBCollection::truncate(transaction::Methods& trx, OperationOptions&
}
RocksDBEngine* engine = rocksutils::globalRocksEngine();
// add the assertion again here, so we are sure we can use RangeDeletes
TRI_ASSERT(engine->canUseRangeDeleteInWal());
rocksdb::DB* db = engine->db()->GetRootDB();
TRI_IF_FAILURE("RocksDBCollection::truncate::forceSync") {
@ -831,9 +824,10 @@ Result RocksDBCollection::insert(arangodb::transaction::Methods* trx,
if (documentId.isSet()) {
if (options.indexOperationMode == Index::OperationMode::internal) {
// need to return the key of the conflict document
return Result(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED, keySlice.copyString());
return res.reset(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED,
keySlice.copyString());
}
return Result(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED);
return res.reset(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED);
}
}
}
@ -910,7 +904,7 @@ Result RocksDBCollection::update(arangodb::transaction::Methods* trx,
int result = checkRevision(trx, expectedRev, prevRev);
if (result != TRI_ERROR_NO_ERROR) {
return Result(result);
return res.reset(result);
}
}
@ -920,7 +914,7 @@ Result RocksDBCollection::update(arangodb::transaction::Methods* trx,
TRI_ASSERT(!mdr.empty());
trackWaitForSync(trx, options);
return Result();
return res;
}
// merge old and new values
@ -937,7 +931,7 @@ Result RocksDBCollection::update(arangodb::transaction::Methods* trx,
if (_isDBServer) {
// Need to check that no sharding keys have changed:
if (arangodb::shardKeysChanged(_logicalCollection, oldDoc, builder->slice(), false)) {
return Result(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SHARDING_ATTRIBUTES);
return res.reset(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SHARDING_ATTRIBUTES);
}
}
@ -994,13 +988,13 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
// get the previous revision
VPackSlice key = newSlice.get(StaticStrings::KeyString);
Result res;
if (key.isNone()) {
return Result(TRI_ERROR_ARANGO_DOCUMENT_HANDLE_BAD);
return res.reset(TRI_ERROR_ARANGO_DOCUMENT_HANDLE_BAD);
}
// get the previous revision
Result res = this->read(trx, key, previous, /*lock*/ false);
res = this->read(trx, key, previous, /*lock*/ false);
if (res.fail()) {
return res;
}
@ -1018,10 +1012,10 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
if (newSlice.isObject()) {
expectedRev = TRI_ExtractRevisionId(newSlice);
}
int res = checkRevision(trx, expectedRev, prevRev);
if (res != TRI_ERROR_NO_ERROR) {
return Result(res);
res = checkRevision(trx, expectedRev, prevRev);
if (res.fail()) {
return res;
}
}
@ -1038,7 +1032,7 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
if (_isDBServer) {
// Need to check that no sharding keys have changed:
if (arangodb::shardKeysChanged(_logicalCollection, oldDoc, builder->slice(), false)) {
return Result(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SHARDING_ATTRIBUTES);
return res.reset(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SHARDING_ATTRIBUTES);
}
}
@ -1050,9 +1044,9 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
// add possible log statement under guard
state->prepareOperation(_logicalCollection.id(), revisionId, TRI_VOC_DOCUMENT_OPERATION_REPLACE);
Result opResult = updateDocument(trx, oldDocumentId, oldDoc, documentId, newDoc, options);
res = updateDocument(trx, oldDocumentId, oldDoc, documentId, newDoc, options);
if (opResult.ok()) {
if (res.ok()) {
trackWaitForSync(trx, options);
if (options.silent) {
@ -1079,7 +1073,7 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
guard.finish(hasPerformedIntermediateCommit);
}
return opResult;
return res;
}
Result RocksDBCollection::remove(transaction::Methods& trx, velocypack::Slice slice,
@ -1103,7 +1097,7 @@ Result RocksDBCollection::remove(transaction::Methods& trx, velocypack::Slice sl
TRI_ASSERT(!key.isNone());
// get the previous revision
auto res = this->read(&trx, key, previous, /*lock*/ false);
Result res = this->read(&trx, key, previous, /*lock*/ false);
if (res.fail()) {
return res;
@ -1120,10 +1114,10 @@ Result RocksDBCollection::remove(transaction::Methods& trx, velocypack::Slice sl
// Check old revision:
if (!options.ignoreRevs && slice.isObject()) {
TRI_voc_rid_t expectedRevisionId = TRI_ExtractRevisionId(slice);
auto res = checkRevision(&trx, expectedRevisionId, oldRevisionId);
res = checkRevision(&trx, expectedRevisionId, oldRevisionId);
if (res != TRI_ERROR_NO_ERROR) {
return Result(res);
if (res.fail()) {
return res;
}
}
@ -1193,29 +1187,6 @@ void RocksDBCollection::figuresSpecific(std::shared_ptr<arangodb::velocypack::Bu
}
}
void RocksDBCollection::addIndex(std::shared_ptr<arangodb::Index> idx) {
// LOCKED from the outside
// primary index must be added at position 0
TRI_ASSERT(ServerState::instance()->isRunningInCluster() ||
idx->type() != arangodb::Index::TRI_IDX_TYPE_PRIMARY_INDEX ||
_indexes.empty());
auto const id = idx->id();
for (auto const& it : _indexes) {
if (it->id() == id) {
// already have this particular index. do not add it again
return;
}
}
TRI_UpdateTickServer(static_cast<TRI_voc_tick_t>(id));
_indexes.emplace_back(idx);
if (idx->type() == Index::TRI_IDX_TYPE_PRIMARY_INDEX) {
TRI_ASSERT(idx->id() == 0);
_primaryIndex = static_cast<RocksDBPrimaryIndex*>(idx.get());
}
}
Result RocksDBCollection::insertDocument(arangodb::transaction::Methods* trx,
LocalDocumentId const& documentId,
VPackSlice const& doc,
@ -1312,28 +1283,23 @@ Result RocksDBCollection::updateDocument(transaction::Methods* trx,
Result res;
RocksDBMethods* mthd = RocksDBTransactionState::toMethods(trx);
// We NEED to do the PUT first, otherwise WAL tailing breaks
RocksDBKeyLeaser newKey(trx);
newKey->constructDocument(_objectId, newDocumentId);
// simon: we do not need to blacklist the new documentId
// disable indexing in this transaction if we are allowed to
IndexingDisabler disabler(mthd, trx->isSingleOperationTransaction());
rocksdb::Status s =
mthd->Put(RocksDBColumnFamily::documents(), newKey.ref(),
rocksdb::Slice(reinterpret_cast<char const*>(newDoc.begin()),
static_cast<size_t>(newDoc.byteSize())));
RocksDBKeyLeaser key(trx);
key->constructDocument(_objectId, oldDocumentId);
blackListKey(key->string().data(), static_cast<uint32_t>(key->string().size()));
rocksdb::Status s = mthd->SingleDelete(RocksDBColumnFamily::documents(), key.ref());
if (!s.ok()) {
return res.reset(rocksutils::convertStatus(s, rocksutils::document));
}
RocksDBKeyLeaser oldKey(trx);
oldKey->constructDocument(_objectId, oldDocumentId);
blackListKey(oldKey->string().data(), static_cast<uint32_t>(oldKey->string().size()));
s = mthd->SingleDelete(RocksDBColumnFamily::documents(), oldKey.ref());
key->constructDocument(_objectId, newDocumentId);
// simon: we do not need to blacklist the new documentId
s = mthd->Put(RocksDBColumnFamily::documents(), key.ref(),
rocksdb::Slice(reinterpret_cast<char const*>(newDoc.begin()),
static_cast<size_t>(newDoc.byteSize())));
if (!s.ok()) {
return res.reset(rocksutils::convertStatus(s, rocksutils::document));
}
@ -1341,8 +1307,8 @@ Result RocksDBCollection::updateDocument(transaction::Methods* trx,
READ_LOCKER(guard, _indexesLock);
for (std::shared_ptr<Index> const& idx : _indexes) {
RocksDBIndex* rIdx = static_cast<RocksDBIndex*>(idx.get());
res = rIdx->update(*trx, mthd, oldDocumentId, oldDoc, newDocumentId,
newDoc, options.indexOperationMode);
res = rIdx->update(*trx, mthd, oldDocumentId, oldDoc, newDocumentId, newDoc,
options.indexOperationMode);
if (res.fail()) {
break;
@ -1411,8 +1377,7 @@ arangodb::Result RocksDBCollection::lookupDocumentVPack(LocalDocumentId const& d
}
} else {
LOG_TOPIC(DEBUG, Logger::ENGINES)
<< "NOT FOUND rev: " << documentId.id()
<< " trx: " << trx->state()->id() << " seq: " << mthd->sequenceNumber()
<< "NOT FOUND rev: " << documentId.id() << " trx: " << trx->state()->id()
<< " objectID " << _objectId << " name: " << _logicalCollection.name();
mdr.clear();
res.reset(rocksutils::convertStatus(s, rocksutils::document));
@ -1476,8 +1441,7 @@ arangodb::Result RocksDBCollection::lookupDocumentVPack(
cb(documentId, VPackSlice(ps.data()));
} else {
LOG_TOPIC(DEBUG, Logger::ENGINES)
<< "NOT FOUND rev: " << documentId.id()
<< " trx: " << trx->state()->id() << " seq: " << mthd->sequenceNumber()
<< "NOT FOUND rev: " << documentId.id() << " trx: " << trx->state()->id()
<< " objectID " << _objectId << " name: " << _logicalCollection.name();
res.reset(rocksutils::convertStatus(s, rocksutils::document));
}
@ -1663,10 +1627,9 @@ uint64_t RocksDBCollection::recalculateCounts() {
std::unique_ptr<rocksdb::Iterator> it(db->NewIterator(ro, cf));
std::size_t count = 0;
it->Seek(bounds.start());
while (it->Valid() && it->key().compare(upper) < 0) {
for (it->Seek(bounds.start()); it->Valid(); it->Next()) {
TRI_ASSERT(it->key().compare(upper) < 0);
++count;
it->Next();
}
int64_t adjustment = snapNumberOfDocuments - count;
@ -1780,3 +1743,12 @@ void RocksDBCollection::trackWaitForSync(arangodb::transaction::Methods* trx,
trx->state()->waitForSync(true);
}
}
/// @brief can use non transactional range delete in write ahead log
bool RocksDBCollection::canUseRangeDeleteInWal() const {
if (ServerState::instance()->isSingleServer()) {
// disableWalFilePruning is used by createIndex
return _numIndexCreations.load(std::memory_order_acquire) == 0;
}
return false;
}

View File

@ -181,12 +181,8 @@ class RocksDBCollection final : public PhysicalCollection {
RocksDBCollectionMeta& meta() { return _meta; }
private:
/// @brief track the usage of waitForSync option in an operation
void trackWaitForSync(arangodb::transaction::Methods* trx, OperationOptions& options);
/// @brief return engine-specific figures
void figuresSpecific(std::shared_ptr<velocypack::Builder>&) override;
void addIndex(std::shared_ptr<arangodb::Index> idx);
// @brief return the primary index
// WARNING: Make sure that this instance
@ -232,8 +228,15 @@ class RocksDBCollection final : public PhysicalCollection {
return (_cacheEnabled && _cachePresent);
}
/// @brief track key in file
void blackListKey(char const* data, std::size_t len) const;
/// @brief track the usage of waitForSync option in an operation
void trackWaitForSync(arangodb::transaction::Methods* trx, OperationOptions& options);
/// @brief can use non transactional range delete in write ahead log
bool canUseRangeDeleteInWal() const;
private:
uint64_t const _objectId; // rocksdb-specific object id for collection
RocksDBCollectionMeta _meta; /// collection metadata
@ -247,10 +250,13 @@ class RocksDBCollection final : public PhysicalCollection {
mutable basics::ReadWriteLock _exclusiveLock;
/// @brief document cache (optional)
mutable std::shared_ptr<cache::Cache> _cache;
// we use this boolean for testing whether _cache is set.
// it's quicker than accessing the shared_ptr each time
mutable bool _cachePresent;
bool _cacheEnabled;
/// @brief number of index creations in progress
std::atomic<int> _numIndexCreations;
};
inline RocksDBCollection* toRocksDBCollection(PhysicalCollection* physical) {

View File

@ -354,7 +354,7 @@ Result RocksDBCollectionMeta::deserializeMeta(rocksdb::DB* db, LogicalCollection
if (!s.ok() && !s.IsNotFound()) {
return rocksutils::convertStatus(s);
} else if (s.IsNotFound()) { // expected with nosync recovery tests
LOG_TOPIC(WARN, Logger::ROCKSDB)
LOG_TOPIC(WARN, Logger::ENGINES)
<< "recalculating index estimate for index "
<< "type '" << idx->typeName() << "' with id '" << idx->id() << "'";
idx->recalculateEstimates();

View File

@ -27,7 +27,6 @@
#define ARANGO_ROCKSDB_ROCKSDB_COMMON_H 1
#include "Basics/Common.h"
#include "Basics/Endian.h"
#include "Basics/Result.h"
#include "Basics/RocksDBUtils.h"
#include "RocksDBEngine/RocksDBComparator.h"

View File

@ -902,16 +902,17 @@ int RocksDBEngine::getCollectionsAndIndexes(TRI_vocbase_t& vocbase,
}
int RocksDBEngine::getViews(TRI_vocbase_t& vocbase, arangodb::velocypack::Builder& result) {
rocksdb::ReadOptions readOptions;
std::unique_ptr<rocksdb::Iterator> iter(
_db->NewIterator(readOptions, RocksDBColumnFamily::definitions()));
result.openArray();
auto bounds = RocksDBKeyBounds::DatabaseViews(vocbase.id());
rocksdb::Slice upper = bounds.end();
rocksdb::ColumnFamilyHandle* cf = RocksDBColumnFamily::definitions();
for (iter->Seek(bounds.start());
iter->Valid() && iter->key().compare(bounds.end()) < 0; iter->Next()) {
rocksdb::ReadOptions ro;
ro.iterate_upper_bound = &upper;
std::unique_ptr<rocksdb::Iterator> iter(_db->NewIterator(ro, cf));
result.openArray();
for (iter->Seek(bounds.start()); iter->Valid(); iter->Next()) {
TRI_ASSERT(iter->key().compare(bounds.end()) < 0);
auto slice = VPackSlice(iter->value().data());
LOG_TOPIC(TRACE, Logger::VIEWS) << "got view slice: " << slice.toJson();
@ -1571,6 +1572,14 @@ void RocksDBEngine::waitForEstimatorSync(std::chrono::milliseconds maxWaitTime)
}
}
void RocksDBEngine::disableWalFilePruning(bool disable) {
_backgroundThread->disableWalFilePruning(disable);
}
bool RocksDBEngine::disableWalFilePruning() const {
return _backgroundThread->disableWalFilePruning();
}
Result RocksDBEngine::registerRecoveryHelper(std::shared_ptr<RocksDBRecoveryHelper> helper) {
try {
_recoveryHelpers.emplace_back(helper);
@ -2177,10 +2186,6 @@ void RocksDBEngine::releaseTick(TRI_voc_tick_t tick) {
}
}
bool RocksDBEngine::canUseRangeDeleteInWal() const {
return ServerState::instance()->isSingleServer();
}
} // namespace arangodb
// -----------------------------------------------------------------------------

View File

@ -296,7 +296,9 @@ class RocksDBEngine final : public StorageEngine {
static std::string const EngineName;
static std::string const FeatureName;
bool canUseRangeDeleteInWal() const;
/// @brief allow / disbable removal of WAL files
void disableWalFilePruning(bool disable);
bool disableWalFilePruning() const;
rocksdb::Options const& rocksDBOptions() const { return _options; }

View File

@ -420,12 +420,13 @@ Result RocksDBFulltextIndex::applyQueryToken(transaction::Methods* trx,
rocksdb::ReadOptions ro = mthds->iteratorReadOptions();
ro.iterate_upper_bound = &end;
std::unique_ptr<rocksdb::Iterator> iter = mthds->NewIterator(ro, _cf);
iter->Seek(bounds.start());
// set is used to perform an intersection with the result set
std::set<LocalDocumentId> intersect;
// apply left to right logic, merging all current results with ALL previous
while (iter->Valid() && cmp->Compare(iter->key(), end) < 0) {
for (iter->Seek(bounds.start());
iter->Valid() && cmp->Compare(iter->key(), end) < 0;
iter->Next()) {
TRI_ASSERT(_objectId == RocksDBKey::objectId(iter->key()));
rocksdb::Status s = iter->status();
@ -442,7 +443,6 @@ Result RocksDBFulltextIndex::applyQueryToken(transaction::Methods* trx,
} else if (token.operation == FulltextQueryToken::EXCLUDE) {
resultSet.erase(documentId);
}
iter->Next();
}
if (token.operation == FulltextQueryToken::AND) {
if (resultSet.empty() || intersect.empty()) {

View File

@ -42,7 +42,6 @@ namespace cache {
class Cache;
}
class LogicalCollection;
class RocksDBSettingsManager;
class RocksDBMethods;
class RocksDBIndex : public Index {

View File

@ -447,8 +447,7 @@ struct GeoIndexFactory : public DefaultIndexFactory {
if (isCreation && !ServerState::instance()->isCoordinator() &&
!definition.hasKey("objectId")) {
normalized.add("objectId",
arangodb::velocypack::Value(std::to_string(TRI_NewTickServer())));
normalized.add("objectId", VPackValue(std::to_string(TRI_NewTickServer())));
}
return EnhanceJsonIndexGeo(definition, normalized, isCreation);

View File

@ -100,12 +100,33 @@ RocksDBLogValue RocksDBLogValue::DocumentRemoveV2(TRI_voc_rid_t rid) {
RocksDBLogValue RocksDBLogValue::SinglePut(TRI_voc_tick_t vocbaseId, TRI_voc_cid_t cid) {
return RocksDBLogValue(RocksDBLogType::SinglePut, vocbaseId, cid);
}
RocksDBLogValue RocksDBLogValue::SingleRemoveV2(TRI_voc_tick_t vocbaseId,
TRI_voc_cid_t cid, TRI_voc_rid_t rid) {
return RocksDBLogValue(RocksDBLogType::SingleRemoveV2, vocbaseId, cid, rid);
}
/*static*/ RocksDBLogValue RocksDBLogValue::Empty() {
RocksDBLogValue RocksDBLogValue::TrackedDocumentInsert(LocalDocumentId docId,
VPackSlice const& slice) {
RocksDBLogValue val{};
val._buffer.reserve(sizeof(RocksDBLogType) + sizeof(LocalDocumentId::BaseType) + slice.byteSize());
val._buffer.push_back(static_cast<char>(RocksDBLogType::TrackedDocumentInsert));
uintToPersistentLittleEndian(val._buffer, docId.id());
val._buffer.append(slice.startAs<char>(), slice.byteSize());
return val;
}
RocksDBLogValue RocksDBLogValue::TrackedDocumentRemove(LocalDocumentId docId,
VPackSlice const& slice) {
RocksDBLogValue val{};
val._buffer.reserve(sizeof(RocksDBLogType) + sizeof(LocalDocumentId::BaseType) + slice.byteSize());
val._buffer.push_back(static_cast<char>(RocksDBLogType::TrackedDocumentRemove));
uintToPersistentLittleEndian(val._buffer, docId.id());
val._buffer.append(reinterpret_cast<char const*>(slice.begin()), slice.byteSize());
return val;
}
RocksDBLogValue RocksDBLogValue::Empty() {
return RocksDBLogValue();
}
@ -322,6 +343,18 @@ StringRef RocksDBLogValue::oldCollectionName(rocksdb::Slice const& slice) {
return StringRef(slice.data() + off, slice.size() - off);
}
/// @brief get slice from tracked document
std::pair<LocalDocumentId, VPackSlice> RocksDBLogValue::trackedDocument(rocksdb::Slice const& slice) {
TRI_ASSERT(slice.size() >= 2);
RocksDBLogType type = static_cast<RocksDBLogType>(slice.data()[0]);
TRI_ASSERT(type == RocksDBLogType::TrackedDocumentInsert ||
type == RocksDBLogType::TrackedDocumentRemove);
LocalDocumentId id(uintFromPersistentLittleEndian<LocalDocumentId::BaseType>(slice.data() + sizeof(RocksDBLogType)));
VPackSlice data(slice.data() + sizeof(RocksDBLogType) + sizeof(LocalDocumentId::BaseType));
return std::make_pair(id, data);
}
bool RocksDBLogValue::containsDatabaseId(RocksDBLogType type) {
return type == RocksDBLogType::DatabaseCreate || type == RocksDBLogType::DatabaseDrop ||
type == RocksDBLogType::CollectionCreate ||

View File

@ -27,6 +27,7 @@
#include "Basics/Common.h"
#include "Basics/StringRef.h"
#include "RocksDBEngine/RocksDBTypes.h"
#include "VocBase/LocalDocumentId.h"
#include "VocBase/voc-types.h"
#include <rocksdb/slice.h>
@ -72,6 +73,9 @@ class RocksDBLogValue {
static RocksDBLogValue SingleRemoveV2(TRI_voc_tick_t vocbaseId,
TRI_voc_cid_t cid, TRI_voc_rid_t rid);
static RocksDBLogValue TrackedDocumentInsert(LocalDocumentId, velocypack::Slice const&);
static RocksDBLogValue TrackedDocumentRemove(LocalDocumentId, velocypack::Slice const&);
// empty log value
static RocksDBLogValue Empty();
@ -96,9 +100,12 @@ class RocksDBLogValue {
/// @brief get UUID from view drop marker
static arangodb::StringRef viewUUID(rocksdb::Slice const&);
// deprecated method for old collection drop marker
/// @deprecated method for old collection drop marker
static arangodb::StringRef oldCollectionName(rocksdb::Slice const&);
/// @brief get slice from tracked document
static std::pair<LocalDocumentId, velocypack::Slice> trackedDocument(rocksdb::Slice const&);
static bool containsDatabaseId(RocksDBLogType type);
static bool containsCollectionId(RocksDBLogType type);
static bool containsViewId(RocksDBLogType type);

View File

@ -98,10 +98,6 @@ void RocksDBSavePoint::rollback() {
// =================== RocksDBMethods ===================
rocksdb::SequenceNumber RocksDBMethods::sequenceNumber() {
return _state->sequenceNumber();
}
rocksdb::ReadOptions RocksDBMethods::iteratorReadOptions() {
if (_state->hasHint(transaction::Hints::Hint::INTERMEDIATE_COMMITS)) {
rocksdb::ReadOptions ro = _state->_rocksReadOptions;
@ -179,6 +175,10 @@ rocksdb::Status RocksDBReadOnlyMethods::SingleDelete(rocksdb::ColumnFamilyHandle
THROW_ARANGO_EXCEPTION(TRI_ERROR_ARANGO_READ_ONLY);
}
void RocksDBReadOnlyMethods::PutLogData(rocksdb::Slice const& blob) {
THROW_ARANGO_EXCEPTION(TRI_ERROR_ARANGO_READ_ONLY);
}
std::unique_ptr<rocksdb::Iterator> RocksDBReadOnlyMethods::NewIterator(
rocksdb::ReadOptions const& opts, rocksdb::ColumnFamilyHandle* cf) {
TRI_ASSERT(cf != nullptr);
@ -241,6 +241,10 @@ rocksdb::Status RocksDBTrxMethods::SingleDelete(rocksdb::ColumnFamilyHandle* cf,
return _state->_rocksTransaction->SingleDelete(cf, key.string());
}
void RocksDBTrxMethods::PutLogData(rocksdb::Slice const& blob) {
_state->_rocksTransaction->PutLogData(blob);
}
std::unique_ptr<rocksdb::Iterator> RocksDBTrxMethods::NewIterator(
rocksdb::ReadOptions const& opts, rocksdb::ColumnFamilyHandle* cf) {
TRI_ASSERT(cf != nullptr);
@ -326,6 +330,10 @@ rocksdb::Status RocksDBBatchedMethods::SingleDelete(rocksdb::ColumnFamilyHandle*
return _wb->SingleDelete(cf, key.string());
}
void RocksDBBatchedMethods::PutLogData(rocksdb::Slice const& blob) {
_wb->PutLogData(blob);
}
std::unique_ptr<rocksdb::Iterator> RocksDBBatchedMethods::NewIterator(
rocksdb::ReadOptions const&, rocksdb::ColumnFamilyHandle*) {
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL,
@ -375,55 +383,13 @@ rocksdb::Status RocksDBBatchedWithIndexMethods::SingleDelete(rocksdb::ColumnFami
return _wb->SingleDelete(cf, key.string());
}
void RocksDBBatchedWithIndexMethods::PutLogData(rocksdb::Slice const& blob) {
_wb->PutLogData(blob);
}
std::unique_ptr<rocksdb::Iterator> RocksDBBatchedWithIndexMethods::NewIterator(
rocksdb::ReadOptions const& ro, rocksdb::ColumnFamilyHandle* cf) {
TRI_ASSERT(cf != nullptr);
return std::unique_ptr<rocksdb::Iterator>(
_wb->NewIteratorWithBase(_db->NewIterator(ro, cf)));
}
// =================== RocksDBSideTrxMethods ====================
/// transaction wrapper, uses the provided rocksdb transaction
RocksDBSideTrxMethods::RocksDBSideTrxMethods(RocksDBTransactionState* state,
rocksdb::Transaction* trx)
: RocksDBMethods(state), _trx(trx) {
_ro.prefix_same_as_start = true;
_ro.fill_cache = false;
}
rocksdb::Status RocksDBSideTrxMethods::Get(rocksdb::ColumnFamilyHandle* cf,
rocksdb::Slice const& key, std::string* val) {
TRI_ASSERT(cf != nullptr);
return _trx->Get(_ro, cf, key, val);
}
rocksdb::Status RocksDBSideTrxMethods::Get(rocksdb::ColumnFamilyHandle* cf,
rocksdb::Slice const& key,
rocksdb::PinnableSlice* val) {
TRI_ASSERT(cf != nullptr);
return _trx->Get(_ro, cf, key, val);
}
rocksdb::Status RocksDBSideTrxMethods::Put(rocksdb::ColumnFamilyHandle* cf,
RocksDBKey const& key,
rocksdb::Slice const& val) {
TRI_ASSERT(cf != nullptr);
return _trx->Put(cf, key.string(), val);
}
rocksdb::Status RocksDBSideTrxMethods::Delete(rocksdb::ColumnFamilyHandle* cf,
RocksDBKey const& key) {
TRI_ASSERT(cf != nullptr);
return _trx->Delete(cf, key.string());
}
rocksdb::Status RocksDBSideTrxMethods::SingleDelete(rocksdb::ColumnFamilyHandle* cf,
RocksDBKey const& key) {
TRI_ASSERT(cf != nullptr);
return _trx->SingleDelete(cf, key.string());
}
bool RocksDBSideTrxMethods::DisableIndexing() {
_trx->DisableIndexing();
return true;
}

View File

@ -72,9 +72,6 @@ class RocksDBMethods {
explicit RocksDBMethods(RocksDBTransactionState* state) : _state(state) {}
virtual ~RocksDBMethods() {}
/// @brief current sequence number
rocksdb::SequenceNumber sequenceNumber();
/// @brief read options for use with iterators
rocksdb::ReadOptions iteratorReadOptions();
@ -97,6 +94,8 @@ class RocksDBMethods {
/// when keys are inserted exactly once (and never overwritten)
virtual rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) = 0;
virtual void PutLogData(rocksdb::Slice const&) = 0;
virtual std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
rocksdb::ColumnFamilyHandle*) = 0;
@ -125,6 +124,7 @@ class RocksDBReadOnlyMethods final : public RocksDBMethods {
rocksdb::Slice const& val) override;
rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
void PutLogData(rocksdb::Slice const&) override;
std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
rocksdb::ColumnFamilyHandle*) override;
@ -157,6 +157,7 @@ class RocksDBTrxMethods : public RocksDBMethods {
rocksdb::Slice const& val) override;
rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
void PutLogData(rocksdb::Slice const&) override;
std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
rocksdb::ColumnFamilyHandle*) override;
@ -193,6 +194,8 @@ class RocksDBBatchedMethods final : public RocksDBMethods {
rocksdb::Slice const& val) override;
rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
void PutLogData(rocksdb::Slice const&) override;
std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
rocksdb::ColumnFamilyHandle*) override;
@ -219,6 +222,7 @@ class RocksDBBatchedWithIndexMethods final : public RocksDBMethods {
rocksdb::Slice const& val) override;
rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
void PutLogData(rocksdb::Slice const&) override;
std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
rocksdb::ColumnFamilyHandle*) override;
@ -234,38 +238,6 @@ class RocksDBBatchedWithIndexMethods final : public RocksDBMethods {
rocksdb::WriteBatchWithIndex* _wb;
};
/// transaction wrapper, uses the provided rocksdb transaction
class RocksDBSideTrxMethods final : public RocksDBMethods {
public:
explicit RocksDBSideTrxMethods(RocksDBTransactionState* state, rocksdb::Transaction* trx);
rocksdb::Status Get(rocksdb::ColumnFamilyHandle*, rocksdb::Slice const& key,
std::string* val) override;
rocksdb::Status Get(rocksdb::ColumnFamilyHandle*, rocksdb::Slice const& key,
rocksdb::PinnableSlice* val) override;
rocksdb::Status Put(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key,
rocksdb::Slice const& val) override;
rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
rocksdb::ColumnFamilyHandle*) override {
return nullptr;
}
void SetSavePoint() override {}
rocksdb::Status RollbackToSavePoint() override {
return rocksdb::Status::OK();
}
void PopSavePoint() override {}
bool DisableIndexing() override;
private:
rocksdb::Transaction* _trx;
rocksdb::ReadOptions _ro;
};
// INDEXING MAY ONLY BE DISABLED IN TOPLEVEL AQL TRANSACTIONS
// THIS IS BECAUSE THESE TRANSACTIONS WILL EITHER READ FROM
// OR (XOR) WRITE TO A COLLECTION. IF THIS PRECONDITION IS

View File

@ -335,6 +335,8 @@ class WALParser final : public rocksdb::WriteBatch::Handler {
case RocksDBLogType::DocumentOperationsPrologue:
case RocksDBLogType::DocumentRemove:
case RocksDBLogType::DocumentRemoveAsPartOfUpdate:
case RocksDBLogType::TrackedDocumentInsert:
case RocksDBLogType::TrackedDocumentRemove:
break; // ignore deprecated && unused markers
default:
@ -435,7 +437,9 @@ class WALParser final : public rocksdb::WriteBatch::Handler {
if (cfId != _primaryCF) {
return; // ignore all document operations
} else if (_state != TRANSACTION && _state != SINGLE_REMOVE) {
}
if (_state != TRANSACTION && _state != SINGLE_REMOVE) {
resetTransientState();
return;
}

View File

@ -242,7 +242,7 @@ void RocksDBTransactionCollection::commitCounts(uint64_t trxId, uint64_t commitS
// Update the index estimates.
for (auto& pair : _trackedIndexOperations) {
auto idx = _collection->lookupIndex(pair.first);
if (idx == nullptr) {
if (ADB_UNLIKELY(idx == nullptr)) {
TRI_ASSERT(false); // Index reported estimates, but does not exist
continue;
}

View File

@ -26,7 +26,7 @@
#include "Basics/Common.h"
#include "Basics/SmallVector.h"
#include "RocksDBEngine/RocksDBCommon.h"
#include "RocksDBEngine/RocksDBKey.h"
#include "StorageEngine/TransactionState.h"
#include "Transaction/Hints.h"
#include "Transaction/Methods.h"

View File

@ -183,6 +183,12 @@ char const* arangodb::rocksDBLogTypeName(arangodb::RocksDBLogType type) {
return "SingleRemove";
case arangodb::RocksDBLogType::SingleRemoveV2:
return "SingleRemoveV2";
case arangodb::RocksDBLogType::FlushSync:
return "FlushSync";
case RocksDBLogType::TrackedDocumentInsert:
return "TrackedDocumentInsert";
case RocksDBLogType::TrackedDocumentRemove:
return "TrackedDocumentRemove";
case arangodb::RocksDBLogType::Invalid:
return "Invalid";
default:

View File

@ -86,6 +86,8 @@ enum class RocksDBLogType : char {
SingleRemoveV2 = 'F',
CollectionTruncate = 'G',
FlushSync = 'H', // @see FlushFeature
TrackedDocumentInsert = 'I',
TrackedDocumentRemove = 'J',
};
/// @brief settings keys

View File

@ -390,7 +390,9 @@ class MyWALDumper final : public rocksdb::WriteBatch::Handler, public WalAccessC
case RocksDBLogType::DocumentOperationsPrologue:
case RocksDBLogType::DocumentRemove:
case RocksDBLogType::DocumentRemoveAsPartOfUpdate:
break; // ignore deprecated markers
case RocksDBLogType::TrackedDocumentInsert:
case RocksDBLogType::TrackedDocumentRemove:
break; // ignore deprecated / unused markers
default:
LOG_TOPIC(WARN, Logger::REPLICATION)
@ -553,7 +555,9 @@ class MyWALDumper final : public rocksdb::WriteBatch::Handler, public WalAccessC
if (cfId != _primaryCF) {
return; // ignore all document operations
} else if (_state != TRANSACTION && _state != SINGLE_REMOVE) {
}
if (_state != TRANSACTION && _state != SINGLE_REMOVE) {
resetTransientState();
return;
}
@ -617,12 +621,6 @@ class MyWALDumper final : public rocksdb::WriteBatch::Handler, public WalAccessC
return rocksdb::Status(); // make WAL iterator happy
}
rocksdb::Status MergeCF(uint32_t, const rocksdb::Slice&, const rocksdb::Slice&) override {
incTick();
// not used for anything in ArangoDB currently
return rocksdb::Status(); // make WAL iterator happy
}
public:
/// figures out from which sequence number we need to start scanning
/// if we just use tickStart rocksdb will skip over batches we might
@ -765,11 +763,6 @@ WalAccessResult RocksDBWalAccess::tail(Filter const& filter, size_t chunkSize,
<< "WAL tailing call. Scan since: " << since << ", tick start: " << filter.tickStart
<< ", tick end: " << filter.tickEnd << ", chunk size: " << chunkSize;
while (iterator->Valid() && lastScannedTick <= filter.tickEnd) {
s = iterator->status();
if (!s.ok()) {
LOG_TOPIC(ERR, Logger::REPLICATION) << "error during WAL scan: " << s.ToString();
break; // s is considered in the end
}
rocksdb::BatchResult batch = iterator->GetBatch();
// record the first tick we are actually considering

View File

@ -8,7 +8,7 @@
///
/// DISCLAIMER
///
/// Copyright 2018 ArangoDB GmbH, Cologne, Germany
/// Copyright 2018-2019 ArangoDB GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
@ -131,6 +131,64 @@ function backgroundIndexSuite() {
}
},
testInsertParallelNonUnique2: function () {
let c = require("internal").db._collection(cn);
// first lets add some initial documents
let x = 10;
while(x-- > 0) {
let docs = [];
for(let i = 0; i < 1000; i++) {
docs.push({value:i});
}
c.save(docs);
}
// lets insert the rest via tasks
let n = 9;
for (let i = 0; i < n; ++i) {
if (i === 6) { // create the index in a task
let command = `const c = require("internal").db._collection("${cn}");
c.ensureIndex({type: 'hash', fields: ['value'], unique: false, inBackground: true});`;
tasks.register({ name: "UnitTestsIndexCreateIDX" + i, command: command });
}
let command = `const c = require("internal").db._collection("${cn}");
let x = 10;
while(x-- > 0) {
let docs = [];
for(let i = 0; i < 1000; i++) {
docs.push({value:i})
}
c.save(docs);
}`;
tasks.register({ name: "UnitTestsIndexInsert" + i, command: command });
}
// wait for tasks to complete
waitForTasks();
// sanity checks
assertEqual(c.count(), 100000);
for (let i = 0; i < 1000; i++) { // 100 entries of each value [0,999]
let cursor = db._query("FOR doc IN @@coll FILTER doc.value == @val RETURN 1",
{'@coll': cn, 'val': i}, {count:true});
assertEqual(cursor.count(), 100);
}
internal.waitForEstimatorSync(); // make sure estimates are consistent
let indexes = c.getIndexes(true);
for (let i of indexes) {
switch (i.type) {
case 'primary':
break;
case 'hash':
assertEqual(i.selectivityEstimate, 0.01);
break;
default:
fail();
}
}
},
testInsertParallelUnique: function () {
let c = require("internal").db._collection(cn);
// first lets add some initial documents
@ -227,17 +285,6 @@ function backgroundIndexSuite() {
// sanity checks
assertEqual(c.count(), 50001);
let indexes = c.getIndexes();
for (let i of indexes) {
switch (i.type) {
case 'primary':
break;
case 'hash':
default:
fail();
}
}
},
testRemoveParallel: function () {
@ -256,6 +303,12 @@ function backgroundIndexSuite() {
// lets remove half via tasks
for (let i = 0; i < 10; ++i) {
if (i === 3) { // create the index in a task
let command = `const c = require("internal").db._collection("${cn}");
c.ensureIndex({type: 'hash', fields: ['value'], unique: false, inBackground: true});`;
tasks.register({ name: "UnitTestsIndexCreateIDX" + i, command: command });
}
let command = `const c = require("internal").db._collection("${cn}");
if (!c) {
throw new Error('could not find collection');
@ -275,9 +328,6 @@ function backgroundIndexSuite() {
tasks.register({ name: "UnitTestsIndexRemove" + i, command: command });
}
// create the index on the main thread
c.ensureIndex({type: 'hash', fields: ['value'], inBackground: true });
// wait for insertion tasks to complete
waitForTasks();
@ -298,12 +348,14 @@ function backgroundIndexSuite() {
}
}
internal.waitForEstimatorSync(); // make sure estimates are consistent
let indexes = c.getIndexes(true);
for (let i of indexes) {
switch (i.type) {
case 'primary':
break;
case 'hash':
assertTrue(Math.abs(i.selectivityEstimate - 1.0) < 0.005, i);
break;
default:
fail();
@ -327,6 +379,11 @@ function backgroundIndexSuite() {
// lets update all via tasks
for (let i = 0; i < 10; ++i) {
if (i === 5) { // create the index in a task
let command = `const c = require("internal").db._collection("${cn}");
c.ensureIndex({type: 'skiplist', fields: ['value'], unique: false, inBackground: true});`;
tasks.register({ name: "UnitTestsIndexCreateIDX" + i, command: command });
}
let command = `const c = require("internal").db._collection("${cn}");
if (!c) {
throw new Error('could not find collection');
@ -350,9 +407,6 @@ function backgroundIndexSuite() {
// wait for insertion tasks to complete
waitForTasks();
// create the index on the main thread
c.ensureIndex({type: 'skiplist', fields: ['value'], inBackground: true });
// sanity checks
assertEqual(c.count(), 100000);
// check for new entries via index
@ -364,12 +418,14 @@ function backgroundIndexSuite() {
{'@coll': cn, 'val': 100000}, {count:true});
assertEqual(oldCursor.count(), 0);
internal.waitForEstimatorSync(); // make sure estimates are consistent
let indexes = c.getIndexes(true);
for (let i of indexes) {
switch (i.type) {
case 'primary':
break;
case 'skiplist':
assertTrue(Math.abs(i.selectivityEstimate - 1.0) < 0.005, i);
break;
default:
fail();