mirror of https://gitee.com/bigwinds/arangodb
Background Indexing (#8039)
This commit is contained in:
parent
93ea32a00a
commit
fd70b6fc34
|
@ -118,7 +118,6 @@ details, including the index-identifier, is returned.
|
|||
@endDocuBlock ensureHashIndexArray
|
||||
|
||||
|
||||
{###
|
||||
Creating Hash Index in Background
|
||||
---------------------------------
|
||||
|
||||
|
@ -142,7 +141,7 @@ db.collection.ensureIndex({ type: "hash", fields: [ "value" ], inBackground: tru
|
|||
```
|
||||
|
||||
For more information see "Creating Indexes in Background" in the [Index basics](IndexBasics.md) page.
|
||||
###}
|
||||
|
||||
|
||||
Ensure uniqueness of relations in edge collections
|
||||
--------------------------------------------------
|
||||
|
|
|
@ -8,7 +8,7 @@ of documents.
|
|||
|
||||
User-defined indexes can be created on collection level. Most user-defined indexes
|
||||
can be created by specifying the names of the index attributes.
|
||||
Some index types allow indexing just one attribute (e.g. fulltext index) whereas
|
||||
Some index types allow indexing just one attribute (e.g. *fulltext* index) whereas
|
||||
other index types allow indexing multiple attributes at the same time.
|
||||
|
||||
Learn how to use different indexes efficiently by going through the
|
||||
|
@ -22,15 +22,14 @@ are covered by an edge collection's edge index automatically.
|
|||
Using the system attribute `_id` in user-defined indexes is not possible, but
|
||||
indexing `_key`, `_rev`, `_from`, and `_to` is.
|
||||
|
||||
{###
|
||||
Creating new indexes is usually done under an exclusive collection lock. The collection is not
|
||||
available as long as the index is created. This "foreground" index creation can be undesirable,
|
||||
Creating new indexes is by default done under an exclusive collection lock. The collection is not
|
||||
available while the index is being created. This "foreground" index creation can be undesirable,
|
||||
if you have to perform it on a live system without a dedicated maintenance window.
|
||||
|
||||
For potentially long running index creation operations the _rocksdb_ storage-engine also supports
|
||||
creating indexes in "background". The collection remains available during the index creation,
|
||||
see the section "Creating Indexes in Background" for more information.
|
||||
###}
|
||||
creating indexes in "background". The collection remains (mostly) available during the index creation,
|
||||
see the section [Creating Indexes in Background](#creating-indexes-in-background) for more information.
|
||||
|
||||
|
||||
ArangoDB provides the following index types:
|
||||
|
||||
|
@ -551,7 +550,7 @@ based on the costs it estimates, even if a vertex centric index might
|
|||
in fact be faster. Vertex centric indexes are more likely to be chosen
|
||||
for highly connected graphs and with RocksDB storage engine.
|
||||
|
||||
{###
|
||||
|
||||
Creating Indexes in Background
|
||||
------------------------------
|
||||
|
||||
|
@ -610,4 +609,4 @@ if this list grows to tens of millions of entries.
|
|||
|
||||
Building an index is always a write heavy operation (internally), it is always a good idea to build indexes
|
||||
during times with less load.
|
||||
###}
|
||||
|
||||
|
|
|
@ -187,7 +187,6 @@ and
|
|||
will match.
|
||||
|
||||
|
||||
{###
|
||||
Creating Skiplist Index in Background
|
||||
-------------------------------------
|
||||
|
||||
|
@ -211,4 +210,4 @@ db.collection.ensureIndex({ type: "skiplist", fields: [ "value" ], inBackground:
|
|||
```
|
||||
|
||||
For more information see "Creating Indexes in Background" in the [Index basics](IndexBasics.md#) page.
|
||||
###}
|
||||
|
||||
|
|
|
@ -491,12 +491,12 @@ aql::AstNode const* checkAttributeAccess(aql::AstNode const* node,
|
|||
} // namespace iresearch
|
||||
} // namespace arangodb
|
||||
|
||||
#endif // ARANGOD_IRESEARCH__AQL_HELPER_H
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
#endif // ARANGOD_IRESEARCH__AQL_HELPER_H
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// --SECTION-- END-OF-FILE
|
||||
// -----------------------------------------------------------------------------
|
||||
|
|
|
@ -26,7 +26,7 @@
|
|||
#include "ApplicationFeatures/ApplicationFeature.h"
|
||||
#include "Basics/ReadWriteLock.h"
|
||||
|
||||
class TRI_vocbase_t; // forward declaration
|
||||
struct TRI_vocbase_t; // forward declaration
|
||||
|
||||
namespace arangodb {
|
||||
|
||||
|
@ -51,7 +51,7 @@ class FlushFeature final : public application_features::ApplicationFeature {
|
|||
virtual ~FlushSubscription() = default;
|
||||
virtual Result commit(velocypack::Slice const& data) = 0;
|
||||
};
|
||||
struct FlushSubscriptionBase; // forward declaration
|
||||
class FlushSubscriptionBase; // forward declaration
|
||||
|
||||
explicit FlushFeature(application_features::ApplicationServer& server);
|
||||
|
||||
|
|
|
@ -32,7 +32,8 @@
|
|||
using namespace arangodb;
|
||||
|
||||
RocksDBBackgroundThread::RocksDBBackgroundThread(RocksDBEngine* eng, double interval)
|
||||
: Thread("RocksDBThread"), _engine(eng), _interval(interval) {}
|
||||
: Thread("RocksDBThread"), _engine(eng), _interval(interval),
|
||||
_disableWalFilePruning(0) {}
|
||||
|
||||
RocksDBBackgroundThread::~RocksDBBackgroundThread() { shutdown(); }
|
||||
|
||||
|
@ -97,6 +98,7 @@ void RocksDBBackgroundThread::run() {
|
|||
});
|
||||
}
|
||||
|
||||
if (!disableWalFilePruning()) {
|
||||
// only start pruning of obsolete WAL files a few minutes after
|
||||
// server start. if we start pruning too early, replication slaves
|
||||
// will not have a chance to reconnect to a restarted master in
|
||||
|
@ -108,6 +110,7 @@ void RocksDBBackgroundThread::run() {
|
|||
// and then prune them when they expired
|
||||
_engine->pruneWalFiles();
|
||||
}
|
||||
}
|
||||
} catch (std::exception const& ex) {
|
||||
LOG_TOPIC(WARN, Logger::ENGINES)
|
||||
<< "caught exception in rocksdb background thread: " << ex.what();
|
||||
|
|
|
@ -53,8 +53,24 @@ class RocksDBBackgroundThread final : public Thread {
|
|||
|
||||
void beginShutdown() override;
|
||||
|
||||
/// disable pruning of wal files
|
||||
void disableWalFilePruning(bool disable) {
|
||||
int sub = 0;
|
||||
if (disable) {
|
||||
sub = _disableWalFilePruning.fetch_add(1);
|
||||
} else {
|
||||
sub = _disableWalFilePruning.fetch_sub(1);
|
||||
}
|
||||
TRI_ASSERT(sub >= 0);
|
||||
}
|
||||
|
||||
bool disableWalFilePruning() const {
|
||||
return _disableWalFilePruning.load(std::memory_order_acquire) > 0;
|
||||
}
|
||||
|
||||
protected:
|
||||
void run() override;
|
||||
std::atomic<int> _disableWalFilePruning;
|
||||
};
|
||||
} // namespace arangodb
|
||||
|
||||
|
|
|
@ -21,10 +21,13 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "RocksDBBuilderIndex.h"
|
||||
|
||||
#include "Basics/HashSet.h"
|
||||
#include "Basics/VelocyPackHelper.h"
|
||||
#include "RocksDBEngine/RocksDBCollection.h"
|
||||
#include "RocksDBEngine/RocksDBColumnFamily.h"
|
||||
#include "RocksDBEngine/RocksDBCommon.h"
|
||||
#include "RocksDBEngine/RocksDBLogValue.h"
|
||||
#include "RocksDBEngine/RocksDBMethods.h"
|
||||
#include "RocksDBEngine/RocksDBTransactionCollection.h"
|
||||
#include "RocksDBEngine/RocksDBTransactionState.h"
|
||||
|
@ -39,17 +42,43 @@
|
|||
#include <rocksdb/utilities/transaction_db.h>
|
||||
#include <rocksdb/utilities/write_batch_with_index.h>
|
||||
|
||||
#include <velocypack/Builder.h>
|
||||
#include <velocypack/Iterator.h>
|
||||
#include <velocypack/velocypack-aliases.h>
|
||||
|
||||
using namespace arangodb;
|
||||
using namespace arangodb::rocksutils;
|
||||
|
||||
namespace {
|
||||
struct BuilderTrx : public arangodb::transaction::Methods {
|
||||
BuilderTrx(std::shared_ptr<transaction::Context> const& transactionContext,
|
||||
LogicalDataSource const& collection, AccessMode::Type type)
|
||||
: transaction::Methods(transactionContext), _cid(collection.id()) {
|
||||
// add the (sole) data-source
|
||||
addCollection(collection.id(), collection.name(), type);
|
||||
addHint(transaction::Hints::Hint::NO_DLD);
|
||||
}
|
||||
|
||||
/// @brief get the underlying transaction collection
|
||||
RocksDBTransactionCollection* resolveTrxCollection() {
|
||||
return static_cast<RocksDBTransactionCollection*>(trxCollection(_cid));
|
||||
}
|
||||
|
||||
private:
|
||||
TRI_voc_cid_t _cid;
|
||||
};
|
||||
|
||||
struct BuilderCookie : public arangodb::TransactionState::Cookie {
|
||||
// do not track removed documents twice
|
||||
arangodb::HashSet<LocalDocumentId::BaseType> tracked;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
RocksDBBuilderIndex::RocksDBBuilderIndex(std::shared_ptr<arangodb::RocksDBIndex> const& wp)
|
||||
: RocksDBIndex(wp->id(), wp->collection(), wp->fields(), wp->unique(),
|
||||
wp->sparse(), wp->columnFamily(), wp->objectId(),
|
||||
/*useCache*/ false),
|
||||
_wrapped(wp),
|
||||
_hasError(false) {
|
||||
_wrapped(wp) {
|
||||
TRI_ASSERT(_wrapped);
|
||||
}
|
||||
|
||||
|
@ -72,18 +101,24 @@ Result RocksDBBuilderIndex::insert(transaction::Methods& trx, RocksDBMethods* mt
|
|||
LocalDocumentId const& documentId,
|
||||
arangodb::velocypack::Slice const& slice,
|
||||
OperationMode mode) {
|
||||
TRI_ASSERT(false); // not enabled
|
||||
Result r = _wrapped->insert(trx, mthd, documentId, slice, mode);
|
||||
if (r.is(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED)) {
|
||||
// these are expected errors; store in builder and suppress
|
||||
bool expected = false;
|
||||
if (!r.ok() && _hasError.compare_exchange_strong(expected, true)) {
|
||||
std::lock_guard<std::mutex> guard(_errorMutex);
|
||||
_errorResult = r;
|
||||
#ifdef ARANGODB_ENABLE_MAINTAINER_MODE
|
||||
auto* ctx = dynamic_cast<::BuilderCookie*>(trx.state()->cookie(this));
|
||||
#else
|
||||
auto* ctx = static_cast<::BuilderCookie*>(trx.state()->cookie(this));
|
||||
#endif
|
||||
if (!ctx) {
|
||||
auto ptr = std::make_unique<::BuilderCookie>();
|
||||
ctx = ptr.get();
|
||||
trx.state()->cookie(this, std::move(ptr));
|
||||
}
|
||||
return Result();
|
||||
|
||||
// do not track document more than once
|
||||
if (ctx->tracked.find(documentId.id()) == ctx->tracked.end()) {
|
||||
ctx->tracked.insert(documentId.id());
|
||||
RocksDBLogValue val = RocksDBLogValue::TrackedDocumentInsert(documentId, slice);
|
||||
mthd->PutLogData(val.slice());
|
||||
}
|
||||
return r;
|
||||
return Result(); // do nothing
|
||||
}
|
||||
|
||||
/// remove index elements and put it in the specified write batch.
|
||||
|
@ -91,221 +126,36 @@ Result RocksDBBuilderIndex::remove(transaction::Methods& trx, RocksDBMethods* mt
|
|||
LocalDocumentId const& documentId,
|
||||
arangodb::velocypack::Slice const& slice,
|
||||
OperationMode mode) {
|
||||
TRI_ASSERT(false); // not enabled
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(_removedDocsMutex);
|
||||
_removedDocs.insert(documentId.id());
|
||||
}
|
||||
{ // wait for keys do be inserted, so we can remove them again
|
||||
std::unique_lock<std::mutex> guard(_lockedDocsMutex);
|
||||
if (_lockedDocs.find(documentId.id()) != _lockedDocs.end()) {
|
||||
_lockedDocsCond.wait(guard);
|
||||
}
|
||||
#ifdef ARANGODB_ENABLE_MAINTAINER_MODE
|
||||
auto* ctx = dynamic_cast<::BuilderCookie*>(trx.state()->cookie(this));
|
||||
#else
|
||||
auto* ctx = static_cast<::BuilderCookie*>(trx.state()->cookie(this));
|
||||
#endif
|
||||
if (!ctx) {
|
||||
auto ptr = std::make_unique<::BuilderCookie>();
|
||||
ctx = ptr.get();
|
||||
trx.state()->cookie(this, std::move(ptr));
|
||||
}
|
||||
|
||||
Result r = _wrapped->remove(trx, mthd, documentId, slice, mode);
|
||||
if (r.is(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED)) {
|
||||
// these are expected errors; store in builder and suppress
|
||||
bool expected = false;
|
||||
if (!r.ok() && _hasError.compare_exchange_strong(expected, true)) {
|
||||
std::lock_guard<std::mutex> guard(_errorMutex);
|
||||
_errorResult = r;
|
||||
// do not track document more than once
|
||||
if (ctx->tracked.find(documentId.id()) == ctx->tracked.end()) {
|
||||
ctx->tracked.insert(documentId.id());
|
||||
RocksDBLogValue val = RocksDBLogValue::TrackedDocumentRemove(documentId, slice);
|
||||
mthd->PutLogData(val.slice());
|
||||
}
|
||||
return Result();
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct BuilderTrx : public arangodb::transaction::Methods {
|
||||
BuilderTrx(std::shared_ptr<transaction::Context> const& transactionContext,
|
||||
LogicalDataSource const& collection, AccessMode::Type type)
|
||||
: transaction::Methods(transactionContext), _cid(collection.id()) {
|
||||
// add the (sole) data-source
|
||||
addCollection(collection.id(), collection.name(), type);
|
||||
addHint(transaction::Hints::Hint::NO_DLD);
|
||||
}
|
||||
|
||||
/// @brief get the underlying transaction collection
|
||||
RocksDBTransactionCollection* resolveTrxCollection() {
|
||||
return static_cast<RocksDBTransactionCollection*>(trxCollection(_cid));
|
||||
}
|
||||
|
||||
private:
|
||||
TRI_voc_cid_t _cid;
|
||||
};
|
||||
|
||||
//struct BuilderCookie {
|
||||
// VPackBuffer<uint8_t>
|
||||
// VPackBuilder _removals;
|
||||
//};
|
||||
} // namespace
|
||||
|
||||
// Background index filler task
|
||||
// FIXME simon: not used right now because rollbacks are not correctly handled
|
||||
// yet
|
||||
arangodb::Result RocksDBBuilderIndex::fillIndexBackground(std::function<void()> const& unlock) {
|
||||
arangodb::Result res;
|
||||
|
||||
// 1. Index everything under a snapshot iterator (get snapshot under
|
||||
// exclusive coll lock)
|
||||
// 2. Track deleted document IDs so we can avoid indexing them
|
||||
// 3. Avoid conflicts on unique index keys by using rocksdb::Transaction
|
||||
// snapshot conflict checking
|
||||
// 4. Supress unique constraint violations / conflicts or client drivers
|
||||
|
||||
auto lockedDocsGuard = scopeGuard([&] { // clear all the processed documents
|
||||
std::lock_guard<std::mutex> guard(_lockedDocsMutex);
|
||||
_lockedDocs.clear();
|
||||
_lockedDocsCond.notify_all();
|
||||
});
|
||||
|
||||
// fillindex can be non transactional, we just need to clean up
|
||||
RocksDBEngine* engine = rocksutils::globalRocksEngine();
|
||||
RocksDBCollection* rcoll = static_cast<RocksDBCollection*>(_collection.getPhysical());
|
||||
rocksdb::DB* rootDB = engine->db()->GetRootDB();
|
||||
TRI_ASSERT(rootDB != nullptr);
|
||||
|
||||
uint64_t numDocsWritten = 0;
|
||||
|
||||
auto bounds = RocksDBKeyBounds::CollectionDocuments(rcoll->objectId());
|
||||
|
||||
rocksdb::Slice upper(bounds.end()); // exclusive upper bound
|
||||
rocksdb::Status s;
|
||||
rocksdb::WriteOptions wo;
|
||||
wo.disableWAL = false; // TODO set to true eventually
|
||||
|
||||
// create a read-snapshot under the guard
|
||||
rocksdb::Snapshot const* snap = rootDB->GetSnapshot();
|
||||
auto snapGuard = scopeGuard([&] { rootDB->ReleaseSnapshot(snap); });
|
||||
TRI_ASSERT(snap != nullptr);
|
||||
|
||||
rocksdb::ReadOptions ro;
|
||||
ro.snapshot = snap;
|
||||
ro.prefix_same_as_start = true;
|
||||
ro.iterate_upper_bound = &upper;
|
||||
ro.verify_checksums = false;
|
||||
ro.fill_cache = false;
|
||||
|
||||
rocksdb::ColumnFamilyHandle* docCF = bounds.columnFamily();
|
||||
std::unique_ptr<rocksdb::Iterator> it(rootDB->NewIterator(ro, docCF));
|
||||
|
||||
unlock(); // release indexes write lock
|
||||
// FIXME use buildertrx
|
||||
SingleCollectionTransaction trx(transaction::StandaloneContext::Create(
|
||||
_collection.vocbase()),
|
||||
_collection, AccessMode::Type::WRITE);
|
||||
res = trx.begin();
|
||||
if (res.fail()) {
|
||||
return res;
|
||||
}
|
||||
auto state = RocksDBTransactionState::toState(&trx);
|
||||
|
||||
// transaction used to perform actual indexing
|
||||
rocksdb::TransactionOptions to;
|
||||
to.lock_timeout = 100; // 100ms
|
||||
std::unique_ptr<rocksdb::Transaction> rtrx(engine->db()->BeginTransaction(wo, to));
|
||||
if (this->unique()) {
|
||||
rtrx->SetSnapshot(); // needed for unique index conflict detection
|
||||
} else {
|
||||
rtrx->DisableIndexing(); // we never check for existing index keys
|
||||
}
|
||||
RocksDBSideTrxMethods batched(state, rtrx.get());
|
||||
|
||||
RocksDBIndex* internal = _wrapped.get();
|
||||
TRI_ASSERT(internal != nullptr);
|
||||
|
||||
// FIXE make selectivity estimates batch wise
|
||||
it->Seek(bounds.start());
|
||||
while (it->Valid() && it->key().compare(upper) < 0) {
|
||||
if (_hasError.load(std::memory_order_acquire)) {
|
||||
std::lock_guard<std::mutex> guard(_errorMutex);
|
||||
res = _errorResult; // a Writer got an error
|
||||
break;
|
||||
}
|
||||
|
||||
LocalDocumentId docId = RocksDBKey::documentId(it->key());
|
||||
{
|
||||
// must acquire both locks here to prevent interleaved operations
|
||||
std::lock_guard<std::mutex> guard(_removedDocsMutex);
|
||||
std::lock_guard<std::mutex> guard2(_lockedDocsMutex);
|
||||
if (_removedDocs.find(docId.id()) != _removedDocs.end()) {
|
||||
_removedDocs.erase(_removedDocs.find(docId.id()));
|
||||
it->Next();
|
||||
continue;
|
||||
}
|
||||
_lockedDocs.insert(docId.id());
|
||||
}
|
||||
|
||||
res = internal->insert(trx, &batched, docId, VPackSlice(it->value().data()),
|
||||
Index::OperationMode::normal);
|
||||
if (res.fail()) {
|
||||
break;
|
||||
}
|
||||
numDocsWritten++;
|
||||
|
||||
if (numDocsWritten % 200 == 0) { // commit buffered writes
|
||||
s = rtrx->Commit();
|
||||
if (!s.ok()) {
|
||||
res = rocksutils::convertStatus(s, rocksutils::StatusHint::index);
|
||||
break;
|
||||
}
|
||||
{ // clear all the processed documents
|
||||
std::lock_guard<std::mutex> guard(_lockedDocsMutex);
|
||||
_lockedDocs.clear();
|
||||
_lockedDocsCond.notify_all();
|
||||
}
|
||||
engine->db()->BeginTransaction(wo, to, rtrx.get()); // reuse transaction
|
||||
if (this->unique()) {
|
||||
rtrx->SetSnapshot();
|
||||
}
|
||||
}
|
||||
|
||||
it->Next();
|
||||
}
|
||||
|
||||
// now actually write all remaining index keys
|
||||
if (res.ok() && rtrx->GetNumPuts() > 0) {
|
||||
s = rtrx->Commit();
|
||||
if (!s.ok()) {
|
||||
res = rocksutils::convertStatus(s, rocksutils::StatusHint::index);
|
||||
}
|
||||
}
|
||||
|
||||
if (res.ok()) {
|
||||
res = trx.commit(); // required to commit selectivity estimates
|
||||
}
|
||||
|
||||
return res;
|
||||
return Result(); // do nothing
|
||||
}
|
||||
|
||||
// fast mode assuming exclusive access locked from outside
|
||||
template <typename WriteBatchType, typename MethodsType>
|
||||
static arangodb::Result fillIndexFast(RocksDBIndex& ridx, LogicalCollection& coll,
|
||||
WriteBatchType& batch) {
|
||||
Result res;
|
||||
::BuilderTrx trx(transaction::StandaloneContext::Create(coll.vocbase()), coll,
|
||||
AccessMode::Type::EXCLUSIVE);
|
||||
trx.addHint(transaction::Hints::Hint::LOCK_NEVER); // already locked
|
||||
res = trx.begin();
|
||||
if (!res.ok()) {
|
||||
THROW_ARANGO_EXCEPTION(res);
|
||||
}
|
||||
|
||||
RocksDBCollection* rcoll = static_cast<RocksDBCollection*>(coll.getPhysical());
|
||||
auto state = RocksDBTransactionState::toState(&trx);
|
||||
auto methds = RocksDBTransactionState::toMethods(&trx);
|
||||
RocksDBTransactionCollection* trxColl = trx.resolveTrxCollection();
|
||||
|
||||
static arangodb::Result fillIndex(RocksDBIndex& ridx, WriteBatchType& batch,
|
||||
rocksdb::Snapshot const* snap) {
|
||||
// fillindex can be non transactional, we just need to clean up
|
||||
RocksDBEngine* engine = rocksutils::globalRocksEngine();
|
||||
rocksdb::DB* rootDB = engine->db()->GetRootDB();
|
||||
rocksdb::DB* rootDB = rocksutils::globalRocksDB()->GetRootDB();
|
||||
TRI_ASSERT(rootDB != nullptr);
|
||||
|
||||
uint64_t numDocsWritten = 0;
|
||||
// write batch will be reset every x documents
|
||||
MethodsType batched(state, &batch);
|
||||
|
||||
RocksDBCollection* rcoll =
|
||||
static_cast<RocksDBCollection*>(ridx.collection().getPhysical());
|
||||
auto bounds = RocksDBKeyBounds::CollectionDocuments(rcoll->objectId());
|
||||
rocksdb::Slice upper(bounds.end());
|
||||
|
||||
|
@ -313,20 +163,32 @@ static arangodb::Result fillIndexFast(RocksDBIndex& ridx, LogicalCollection& col
|
|||
rocksdb::WriteOptions wo;
|
||||
wo.disableWAL = false; // TODO set to true eventually
|
||||
|
||||
const rocksdb::Snapshot* snap = rootDB->GetSnapshot();
|
||||
auto snapGuard = scopeGuard([&] { rootDB->ReleaseSnapshot(snap); });
|
||||
|
||||
rocksdb::ReadOptions ro;
|
||||
rocksdb::ReadOptions ro(/*cksum*/ false, /*cache*/ false);
|
||||
ro.snapshot = snap;
|
||||
ro.prefix_same_as_start = true;
|
||||
ro.iterate_upper_bound = &upper;
|
||||
ro.verify_checksums = false;
|
||||
ro.fill_cache = false;
|
||||
|
||||
rocksdb::ColumnFamilyHandle* docCF = RocksDBColumnFamily::documents();
|
||||
std::unique_ptr<rocksdb::Iterator> it = methds->NewIterator(ro, docCF);
|
||||
std::unique_ptr<rocksdb::Iterator> it(rootDB->NewIterator(ro, docCF));
|
||||
|
||||
auto commitLambda = [&] {
|
||||
auto mode = snap == nullptr ? AccessMode::Type::EXCLUSIVE : AccessMode::Type::WRITE;
|
||||
LogicalCollection& coll = ridx.collection();
|
||||
::BuilderTrx trx(transaction::StandaloneContext::Create(coll.vocbase()), coll, mode);
|
||||
if (mode == AccessMode::Type::EXCLUSIVE) {
|
||||
trx.addHint(transaction::Hints::Hint::LOCK_NEVER);
|
||||
}
|
||||
Result res = trx.begin();
|
||||
if (!res.ok()) {
|
||||
THROW_ARANGO_EXCEPTION(res);
|
||||
}
|
||||
|
||||
uint64_t numDocsWritten = 0;
|
||||
auto state = RocksDBTransactionState::toState(&trx);
|
||||
RocksDBTransactionCollection* trxColl = trx.resolveTrxCollection();
|
||||
// write batch will be reset every x documents
|
||||
MethodsType batched(state, &batch);
|
||||
|
||||
auto commitLambda = [&](rocksdb::SequenceNumber seq) {
|
||||
if (batch.GetWriteBatch()->Count() > 0) {
|
||||
s = rootDB->Write(wo, batch.GetWriteBatch());
|
||||
if (!s.ok()) {
|
||||
|
@ -339,14 +201,18 @@ static arangodb::Result fillIndexFast(RocksDBIndex& ridx, LogicalCollection& col
|
|||
if (!ops.empty()) {
|
||||
TRI_ASSERT(ridx.hasSelectivityEstimate() && ops.size() == 1);
|
||||
auto it = ops.begin();
|
||||
ridx.estimator()->bufferUpdates(it->first, std::move(it->second.inserts),
|
||||
TRI_ASSERT(ridx.id() == it->first);
|
||||
ridx.estimator()->bufferUpdates(seq, std::move(it->second.inserts),
|
||||
std::move(it->second.removals));
|
||||
}
|
||||
};
|
||||
|
||||
it->Seek(bounds.start());
|
||||
while (it->Valid()) {
|
||||
for (it->Seek(bounds.start()); it->Valid(); it->Next()) {
|
||||
TRI_ASSERT(it->key().compare(upper) < 0);
|
||||
if (application_features::ApplicationServer::isStopping()) {
|
||||
res.reset(TRI_ERROR_SHUTTING_DOWN);
|
||||
break;
|
||||
}
|
||||
|
||||
res = ridx.insert(trx, &batched, RocksDBKey::documentId(it->key()),
|
||||
VPackSlice(it->value().data()), Index::OperationMode::normal);
|
||||
|
@ -356,57 +222,398 @@ static arangodb::Result fillIndexFast(RocksDBIndex& ridx, LogicalCollection& col
|
|||
numDocsWritten++;
|
||||
|
||||
if (numDocsWritten % 200 == 0) { // commit buffered writes
|
||||
commitLambda();
|
||||
commitLambda(rootDB->GetLatestSequenceNumber());
|
||||
if (res.fail()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
it->Next();
|
||||
if (!it->status().ok() && res.ok()) {
|
||||
res = rocksutils::convertStatus(it->status(), rocksutils::StatusHint::index);
|
||||
}
|
||||
|
||||
if (res.ok()) {
|
||||
commitLambda();
|
||||
commitLambda(rootDB->GetLatestSequenceNumber());
|
||||
}
|
||||
batch.Clear();
|
||||
|
||||
if (res.ok()) { // required so iresearch commits
|
||||
res = trx.commit();
|
||||
}
|
||||
|
||||
// we will need to remove index elements created before an error
|
||||
// occurred, this needs to happen since we are non transactional
|
||||
if (res.fail()) {
|
||||
RocksDBKeyBounds bounds = ridx.getBounds();
|
||||
arangodb::Result res2 =
|
||||
rocksutils::removeLargeRange(rocksutils::globalRocksDB(), bounds, true,
|
||||
/*useRangeDel*/ numDocsWritten > 25000);
|
||||
if (res2.fail()) {
|
||||
LOG_TOPIC(WARN, Logger::ENGINES) << "was not able to roll-back "
|
||||
<< "index creation: " << res2.errorMessage();
|
||||
}
|
||||
}
|
||||
// if an error occured drop() will be called
|
||||
LOG_TOPIC(DEBUG, Logger::ENGINES) << "SNAPSHOT CAPTURED " << numDocsWritten << " " << res.errorMessage();
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/// non-transactional: fill index with existing documents
|
||||
/// from this collection
|
||||
arangodb::Result RocksDBBuilderIndex::fillIndexFast() {
|
||||
arangodb::Result RocksDBBuilderIndex::fillIndexForeground() {
|
||||
RocksDBIndex* internal = _wrapped.get();
|
||||
TRI_ASSERT(internal != nullptr);
|
||||
std::function<void()> empty;
|
||||
|
||||
const rocksdb::Snapshot* snap = nullptr;
|
||||
|
||||
Result res;
|
||||
if (this->unique()) {
|
||||
const rocksdb::Comparator* cmp = internal->columnFamily()->GetComparator();
|
||||
// unique index. we need to keep track of all our changes because we need to
|
||||
// avoid duplicate index keys. must therefore use a WriteBatchWithIndex
|
||||
rocksdb::WriteBatchWithIndex batch(cmp, 32 * 1024 * 1024);
|
||||
return ::fillIndexFast<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(
|
||||
*internal, _collection, batch);
|
||||
res = ::fillIndex<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(*internal, batch, snap);
|
||||
} else {
|
||||
// non-unique index. all index keys will be unique anyway because they
|
||||
// contain the document id we can therefore get away with a cheap WriteBatch
|
||||
rocksdb::WriteBatch batch(32 * 1024 * 1024);
|
||||
return ::fillIndexFast<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, _collection,
|
||||
batch);
|
||||
res = ::fillIndex<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, batch, snap);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename MethodsType>
|
||||
struct ReplayHandler final : public rocksdb::WriteBatch::Handler {
|
||||
ReplayHandler(uint64_t oid, RocksDBIndex& idx, transaction::Methods& trx, MethodsType* methods)
|
||||
: _objectId(oid), _index(idx), _trx(trx), _methods(methods) {}
|
||||
|
||||
bool Continue() override {
|
||||
if (application_features::ApplicationServer::isStopping()) {
|
||||
tmpRes.reset(TRI_ERROR_SHUTTING_DOWN);
|
||||
}
|
||||
return tmpRes.ok();
|
||||
}
|
||||
|
||||
uint64_t numInserted = 0;
|
||||
uint64_t numRemoved = 0;
|
||||
Result tmpRes;
|
||||
|
||||
void startNewBatch(rocksdb::SequenceNumber startSequence) {
|
||||
// starting new write batch
|
||||
_startSequence = startSequence;
|
||||
_currentSequence = startSequence;
|
||||
_startOfBatch = true;
|
||||
_lastObjectID = 0;
|
||||
}
|
||||
|
||||
uint64_t endBatch() {
|
||||
_lastObjectID = 0;
|
||||
return _currentSequence;
|
||||
}
|
||||
|
||||
// The default implementation of LogData does nothing.
|
||||
void LogData(const rocksdb::Slice& blob) override {
|
||||
|
||||
switch (RocksDBLogValue::type(blob)) {
|
||||
case RocksDBLogType::TrackedDocumentInsert:
|
||||
if (_lastObjectID == _objectId) {
|
||||
auto pair = RocksDBLogValue::trackedDocument(blob);
|
||||
tmpRes = _index.insert(_trx, _methods, pair.first, pair.second,
|
||||
Index::OperationMode::normal);
|
||||
numInserted++;
|
||||
}
|
||||
break;
|
||||
|
||||
case RocksDBLogType::TrackedDocumentRemove:
|
||||
if (_lastObjectID == _objectId) {
|
||||
auto pair = RocksDBLogValue::trackedDocument(blob);
|
||||
tmpRes = _index.remove(_trx, _methods, pair.first, pair.second,
|
||||
Index::OperationMode::normal);
|
||||
numRemoved++;
|
||||
}
|
||||
break;
|
||||
|
||||
default: // ignore
|
||||
_lastObjectID = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
rocksdb::Status PutCF(uint32_t column_family_id, const rocksdb::Slice& key,
|
||||
rocksdb::Slice const& value) override {
|
||||
incTick();
|
||||
if (column_family_id == RocksDBColumnFamily::definitions()->GetID()) {
|
||||
_lastObjectID = 0;
|
||||
} else if (column_family_id == RocksDBColumnFamily::documents()->GetID()) {
|
||||
_lastObjectID = RocksDBKey::objectId(key);
|
||||
}
|
||||
|
||||
return rocksdb::Status();
|
||||
}
|
||||
|
||||
rocksdb::Status DeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override {
|
||||
incTick();
|
||||
if (column_family_id == RocksDBColumnFamily::definitions()->GetID()) {
|
||||
_lastObjectID = 0;
|
||||
} else if (column_family_id == RocksDBColumnFamily::documents()->GetID()) {
|
||||
_lastObjectID = RocksDBKey::objectId(key);
|
||||
}
|
||||
return rocksdb::Status();
|
||||
}
|
||||
|
||||
rocksdb::Status SingleDeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override {
|
||||
incTick();
|
||||
if (column_family_id == RocksDBColumnFamily::definitions()->GetID()) {
|
||||
_lastObjectID = 0;
|
||||
} else if (column_family_id == RocksDBColumnFamily::documents()->GetID()) {
|
||||
_lastObjectID = RocksDBKey::objectId(key);
|
||||
}
|
||||
return rocksdb::Status();
|
||||
}
|
||||
|
||||
rocksdb::Status DeleteRangeCF(uint32_t column_family_id,
|
||||
const rocksdb::Slice& begin_key,
|
||||
const rocksdb::Slice& end_key) override {
|
||||
incTick(); // drop and truncate may use this
|
||||
if (column_family_id == _index.columnFamily()->GetID() &&
|
||||
RocksDBKey::objectId(begin_key) == _objectId &&
|
||||
RocksDBKey::objectId(end_key) == _objectId) {
|
||||
_index.afterTruncate(_currentSequence);
|
||||
}
|
||||
return rocksdb::Status(); // make WAL iterator happy
|
||||
}
|
||||
|
||||
private:
|
||||
// tick function that is called before each new WAL entry
|
||||
void incTick() {
|
||||
if (_startOfBatch) {
|
||||
// we are at the start of a batch. do NOT increase sequence number
|
||||
_startOfBatch = false;
|
||||
} else {
|
||||
// we are inside a batch already. now increase sequence number
|
||||
++_currentSequence;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const uint64_t _objectId; /// collection objectID
|
||||
RocksDBIndex& _index; /// the index to use
|
||||
transaction::Methods& _trx;
|
||||
MethodsType* _methods; /// methods to fill
|
||||
|
||||
rocksdb::SequenceNumber _startSequence;
|
||||
rocksdb::SequenceNumber _currentSequence;
|
||||
rocksdb::SequenceNumber _lastWrittenSequence;
|
||||
bool _startOfBatch = false;
|
||||
uint64_t _lastObjectID = 0;
|
||||
};
|
||||
|
||||
template <typename WriteBatchType, typename MethodsType>
|
||||
Result catchup(RocksDBIndex& ridx, WriteBatchType& wb, AccessMode::Type mode,
|
||||
rocksdb::SequenceNumber startingFrom,
|
||||
rocksdb::SequenceNumber& lastScannedTick, uint64_t& numScanned) {
|
||||
LogicalCollection& coll = ridx.collection();
|
||||
::BuilderTrx trx(transaction::StandaloneContext::Create(coll.vocbase()), coll, mode);
|
||||
if (mode == AccessMode::Type::EXCLUSIVE) {
|
||||
trx.addHint(transaction::Hints::Hint::LOCK_NEVER);
|
||||
}
|
||||
Result res = trx.begin();
|
||||
if (res.fail()) {
|
||||
return res;
|
||||
}
|
||||
|
||||
auto state = RocksDBTransactionState::toState(&trx);
|
||||
RocksDBTransactionCollection* trxColl = trx.resolveTrxCollection();
|
||||
RocksDBCollection* rcoll = static_cast<RocksDBCollection*>(coll.getPhysical());
|
||||
|
||||
rocksdb::DB* rootDB = rocksutils::globalRocksDB()->GetRootDB();
|
||||
TRI_ASSERT(rootDB != nullptr);
|
||||
|
||||
// write batch will be reset every x documents
|
||||
MethodsType batched(state, &wb);
|
||||
|
||||
ReplayHandler<MethodsType> replay(rcoll->objectId(), ridx, trx, &batched);
|
||||
|
||||
std::unique_ptr<rocksdb::TransactionLogIterator> iterator; // reader();
|
||||
// no need verifying the WAL contents
|
||||
rocksdb::TransactionLogIterator::ReadOptions ro(false);
|
||||
rocksdb::Status s = rootDB->GetUpdatesSince(startingFrom, &iterator, ro);
|
||||
if (!s.ok()) {
|
||||
return res.reset(convertStatus(s, rocksutils::StatusHint::wal));
|
||||
}
|
||||
|
||||
auto commitLambda = [&](rocksdb::SequenceNumber seq) {
|
||||
if (wb.GetWriteBatch()->Count() > 0) {
|
||||
rocksdb::WriteOptions wo;
|
||||
s = rootDB->Write(wo, wb.GetWriteBatch());
|
||||
if (!s.ok()) {
|
||||
res = rocksutils::convertStatus(s, rocksutils::StatusHint::index);
|
||||
}
|
||||
}
|
||||
wb.Clear();
|
||||
|
||||
auto ops = trxColl->stealTrackedOperations();
|
||||
if (!ops.empty()) {
|
||||
TRI_ASSERT(ridx.hasSelectivityEstimate() && ops.size() == 1);
|
||||
auto it = ops.begin();
|
||||
TRI_ASSERT(ridx.id() == it->first);
|
||||
ridx.estimator()->bufferUpdates(seq, std::move(it->second.inserts),
|
||||
std::move(it->second.removals));
|
||||
}
|
||||
};
|
||||
|
||||
LOG_TOPIC(DEBUG, Logger::ENGINES) << "Scanning from " << startingFrom;
|
||||
|
||||
for (; iterator->Valid(); iterator->Next()) {
|
||||
rocksdb::BatchResult batch = iterator->GetBatch();
|
||||
lastScannedTick = batch.sequence; // start of the batch
|
||||
if (batch.sequence < startingFrom) {
|
||||
continue; // skip
|
||||
}
|
||||
|
||||
replay.startNewBatch(batch.sequence);
|
||||
s = batch.writeBatchPtr->Iterate(&replay);
|
||||
if (!s.ok()) {
|
||||
res = rocksutils::convertStatus(s);
|
||||
break;
|
||||
}
|
||||
if (replay.tmpRes.fail()) {
|
||||
res = replay.tmpRes;
|
||||
break;
|
||||
}
|
||||
|
||||
commitLambda(batch.sequence);
|
||||
if (res.fail()) {
|
||||
break;
|
||||
}
|
||||
lastScannedTick = replay.endBatch();
|
||||
}
|
||||
|
||||
if (!iterator->status().ok() && res.ok()) {
|
||||
LOG_TOPIC(ERR, Logger::ENGINES) << "iterator error " << s.ToString();
|
||||
res = rocksutils::convertStatus(iterator->status());
|
||||
}
|
||||
|
||||
if (res.ok()) {
|
||||
numScanned = replay.numInserted + replay.numRemoved;
|
||||
res = trx.commit(); // important for iresearch
|
||||
}
|
||||
|
||||
LOG_TOPIC(DEBUG, Logger::ENGINES) << "WAL REPLAYED insertions: " << replay.numInserted
|
||||
<< "; deletions: " << replay.numRemoved << "; lastScannedTick "
|
||||
<< lastScannedTick;
|
||||
|
||||
return res;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
bool RocksDBBuilderIndex::Locker::lock() {
|
||||
if (!_locked) {
|
||||
if (_collection->lockWrite() != TRI_ERROR_NO_ERROR) {
|
||||
return false;
|
||||
}
|
||||
_locked = true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void RocksDBBuilderIndex::Locker::unlock() {
|
||||
if (_locked) {
|
||||
_collection->unlockWrite();
|
||||
_locked = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Background index filler task
|
||||
arangodb::Result RocksDBBuilderIndex::fillIndexBackground(Locker& locker) {
|
||||
TRI_ASSERT(locker.isLocked());
|
||||
|
||||
arangodb::Result res;
|
||||
RocksDBIndex* internal = _wrapped.get();
|
||||
TRI_ASSERT(internal != nullptr);
|
||||
|
||||
RocksDBEngine* engine = globalRocksEngine();
|
||||
rocksdb::DB* rootDB = engine->db()->GetRootDB();
|
||||
rocksdb::Snapshot const* snap = rootDB->GetSnapshot();
|
||||
engine->disableWalFilePruning(true);
|
||||
|
||||
auto scope = scopeGuard([&] {
|
||||
engine->disableWalFilePruning(false);
|
||||
if (snap) {
|
||||
rootDB->ReleaseSnapshot(snap);
|
||||
}
|
||||
});
|
||||
|
||||
locker.unlock();
|
||||
if (internal->unique()) {
|
||||
const rocksdb::Comparator* cmp = internal->columnFamily()->GetComparator();
|
||||
// unique index. we need to keep track of all our changes because we need to
|
||||
// avoid duplicate index keys. must therefore use a WriteBatchWithIndex
|
||||
rocksdb::WriteBatchWithIndex batch(cmp, 32 * 1024 * 1024);
|
||||
res = ::fillIndex<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(
|
||||
*internal, batch, snap);
|
||||
} else {
|
||||
// non-unique index. all index keys will be unique anyway because they
|
||||
// contain the document id we can therefore get away with a cheap WriteBatch
|
||||
rocksdb::WriteBatch batch(32 * 1024 * 1024);
|
||||
res = ::fillIndex<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, batch,
|
||||
snap);
|
||||
}
|
||||
|
||||
if (res.fail()) {
|
||||
return res;
|
||||
}
|
||||
|
||||
rocksdb::SequenceNumber scanFrom = snap->GetSequenceNumber();
|
||||
rootDB->ReleaseSnapshot(snap);
|
||||
snap = nullptr;
|
||||
|
||||
rocksdb::SequenceNumber lastScanned = 0;
|
||||
uint64_t numScanned = 0;
|
||||
|
||||
int maxCatchups = 4;
|
||||
while(true) {
|
||||
lastScanned = 0;
|
||||
if (internal->unique()) {
|
||||
const rocksdb::Comparator* cmp = internal->columnFamily()->GetComparator();
|
||||
// unique index. we need to keep track of all our changes because we need to
|
||||
// avoid duplicate index keys. must therefore use a WriteBatchWithIndex
|
||||
rocksdb::WriteBatchWithIndex batch(cmp, 32 * 1024 * 1024);
|
||||
res = ::catchup<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(
|
||||
*internal, batch, AccessMode::Type::WRITE, scanFrom, lastScanned, numScanned);
|
||||
} else {
|
||||
// non-unique index. all index keys will be unique anyway because they
|
||||
// contain the document id we can therefore get away with a cheap WriteBatch
|
||||
rocksdb::WriteBatch batch(32 * 1024 * 1024);
|
||||
res = ::catchup<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, batch,
|
||||
AccessMode::Type::WRITE,
|
||||
scanFrom, lastScanned,
|
||||
numScanned);
|
||||
}
|
||||
|
||||
if (res.fail()) {
|
||||
return res;
|
||||
}
|
||||
|
||||
if (numScanned < 5000 || maxCatchups-- == 0) {
|
||||
TRI_ASSERT(lastScanned > scanFrom);
|
||||
std::this_thread::yield();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!locker.lock()) {
|
||||
return res.reset(TRI_ERROR_LOCK_TIMEOUT);
|
||||
}
|
||||
|
||||
scanFrom = lastScanned;
|
||||
if (internal->unique()) {
|
||||
const rocksdb::Comparator* cmp = internal->columnFamily()->GetComparator();
|
||||
// unique index. we need to keep track of all our changes because we need to
|
||||
// avoid duplicate index keys. must therefore use a WriteBatchWithIndex
|
||||
rocksdb::WriteBatchWithIndex batch(cmp, 32 * 1024 * 1024);
|
||||
res = ::catchup<rocksdb::WriteBatchWithIndex, RocksDBBatchedWithIndexMethods>(
|
||||
*internal, batch, AccessMode::Type::EXCLUSIVE, scanFrom, lastScanned, numScanned);
|
||||
} else {
|
||||
// non-unique index. all index keys will be unique anyway because they
|
||||
// contain the document id we can therefore get away with a cheap WriteBatch
|
||||
rocksdb::WriteBatch batch(32 * 1024 * 1024);
|
||||
res = ::catchup<rocksdb::WriteBatch, RocksDBBatchedMethods>(*internal, batch,
|
||||
AccessMode::Type::EXCLUSIVE,
|
||||
scanFrom, lastScanned,
|
||||
numScanned);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
|
|
@ -30,6 +30,8 @@
|
|||
|
||||
namespace arangodb {
|
||||
|
||||
class RocksDBCollection;
|
||||
|
||||
/// Dummy index class that contains the logic to build indexes
|
||||
/// without an exclusive lock. It wraps the actual index implementation
|
||||
/// and adds some required synchronization logic on top
|
||||
|
@ -71,13 +73,11 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex {
|
|||
bool hasSelectivityEstimate() const override { return false; }
|
||||
|
||||
/// insert index elements into the specified write batch.
|
||||
Result insert(transaction::Methods& trx, RocksDBMethods*,
|
||||
LocalDocumentId const& documentId,
|
||||
Result insert(transaction::Methods& trx, RocksDBMethods*, LocalDocumentId const& documentId,
|
||||
arangodb::velocypack::Slice const&, OperationMode mode) override;
|
||||
|
||||
/// remove index elements and put it in the specified write batch.
|
||||
Result remove(transaction::Methods& trx, RocksDBMethods*,
|
||||
LocalDocumentId const& documentId,
|
||||
Result remove(transaction::Methods& trx, RocksDBMethods*, LocalDocumentId const& documentId,
|
||||
arangodb::velocypack::Slice const&, OperationMode mode) override;
|
||||
|
||||
RocksDBBuilderIndex(std::shared_ptr<arangodb::RocksDBIndex> const&);
|
||||
|
@ -91,12 +91,23 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex {
|
|||
}
|
||||
void recalculateEstimates() override { _wrapped->recalculateEstimates(); }
|
||||
|
||||
/// @brief fill index, will exclusively lock the collection
|
||||
Result fillIndexFast();
|
||||
/// @brief assumes an exclusive lock on the collection
|
||||
Result fillIndexForeground();
|
||||
|
||||
struct Locker {
|
||||
Locker(RocksDBCollection* c) : _collection(c), _locked(false) {}
|
||||
~Locker() { unlock(); }
|
||||
bool lock();
|
||||
void unlock();
|
||||
bool isLocked() const { return _locked; }
|
||||
private:
|
||||
RocksDBCollection* const _collection;
|
||||
bool _locked;
|
||||
};
|
||||
|
||||
/// @brief fill the index, assume already locked exclusively
|
||||
/// @param unlock called when collection lock can be released
|
||||
Result fillIndexBackground(std::function<void()> const& unlock);
|
||||
/// @param locker locks and unlocks the collection
|
||||
Result fillIndexBackground(Locker& locker);
|
||||
|
||||
virtual IndexIterator* iteratorForCondition(transaction::Methods* trx,
|
||||
ManagedDocumentResult* result,
|
||||
|
@ -109,17 +120,6 @@ class RocksDBBuilderIndex final : public arangodb::RocksDBIndex {
|
|||
|
||||
private:
|
||||
std::shared_ptr<arangodb::RocksDBIndex> _wrapped;
|
||||
|
||||
std::atomic<bool> _hasError;
|
||||
std::mutex _errorMutex;
|
||||
Result _errorResult;
|
||||
|
||||
std::mutex _removedDocsMutex;
|
||||
std::unordered_set<LocalDocumentId::BaseType> _removedDocs;
|
||||
|
||||
std::mutex _lockedDocsMutex;
|
||||
std::condition_variable _lockedDocsCond;
|
||||
std::unordered_set<LocalDocumentId::BaseType> _lockedDocs;
|
||||
};
|
||||
} // namespace arangodb
|
||||
|
||||
|
|
|
@ -80,7 +80,8 @@ RocksDBCollection::RocksDBCollection(LogicalCollection& collection,
|
|||
_cacheEnabled(
|
||||
!collection.system() &&
|
||||
basics::VelocyPackHelper::readBooleanValue(info, "cacheEnabled", false) &&
|
||||
CacheManagerFeature::MANAGER != nullptr) {
|
||||
CacheManagerFeature::MANAGER != nullptr),
|
||||
_numIndexCreations(0) {
|
||||
TRI_ASSERT(!ServerState::instance()->isCoordinator());
|
||||
VPackSlice s = info.get("isVolatile");
|
||||
if (s.isBoolean() && s.getBoolean()) {
|
||||
|
@ -108,7 +109,8 @@ RocksDBCollection::RocksDBCollection(LogicalCollection& collection,
|
|||
_cache(nullptr),
|
||||
_cachePresent(false),
|
||||
_cacheEnabled(static_cast<RocksDBCollection const*>(physical)->_cacheEnabled &&
|
||||
CacheManagerFeature::MANAGER != nullptr) {
|
||||
CacheManagerFeature::MANAGER != nullptr),
|
||||
_numIndexCreations(0) {
|
||||
TRI_ASSERT(!ServerState::instance()->isCoordinator());
|
||||
rocksutils::globalRocksEngine()->addCollectionMapping(
|
||||
_objectId, _logicalCollection.vocbase().id(), _logicalCollection.id());
|
||||
|
@ -264,8 +266,23 @@ void RocksDBCollection::prepareIndexes(arangodb::velocypack::Slice indexesSlice)
|
|||
}
|
||||
|
||||
WRITE_LOCKER(guard, _indexesLock);
|
||||
TRI_ASSERT(_indexes.empty());
|
||||
for (std::shared_ptr<Index>& idx : indexes) {
|
||||
addIndex(std::move(idx));
|
||||
auto const id = idx->id();
|
||||
for (auto const& it : _indexes) {
|
||||
if (it->id() == id) { // index is there twice
|
||||
idx.reset();
|
||||
}
|
||||
}
|
||||
|
||||
if (idx) {
|
||||
TRI_UpdateTickServer(static_cast<TRI_voc_tick_t>(id));
|
||||
_indexes.emplace_back(idx);
|
||||
if (idx->type() == Index::TRI_IDX_TYPE_PRIMARY_INDEX) {
|
||||
TRI_ASSERT(idx->id() == 0);
|
||||
_primaryIndex = static_cast<RocksDBPrimaryIndex*>(idx.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (_indexes[0]->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX ||
|
||||
|
@ -286,7 +303,7 @@ void RocksDBCollection::prepareIndexes(arangodb::velocypack::Slice indexesSlice)
|
|||
TRI_ASSERT(!_indexes.empty());
|
||||
}
|
||||
|
||||
std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slice const& info,
|
||||
std::shared_ptr<Index> RocksDBCollection::createIndex(VPackSlice const& info,
|
||||
bool restore, bool& created) {
|
||||
TRI_ASSERT(info.isObject());
|
||||
Result res;
|
||||
|
@ -298,18 +315,17 @@ std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slic
|
|||
if (res.fail()) {
|
||||
THROW_ARANGO_EXCEPTION(res);
|
||||
}
|
||||
auto releaseGuard =
|
||||
scopeGuard([&] { vocbase.releaseCollection(&_logicalCollection); });
|
||||
res = lockWrite();
|
||||
if (res.fail()) {
|
||||
THROW_ARANGO_EXCEPTION(res);
|
||||
}
|
||||
// WRITE_LOCKER(indexGuard, _indexesLock);
|
||||
auto unlockGuard = scopeGuard([&] {
|
||||
// indexGuard.unlock(); // unlock in reverse order
|
||||
this->unlockWrite();
|
||||
_numIndexCreations.fetch_add(1, std::memory_order_release);
|
||||
auto colGuard = scopeGuard([&] {
|
||||
vocbase.releaseCollection(&_logicalCollection);
|
||||
_numIndexCreations.fetch_sub(1, std::memory_order_release);
|
||||
});
|
||||
|
||||
RocksDBBuilderIndex::Locker locker(this);
|
||||
if (!locker.lock()) {
|
||||
THROW_ARANGO_EXCEPTION(TRI_ERROR_LOCK_TIMEOUT);
|
||||
}
|
||||
|
||||
std::shared_ptr<Index> idx;
|
||||
{ // Step 1. Check for matching index
|
||||
WRITE_LOCKER(guard, _indexesLock);
|
||||
|
@ -377,23 +393,18 @@ std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slic
|
|||
}
|
||||
}
|
||||
|
||||
const bool inBackground = false; // TODO simon: will be enabled in a seperate PR
|
||||
#if 0
|
||||
bool inBackground = basics::VelocyPackHelper::getBooleanValue(
|
||||
info, StaticStrings::IndexInBackground, false);
|
||||
#endif
|
||||
|
||||
const bool inBackground =
|
||||
basics::VelocyPackHelper::getBooleanValue(info, StaticStrings::IndexInBackground, false);
|
||||
// Step 4. fill index
|
||||
if (res.ok()) {
|
||||
if (inBackground) { // allow concurrent inserts into index
|
||||
_indexes.emplace_back(buildIdx);
|
||||
res = buildIdx->fillIndexBackground([&] {
|
||||
unlockGuard.fire(); // will be called at appropriate time
|
||||
});
|
||||
res = buildIdx->fillIndexBackground(locker);
|
||||
} else {
|
||||
res = buildIdx->fillIndexFast(); // will lock again internally
|
||||
res = buildIdx->fillIndexForeground(); // will lock again internally
|
||||
}
|
||||
}
|
||||
locker.lock(); // always lock to avoid inconsistencies
|
||||
|
||||
// Step 5. cleanup
|
||||
if (res.ok()) {
|
||||
|
@ -411,20 +422,6 @@ std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slic
|
|||
}
|
||||
}
|
||||
|
||||
// // we should sync the selectivity estimates TODO fix
|
||||
// res = engine->settingsManager()->sync(false);
|
||||
// if (res.fail()) { // not critical
|
||||
// LOG_TOPIC(WARN, Logger::ENGINES) << "could not sync settings: "
|
||||
// << res.errorMessage();
|
||||
// res.reset();
|
||||
// }
|
||||
//
|
||||
// rocksdb::Status s = engine->db()->GetRootDB()->FlushWAL(true);
|
||||
// if (!s.ok()) { // not critical
|
||||
// LOG_TOPIC(WARN, Logger::ENGINES) << "could not flush wal: "
|
||||
// << s.ToString();
|
||||
// }
|
||||
|
||||
#if USE_PLAN_CACHE
|
||||
arangodb::aql::PlanCache::instance()->invalidate(_logicalCollection->vocbase());
|
||||
#endif
|
||||
|
@ -442,7 +439,6 @@ std::shared_ptr<Index> RocksDBCollection::createIndex(arangodb::velocypack::Slic
|
|||
}
|
||||
}
|
||||
|
||||
unlockGuard.fire(); // may have already been fired
|
||||
if (res.fail()) {
|
||||
{ // We could not create the index. Better abort
|
||||
WRITE_LOCKER(guard, _indexesLock);
|
||||
|
@ -550,8 +546,7 @@ Result RocksDBCollection::truncate(transaction::Methods& trx, OperationOptions&
|
|||
|
||||
if (state->isOnlyExclusiveTransaction() &&
|
||||
state->hasHint(transaction::Hints::Hint::ALLOW_RANGE_DELETE) &&
|
||||
static_cast<RocksDBEngine*>(EngineSelectorFeature::ENGINE)->canUseRangeDeleteInWal() &&
|
||||
_numberDocuments >= 32 * 1024) {
|
||||
this->canUseRangeDeleteInWal() && _numberDocuments >= 32 * 1024) {
|
||||
// non-transactional truncate optimization. We perform a bunch of
|
||||
// range deletes and circumwent the normal rocksdb::Transaction.
|
||||
// no savepoint needed here
|
||||
|
@ -562,8 +557,6 @@ Result RocksDBCollection::truncate(transaction::Methods& trx, OperationOptions&
|
|||
}
|
||||
|
||||
RocksDBEngine* engine = rocksutils::globalRocksEngine();
|
||||
// add the assertion again here, so we are sure we can use RangeDeletes
|
||||
TRI_ASSERT(engine->canUseRangeDeleteInWal());
|
||||
rocksdb::DB* db = engine->db()->GetRootDB();
|
||||
|
||||
TRI_IF_FAILURE("RocksDBCollection::truncate::forceSync") {
|
||||
|
@ -831,9 +824,10 @@ Result RocksDBCollection::insert(arangodb::transaction::Methods* trx,
|
|||
if (documentId.isSet()) {
|
||||
if (options.indexOperationMode == Index::OperationMode::internal) {
|
||||
// need to return the key of the conflict document
|
||||
return Result(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED, keySlice.copyString());
|
||||
return res.reset(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED,
|
||||
keySlice.copyString());
|
||||
}
|
||||
return Result(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED);
|
||||
return res.reset(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -910,7 +904,7 @@ Result RocksDBCollection::update(arangodb::transaction::Methods* trx,
|
|||
int result = checkRevision(trx, expectedRev, prevRev);
|
||||
|
||||
if (result != TRI_ERROR_NO_ERROR) {
|
||||
return Result(result);
|
||||
return res.reset(result);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -920,7 +914,7 @@ Result RocksDBCollection::update(arangodb::transaction::Methods* trx,
|
|||
TRI_ASSERT(!mdr.empty());
|
||||
|
||||
trackWaitForSync(trx, options);
|
||||
return Result();
|
||||
return res;
|
||||
}
|
||||
|
||||
// merge old and new values
|
||||
|
@ -937,7 +931,7 @@ Result RocksDBCollection::update(arangodb::transaction::Methods* trx,
|
|||
if (_isDBServer) {
|
||||
// Need to check that no sharding keys have changed:
|
||||
if (arangodb::shardKeysChanged(_logicalCollection, oldDoc, builder->slice(), false)) {
|
||||
return Result(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SHARDING_ATTRIBUTES);
|
||||
return res.reset(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SHARDING_ATTRIBUTES);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -994,13 +988,13 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
|
|||
// get the previous revision
|
||||
VPackSlice key = newSlice.get(StaticStrings::KeyString);
|
||||
|
||||
Result res;
|
||||
if (key.isNone()) {
|
||||
return Result(TRI_ERROR_ARANGO_DOCUMENT_HANDLE_BAD);
|
||||
return res.reset(TRI_ERROR_ARANGO_DOCUMENT_HANDLE_BAD);
|
||||
}
|
||||
|
||||
// get the previous revision
|
||||
Result res = this->read(trx, key, previous, /*lock*/ false);
|
||||
|
||||
res = this->read(trx, key, previous, /*lock*/ false);
|
||||
if (res.fail()) {
|
||||
return res;
|
||||
}
|
||||
|
@ -1018,10 +1012,10 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
|
|||
if (newSlice.isObject()) {
|
||||
expectedRev = TRI_ExtractRevisionId(newSlice);
|
||||
}
|
||||
int res = checkRevision(trx, expectedRev, prevRev);
|
||||
|
||||
if (res != TRI_ERROR_NO_ERROR) {
|
||||
return Result(res);
|
||||
res = checkRevision(trx, expectedRev, prevRev);
|
||||
if (res.fail()) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1038,7 +1032,7 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
|
|||
if (_isDBServer) {
|
||||
// Need to check that no sharding keys have changed:
|
||||
if (arangodb::shardKeysChanged(_logicalCollection, oldDoc, builder->slice(), false)) {
|
||||
return Result(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SHARDING_ATTRIBUTES);
|
||||
return res.reset(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SHARDING_ATTRIBUTES);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1050,9 +1044,9 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
|
|||
// add possible log statement under guard
|
||||
state->prepareOperation(_logicalCollection.id(), revisionId, TRI_VOC_DOCUMENT_OPERATION_REPLACE);
|
||||
|
||||
Result opResult = updateDocument(trx, oldDocumentId, oldDoc, documentId, newDoc, options);
|
||||
res = updateDocument(trx, oldDocumentId, oldDoc, documentId, newDoc, options);
|
||||
|
||||
if (opResult.ok()) {
|
||||
if (res.ok()) {
|
||||
trackWaitForSync(trx, options);
|
||||
|
||||
if (options.silent) {
|
||||
|
@ -1079,7 +1073,7 @@ Result RocksDBCollection::replace(transaction::Methods* trx,
|
|||
guard.finish(hasPerformedIntermediateCommit);
|
||||
}
|
||||
|
||||
return opResult;
|
||||
return res;
|
||||
}
|
||||
|
||||
Result RocksDBCollection::remove(transaction::Methods& trx, velocypack::Slice slice,
|
||||
|
@ -1103,7 +1097,7 @@ Result RocksDBCollection::remove(transaction::Methods& trx, velocypack::Slice sl
|
|||
TRI_ASSERT(!key.isNone());
|
||||
|
||||
// get the previous revision
|
||||
auto res = this->read(&trx, key, previous, /*lock*/ false);
|
||||
Result res = this->read(&trx, key, previous, /*lock*/ false);
|
||||
|
||||
if (res.fail()) {
|
||||
return res;
|
||||
|
@ -1120,10 +1114,10 @@ Result RocksDBCollection::remove(transaction::Methods& trx, velocypack::Slice sl
|
|||
// Check old revision:
|
||||
if (!options.ignoreRevs && slice.isObject()) {
|
||||
TRI_voc_rid_t expectedRevisionId = TRI_ExtractRevisionId(slice);
|
||||
auto res = checkRevision(&trx, expectedRevisionId, oldRevisionId);
|
||||
res = checkRevision(&trx, expectedRevisionId, oldRevisionId);
|
||||
|
||||
if (res != TRI_ERROR_NO_ERROR) {
|
||||
return Result(res);
|
||||
if (res.fail()) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1193,29 +1187,6 @@ void RocksDBCollection::figuresSpecific(std::shared_ptr<arangodb::velocypack::Bu
|
|||
}
|
||||
}
|
||||
|
||||
void RocksDBCollection::addIndex(std::shared_ptr<arangodb::Index> idx) {
|
||||
// LOCKED from the outside
|
||||
// primary index must be added at position 0
|
||||
TRI_ASSERT(ServerState::instance()->isRunningInCluster() ||
|
||||
idx->type() != arangodb::Index::TRI_IDX_TYPE_PRIMARY_INDEX ||
|
||||
_indexes.empty());
|
||||
|
||||
auto const id = idx->id();
|
||||
for (auto const& it : _indexes) {
|
||||
if (it->id() == id) {
|
||||
// already have this particular index. do not add it again
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
TRI_UpdateTickServer(static_cast<TRI_voc_tick_t>(id));
|
||||
_indexes.emplace_back(idx);
|
||||
if (idx->type() == Index::TRI_IDX_TYPE_PRIMARY_INDEX) {
|
||||
TRI_ASSERT(idx->id() == 0);
|
||||
_primaryIndex = static_cast<RocksDBPrimaryIndex*>(idx.get());
|
||||
}
|
||||
}
|
||||
|
||||
Result RocksDBCollection::insertDocument(arangodb::transaction::Methods* trx,
|
||||
LocalDocumentId const& documentId,
|
||||
VPackSlice const& doc,
|
||||
|
@ -1312,28 +1283,23 @@ Result RocksDBCollection::updateDocument(transaction::Methods* trx,
|
|||
Result res;
|
||||
|
||||
RocksDBMethods* mthd = RocksDBTransactionState::toMethods(trx);
|
||||
|
||||
// We NEED to do the PUT first, otherwise WAL tailing breaks
|
||||
RocksDBKeyLeaser newKey(trx);
|
||||
newKey->constructDocument(_objectId, newDocumentId);
|
||||
// simon: we do not need to blacklist the new documentId
|
||||
|
||||
// disable indexing in this transaction if we are allowed to
|
||||
IndexingDisabler disabler(mthd, trx->isSingleOperationTransaction());
|
||||
|
||||
rocksdb::Status s =
|
||||
mthd->Put(RocksDBColumnFamily::documents(), newKey.ref(),
|
||||
rocksdb::Slice(reinterpret_cast<char const*>(newDoc.begin()),
|
||||
static_cast<size_t>(newDoc.byteSize())));
|
||||
RocksDBKeyLeaser key(trx);
|
||||
key->constructDocument(_objectId, oldDocumentId);
|
||||
blackListKey(key->string().data(), static_cast<uint32_t>(key->string().size()));
|
||||
|
||||
rocksdb::Status s = mthd->SingleDelete(RocksDBColumnFamily::documents(), key.ref());
|
||||
if (!s.ok()) {
|
||||
return res.reset(rocksutils::convertStatus(s, rocksutils::document));
|
||||
}
|
||||
|
||||
RocksDBKeyLeaser oldKey(trx);
|
||||
oldKey->constructDocument(_objectId, oldDocumentId);
|
||||
blackListKey(oldKey->string().data(), static_cast<uint32_t>(oldKey->string().size()));
|
||||
|
||||
s = mthd->SingleDelete(RocksDBColumnFamily::documents(), oldKey.ref());
|
||||
key->constructDocument(_objectId, newDocumentId);
|
||||
// simon: we do not need to blacklist the new documentId
|
||||
s = mthd->Put(RocksDBColumnFamily::documents(), key.ref(),
|
||||
rocksdb::Slice(reinterpret_cast<char const*>(newDoc.begin()),
|
||||
static_cast<size_t>(newDoc.byteSize())));
|
||||
if (!s.ok()) {
|
||||
return res.reset(rocksutils::convertStatus(s, rocksutils::document));
|
||||
}
|
||||
|
@ -1341,8 +1307,8 @@ Result RocksDBCollection::updateDocument(transaction::Methods* trx,
|
|||
READ_LOCKER(guard, _indexesLock);
|
||||
for (std::shared_ptr<Index> const& idx : _indexes) {
|
||||
RocksDBIndex* rIdx = static_cast<RocksDBIndex*>(idx.get());
|
||||
res = rIdx->update(*trx, mthd, oldDocumentId, oldDoc, newDocumentId,
|
||||
newDoc, options.indexOperationMode);
|
||||
res = rIdx->update(*trx, mthd, oldDocumentId, oldDoc, newDocumentId, newDoc,
|
||||
options.indexOperationMode);
|
||||
|
||||
if (res.fail()) {
|
||||
break;
|
||||
|
@ -1411,8 +1377,7 @@ arangodb::Result RocksDBCollection::lookupDocumentVPack(LocalDocumentId const& d
|
|||
}
|
||||
} else {
|
||||
LOG_TOPIC(DEBUG, Logger::ENGINES)
|
||||
<< "NOT FOUND rev: " << documentId.id()
|
||||
<< " trx: " << trx->state()->id() << " seq: " << mthd->sequenceNumber()
|
||||
<< "NOT FOUND rev: " << documentId.id() << " trx: " << trx->state()->id()
|
||||
<< " objectID " << _objectId << " name: " << _logicalCollection.name();
|
||||
mdr.clear();
|
||||
res.reset(rocksutils::convertStatus(s, rocksutils::document));
|
||||
|
@ -1476,8 +1441,7 @@ arangodb::Result RocksDBCollection::lookupDocumentVPack(
|
|||
cb(documentId, VPackSlice(ps.data()));
|
||||
} else {
|
||||
LOG_TOPIC(DEBUG, Logger::ENGINES)
|
||||
<< "NOT FOUND rev: " << documentId.id()
|
||||
<< " trx: " << trx->state()->id() << " seq: " << mthd->sequenceNumber()
|
||||
<< "NOT FOUND rev: " << documentId.id() << " trx: " << trx->state()->id()
|
||||
<< " objectID " << _objectId << " name: " << _logicalCollection.name();
|
||||
res.reset(rocksutils::convertStatus(s, rocksutils::document));
|
||||
}
|
||||
|
@ -1663,10 +1627,9 @@ uint64_t RocksDBCollection::recalculateCounts() {
|
|||
std::unique_ptr<rocksdb::Iterator> it(db->NewIterator(ro, cf));
|
||||
std::size_t count = 0;
|
||||
|
||||
it->Seek(bounds.start());
|
||||
while (it->Valid() && it->key().compare(upper) < 0) {
|
||||
for (it->Seek(bounds.start()); it->Valid(); it->Next()) {
|
||||
TRI_ASSERT(it->key().compare(upper) < 0);
|
||||
++count;
|
||||
it->Next();
|
||||
}
|
||||
|
||||
int64_t adjustment = snapNumberOfDocuments - count;
|
||||
|
@ -1780,3 +1743,12 @@ void RocksDBCollection::trackWaitForSync(arangodb::transaction::Methods* trx,
|
|||
trx->state()->waitForSync(true);
|
||||
}
|
||||
}
|
||||
|
||||
/// @brief can use non transactional range delete in write ahead log
|
||||
bool RocksDBCollection::canUseRangeDeleteInWal() const {
|
||||
if (ServerState::instance()->isSingleServer()) {
|
||||
// disableWalFilePruning is used by createIndex
|
||||
return _numIndexCreations.load(std::memory_order_acquire) == 0;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -181,12 +181,8 @@ class RocksDBCollection final : public PhysicalCollection {
|
|||
RocksDBCollectionMeta& meta() { return _meta; }
|
||||
|
||||
private:
|
||||
/// @brief track the usage of waitForSync option in an operation
|
||||
void trackWaitForSync(arangodb::transaction::Methods* trx, OperationOptions& options);
|
||||
|
||||
/// @brief return engine-specific figures
|
||||
void figuresSpecific(std::shared_ptr<velocypack::Builder>&) override;
|
||||
void addIndex(std::shared_ptr<arangodb::Index> idx);
|
||||
|
||||
// @brief return the primary index
|
||||
// WARNING: Make sure that this instance
|
||||
|
@ -232,8 +228,15 @@ class RocksDBCollection final : public PhysicalCollection {
|
|||
return (_cacheEnabled && _cachePresent);
|
||||
}
|
||||
|
||||
/// @brief track key in file
|
||||
void blackListKey(char const* data, std::size_t len) const;
|
||||
|
||||
/// @brief track the usage of waitForSync option in an operation
|
||||
void trackWaitForSync(arangodb::transaction::Methods* trx, OperationOptions& options);
|
||||
|
||||
/// @brief can use non transactional range delete in write ahead log
|
||||
bool canUseRangeDeleteInWal() const;
|
||||
|
||||
private:
|
||||
uint64_t const _objectId; // rocksdb-specific object id for collection
|
||||
RocksDBCollectionMeta _meta; /// collection metadata
|
||||
|
@ -247,10 +250,13 @@ class RocksDBCollection final : public PhysicalCollection {
|
|||
mutable basics::ReadWriteLock _exclusiveLock;
|
||||
/// @brief document cache (optional)
|
||||
mutable std::shared_ptr<cache::Cache> _cache;
|
||||
|
||||
// we use this boolean for testing whether _cache is set.
|
||||
// it's quicker than accessing the shared_ptr each time
|
||||
mutable bool _cachePresent;
|
||||
bool _cacheEnabled;
|
||||
/// @brief number of index creations in progress
|
||||
std::atomic<int> _numIndexCreations;
|
||||
};
|
||||
|
||||
inline RocksDBCollection* toRocksDBCollection(PhysicalCollection* physical) {
|
||||
|
|
|
@ -354,7 +354,7 @@ Result RocksDBCollectionMeta::deserializeMeta(rocksdb::DB* db, LogicalCollection
|
|||
if (!s.ok() && !s.IsNotFound()) {
|
||||
return rocksutils::convertStatus(s);
|
||||
} else if (s.IsNotFound()) { // expected with nosync recovery tests
|
||||
LOG_TOPIC(WARN, Logger::ROCKSDB)
|
||||
LOG_TOPIC(WARN, Logger::ENGINES)
|
||||
<< "recalculating index estimate for index "
|
||||
<< "type '" << idx->typeName() << "' with id '" << idx->id() << "'";
|
||||
idx->recalculateEstimates();
|
||||
|
|
|
@ -27,7 +27,6 @@
|
|||
#define ARANGO_ROCKSDB_ROCKSDB_COMMON_H 1
|
||||
|
||||
#include "Basics/Common.h"
|
||||
#include "Basics/Endian.h"
|
||||
#include "Basics/Result.h"
|
||||
#include "Basics/RocksDBUtils.h"
|
||||
#include "RocksDBEngine/RocksDBComparator.h"
|
||||
|
|
|
@ -902,16 +902,17 @@ int RocksDBEngine::getCollectionsAndIndexes(TRI_vocbase_t& vocbase,
|
|||
}
|
||||
|
||||
int RocksDBEngine::getViews(TRI_vocbase_t& vocbase, arangodb::velocypack::Builder& result) {
|
||||
rocksdb::ReadOptions readOptions;
|
||||
std::unique_ptr<rocksdb::Iterator> iter(
|
||||
_db->NewIterator(readOptions, RocksDBColumnFamily::definitions()));
|
||||
|
||||
result.openArray();
|
||||
|
||||
auto bounds = RocksDBKeyBounds::DatabaseViews(vocbase.id());
|
||||
rocksdb::Slice upper = bounds.end();
|
||||
rocksdb::ColumnFamilyHandle* cf = RocksDBColumnFamily::definitions();
|
||||
|
||||
for (iter->Seek(bounds.start());
|
||||
iter->Valid() && iter->key().compare(bounds.end()) < 0; iter->Next()) {
|
||||
rocksdb::ReadOptions ro;
|
||||
ro.iterate_upper_bound = &upper;
|
||||
|
||||
std::unique_ptr<rocksdb::Iterator> iter(_db->NewIterator(ro, cf));
|
||||
result.openArray();
|
||||
for (iter->Seek(bounds.start()); iter->Valid(); iter->Next()) {
|
||||
TRI_ASSERT(iter->key().compare(bounds.end()) < 0);
|
||||
auto slice = VPackSlice(iter->value().data());
|
||||
|
||||
LOG_TOPIC(TRACE, Logger::VIEWS) << "got view slice: " << slice.toJson();
|
||||
|
@ -1571,6 +1572,14 @@ void RocksDBEngine::waitForEstimatorSync(std::chrono::milliseconds maxWaitTime)
|
|||
}
|
||||
}
|
||||
|
||||
void RocksDBEngine::disableWalFilePruning(bool disable) {
|
||||
_backgroundThread->disableWalFilePruning(disable);
|
||||
}
|
||||
|
||||
bool RocksDBEngine::disableWalFilePruning() const {
|
||||
return _backgroundThread->disableWalFilePruning();
|
||||
}
|
||||
|
||||
Result RocksDBEngine::registerRecoveryHelper(std::shared_ptr<RocksDBRecoveryHelper> helper) {
|
||||
try {
|
||||
_recoveryHelpers.emplace_back(helper);
|
||||
|
@ -2177,10 +2186,6 @@ void RocksDBEngine::releaseTick(TRI_voc_tick_t tick) {
|
|||
}
|
||||
}
|
||||
|
||||
bool RocksDBEngine::canUseRangeDeleteInWal() const {
|
||||
return ServerState::instance()->isSingleServer();
|
||||
}
|
||||
|
||||
} // namespace arangodb
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
|
|
@ -296,7 +296,9 @@ class RocksDBEngine final : public StorageEngine {
|
|||
static std::string const EngineName;
|
||||
static std::string const FeatureName;
|
||||
|
||||
bool canUseRangeDeleteInWal() const;
|
||||
/// @brief allow / disbable removal of WAL files
|
||||
void disableWalFilePruning(bool disable);
|
||||
bool disableWalFilePruning() const;
|
||||
|
||||
rocksdb::Options const& rocksDBOptions() const { return _options; }
|
||||
|
||||
|
|
|
@ -420,12 +420,13 @@ Result RocksDBFulltextIndex::applyQueryToken(transaction::Methods* trx,
|
|||
rocksdb::ReadOptions ro = mthds->iteratorReadOptions();
|
||||
ro.iterate_upper_bound = &end;
|
||||
std::unique_ptr<rocksdb::Iterator> iter = mthds->NewIterator(ro, _cf);
|
||||
iter->Seek(bounds.start());
|
||||
|
||||
// set is used to perform an intersection with the result set
|
||||
std::set<LocalDocumentId> intersect;
|
||||
// apply left to right logic, merging all current results with ALL previous
|
||||
while (iter->Valid() && cmp->Compare(iter->key(), end) < 0) {
|
||||
for (iter->Seek(bounds.start());
|
||||
iter->Valid() && cmp->Compare(iter->key(), end) < 0;
|
||||
iter->Next()) {
|
||||
TRI_ASSERT(_objectId == RocksDBKey::objectId(iter->key()));
|
||||
|
||||
rocksdb::Status s = iter->status();
|
||||
|
@ -442,7 +443,6 @@ Result RocksDBFulltextIndex::applyQueryToken(transaction::Methods* trx,
|
|||
} else if (token.operation == FulltextQueryToken::EXCLUDE) {
|
||||
resultSet.erase(documentId);
|
||||
}
|
||||
iter->Next();
|
||||
}
|
||||
if (token.operation == FulltextQueryToken::AND) {
|
||||
if (resultSet.empty() || intersect.empty()) {
|
||||
|
|
|
@ -42,7 +42,6 @@ namespace cache {
|
|||
class Cache;
|
||||
}
|
||||
class LogicalCollection;
|
||||
class RocksDBSettingsManager;
|
||||
class RocksDBMethods;
|
||||
|
||||
class RocksDBIndex : public Index {
|
||||
|
|
|
@ -447,8 +447,7 @@ struct GeoIndexFactory : public DefaultIndexFactory {
|
|||
|
||||
if (isCreation && !ServerState::instance()->isCoordinator() &&
|
||||
!definition.hasKey("objectId")) {
|
||||
normalized.add("objectId",
|
||||
arangodb::velocypack::Value(std::to_string(TRI_NewTickServer())));
|
||||
normalized.add("objectId", VPackValue(std::to_string(TRI_NewTickServer())));
|
||||
}
|
||||
|
||||
return EnhanceJsonIndexGeo(definition, normalized, isCreation);
|
||||
|
|
|
@ -100,12 +100,33 @@ RocksDBLogValue RocksDBLogValue::DocumentRemoveV2(TRI_voc_rid_t rid) {
|
|||
RocksDBLogValue RocksDBLogValue::SinglePut(TRI_voc_tick_t vocbaseId, TRI_voc_cid_t cid) {
|
||||
return RocksDBLogValue(RocksDBLogType::SinglePut, vocbaseId, cid);
|
||||
}
|
||||
|
||||
RocksDBLogValue RocksDBLogValue::SingleRemoveV2(TRI_voc_tick_t vocbaseId,
|
||||
TRI_voc_cid_t cid, TRI_voc_rid_t rid) {
|
||||
return RocksDBLogValue(RocksDBLogType::SingleRemoveV2, vocbaseId, cid, rid);
|
||||
}
|
||||
|
||||
/*static*/ RocksDBLogValue RocksDBLogValue::Empty() {
|
||||
RocksDBLogValue RocksDBLogValue::TrackedDocumentInsert(LocalDocumentId docId,
|
||||
VPackSlice const& slice) {
|
||||
RocksDBLogValue val{};
|
||||
val._buffer.reserve(sizeof(RocksDBLogType) + sizeof(LocalDocumentId::BaseType) + slice.byteSize());
|
||||
val._buffer.push_back(static_cast<char>(RocksDBLogType::TrackedDocumentInsert));
|
||||
uintToPersistentLittleEndian(val._buffer, docId.id());
|
||||
val._buffer.append(slice.startAs<char>(), slice.byteSize());
|
||||
return val;
|
||||
}
|
||||
|
||||
RocksDBLogValue RocksDBLogValue::TrackedDocumentRemove(LocalDocumentId docId,
|
||||
VPackSlice const& slice) {
|
||||
RocksDBLogValue val{};
|
||||
val._buffer.reserve(sizeof(RocksDBLogType) + sizeof(LocalDocumentId::BaseType) + slice.byteSize());
|
||||
val._buffer.push_back(static_cast<char>(RocksDBLogType::TrackedDocumentRemove));
|
||||
uintToPersistentLittleEndian(val._buffer, docId.id());
|
||||
val._buffer.append(reinterpret_cast<char const*>(slice.begin()), slice.byteSize());
|
||||
return val;
|
||||
}
|
||||
|
||||
RocksDBLogValue RocksDBLogValue::Empty() {
|
||||
return RocksDBLogValue();
|
||||
}
|
||||
|
||||
|
@ -322,6 +343,18 @@ StringRef RocksDBLogValue::oldCollectionName(rocksdb::Slice const& slice) {
|
|||
return StringRef(slice.data() + off, slice.size() - off);
|
||||
}
|
||||
|
||||
/// @brief get slice from tracked document
|
||||
std::pair<LocalDocumentId, VPackSlice> RocksDBLogValue::trackedDocument(rocksdb::Slice const& slice) {
|
||||
TRI_ASSERT(slice.size() >= 2);
|
||||
RocksDBLogType type = static_cast<RocksDBLogType>(slice.data()[0]);
|
||||
TRI_ASSERT(type == RocksDBLogType::TrackedDocumentInsert ||
|
||||
type == RocksDBLogType::TrackedDocumentRemove);
|
||||
|
||||
LocalDocumentId id(uintFromPersistentLittleEndian<LocalDocumentId::BaseType>(slice.data() + sizeof(RocksDBLogType)));
|
||||
VPackSlice data(slice.data() + sizeof(RocksDBLogType) + sizeof(LocalDocumentId::BaseType));
|
||||
return std::make_pair(id, data);
|
||||
}
|
||||
|
||||
bool RocksDBLogValue::containsDatabaseId(RocksDBLogType type) {
|
||||
return type == RocksDBLogType::DatabaseCreate || type == RocksDBLogType::DatabaseDrop ||
|
||||
type == RocksDBLogType::CollectionCreate ||
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
#include "Basics/Common.h"
|
||||
#include "Basics/StringRef.h"
|
||||
#include "RocksDBEngine/RocksDBTypes.h"
|
||||
#include "VocBase/LocalDocumentId.h"
|
||||
#include "VocBase/voc-types.h"
|
||||
|
||||
#include <rocksdb/slice.h>
|
||||
|
@ -72,6 +73,9 @@ class RocksDBLogValue {
|
|||
static RocksDBLogValue SingleRemoveV2(TRI_voc_tick_t vocbaseId,
|
||||
TRI_voc_cid_t cid, TRI_voc_rid_t rid);
|
||||
|
||||
static RocksDBLogValue TrackedDocumentInsert(LocalDocumentId, velocypack::Slice const&);
|
||||
static RocksDBLogValue TrackedDocumentRemove(LocalDocumentId, velocypack::Slice const&);
|
||||
|
||||
// empty log value
|
||||
static RocksDBLogValue Empty();
|
||||
|
||||
|
@ -96,9 +100,12 @@ class RocksDBLogValue {
|
|||
/// @brief get UUID from view drop marker
|
||||
static arangodb::StringRef viewUUID(rocksdb::Slice const&);
|
||||
|
||||
// deprecated method for old collection drop marker
|
||||
/// @deprecated method for old collection drop marker
|
||||
static arangodb::StringRef oldCollectionName(rocksdb::Slice const&);
|
||||
|
||||
/// @brief get slice from tracked document
|
||||
static std::pair<LocalDocumentId, velocypack::Slice> trackedDocument(rocksdb::Slice const&);
|
||||
|
||||
static bool containsDatabaseId(RocksDBLogType type);
|
||||
static bool containsCollectionId(RocksDBLogType type);
|
||||
static bool containsViewId(RocksDBLogType type);
|
||||
|
|
|
@ -98,10 +98,6 @@ void RocksDBSavePoint::rollback() {
|
|||
|
||||
// =================== RocksDBMethods ===================
|
||||
|
||||
rocksdb::SequenceNumber RocksDBMethods::sequenceNumber() {
|
||||
return _state->sequenceNumber();
|
||||
}
|
||||
|
||||
rocksdb::ReadOptions RocksDBMethods::iteratorReadOptions() {
|
||||
if (_state->hasHint(transaction::Hints::Hint::INTERMEDIATE_COMMITS)) {
|
||||
rocksdb::ReadOptions ro = _state->_rocksReadOptions;
|
||||
|
@ -179,6 +175,10 @@ rocksdb::Status RocksDBReadOnlyMethods::SingleDelete(rocksdb::ColumnFamilyHandle
|
|||
THROW_ARANGO_EXCEPTION(TRI_ERROR_ARANGO_READ_ONLY);
|
||||
}
|
||||
|
||||
void RocksDBReadOnlyMethods::PutLogData(rocksdb::Slice const& blob) {
|
||||
THROW_ARANGO_EXCEPTION(TRI_ERROR_ARANGO_READ_ONLY);
|
||||
}
|
||||
|
||||
std::unique_ptr<rocksdb::Iterator> RocksDBReadOnlyMethods::NewIterator(
|
||||
rocksdb::ReadOptions const& opts, rocksdb::ColumnFamilyHandle* cf) {
|
||||
TRI_ASSERT(cf != nullptr);
|
||||
|
@ -241,6 +241,10 @@ rocksdb::Status RocksDBTrxMethods::SingleDelete(rocksdb::ColumnFamilyHandle* cf,
|
|||
return _state->_rocksTransaction->SingleDelete(cf, key.string());
|
||||
}
|
||||
|
||||
void RocksDBTrxMethods::PutLogData(rocksdb::Slice const& blob) {
|
||||
_state->_rocksTransaction->PutLogData(blob);
|
||||
}
|
||||
|
||||
std::unique_ptr<rocksdb::Iterator> RocksDBTrxMethods::NewIterator(
|
||||
rocksdb::ReadOptions const& opts, rocksdb::ColumnFamilyHandle* cf) {
|
||||
TRI_ASSERT(cf != nullptr);
|
||||
|
@ -326,6 +330,10 @@ rocksdb::Status RocksDBBatchedMethods::SingleDelete(rocksdb::ColumnFamilyHandle*
|
|||
return _wb->SingleDelete(cf, key.string());
|
||||
}
|
||||
|
||||
void RocksDBBatchedMethods::PutLogData(rocksdb::Slice const& blob) {
|
||||
_wb->PutLogData(blob);
|
||||
}
|
||||
|
||||
std::unique_ptr<rocksdb::Iterator> RocksDBBatchedMethods::NewIterator(
|
||||
rocksdb::ReadOptions const&, rocksdb::ColumnFamilyHandle*) {
|
||||
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL,
|
||||
|
@ -375,55 +383,13 @@ rocksdb::Status RocksDBBatchedWithIndexMethods::SingleDelete(rocksdb::ColumnFami
|
|||
return _wb->SingleDelete(cf, key.string());
|
||||
}
|
||||
|
||||
void RocksDBBatchedWithIndexMethods::PutLogData(rocksdb::Slice const& blob) {
|
||||
_wb->PutLogData(blob);
|
||||
}
|
||||
|
||||
std::unique_ptr<rocksdb::Iterator> RocksDBBatchedWithIndexMethods::NewIterator(
|
||||
rocksdb::ReadOptions const& ro, rocksdb::ColumnFamilyHandle* cf) {
|
||||
TRI_ASSERT(cf != nullptr);
|
||||
return std::unique_ptr<rocksdb::Iterator>(
|
||||
_wb->NewIteratorWithBase(_db->NewIterator(ro, cf)));
|
||||
}
|
||||
|
||||
// =================== RocksDBSideTrxMethods ====================
|
||||
|
||||
/// transaction wrapper, uses the provided rocksdb transaction
|
||||
RocksDBSideTrxMethods::RocksDBSideTrxMethods(RocksDBTransactionState* state,
|
||||
rocksdb::Transaction* trx)
|
||||
: RocksDBMethods(state), _trx(trx) {
|
||||
_ro.prefix_same_as_start = true;
|
||||
_ro.fill_cache = false;
|
||||
}
|
||||
|
||||
rocksdb::Status RocksDBSideTrxMethods::Get(rocksdb::ColumnFamilyHandle* cf,
|
||||
rocksdb::Slice const& key, std::string* val) {
|
||||
TRI_ASSERT(cf != nullptr);
|
||||
return _trx->Get(_ro, cf, key, val);
|
||||
}
|
||||
|
||||
rocksdb::Status RocksDBSideTrxMethods::Get(rocksdb::ColumnFamilyHandle* cf,
|
||||
rocksdb::Slice const& key,
|
||||
rocksdb::PinnableSlice* val) {
|
||||
TRI_ASSERT(cf != nullptr);
|
||||
return _trx->Get(_ro, cf, key, val);
|
||||
}
|
||||
|
||||
rocksdb::Status RocksDBSideTrxMethods::Put(rocksdb::ColumnFamilyHandle* cf,
|
||||
RocksDBKey const& key,
|
||||
rocksdb::Slice const& val) {
|
||||
TRI_ASSERT(cf != nullptr);
|
||||
return _trx->Put(cf, key.string(), val);
|
||||
}
|
||||
rocksdb::Status RocksDBSideTrxMethods::Delete(rocksdb::ColumnFamilyHandle* cf,
|
||||
RocksDBKey const& key) {
|
||||
TRI_ASSERT(cf != nullptr);
|
||||
return _trx->Delete(cf, key.string());
|
||||
}
|
||||
|
||||
rocksdb::Status RocksDBSideTrxMethods::SingleDelete(rocksdb::ColumnFamilyHandle* cf,
|
||||
RocksDBKey const& key) {
|
||||
TRI_ASSERT(cf != nullptr);
|
||||
return _trx->SingleDelete(cf, key.string());
|
||||
}
|
||||
|
||||
bool RocksDBSideTrxMethods::DisableIndexing() {
|
||||
_trx->DisableIndexing();
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -72,9 +72,6 @@ class RocksDBMethods {
|
|||
explicit RocksDBMethods(RocksDBTransactionState* state) : _state(state) {}
|
||||
virtual ~RocksDBMethods() {}
|
||||
|
||||
/// @brief current sequence number
|
||||
rocksdb::SequenceNumber sequenceNumber();
|
||||
|
||||
/// @brief read options for use with iterators
|
||||
rocksdb::ReadOptions iteratorReadOptions();
|
||||
|
||||
|
@ -97,6 +94,8 @@ class RocksDBMethods {
|
|||
/// when keys are inserted exactly once (and never overwritten)
|
||||
virtual rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) = 0;
|
||||
|
||||
virtual void PutLogData(rocksdb::Slice const&) = 0;
|
||||
|
||||
virtual std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
|
||||
rocksdb::ColumnFamilyHandle*) = 0;
|
||||
|
||||
|
@ -125,6 +124,7 @@ class RocksDBReadOnlyMethods final : public RocksDBMethods {
|
|||
rocksdb::Slice const& val) override;
|
||||
rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
|
||||
rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
|
||||
void PutLogData(rocksdb::Slice const&) override;
|
||||
|
||||
std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
|
||||
rocksdb::ColumnFamilyHandle*) override;
|
||||
|
@ -157,6 +157,7 @@ class RocksDBTrxMethods : public RocksDBMethods {
|
|||
rocksdb::Slice const& val) override;
|
||||
rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
|
||||
rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
|
||||
void PutLogData(rocksdb::Slice const&) override;
|
||||
|
||||
std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
|
||||
rocksdb::ColumnFamilyHandle*) override;
|
||||
|
@ -193,6 +194,8 @@ class RocksDBBatchedMethods final : public RocksDBMethods {
|
|||
rocksdb::Slice const& val) override;
|
||||
rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
|
||||
rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
|
||||
void PutLogData(rocksdb::Slice const&) override;
|
||||
|
||||
std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
|
||||
rocksdb::ColumnFamilyHandle*) override;
|
||||
|
||||
|
@ -219,6 +222,7 @@ class RocksDBBatchedWithIndexMethods final : public RocksDBMethods {
|
|||
rocksdb::Slice const& val) override;
|
||||
rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
|
||||
rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
|
||||
void PutLogData(rocksdb::Slice const&) override;
|
||||
|
||||
std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
|
||||
rocksdb::ColumnFamilyHandle*) override;
|
||||
|
@ -234,38 +238,6 @@ class RocksDBBatchedWithIndexMethods final : public RocksDBMethods {
|
|||
rocksdb::WriteBatchWithIndex* _wb;
|
||||
};
|
||||
|
||||
/// transaction wrapper, uses the provided rocksdb transaction
|
||||
class RocksDBSideTrxMethods final : public RocksDBMethods {
|
||||
public:
|
||||
explicit RocksDBSideTrxMethods(RocksDBTransactionState* state, rocksdb::Transaction* trx);
|
||||
|
||||
rocksdb::Status Get(rocksdb::ColumnFamilyHandle*, rocksdb::Slice const& key,
|
||||
std::string* val) override;
|
||||
rocksdb::Status Get(rocksdb::ColumnFamilyHandle*, rocksdb::Slice const& key,
|
||||
rocksdb::PinnableSlice* val) override;
|
||||
rocksdb::Status Put(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key,
|
||||
rocksdb::Slice const& val) override;
|
||||
rocksdb::Status Delete(rocksdb::ColumnFamilyHandle*, RocksDBKey const& key) override;
|
||||
rocksdb::Status SingleDelete(rocksdb::ColumnFamilyHandle*, RocksDBKey const&) override;
|
||||
|
||||
std::unique_ptr<rocksdb::Iterator> NewIterator(rocksdb::ReadOptions const&,
|
||||
rocksdb::ColumnFamilyHandle*) override {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void SetSavePoint() override {}
|
||||
rocksdb::Status RollbackToSavePoint() override {
|
||||
return rocksdb::Status::OK();
|
||||
}
|
||||
void PopSavePoint() override {}
|
||||
|
||||
bool DisableIndexing() override;
|
||||
|
||||
private:
|
||||
rocksdb::Transaction* _trx;
|
||||
rocksdb::ReadOptions _ro;
|
||||
};
|
||||
|
||||
// INDEXING MAY ONLY BE DISABLED IN TOPLEVEL AQL TRANSACTIONS
|
||||
// THIS IS BECAUSE THESE TRANSACTIONS WILL EITHER READ FROM
|
||||
// OR (XOR) WRITE TO A COLLECTION. IF THIS PRECONDITION IS
|
||||
|
|
|
@ -335,6 +335,8 @@ class WALParser final : public rocksdb::WriteBatch::Handler {
|
|||
case RocksDBLogType::DocumentOperationsPrologue:
|
||||
case RocksDBLogType::DocumentRemove:
|
||||
case RocksDBLogType::DocumentRemoveAsPartOfUpdate:
|
||||
case RocksDBLogType::TrackedDocumentInsert:
|
||||
case RocksDBLogType::TrackedDocumentRemove:
|
||||
break; // ignore deprecated && unused markers
|
||||
|
||||
default:
|
||||
|
@ -435,7 +437,9 @@ class WALParser final : public rocksdb::WriteBatch::Handler {
|
|||
|
||||
if (cfId != _primaryCF) {
|
||||
return; // ignore all document operations
|
||||
} else if (_state != TRANSACTION && _state != SINGLE_REMOVE) {
|
||||
}
|
||||
|
||||
if (_state != TRANSACTION && _state != SINGLE_REMOVE) {
|
||||
resetTransientState();
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -242,7 +242,7 @@ void RocksDBTransactionCollection::commitCounts(uint64_t trxId, uint64_t commitS
|
|||
// Update the index estimates.
|
||||
for (auto& pair : _trackedIndexOperations) {
|
||||
auto idx = _collection->lookupIndex(pair.first);
|
||||
if (idx == nullptr) {
|
||||
if (ADB_UNLIKELY(idx == nullptr)) {
|
||||
TRI_ASSERT(false); // Index reported estimates, but does not exist
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@
|
|||
|
||||
#include "Basics/Common.h"
|
||||
#include "Basics/SmallVector.h"
|
||||
#include "RocksDBEngine/RocksDBCommon.h"
|
||||
#include "RocksDBEngine/RocksDBKey.h"
|
||||
#include "StorageEngine/TransactionState.h"
|
||||
#include "Transaction/Hints.h"
|
||||
#include "Transaction/Methods.h"
|
||||
|
|
|
@ -183,6 +183,12 @@ char const* arangodb::rocksDBLogTypeName(arangodb::RocksDBLogType type) {
|
|||
return "SingleRemove";
|
||||
case arangodb::RocksDBLogType::SingleRemoveV2:
|
||||
return "SingleRemoveV2";
|
||||
case arangodb::RocksDBLogType::FlushSync:
|
||||
return "FlushSync";
|
||||
case RocksDBLogType::TrackedDocumentInsert:
|
||||
return "TrackedDocumentInsert";
|
||||
case RocksDBLogType::TrackedDocumentRemove:
|
||||
return "TrackedDocumentRemove";
|
||||
case arangodb::RocksDBLogType::Invalid:
|
||||
return "Invalid";
|
||||
default:
|
||||
|
|
|
@ -86,6 +86,8 @@ enum class RocksDBLogType : char {
|
|||
SingleRemoveV2 = 'F',
|
||||
CollectionTruncate = 'G',
|
||||
FlushSync = 'H', // @see FlushFeature
|
||||
TrackedDocumentInsert = 'I',
|
||||
TrackedDocumentRemove = 'J',
|
||||
};
|
||||
|
||||
/// @brief settings keys
|
||||
|
|
|
@ -390,7 +390,9 @@ class MyWALDumper final : public rocksdb::WriteBatch::Handler, public WalAccessC
|
|||
case RocksDBLogType::DocumentOperationsPrologue:
|
||||
case RocksDBLogType::DocumentRemove:
|
||||
case RocksDBLogType::DocumentRemoveAsPartOfUpdate:
|
||||
break; // ignore deprecated markers
|
||||
case RocksDBLogType::TrackedDocumentInsert:
|
||||
case RocksDBLogType::TrackedDocumentRemove:
|
||||
break; // ignore deprecated / unused markers
|
||||
|
||||
default:
|
||||
LOG_TOPIC(WARN, Logger::REPLICATION)
|
||||
|
@ -553,7 +555,9 @@ class MyWALDumper final : public rocksdb::WriteBatch::Handler, public WalAccessC
|
|||
|
||||
if (cfId != _primaryCF) {
|
||||
return; // ignore all document operations
|
||||
} else if (_state != TRANSACTION && _state != SINGLE_REMOVE) {
|
||||
}
|
||||
|
||||
if (_state != TRANSACTION && _state != SINGLE_REMOVE) {
|
||||
resetTransientState();
|
||||
return;
|
||||
}
|
||||
|
@ -617,12 +621,6 @@ class MyWALDumper final : public rocksdb::WriteBatch::Handler, public WalAccessC
|
|||
return rocksdb::Status(); // make WAL iterator happy
|
||||
}
|
||||
|
||||
rocksdb::Status MergeCF(uint32_t, const rocksdb::Slice&, const rocksdb::Slice&) override {
|
||||
incTick();
|
||||
// not used for anything in ArangoDB currently
|
||||
return rocksdb::Status(); // make WAL iterator happy
|
||||
}
|
||||
|
||||
public:
|
||||
/// figures out from which sequence number we need to start scanning
|
||||
/// if we just use tickStart rocksdb will skip over batches we might
|
||||
|
@ -765,11 +763,6 @@ WalAccessResult RocksDBWalAccess::tail(Filter const& filter, size_t chunkSize,
|
|||
<< "WAL tailing call. Scan since: " << since << ", tick start: " << filter.tickStart
|
||||
<< ", tick end: " << filter.tickEnd << ", chunk size: " << chunkSize;
|
||||
while (iterator->Valid() && lastScannedTick <= filter.tickEnd) {
|
||||
s = iterator->status();
|
||||
if (!s.ok()) {
|
||||
LOG_TOPIC(ERR, Logger::REPLICATION) << "error during WAL scan: " << s.ToString();
|
||||
break; // s is considered in the end
|
||||
}
|
||||
|
||||
rocksdb::BatchResult batch = iterator->GetBatch();
|
||||
// record the first tick we are actually considering
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
///
|
||||
/// DISCLAIMER
|
||||
///
|
||||
/// Copyright 2018 ArangoDB GmbH, Cologne, Germany
|
||||
/// Copyright 2018-2019 ArangoDB GmbH, Cologne, Germany
|
||||
///
|
||||
/// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
/// you may not use this file except in compliance with the License.
|
||||
|
@ -131,6 +131,64 @@ function backgroundIndexSuite() {
|
|||
}
|
||||
},
|
||||
|
||||
testInsertParallelNonUnique2: function () {
|
||||
let c = require("internal").db._collection(cn);
|
||||
// first lets add some initial documents
|
||||
let x = 10;
|
||||
while(x-- > 0) {
|
||||
let docs = [];
|
||||
for(let i = 0; i < 1000; i++) {
|
||||
docs.push({value:i});
|
||||
}
|
||||
c.save(docs);
|
||||
}
|
||||
|
||||
// lets insert the rest via tasks
|
||||
let n = 9;
|
||||
for (let i = 0; i < n; ++i) {
|
||||
if (i === 6) { // create the index in a task
|
||||
let command = `const c = require("internal").db._collection("${cn}");
|
||||
c.ensureIndex({type: 'hash', fields: ['value'], unique: false, inBackground: true});`;
|
||||
tasks.register({ name: "UnitTestsIndexCreateIDX" + i, command: command });
|
||||
}
|
||||
let command = `const c = require("internal").db._collection("${cn}");
|
||||
let x = 10;
|
||||
while(x-- > 0) {
|
||||
let docs = [];
|
||||
for(let i = 0; i < 1000; i++) {
|
||||
docs.push({value:i})
|
||||
}
|
||||
c.save(docs);
|
||||
}`;
|
||||
tasks.register({ name: "UnitTestsIndexInsert" + i, command: command });
|
||||
}
|
||||
|
||||
// wait for tasks to complete
|
||||
waitForTasks();
|
||||
|
||||
// sanity checks
|
||||
assertEqual(c.count(), 100000);
|
||||
for (let i = 0; i < 1000; i++) { // 100 entries of each value [0,999]
|
||||
let cursor = db._query("FOR doc IN @@coll FILTER doc.value == @val RETURN 1",
|
||||
{'@coll': cn, 'val': i}, {count:true});
|
||||
assertEqual(cursor.count(), 100);
|
||||
}
|
||||
|
||||
internal.waitForEstimatorSync(); // make sure estimates are consistent
|
||||
let indexes = c.getIndexes(true);
|
||||
for (let i of indexes) {
|
||||
switch (i.type) {
|
||||
case 'primary':
|
||||
break;
|
||||
case 'hash':
|
||||
assertEqual(i.selectivityEstimate, 0.01);
|
||||
break;
|
||||
default:
|
||||
fail();
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
testInsertParallelUnique: function () {
|
||||
let c = require("internal").db._collection(cn);
|
||||
// first lets add some initial documents
|
||||
|
@ -227,17 +285,6 @@ function backgroundIndexSuite() {
|
|||
|
||||
// sanity checks
|
||||
assertEqual(c.count(), 50001);
|
||||
|
||||
let indexes = c.getIndexes();
|
||||
for (let i of indexes) {
|
||||
switch (i.type) {
|
||||
case 'primary':
|
||||
break;
|
||||
case 'hash':
|
||||
default:
|
||||
fail();
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
testRemoveParallel: function () {
|
||||
|
@ -256,6 +303,12 @@ function backgroundIndexSuite() {
|
|||
|
||||
// lets remove half via tasks
|
||||
for (let i = 0; i < 10; ++i) {
|
||||
if (i === 3) { // create the index in a task
|
||||
let command = `const c = require("internal").db._collection("${cn}");
|
||||
c.ensureIndex({type: 'hash', fields: ['value'], unique: false, inBackground: true});`;
|
||||
tasks.register({ name: "UnitTestsIndexCreateIDX" + i, command: command });
|
||||
}
|
||||
|
||||
let command = `const c = require("internal").db._collection("${cn}");
|
||||
if (!c) {
|
||||
throw new Error('could not find collection');
|
||||
|
@ -275,9 +328,6 @@ function backgroundIndexSuite() {
|
|||
tasks.register({ name: "UnitTestsIndexRemove" + i, command: command });
|
||||
}
|
||||
|
||||
// create the index on the main thread
|
||||
c.ensureIndex({type: 'hash', fields: ['value'], inBackground: true });
|
||||
|
||||
// wait for insertion tasks to complete
|
||||
waitForTasks();
|
||||
|
||||
|
@ -298,12 +348,14 @@ function backgroundIndexSuite() {
|
|||
}
|
||||
}
|
||||
|
||||
internal.waitForEstimatorSync(); // make sure estimates are consistent
|
||||
let indexes = c.getIndexes(true);
|
||||
for (let i of indexes) {
|
||||
switch (i.type) {
|
||||
case 'primary':
|
||||
break;
|
||||
case 'hash':
|
||||
assertTrue(Math.abs(i.selectivityEstimate - 1.0) < 0.005, i);
|
||||
break;
|
||||
default:
|
||||
fail();
|
||||
|
@ -327,6 +379,11 @@ function backgroundIndexSuite() {
|
|||
|
||||
// lets update all via tasks
|
||||
for (let i = 0; i < 10; ++i) {
|
||||
if (i === 5) { // create the index in a task
|
||||
let command = `const c = require("internal").db._collection("${cn}");
|
||||
c.ensureIndex({type: 'skiplist', fields: ['value'], unique: false, inBackground: true});`;
|
||||
tasks.register({ name: "UnitTestsIndexCreateIDX" + i, command: command });
|
||||
}
|
||||
let command = `const c = require("internal").db._collection("${cn}");
|
||||
if (!c) {
|
||||
throw new Error('could not find collection');
|
||||
|
@ -350,9 +407,6 @@ function backgroundIndexSuite() {
|
|||
// wait for insertion tasks to complete
|
||||
waitForTasks();
|
||||
|
||||
// create the index on the main thread
|
||||
c.ensureIndex({type: 'skiplist', fields: ['value'], inBackground: true });
|
||||
|
||||
// sanity checks
|
||||
assertEqual(c.count(), 100000);
|
||||
// check for new entries via index
|
||||
|
@ -364,12 +418,14 @@ function backgroundIndexSuite() {
|
|||
{'@coll': cn, 'val': 100000}, {count:true});
|
||||
assertEqual(oldCursor.count(), 0);
|
||||
|
||||
internal.waitForEstimatorSync(); // make sure estimates are consistent
|
||||
let indexes = c.getIndexes(true);
|
||||
for (let i of indexes) {
|
||||
switch (i.type) {
|
||||
case 'primary':
|
||||
break;
|
||||
case 'skiplist':
|
||||
assertTrue(Math.abs(i.selectivityEstimate - 1.0) < 0.005, i);
|
||||
break;
|
||||
default:
|
||||
fail();
|
Loading…
Reference in New Issue