mirror of https://gitee.com/bigwinds/arangodb
534 lines
18 KiB
C++
534 lines
18 KiB
C++
////////////////////////////////////////////////////////////////////////////////
|
|
/// DISCLAIMER
|
|
///
|
|
/// Copyright 2014-2019 ArangoDB GmbH, Cologne, Germany
|
|
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
|
|
///
|
|
/// Licensed under the Apache License, Version 2.0 (the "License");
|
|
/// you may not use this file except in compliance with the License.
|
|
/// You may obtain a copy of the License at
|
|
///
|
|
/// http://www.apache.org/licenses/LICENSE-2.0
|
|
///
|
|
/// Unless required by applicable law or agreed to in writing, software
|
|
/// distributed under the License is distributed on an "AS IS" BASIS,
|
|
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
/// See the License for the specific language governing permissions and
|
|
/// limitations under the License.
|
|
///
|
|
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
|
///
|
|
/// @author Simon Grätzer
|
|
/// @author Daniel Larkin-York
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "RocksDBRecoveryManager.h"
|
|
|
|
#include "ApplicationFeatures/ApplicationServer.h"
|
|
#include "Basics/Exceptions.h"
|
|
#include "Basics/NumberUtils.h"
|
|
#include "Basics/StringUtils.h"
|
|
#include "Basics/VelocyPackHelper.h"
|
|
#include "Basics/WriteLocker.h"
|
|
#include "Basics/exitcodes.h"
|
|
#include "Logger/Logger.h"
|
|
#include "RestServer/DatabaseFeature.h"
|
|
#include "RocksDBEngine/RocksDBCollection.h"
|
|
#include "RocksDBEngine/RocksDBColumnFamily.h"
|
|
#include "RocksDBEngine/RocksDBCommon.h"
|
|
#include "RocksDBEngine/RocksDBCuckooIndexEstimator.h"
|
|
#include "RocksDBEngine/RocksDBEdgeIndex.h"
|
|
#include "RocksDBEngine/RocksDBKey.h"
|
|
#include "RocksDBEngine/RocksDBKeyBounds.h"
|
|
#include "RocksDBEngine/RocksDBLogValue.h"
|
|
#include "RocksDBEngine/RocksDBRecoveryHelper.h"
|
|
#include "RocksDBEngine/RocksDBSettingsManager.h"
|
|
#include "RocksDBEngine/RocksDBVPackIndex.h"
|
|
#include "RocksDBEngine/RocksDBValue.h"
|
|
#include "Transaction/Helpers.h"
|
|
#include "VocBase/KeyGenerator.h"
|
|
#include "VocBase/ticks.h"
|
|
|
|
#include <rocksdb/utilities/transaction_db.h>
|
|
#include <rocksdb/utilities/write_batch_with_index.h>
|
|
#include <rocksdb/write_batch.h>
|
|
|
|
#include <velocypack/Iterator.h>
|
|
#include <velocypack/Parser.h>
|
|
#include <velocypack/Slice.h>
|
|
#include <velocypack/velocypack-aliases.h>
|
|
|
|
using namespace arangodb::application_features;
|
|
|
|
namespace arangodb {
|
|
|
|
RocksDBRecoveryManager* RocksDBRecoveryManager::instance() {
|
|
return ApplicationServer::getFeature<RocksDBRecoveryManager>(featureName());
|
|
}
|
|
|
|
/// Constructor needs to be called synchrunously,
|
|
/// will load counts from the db and scan the WAL
|
|
RocksDBRecoveryManager::RocksDBRecoveryManager(application_features::ApplicationServer& server)
|
|
: ApplicationFeature(server, featureName()), _db(nullptr), _inRecovery(true) {
|
|
setOptional(true);
|
|
startsAfter("BasicsPhase");
|
|
|
|
startsAfter("Database");
|
|
startsAfter("SystemDatabase");
|
|
startsAfter("RocksDBEngine");
|
|
startsAfter("ServerId");
|
|
startsAfter("StorageEngine");
|
|
|
|
onlyEnabledWith("RocksDBEngine");
|
|
}
|
|
|
|
void RocksDBRecoveryManager::start() {
|
|
TRI_ASSERT(isEnabled());
|
|
|
|
_db = ApplicationServer::getFeature<RocksDBEngine>("RocksDBEngine")->db();
|
|
runRecovery();
|
|
// synchronizes with acquire inRecovery()
|
|
_inRecovery.store(false, std::memory_order_release);
|
|
|
|
// notify everyone that recovery is now done
|
|
auto databaseFeature =
|
|
ApplicationServer::getFeature<DatabaseFeature>("Database");
|
|
databaseFeature->recoveryDone();
|
|
}
|
|
|
|
/// parse recent RocksDB WAL entries and notify the
|
|
/// DatabaseFeature about the successful recovery
|
|
void RocksDBRecoveryManager::runRecovery() {
|
|
auto res = parseRocksWAL();
|
|
if (res.fail()) {
|
|
LOG_TOPIC("be0ce", FATAL, Logger::ENGINES)
|
|
<< "failed during rocksdb WAL recovery: " << res.errorMessage();
|
|
FATAL_ERROR_EXIT_CODE(TRI_EXIT_RECOVERY);
|
|
}
|
|
|
|
// now restore collection counts into collections
|
|
}
|
|
|
|
class WBReader final : public rocksdb::WriteBatch::Handler {
|
|
private:
|
|
// used to track used IDs for key-generators
|
|
std::map<uint64_t, uint64_t> _generators;
|
|
|
|
// max tick found
|
|
uint64_t _maxTick;
|
|
uint64_t _maxHLC;
|
|
/// @brief last document removed
|
|
TRI_voc_rid_t _lastRemovedDocRid = 0;
|
|
|
|
rocksdb::SequenceNumber _startSequence; /// start of batch sequence nr
|
|
rocksdb::SequenceNumber _currentSequence; /// current sequence nr
|
|
bool _startOfBatch = false;
|
|
|
|
public:
|
|
/// @param seqs sequence number from which to count operations
|
|
explicit WBReader()
|
|
: _maxTick(TRI_NewTickServer()),
|
|
_maxHLC(0),
|
|
_lastRemovedDocRid(0),
|
|
_startSequence(0),
|
|
_currentSequence(0) {}
|
|
|
|
void startNewBatch(rocksdb::SequenceNumber startSequence) {
|
|
// starting new write batch
|
|
_startSequence = startSequence;
|
|
_currentSequence = startSequence;
|
|
_startOfBatch = true;
|
|
TRI_ASSERT(_maxTick > 0);
|
|
}
|
|
|
|
Result shutdownWBReader() {
|
|
Result rv = basics::catchVoidToResult([&]() -> void {
|
|
// update ticks after parsing wal
|
|
LOG_TOPIC("a4ec8", TRACE, Logger::ENGINES)
|
|
<< "max tick found in WAL: " << _maxTick << ", last HLC value: " << _maxHLC;
|
|
|
|
TRI_UpdateTickServer(_maxTick);
|
|
TRI_HybridLogicalClock(_maxHLC);
|
|
});
|
|
return rv;
|
|
}
|
|
|
|
private:
|
|
void storeMaxHLC(uint64_t hlc) {
|
|
if (hlc > _maxHLC) {
|
|
_maxHLC = hlc;
|
|
}
|
|
}
|
|
|
|
void storeMaxTick(uint64_t tick) {
|
|
if (tick > _maxTick) {
|
|
_maxTick = tick;
|
|
}
|
|
}
|
|
|
|
// find estimator for index
|
|
RocksDBCollection* findCollection(uint64_t objectId) {
|
|
RocksDBEngine* engine = rocksutils::globalRocksEngine();
|
|
// now adjust the counter in collections which are already loaded
|
|
RocksDBEngine::CollectionPair dbColPair = engine->mapObjectToCollection(objectId);
|
|
if (dbColPair.second == 0 || dbColPair.first == 0) {
|
|
// collection with this objectID not known.Skip.
|
|
return nullptr;
|
|
}
|
|
DatabaseFeature* df = DatabaseFeature::DATABASE;
|
|
TRI_vocbase_t* vocbase = df->useDatabase(dbColPair.first);
|
|
if (vocbase == nullptr) {
|
|
return nullptr;
|
|
}
|
|
TRI_DEFER(vocbase->release());
|
|
return static_cast<RocksDBCollection*>(
|
|
vocbase->lookupCollection(dbColPair.second)->getPhysical());
|
|
}
|
|
|
|
RocksDBIndex* findIndex(uint64_t objectId) {
|
|
RocksDBEngine* engine = rocksutils::globalRocksEngine();
|
|
RocksDBEngine::IndexTriple triple = engine->mapObjectToIndex(objectId);
|
|
if (std::get<0>(triple) == 0 && std::get<1>(triple) == 0) {
|
|
return nullptr;
|
|
}
|
|
|
|
DatabaseFeature* df = DatabaseFeature::DATABASE;
|
|
TRI_vocbase_t* vb = df->useDatabase(std::get<0>(triple));
|
|
if (vb == nullptr) {
|
|
return nullptr;
|
|
}
|
|
TRI_DEFER(vb->release());
|
|
|
|
auto coll = vb->lookupCollection(std::get<1>(triple));
|
|
|
|
if (coll == nullptr) {
|
|
return nullptr;
|
|
}
|
|
|
|
std::shared_ptr<Index> index = coll->lookupIndex(std::get<2>(triple));
|
|
if (index == nullptr) {
|
|
return nullptr;
|
|
}
|
|
return static_cast<RocksDBIndex*>(index.get());
|
|
}
|
|
|
|
void updateMaxTick(uint32_t column_family_id, const rocksdb::Slice& key,
|
|
const rocksdb::Slice& value) {
|
|
// RETURN (side-effect): update _maxTick
|
|
//
|
|
// extract max tick from Markers and store them as side-effect in
|
|
// _maxTick member variable that can be used later (dtor) to call
|
|
// TRI_UpdateTickServer (ticks.h)
|
|
// Markers: - collections (id,objectid) as tick and max tick in indexes
|
|
// array
|
|
// - documents - _rev (revision as maxtick)
|
|
// - databases
|
|
|
|
if (column_family_id == RocksDBColumnFamily::documents()->GetID()) {
|
|
storeMaxHLC(RocksDBKey::documentId(key).id());
|
|
} else if (column_family_id == RocksDBColumnFamily::primary()->GetID()) {
|
|
// document key
|
|
arangodb::velocypack::StringRef ref = RocksDBKey::primaryKey(key);
|
|
TRI_ASSERT(!ref.empty());
|
|
// check if the key is numeric
|
|
if (ref[0] >= '1' && ref[0] <= '9') {
|
|
// numeric start byte. looks good
|
|
bool valid;
|
|
uint64_t tick =
|
|
NumberUtils::atoi<uint64_t>(ref.data(), ref.data() + ref.size(), valid);
|
|
if (valid) {
|
|
// if no previous _maxTick set or the numeric value found is
|
|
// "near" our previous _maxTick, then we update it
|
|
if (tick > _maxTick && (tick - _maxTick) < 2048) {
|
|
storeMaxTick(tick);
|
|
}
|
|
}
|
|
// else we got a non-numeric key. simply ignore it
|
|
}
|
|
|
|
RocksDBIndex* idx = findIndex(RocksDBKey::objectId(key));
|
|
if (idx) {
|
|
KeyGenerator* keyGen = idx->collection().keyGenerator();
|
|
if (keyGen) {
|
|
keyGen->track(ref.begin(), ref.size());
|
|
}
|
|
}
|
|
|
|
} else if (column_family_id == RocksDBColumnFamily::definitions()->GetID()) {
|
|
auto const type = RocksDBKey::type(key);
|
|
|
|
if (type == RocksDBEntryType::Collection) {
|
|
storeMaxTick(RocksDBKey::collectionId(key));
|
|
auto slice = RocksDBValue::data(value);
|
|
storeMaxTick(basics::VelocyPackHelper::stringUInt64(slice, "objectId"));
|
|
VPackSlice indexes = slice.get("indexes");
|
|
for (VPackSlice const& idx : VPackArrayIterator(indexes)) {
|
|
storeMaxTick(
|
|
std::max(basics::VelocyPackHelper::stringUInt64(idx, "objectId"),
|
|
basics::VelocyPackHelper::stringUInt64(idx, "id")));
|
|
}
|
|
} else if (type == RocksDBEntryType::Database) {
|
|
storeMaxTick(RocksDBKey::databaseId(key));
|
|
} else if (type == RocksDBEntryType::View) {
|
|
storeMaxTick(std::max(RocksDBKey::databaseId(key), RocksDBKey::viewId(key)));
|
|
}
|
|
}
|
|
}
|
|
|
|
// tick function that is called before each new WAL entry
|
|
void incTick() {
|
|
if (_startOfBatch) {
|
|
// we are at the start of a batch. do NOT increase sequence number
|
|
_startOfBatch = false;
|
|
} else {
|
|
// we are inside a batch already. now increase sequence number
|
|
++_currentSequence;
|
|
}
|
|
}
|
|
|
|
public:
|
|
rocksdb::Status PutCF(uint32_t column_family_id, const rocksdb::Slice& key,
|
|
const rocksdb::Slice& value) override {
|
|
LOG_TOPIC("3e5c5", TRACE, Logger::ENGINES) << "recovering PUT " << RocksDBKey(key);
|
|
incTick();
|
|
|
|
updateMaxTick(column_family_id, key, value);
|
|
if (column_family_id == RocksDBColumnFamily::documents()->GetID()) {
|
|
auto coll = findCollection(RocksDBKey::objectId(key));
|
|
if (coll && coll->meta().countUnsafe()._committedSeq < _currentSequence) {
|
|
auto& cc = coll->meta().countUnsafe();
|
|
cc._committedSeq = _currentSequence;
|
|
cc._added++;
|
|
cc._revisionId =
|
|
transaction::helpers::extractRevFromDocument(RocksDBValue::data(value));
|
|
coll->loadInitialNumberDocuments();
|
|
}
|
|
|
|
} else {
|
|
// We have to adjust the estimate with an insert
|
|
uint64_t hash = 0;
|
|
if (column_family_id == RocksDBColumnFamily::vpack()->GetID()) {
|
|
hash = RocksDBVPackIndex::HashForKey(key);
|
|
} else if (column_family_id == RocksDBColumnFamily::edge()->GetID()) {
|
|
hash = RocksDBEdgeIndex::HashForKey(key);
|
|
}
|
|
|
|
if (hash != 0) {
|
|
auto* idx = findIndex(RocksDBKey::objectId(key));
|
|
if (idx) {
|
|
RocksDBCuckooIndexEstimator<uint64_t>* est = idx->estimator();
|
|
if (est && est->appliedSeq() < _currentSequence) {
|
|
// We track estimates for this index
|
|
est->insert(hash);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
RocksDBEngine* engine = rocksutils::globalRocksEngine();
|
|
for (auto helper : engine->recoveryHelpers()) {
|
|
helper->PutCF(column_family_id, key, value);
|
|
}
|
|
|
|
return rocksdb::Status();
|
|
}
|
|
|
|
void handleDeleteCF(uint32_t cfId, const rocksdb::Slice& key) {
|
|
incTick();
|
|
|
|
if (cfId == RocksDBColumnFamily::documents()->GetID()) {
|
|
uint64_t objectId = RocksDBKey::objectId(key);
|
|
|
|
storeMaxHLC(RocksDBKey::documentId(key).id());
|
|
storeMaxTick(objectId);
|
|
|
|
auto coll = findCollection(RocksDBKey::objectId(key));
|
|
if (coll && coll->meta().countUnsafe()._committedSeq < _currentSequence) {
|
|
auto& cc = coll->meta().countUnsafe();
|
|
cc._committedSeq = _currentSequence;
|
|
cc._removed++;
|
|
if (_lastRemovedDocRid != 0) {
|
|
cc._revisionId = _lastRemovedDocRid;
|
|
}
|
|
coll->loadInitialNumberDocuments();
|
|
}
|
|
_lastRemovedDocRid = 0; // reset in any case
|
|
|
|
} else {
|
|
// We have to adjust the estimate with an insert
|
|
uint64_t hash = 0;
|
|
if (cfId == RocksDBColumnFamily::vpack()->GetID()) {
|
|
hash = RocksDBVPackIndex::HashForKey(key);
|
|
} else if (cfId == RocksDBColumnFamily::edge()->GetID()) {
|
|
hash = RocksDBEdgeIndex::HashForKey(key);
|
|
}
|
|
|
|
if (hash != 0) {
|
|
auto* idx = findIndex(RocksDBKey::objectId(key));
|
|
if (idx) {
|
|
RocksDBCuckooIndexEstimator<uint64_t>* est = idx->estimator();
|
|
if (est && est->appliedSeq() < _currentSequence) {
|
|
// We track estimates for this index
|
|
est->remove(hash);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
rocksdb::Status DeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override {
|
|
LOG_TOPIC("5f341", TRACE, Logger::ENGINES) << "recovering DELETE " << RocksDBKey(key);
|
|
handleDeleteCF(column_family_id, key);
|
|
RocksDBEngine* engine = rocksutils::globalRocksEngine();
|
|
for (auto helper : engine->recoveryHelpers()) {
|
|
helper->DeleteCF(column_family_id, key);
|
|
}
|
|
|
|
return rocksdb::Status();
|
|
}
|
|
|
|
rocksdb::Status SingleDeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override {
|
|
LOG_TOPIC("aa997", TRACE, Logger::ENGINES)
|
|
<< "recovering SINGLE DELETE " << RocksDBKey(key);
|
|
handleDeleteCF(column_family_id, key);
|
|
|
|
RocksDBEngine* engine = rocksutils::globalRocksEngine();
|
|
for (auto helper : engine->recoveryHelpers()) {
|
|
helper->SingleDeleteCF(column_family_id, key);
|
|
}
|
|
|
|
return rocksdb::Status();
|
|
}
|
|
|
|
rocksdb::Status DeleteRangeCF(uint32_t column_family_id, const rocksdb::Slice& begin_key,
|
|
const rocksdb::Slice& end_key) override {
|
|
LOG_TOPIC("ed6f5", TRACE, Logger::ENGINES)
|
|
<< "recovering DELETE RANGE from " << RocksDBKey(begin_key) << " to "
|
|
<< RocksDBKey(end_key);
|
|
incTick();
|
|
// drop and truncate can use this, truncate is handled via a Log marker
|
|
RocksDBEngine* engine = rocksutils::globalRocksEngine();
|
|
for (auto helper : engine->recoveryHelpers()) {
|
|
helper->DeleteRangeCF(column_family_id, begin_key, end_key);
|
|
}
|
|
|
|
// check for a range-delete of the primary index
|
|
if (column_family_id == RocksDBColumnFamily::documents()->GetID()) {
|
|
uint64_t objectId = RocksDBKey::objectId(begin_key);
|
|
TRI_ASSERT(objectId == RocksDBKey::objectId(end_key));
|
|
|
|
auto coll = findCollection(objectId);
|
|
if (!coll) {
|
|
return rocksdb::Status();
|
|
}
|
|
|
|
if (coll->meta().countUnsafe()._committedSeq <= _currentSequence) {
|
|
auto& cc = coll->meta().countUnsafe();
|
|
cc._committedSeq = _currentSequence;
|
|
cc._added = 0;
|
|
cc._removed = 0;
|
|
coll->loadInitialNumberDocuments();
|
|
|
|
for (std::shared_ptr<arangodb::Index> const& idx : coll->getIndexes()) {
|
|
RocksDBIndex* ridx = static_cast<RocksDBIndex*>(idx.get());
|
|
RocksDBCuckooIndexEstimator<uint64_t>* est = ridx->estimator();
|
|
TRI_ASSERT(ridx->type() != Index::TRI_IDX_TYPE_EDGE_INDEX || est);
|
|
if (est) {
|
|
est->clear();
|
|
est->setAppliedSeq(_currentSequence);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return rocksdb::Status(); // make WAL iterator happy
|
|
}
|
|
|
|
void LogData(const rocksdb::Slice& blob) override {
|
|
// a delete log message appears directly before a Delete
|
|
RocksDBLogType type = RocksDBLogValue::type(blob);
|
|
switch (type) {
|
|
case RocksDBLogType::DocumentRemoveV2: // remove within a trx
|
|
case RocksDBLogType::SingleRemoveV2: // single remove
|
|
TRI_ASSERT(_lastRemovedDocRid == 0);
|
|
_lastRemovedDocRid = RocksDBLogValue::revisionId(blob);
|
|
break;
|
|
default:
|
|
_lastRemovedDocRid = 0; // reset in any other case
|
|
break;
|
|
}
|
|
RocksDBEngine* engine = rocksutils::globalRocksEngine();
|
|
for (auto helper : engine->recoveryHelpers()) {
|
|
helper->LogData(blob);
|
|
}
|
|
}
|
|
|
|
// MergeCF is not used
|
|
};
|
|
|
|
/// parse the WAL with the above handler parser class
|
|
Result RocksDBRecoveryManager::parseRocksWAL() {
|
|
Result shutdownRv;
|
|
|
|
Result res = basics::catchToResult([&]() -> Result {
|
|
Result rv;
|
|
RocksDBEngine* engine = rocksutils::globalRocksEngine();
|
|
for (auto& helper : engine->recoveryHelpers()) {
|
|
helper->prepare();
|
|
}
|
|
|
|
// Tell the WriteBatch reader the transaction markers to look for
|
|
WBReader handler;
|
|
rocksdb::SequenceNumber earliest = engine->settingsManager()->earliestSeqNeeded();
|
|
auto minTick = std::min(earliest, engine->releasedTick());
|
|
|
|
// prevent purging of WAL files while we are in here
|
|
RocksDBFilePurgePreventer purgePreventer(
|
|
rocksutils::globalRocksEngine()->disallowPurging());
|
|
|
|
std::unique_ptr<rocksdb::TransactionLogIterator> iterator; // reader();
|
|
rocksdb::Status s =
|
|
_db->GetUpdatesSince(minTick, &iterator,
|
|
rocksdb::TransactionLogIterator::ReadOptions(true));
|
|
|
|
rv = rocksutils::convertStatus(s);
|
|
|
|
if (rv.ok()) {
|
|
while (iterator->Valid()) {
|
|
s = iterator->status();
|
|
if (s.ok()) {
|
|
rocksdb::BatchResult batch = iterator->GetBatch();
|
|
handler.startNewBatch(batch.sequence);
|
|
s = batch.writeBatchPtr->Iterate(&handler);
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
rv = rocksutils::convertStatus(s);
|
|
std::string msg = "error during WAL scan: " + rv.errorMessage();
|
|
LOG_TOPIC("ee333", ERR, Logger::ENGINES) << msg;
|
|
rv.reset(rv.errorNumber(), std::move(msg)); // update message
|
|
break;
|
|
}
|
|
|
|
iterator->Next();
|
|
}
|
|
}
|
|
|
|
shutdownRv = handler.shutdownWBReader();
|
|
|
|
return rv;
|
|
});
|
|
|
|
if (res.ok()) {
|
|
res = std::move(shutdownRv);
|
|
} else {
|
|
if (shutdownRv.fail()) {
|
|
res.reset(res.errorNumber(), res.errorMessage() + " - " + shutdownRv.errorMessage());
|
|
}
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
} // namespace arangodb
|