//////////////////////////////////////////////////////////////////////////////// /// DISCLAIMER /// /// Copyright 2014-2017 ArangoDB GmbH, Cologne, Germany /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Simon Grätzer /// @author Daniel Larkin-York //////////////////////////////////////////////////////////////////////////////// #include "RocksDBRecoveryManager.h" #include "ApplicationFeatures/ApplicationServer.h" #include "Basics/Exceptions.h" #include "Basics/NumberUtils.h" #include "Basics/StringUtils.h" #include "Basics/VelocyPackHelper.h" #include "Basics/WriteLocker.h" #include "Basics/exitcodes.h" #include "Logger/Logger.h" #include "RestServer/DatabaseFeature.h" #include "RocksDBEngine/RocksDBCollection.h" #include "RocksDBEngine/RocksDBColumnFamily.h" #include "RocksDBEngine/RocksDBCommon.h" #include "RocksDBEngine/RocksDBCuckooIndexEstimator.h" #include "RocksDBEngine/RocksDBEdgeIndex.h" #include "RocksDBEngine/RocksDBKey.h" #include "RocksDBEngine/RocksDBKeyBounds.h" #include "RocksDBEngine/RocksDBLogValue.h" #include "RocksDBEngine/RocksDBRecoveryHelper.h" #include "RocksDBEngine/RocksDBSettingsManager.h" #include "RocksDBEngine/RocksDBVPackIndex.h" #include "RocksDBEngine/RocksDBValue.h" #include "StorageEngine/EngineSelectorFeature.h" #include "Transaction/Helpers.h" #include "VocBase/KeyGenerator.h" #include "VocBase/ticks.h" #include #include #include #include #include #include #include using namespace arangodb::application_features; namespace arangodb { RocksDBRecoveryManager* RocksDBRecoveryManager::instance() { return ApplicationServer::getFeature(featureName()); } /// Constructor needs to be called synchrunously, /// will load counts from the db and scan the WAL RocksDBRecoveryManager::RocksDBRecoveryManager(application_features::ApplicationServer& server) : ApplicationFeature(server, featureName()), _db(nullptr), _inRecovery(true) { setOptional(true); startsAfter("BasicsPhase"); startsAfter("Database"); startsAfter("SystemDatabase"); startsAfter("RocksDBEngine"); startsAfter("ServerId"); startsAfter("StorageEngine"); onlyEnabledWith("RocksDBEngine"); } void RocksDBRecoveryManager::start() { if (!isEnabled()) { return; } _db = ApplicationServer::getFeature("RocksDBEngine")->db(); runRecovery(); // synchronizes with acquire inRecovery() _inRecovery.store(false, std::memory_order_release); // notify everyone that recovery is now done auto databaseFeature = ApplicationServer::getFeature("Database"); databaseFeature->recoveryDone(); } /// parse recent RocksDB WAL entries and notify the /// DatabaseFeature about the successful recovery void RocksDBRecoveryManager::runRecovery() { auto res = parseRocksWAL(); if (res.fail()) { LOG_TOPIC(FATAL, Logger::ENGINES) << "failed during rocksdb WAL recovery: " << res.errorMessage(); FATAL_ERROR_EXIT_CODE(TRI_EXIT_RECOVERY); } // now restore collection counts into collections } class WBReader final : public rocksdb::WriteBatch::Handler { public: struct Operations { Operations(rocksdb::SequenceNumber seq) : startSequenceNumber(seq) {} Operations(Operations const&) = delete; Operations& operator=(Operations const&) = delete; Operations(Operations&&) = default; Operations& operator=(Operations&&) = default; rocksdb::SequenceNumber startSequenceNumber; rocksdb::SequenceNumber lastSequenceNumber = 0; uint64_t added = 0; uint64_t removed = 0; TRI_voc_rid_t lastRevisionId = 0; bool mustTruncate = false; }; private: // contains the operations we counted std::map _deltas; // used to track used IDs for key-generators std::map _generators; // max tick found uint64_t _maxTick; uint64_t _maxHLC; /// @brief last document removed TRI_voc_rid_t _lastRemovedDocRid; rocksdb::SequenceNumber _startSequence; /// start of batch sequence nr rocksdb::SequenceNumber _currentSequence; /// current sequence nr bool _startOfBatch = false; public: /// @param seqs sequence number from which to count operations explicit WBReader(std::map const& seqs) : _maxTick(TRI_NewTickServer()), _maxHLC(0), _lastRemovedDocRid(0), _startSequence(0), _currentSequence(0) { for (auto const& pair : seqs) { try { _deltas.emplace(pair.first, Operations(pair.second)); } catch (...) { } } } void startNewBatch(rocksdb::SequenceNumber startSequence) { // starting new write batch _startSequence = startSequence; _currentSequence = startSequence; _startOfBatch = true; TRI_ASSERT(_maxTick > 0); } Result shutdownWBReader() { Result rv = basics::catchVoidToResult([&]() -> void { // update ticks after parsing wal LOG_TOPIC(TRACE, Logger::ENGINES) << "max tick found in WAL: " << _maxTick << ", last HLC value: " << _maxHLC; TRI_UpdateTickServer(_maxTick); TRI_HybridLogicalClock(_maxHLC); auto dbfeature = ApplicationServer::getFeature("Database"); LOG_TOPIC(TRACE, Logger::ENGINES) << "finished WAL scan with " << _deltas.size() << " entries"; for (auto& pair : _deltas) { WBReader::Operations const& ops = pair.second; // now adjust the counter in collections which are already loaded auto dbColPair = rocksutils::mapObjectToCollection(pair.first); if (dbColPair.second == 0 && dbColPair.first == 0) { // collection with this objectID not known.Skip. continue; } auto vocbase = dbfeature->useDatabase(dbColPair.first); if (vocbase == nullptr) { continue; } TRI_DEFER(vocbase->release()); auto collection = vocbase->lookupCollection(dbColPair.second); if (collection == nullptr) { continue; } auto* rcoll = static_cast(collection->getPhysical()); if (ops.mustTruncate) { // first we must reset the counter rcoll->meta().countRefUnsafe()._added = 0; rcoll->meta().countRefUnsafe()._removed = 0; } rcoll->meta().countRefUnsafe()._added += ops.added; rcoll->meta().countRefUnsafe()._removed += ops.removed; if (ops.lastRevisionId) { rcoll->meta().countRefUnsafe()._revisionId = ops.lastRevisionId; } TRI_ASSERT(rcoll->meta().countRefUnsafe()._added >= rcoll->meta().countRefUnsafe()._removed); rcoll->loadInitialNumberDocuments(); auto const& it = _generators.find(rcoll->objectId()); if (it != _generators.end()) { std::string k(basics::StringUtils::itoa(it->second)); collection->keyGenerator()->track(k.data(), k.size()); _generators.erase(it); } } // make sure we collect all the other generators for (auto gen : _generators) { if (gen.second > 0) { auto dbColPair = rocksutils::mapObjectToCollection(gen.first); if (dbColPair.second == 0 && dbColPair.first == 0) { // collection with this objectID not known.Skip. continue; } auto vocbase = dbfeature->useDatabase(dbColPair.first); if (vocbase == nullptr) { continue; } TRI_DEFER(vocbase->release()); auto collection = vocbase->lookupCollection(dbColPair.second); if (collection == nullptr) { continue; } std::string k(basics::StringUtils::itoa(gen.second)); collection->keyGenerator()->track(k.data(), k.size()); } } }); return rv; } private: bool shouldHandleCollection(uint64_t objectId, Operations** ops) { auto it = _deltas.find(objectId); if (it != _deltas.end()) { *ops = &(it->second); return it->second.startSequenceNumber < _currentSequence; } auto res = _deltas.emplace(objectId, _currentSequence); // do not ignore unknown counters *ops = &(res.first->second); return true; } void storeMaxHLC(uint64_t hlc) { if (hlc > _maxHLC) { _maxHLC = hlc; } } void storeMaxTick(uint64_t tick) { if (tick > _maxTick) { _maxTick = tick; } } void storeLastKeyValue(uint64_t objectId, uint64_t keyValue) { if (keyValue == 0) { return; } auto it = _generators.find(objectId); if (it == _generators.end()) { try { _generators.emplace(objectId, keyValue); } catch (...) { } return; } if (keyValue > (*it).second) { (*it).second = keyValue; } } /// Truncate indexes of collection with objectId bool truncateIndexes(uint64_t objectId) { RocksDBEngine* engine = static_cast(EngineSelectorFeature::ENGINE); RocksDBEngine::CollectionPair pair = engine->mapObjectToCollection(objectId); if (pair.first == 0 || pair.second == 0) { return false; } DatabaseFeature* df = DatabaseFeature::DATABASE; TRI_vocbase_t* vb = df->useDatabase(pair.first); if (vb == nullptr) { return false; } TRI_DEFER(vb->release()); auto coll = vb->lookupCollection(pair.second); if (coll == nullptr) { return false; } for (std::shared_ptr const& idx : coll->getIndexes()) { RocksDBIndex* ridx = static_cast(idx.get()); RocksDBCuckooIndexEstimator* est = ridx->estimator(); if (est && est->committedSeq() < _currentSequence) { est->clear(); } } return true; } // find estimator for index RocksDBCuckooIndexEstimator* findEstimator(uint64_t objectId) { RocksDBEngine* engine = static_cast(EngineSelectorFeature::ENGINE); RocksDBEngine::IndexTriple triple = engine->mapObjectToIndex(objectId); if (std::get<0>(triple) == 0 && std::get<1>(triple) == 0) { return nullptr; } DatabaseFeature* df = DatabaseFeature::DATABASE; TRI_vocbase_t* vb = df->useDatabase(std::get<0>(triple)); if (vb == nullptr) { return nullptr; } TRI_DEFER(vb->release()); auto coll = vb->lookupCollection(std::get<1>(triple)); if (coll == nullptr) { return nullptr; } std::shared_ptr index = coll->lookupIndex(std::get<2>(triple)); if (index == nullptr) { return nullptr; } return static_cast(index.get())->estimator(); } void updateMaxTick(uint32_t column_family_id, const rocksdb::Slice& key, const rocksdb::Slice& value) { // RETURN (side-effect): update _maxTick // // extract max tick from Markers and store them as side-effect in // _maxTick member variable that can be used later (dtor) to call // TRI_UpdateTickServer (ticks.h) // Markers: - collections (id,objectid) as tick and max tick in indexes // array // - documents - _rev (revision as maxtick) // - databases if (column_family_id == RocksDBColumnFamily::documents()->GetID()) { storeMaxHLC(RocksDBKey::documentId(key).id()); storeLastKeyValue(RocksDBKey::objectId(key), RocksDBValue::keyValue(value)); } else if (column_family_id == RocksDBColumnFamily::primary()->GetID()) { // document key StringRef ref = RocksDBKey::primaryKey(key); TRI_ASSERT(!ref.empty()); // check if the key is numeric if (ref[0] >= '1' && ref[0] <= '9') { // numeric start byte. looks good bool valid; uint64_t tick = NumberUtils::atoi(ref.data(), ref.data() + ref.size(), valid); if (valid) { // if no previous _maxTick set or the numeric value found is // "near" our previous _maxTick, then we update it if (tick > _maxTick && (tick - _maxTick) < 2048) { storeMaxTick(tick); } } // else we got a non-numeric key. simply ignore it } } else if (column_family_id == RocksDBColumnFamily::definitions()->GetID()) { auto const type = RocksDBKey::type(key); if (type == RocksDBEntryType::Collection) { storeMaxTick(RocksDBKey::collectionId(key)); auto slice = RocksDBValue::data(value); storeMaxTick(basics::VelocyPackHelper::stringUInt64(slice, "objectId")); VPackSlice indexes = slice.get("indexes"); for (VPackSlice const& idx : VPackArrayIterator(indexes)) { storeMaxTick( std::max(basics::VelocyPackHelper::stringUInt64(idx, "objectId"), basics::VelocyPackHelper::stringUInt64(idx, "id"))); } } else if (type == RocksDBEntryType::Database) { storeMaxTick(RocksDBKey::databaseId(key)); } else if (type == RocksDBEntryType::View) { storeMaxTick(std::max(RocksDBKey::databaseId(key), RocksDBKey::viewId(key))); } } } // tick function that is called before each new WAL entry void incTick() { if (_startOfBatch) { // we are at the start of a batch. do NOT increase sequence number _startOfBatch = false; } else { // we are inside a batch already. now increase sequence number ++_currentSequence; } } public: rocksdb::Status PutCF(uint32_t column_family_id, const rocksdb::Slice& key, const rocksdb::Slice& value) override { LOG_TOPIC(TRACE, Logger::ENGINES) << "recovering PUT " << RocksDBKey(key); incTick(); updateMaxTick(column_family_id, key, value); if (column_family_id == RocksDBColumnFamily::documents()->GetID()) { uint64_t objectId = RocksDBKey::objectId(key); Operations* ops = nullptr; if (shouldHandleCollection(objectId, &ops)) { TRI_ASSERT(ops != nullptr); ops->lastSequenceNumber = _currentSequence; ops->added++; ops->lastRevisionId = transaction::helpers::extractRevFromDocument(RocksDBValue::data(value)); } } else { // We have to adjust the estimate with an insert uint64_t hash = 0; if (column_family_id == RocksDBColumnFamily::vpack()->GetID()) { hash = RocksDBVPackIndex::HashForKey(key); } else if (column_family_id == RocksDBColumnFamily::edge()->GetID()) { hash = RocksDBEdgeIndex::HashForKey(key); } if (hash != 0) { uint64_t objectId = RocksDBKey::objectId(key); auto est = findEstimator(objectId); if (est != nullptr && est->committedSeq() < _currentSequence) { // We track estimates for this index est->insert(hash); } } } RocksDBEngine* engine = static_cast(EngineSelectorFeature::ENGINE); for (auto helper : engine->recoveryHelpers()) { helper->PutCF(column_family_id, key, value); } return rocksdb::Status(); } void handleDeleteCF(uint32_t cfId, const rocksdb::Slice& key) { incTick(); if (cfId == RocksDBColumnFamily::documents()->GetID()) { uint64_t objectId = RocksDBKey::objectId(key); storeMaxHLC(RocksDBKey::documentId(key).id()); storeMaxTick(objectId); Operations* ops = nullptr; if (shouldHandleCollection(objectId, &ops)) { TRI_ASSERT(ops != nullptr); ops->lastSequenceNumber = _currentSequence; ops->removed++; if (_lastRemovedDocRid != 0) { ops->lastRevisionId = _lastRemovedDocRid; } } _lastRemovedDocRid = 0; // reset in any case } else { // We have to adjust the estimate with an insert uint64_t hash = 0; if (cfId == RocksDBColumnFamily::vpack()->GetID()) { hash = RocksDBVPackIndex::HashForKey(key); } else if (cfId == RocksDBColumnFamily::edge()->GetID()) { hash = RocksDBEdgeIndex::HashForKey(key); } if (hash != 0) { uint64_t objectId = RocksDBKey::objectId(key); auto est = findEstimator(objectId); if (est != nullptr && est->committedSeq() < _currentSequence) { // We track estimates for this index est->remove(hash); } } } } rocksdb::Status DeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override { LOG_TOPIC(TRACE, Logger::ENGINES) << "recovering DELETE " << RocksDBKey(key); handleDeleteCF(column_family_id, key); RocksDBEngine* engine = static_cast(EngineSelectorFeature::ENGINE); for (auto helper : engine->recoveryHelpers()) { helper->DeleteCF(column_family_id, key); } return rocksdb::Status(); } rocksdb::Status SingleDeleteCF(uint32_t column_family_id, const rocksdb::Slice& key) override { LOG_TOPIC(TRACE, Logger::ENGINES) << "recovering SINGLE DELETE " << RocksDBKey(key); handleDeleteCF(column_family_id, key); RocksDBEngine* engine = static_cast(EngineSelectorFeature::ENGINE); for (auto helper : engine->recoveryHelpers()) { helper->SingleDeleteCF(column_family_id, key); } return rocksdb::Status(); } rocksdb::Status DeleteRangeCF(uint32_t column_family_id, const rocksdb::Slice& begin_key, const rocksdb::Slice& end_key) override { LOG_TOPIC(TRACE, Logger::ENGINES) << "recovering DELETE RANGE from " << RocksDBKey(begin_key) << " to " << RocksDBKey(end_key); incTick(); // drop and truncate can use this, truncate is handled via a Log marker RocksDBEngine* engine = static_cast(EngineSelectorFeature::ENGINE); for (auto helper : engine->recoveryHelpers()) { helper->DeleteRangeCF(column_family_id, begin_key, end_key); } return rocksdb::Status(); // make WAL iterator happy } void LogData(const rocksdb::Slice& blob) override { // a delete log message appears directly before a Delete RocksDBLogType type = RocksDBLogValue::type(blob); switch (type) { case RocksDBLogType::DocumentRemoveV2: // remove within a trx TRI_ASSERT(_lastRemovedDocRid == 0); _lastRemovedDocRid = RocksDBLogValue::revisionId(blob); break; case RocksDBLogType::SingleRemoveV2: // single remove TRI_ASSERT(_lastRemovedDocRid == 0); _lastRemovedDocRid = RocksDBLogValue::revisionId(blob); break; case RocksDBLogType::CollectionTruncate: { uint64_t objectId = RocksDBLogValue::objectId(blob); Operations* ops = nullptr; if (shouldHandleCollection(objectId, &ops)) { TRI_ASSERT(ops != nullptr); ops->lastSequenceNumber = _currentSequence; ops->removed = 0; ops->added = 0; ops->mustTruncate = true; } // index estimates have their own commitSeq if (!truncateIndexes(objectId)) { // unable to truncate indexes of the collection. // may be due to collection having been deleted etc. LOG_TOPIC(WARN, Logger::ENGINES) << "unable to truncate indexes for objectId " << objectId; } _lastRemovedDocRid = 0; // reset in any other case break; } default: _lastRemovedDocRid = 0; // reset in any other case break; } RocksDBEngine* engine = static_cast(EngineSelectorFeature::ENGINE); for (auto helper : engine->recoveryHelpers()) { helper->LogData(blob); } } // MergeCF is not used }; /// parse the WAL with the above handler parser class Result RocksDBRecoveryManager::parseRocksWAL() { Result shutdownRv; Result res = basics::catchToResult([&]() -> Result { Result rv; RocksDBEngine* engine = static_cast(EngineSelectorFeature::ENGINE); for (auto& helper : engine->recoveryHelpers()) { helper->prepare(); } std::map startSeqs; auto dbfeature = arangodb::DatabaseFeature::DATABASE; dbfeature->enumerateDatabases([&](TRI_vocbase_t& vocbase) { vocbase.processCollections( [&](LogicalCollection* coll) { RocksDBCollection* rcoll = static_cast(coll->getPhysical()); rocksdb::SequenceNumber seq = rcoll->meta().currentCount()._committedSeq; startSeqs.emplace(rcoll->objectId(), seq); }, /*includeDeleted*/ false); }); // Tell the WriteBatch reader the transaction markers to look for WBReader handler(startSeqs); rocksdb::SequenceNumber earliest = engine->settingsManager()->earliestSeqNeeded(); auto minTick = std::min(earliest, engine->releasedTick()); // prevent purging of WAL files while we are in here RocksDBFilePurgePreventer purgePreventer(rocksutils::globalRocksEngine()->disallowPurging()); std::unique_ptr iterator; // reader(); rocksdb::Status s = _db->GetUpdatesSince(minTick, &iterator, rocksdb::TransactionLogIterator::ReadOptions(true)); rv = rocksutils::convertStatus(s); if (rv.ok()) { while (iterator->Valid()) { s = iterator->status(); if (s.ok()) { rocksdb::BatchResult batch = iterator->GetBatch(); handler.startNewBatch(batch.sequence); s = batch.writeBatchPtr->Iterate(&handler); } if (!s.ok()) { rv = rocksutils::convertStatus(s); std::string msg = "error during WAL scan: " + rv.errorMessage(); LOG_TOPIC(ERR, Logger::ENGINES) << msg; rv.reset(rv.errorNumber(), std::move(msg)); // update message break; } iterator->Next(); } } shutdownRv = handler.shutdownWBReader(); return rv; }); if (res.ok()) { res = std::move(shutdownRv); } else { if (shutdownRv.fail()) { res.reset(res.errorNumber(), res.errorMessage() + " - " + shutdownRv.errorMessage()); } } return res; } } // namespace arangodb