//////////////////////////////////////////////////////////////////////////////// /// DISCLAIMER /// /// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Jan Steemann //////////////////////////////////////////////////////////////////////////////// #include "ApplicationFeatures/ApplicationServer.h" #include "Aql/PlanCache.h" #include "Basics/Exceptions.h" #include "Basics/FileUtils.h" #include "Basics/PerformanceLogScope.h" #include "Basics/ReadLocker.h" #include "Basics/ReadUnlocker.h" #include "Basics/Result.h" #include "Basics/StaticStrings.h" #include "Basics/VelocyPackHelper.h" #include "Basics/WriteLocker.h" #include "Basics/WriteUnlocker.h" #include "Basics/encoding.h" #include "Basics/process-utils.h" #include "Cluster/ClusterMethods.h" #include "Indexes/IndexIterator.h" #include "Logger/Logger.h" #include "MMFiles/MMFilesCollectionWriteLocker.h" #include "MMFiles/MMFilesCompactorThread.h" #include "MMFiles/MMFilesDatafile.h" #include "MMFiles/MMFilesDatafileHelper.h" #include "MMFiles/MMFilesDocumentOperation.h" #include "MMFiles/MMFilesDocumentPosition.h" #include "MMFiles/MMFilesEngine.h" #include "MMFiles/MMFilesIndexElement.h" #include "MMFiles/MMFilesIndexLookupContext.h" #include "MMFiles/MMFilesLogfileManager.h" #include "MMFiles/MMFilesPrimaryIndex.h" #include "MMFiles/MMFilesTransactionState.h" #include "MMFilesCollection.h" #include "RestServer/DatabaseFeature.h" #include "Scheduler/Scheduler.h" #include "Scheduler/SchedulerFeature.h" #include "StorageEngine/EngineSelectorFeature.h" #include "StorageEngine/StorageEngine.h" #include "StorageEngine/TransactionState.h" #include "Transaction/Helpers.h" #include "Transaction/Hints.h" #include "Transaction/Methods.h" #include "Transaction/StandaloneContext.h" #include "Utils/CollectionNameResolver.h" #include "Utils/Events.h" #include "Utils/OperationOptions.h" #include "Utils/SingleCollectionTransaction.h" #include "VocBase/KeyGenerator.h" #include "VocBase/KeyLockInfo.h" #include "VocBase/LocalDocumentId.h" #include "VocBase/LogicalCollection.h" #include "VocBase/ManagedDocumentResult.h" #include "VocBase/ticks.h" #include using namespace arangodb; using Helper = arangodb::basics::VelocyPackHelper; /// @brief state during opening of a collection namespace arangodb { struct OpenIteratorState { LogicalCollection* _collection; arangodb::MMFilesPrimaryIndex* _primaryIndex; TRI_voc_tid_t _tid{0}; TRI_voc_fid_t _fid{0}; std::unordered_map _stats; MMFilesDatafileStatisticsContainer* _dfi{nullptr}; transaction::Methods* _trx; ManagedDocumentResult _mdr; MMFilesIndexLookupContext _context; uint64_t _deletions{0}; uint64_t _documents{0}; int64_t _initialCount{-1}; bool _hasAllPersistentLocalIds{true}; OpenIteratorState(LogicalCollection* collection, transaction::Methods* trx) : _collection(collection), _primaryIndex(static_cast(collection->getPhysical())->primaryIndex()), _stats(), _trx(trx), _context(collection, &_mdr, 1) { TRI_ASSERT(collection != nullptr); TRI_ASSERT(trx != nullptr); } ~OpenIteratorState() { for (auto& it : _stats) { delete it.second; } } }; } // namespace arangodb namespace { /// @brief helper class for filling indexes class MMFilesIndexFillerTask : public basics::LocalTask { public: MMFilesIndexFillerTask(std::shared_ptr const& queue, transaction::Methods& trx, MMFilesIndex* idx, std::shared_ptr>> const& documents) : LocalTask(queue), _trx(trx), _idx(idx), _documents(documents) {} void run() override { TRI_ASSERT(_idx->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX); try { _idx->batchInsert(_trx, *_documents.get(), _queue); } catch (std::exception const&) { _queue->setStatus(TRI_ERROR_INTERNAL); } _queue->join(); } private: transaction::Methods& _trx; MMFilesIndex* _idx; std::shared_ptr>> _documents; }; /// @brief find a statistics container for a given file id static MMFilesDatafileStatisticsContainer* FindDatafileStats(OpenIteratorState* state, TRI_voc_fid_t fid) { auto it = state->_stats.find(fid); if (it != state->_stats.end()) { return (*it).second; } auto stats = std::make_unique(); state->_stats.emplace(fid, stats.get()); return stats.release(); } bool countDocumentsIterator(MMFilesMarker const* marker, void* counter, MMFilesDatafile*) { TRI_ASSERT(nullptr != counter); if (marker->getType() == TRI_DF_MARKER_VPACK_DOCUMENT) { (*static_cast(counter))++; } return true; } Result persistLocalDocumentIdIterator(MMFilesMarker const* marker, void* data, MMFilesDatafile* inputFile) { Result res; auto outputFile = static_cast(data); switch (marker->getType()) { case TRI_DF_MARKER_VPACK_DOCUMENT: { auto transactionId = MMFilesDatafileHelper::TransactionId(marker); VPackSlice const slice(reinterpret_cast(marker) + MMFilesDatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_DOCUMENT)); uint8_t const* vpack = slice.begin(); LocalDocumentId localDocumentId; if (marker->getSize() == MMFilesDatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_DOCUMENT) + slice.byteSize() + sizeof(LocalDocumentId::BaseType)) { // we do have a LocalDocumentId stored at the end of the marker uint8_t const* ptr = vpack + slice.byteSize(); localDocumentId = LocalDocumentId( encoding::readNumber(ptr, sizeof(LocalDocumentId::BaseType))); } else { localDocumentId = LocalDocumentId::create(); } MMFilesCrudMarker updatedMarker(TRI_DF_MARKER_VPACK_DOCUMENT, transactionId, localDocumentId, slice); std::unique_ptr buffer(new char[updatedMarker.size()]); MMFilesMarker* outputMarker = reinterpret_cast(buffer.get()); MMFilesDatafileHelper::InitMarker(outputMarker, updatedMarker.type(), updatedMarker.size(), marker->getTick()); updatedMarker.store(buffer.get()); MMFilesMarker* result; res = outputFile->reserveElement(outputMarker->getSize(), &result, 0); if (res.fail()) { return res; } res = outputFile->writeCrcElement(result, outputMarker); if (res.fail()) { return res; } break; } case TRI_DF_MARKER_HEADER: case TRI_DF_MARKER_COL_HEADER: case TRI_DF_MARKER_FOOTER: { // skip marker, either already written by createCompactor or will be // written by closeCompactor break; } default: { // direct copy MMFilesMarker* result; res = outputFile->reserveElement(marker->getSize(), &result, 0); if (res.fail()) { return res; } res = outputFile->writeElement(result, marker); if (res.fail()) { return res; } break; } } return res; } } // namespace arangodb::Result MMFilesCollection::updateProperties(VPackSlice const& slice, bool doSync) { // validation uint32_t tmp = arangodb::basics::VelocyPackHelper::getNumericValue( slice, "indexBuckets", 2 /*Just for validation, this default Value passes*/); if (tmp == 0 || tmp > 1024) { return {TRI_ERROR_BAD_PARAMETER, "indexBuckets must be a two-power between 1 and 1024"}; } if (isVolatile() && arangodb::basics::VelocyPackHelper::getBooleanValue( slice, "waitForSync", _logicalCollection.waitForSync())) { // the combination of waitForSync and isVolatile makes no sense THROW_ARANGO_EXCEPTION_MESSAGE( TRI_ERROR_BAD_PARAMETER, "volatile collections do not support the waitForSync option"); } if (isVolatile() != arangodb::basics::VelocyPackHelper::getBooleanValue(slice, "isVolatile", isVolatile())) { THROW_ARANGO_EXCEPTION_MESSAGE( TRI_ERROR_BAD_PARAMETER, "isVolatile option cannot be changed at runtime"); } auto journalSlice = slice.get("journalSize"); if (journalSlice.isNone()) { // In some APIs maximalSize is allowed instead journalSlice = slice.get("maximalSize"); } if (!journalSlice.isNone() && journalSlice.isNumber()) { uint32_t toUpdate = journalSlice.getNumericValue(); if (toUpdate < TRI_JOURNAL_MINIMAL_SIZE) { return {TRI_ERROR_BAD_PARAMETER, ".journalSize too small"}; } } _indexBuckets = Helper::getNumericValue(slice, "indexBuckets", _indexBuckets); // MMFiles if (slice.hasKey("journalSize")) { _journalSize = Helper::getNumericValue(slice, "journalSize", _journalSize); } else { _journalSize = Helper::getNumericValue(slice, "maximalSize", _journalSize); } _doCompact = Helper::getBooleanValue(slice, "doCompact", _doCompact); int64_t count = arangodb::basics::VelocyPackHelper::getNumericValue(slice, "count", _initialCount); if (count != _initialCount) { _initialCount = count; } return {}; } arangodb::Result MMFilesCollection::persistProperties() { Result res; try { auto infoBuilder = _logicalCollection.toVelocyPackIgnore({"path", "statusString"}, true, true); MMFilesCollectionMarker marker(TRI_DF_MARKER_VPACK_CHANGE_COLLECTION, _logicalCollection.vocbase().id(), _logicalCollection.id(), infoBuilder.slice()); MMFilesWalSlotInfoCopy slotInfo = MMFilesLogfileManager::instance()->allocateAndWrite(marker, false); res = slotInfo.errorCode; } catch (arangodb::basics::Exception const& ex) { res = ex.code(); } catch (...) { res = TRI_ERROR_INTERNAL; } if (res.fail()) { // TODO: what to do here LOG_TOPIC("151a6", WARN, arangodb::Logger::ENGINES) << "could not save collection change marker in log: " << res.errorMessage(); } return res; } PhysicalCollection* MMFilesCollection::clone(LogicalCollection& logical) const { return new MMFilesCollection(logical, this); } /// @brief process a document (or edge) marker when opening a collection int MMFilesCollection::OpenIteratorHandleDocumentMarker(MMFilesMarker const* marker, MMFilesDatafile* datafile, OpenIteratorState* state) { LogicalCollection* collection = state->_collection; TRI_ASSERT(collection != nullptr); auto physical = static_cast(collection->getPhysical()); TRI_ASSERT(physical != nullptr); transaction::Methods* trx = state->_trx; TRI_ASSERT(trx != nullptr); VPackSlice const slice(reinterpret_cast(marker) + MMFilesDatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_DOCUMENT)); uint8_t const* vpack = slice.begin(); LocalDocumentId localDocumentId; if (marker->getSize() == MMFilesDatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_DOCUMENT) + slice.byteSize() + sizeof(LocalDocumentId::BaseType)) { // we do have a LocalDocumentId stored at the end of the marker uint8_t const* ptr = vpack + slice.byteSize(); localDocumentId = LocalDocumentId( encoding::readNumber(ptr, sizeof(LocalDocumentId::BaseType))); } else { localDocumentId = LocalDocumentId::create(); state->_hasAllPersistentLocalIds = false; } VPackSlice keySlice; TRI_voc_rid_t revisionId; transaction::helpers::extractKeyAndRevFromDocument(slice, keySlice, revisionId); physical->setRevision(revisionId, false); { // track keys VPackValueLength length; char const* p = keySlice.getString(length); collection->keyGenerator()->track(p, static_cast(length)); } ++state->_documents; TRI_voc_fid_t const fid = datafile->fid(); if (state->_fid != fid) { // update the state state->_fid = fid; // when we're here, we're looking at a datafile state->_dfi = FindDatafileStats(state, fid); } // no primary index lock required here because we are the only ones reading // from the index ATM MMFilesSimpleIndexElement* found = state->_primaryIndex->lookupKeyRef(trx, keySlice); // it is a new entry if (found == nullptr || !found->isSet()) { physical->insertLocalDocumentId(localDocumentId, vpack, fid, false, false); // insert into primary index Result res = state->_primaryIndex->insertKey(trx, localDocumentId, VPackSlice(vpack), state->_mdr, Index::OperationMode::normal); if (res.fail()) { physical->removeLocalDocumentId(localDocumentId, false); LOG_TOPIC("a85d4", ERR, arangodb::Logger::ENGINES) << "inserting document into primary index failed with error: " << res.errorMessage(); return res.errorNumber(); } // update the datafile info state->_dfi->numberAlive++; state->_dfi->sizeAlive += MMFilesDatafileHelper::AlignedMarkerSize(marker); } // it is an update else { LocalDocumentId const oldLocalDocumentId = found->localDocumentId(); // update the revision id in primary index found->updateLocalDocumentId(localDocumentId, static_cast(keySlice.begin() - vpack)); MMFilesDocumentPosition const old = physical->lookupDocument(oldLocalDocumentId); // remove old revision physical->removeLocalDocumentId(oldLocalDocumentId, false); // insert new revision physical->insertLocalDocumentId(localDocumentId, vpack, fid, false, false); // update the datafile info MMFilesDatafileStatisticsContainer* dfi; if (old.fid() == state->_fid) { dfi = state->_dfi; } else { dfi = FindDatafileStats(state, old.fid()); } if (old.dataptr() != nullptr) { uint8_t const* vpack = static_cast(old.dataptr()); MMFilesMarker const* oldMarker = reinterpret_cast( vpack - MMFilesDatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_DOCUMENT)); int64_t size = MMFilesDatafileHelper::AlignedMarkerSize(oldMarker); dfi->numberAlive--; dfi->sizeAlive -= size; dfi->numberDead++; dfi->sizeDead += size; } state->_dfi->numberAlive++; state->_dfi->sizeAlive += MMFilesDatafileHelper::AlignedMarkerSize(marker); } return TRI_ERROR_NO_ERROR; } /// @brief process a deletion marker when opening a collection int MMFilesCollection::OpenIteratorHandleDeletionMarker(MMFilesMarker const* marker, MMFilesDatafile* datafile, OpenIteratorState* state) { LogicalCollection* collection = state->_collection; TRI_ASSERT(collection != nullptr); auto physical = static_cast(collection->getPhysical()); TRI_ASSERT(physical != nullptr); transaction::Methods* trx = state->_trx; VPackSlice const slice(reinterpret_cast(marker) + MMFilesDatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_REMOVE)); VPackSlice keySlice; TRI_voc_rid_t revisionId; transaction::helpers::extractKeyAndRevFromDocument(slice, keySlice, revisionId); physical->setRevision(revisionId, false); { // track keys VPackValueLength length; char const* p = keySlice.getString(length); collection->keyGenerator()->track(p, length); } ++state->_deletions; if (state->_fid != datafile->fid()) { // update the state state->_fid = datafile->fid(); state->_dfi = FindDatafileStats(state, datafile->fid()); } // no primary index lock required here because we are the only ones reading // from the index ATM MMFilesSimpleIndexElement found = state->_primaryIndex->lookupKey(trx, keySlice); // it is a new entry, so we missed the create if (!found) { // update the datafile info state->_dfi->numberDeletions++; } // it is a real delete else { LocalDocumentId const oldLocalDocumentId = found.localDocumentId(); MMFilesDocumentPosition const old = physical->lookupDocument(oldLocalDocumentId); // update the datafile info MMFilesDatafileStatisticsContainer* dfi; if (old.fid() == state->_fid) { dfi = state->_dfi; } else { dfi = FindDatafileStats(state, old.fid()); } TRI_ASSERT(old.dataptr() != nullptr); uint8_t const* vpack = static_cast(old.dataptr()); MMFilesMarker const* oldMarker = reinterpret_cast( vpack - MMFilesDatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_DOCUMENT)); int64_t size = MMFilesDatafileHelper::AlignedMarkerSize(oldMarker); dfi->numberAlive--; dfi->sizeAlive -= size; dfi->numberDead++; dfi->sizeDead += size; state->_dfi->numberDeletions++; state->_primaryIndex->removeKey(trx, oldLocalDocumentId, VPackSlice(vpack), state->_mdr, Index::OperationMode::normal); physical->removeLocalDocumentId(oldLocalDocumentId, true); } return TRI_ERROR_NO_ERROR; } /// @brief iterator for open bool MMFilesCollection::OpenIterator(MMFilesMarker const* marker, OpenIteratorState* data, MMFilesDatafile* datafile) { TRI_voc_tick_t const tick = marker->getTick(); MMFilesMarkerType const type = marker->getType(); int res; if (type == TRI_DF_MARKER_VPACK_DOCUMENT) { res = OpenIteratorHandleDocumentMarker(marker, datafile, data); if (datafile->_dataMin == 0) { datafile->_dataMin = tick; } if (tick > datafile->_dataMax) { datafile->_dataMax = tick; } } else if (type == TRI_DF_MARKER_VPACK_REMOVE) { res = OpenIteratorHandleDeletionMarker(marker, datafile, data); } else { if (type == TRI_DF_MARKER_HEADER) { // ensure there is a datafile info entry for each datafile of the // collection FindDatafileStats(data, datafile->fid()); } LOG_TOPIC("8f99d", TRACE, arangodb::Logger::ENGINES) << "skipping marker type " << TRI_NameMarkerDatafile(marker); res = TRI_ERROR_NO_ERROR; } if (datafile->_tickMin == 0) { datafile->_tickMin = tick; } if (tick > datafile->_tickMax) { datafile->_tickMax = tick; } auto physical = data->_collection->getPhysical(); auto mmfiles = static_cast(physical); TRI_ASSERT(mmfiles); if (tick > mmfiles->maxTick()) { if (type != TRI_DF_MARKER_HEADER && type != TRI_DF_MARKER_FOOTER && type != TRI_DF_MARKER_COL_HEADER && type != TRI_DF_MARKER_PROLOGUE) { mmfiles->maxTick(tick); } } return (res == TRI_ERROR_NO_ERROR); } MMFilesCollection::MMFilesCollection(LogicalCollection& collection, VPackSlice const& info) : PhysicalCollection(collection, info), _ditches(&collection), _initialCount(0), _lastRevision(0), _uncollectedLogfileEntries(0), _nextCompactionStartIndex(0), _lastCompactionStatus(nullptr), _lastCompactionStamp(0.0), _journalSize(Helper::readNumericValue( info, "maximalSize", // Backwards compatibility. Agency uses // journalSize. paramters.json uses maximalSize Helper::readNumericValue(info, "journalSize", TRI_JOURNAL_DEFAULT_SIZE))), _isVolatile( arangodb::basics::VelocyPackHelper::readBooleanValue(info, "isVolatile", false)), _persistentIndexes(0), _primaryIndex(nullptr), _indexBuckets(Helper::readNumericValue(info, "indexBuckets", defaultIndexBuckets)), _useSecondaryIndexes(true), _doCompact(Helper::readBooleanValue(info, "doCompact", true)), _maxTick(0) { TRI_ASSERT(!ServerState::instance()->isCoordinator()); if (_isVolatile && _logicalCollection.waitForSync()) { // Illegal collection configuration THROW_ARANGO_EXCEPTION_MESSAGE( TRI_ERROR_BAD_PARAMETER, "volatile collections do not support the waitForSync option"); } if (_journalSize < TRI_JOURNAL_MINIMAL_SIZE) { THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_BAD_PARAMETER, ".journalSize too small"); } auto pathSlice = info.get("path"); if (pathSlice.isString()) { _path = pathSlice.copyString(); } setCompactionStatus("compaction not yet started"); } MMFilesCollection::MMFilesCollection(LogicalCollection& logical, PhysicalCollection const* physical) : PhysicalCollection(logical, VPackSlice::emptyObjectSlice()), _ditches(&logical), _isVolatile(static_cast(physical)->isVolatile()) { MMFilesCollection const& mmfiles = *static_cast(physical); _persistentIndexes = mmfiles._persistentIndexes; _useSecondaryIndexes = mmfiles._useSecondaryIndexes; _initialCount = mmfiles._initialCount; _lastRevision = mmfiles._lastRevision; _nextCompactionStartIndex = mmfiles._nextCompactionStartIndex; _lastCompactionStatus = mmfiles._lastCompactionStatus; _lastCompactionStamp = mmfiles._lastCompactionStamp; _journalSize = mmfiles._journalSize; _indexBuckets = mmfiles._indexBuckets; _primaryIndex = mmfiles._primaryIndex; _path = mmfiles._path; _doCompact = mmfiles._doCompact; _maxTick = mmfiles._maxTick; TRI_ASSERT(!ServerState::instance()->isCoordinator()); setCompactionStatus("compaction not yet started"); // not copied // _datafiles; // all datafiles // _journals; // all journals // _compactors; // all compactor files // _uncollectedLogfileEntries = mmfiles.uncollectedLogfileEntries; //TODO // FIXME // _datafileStatistics; // _revisionsCache; } MMFilesCollection::~MMFilesCollection() {} TRI_voc_rid_t MMFilesCollection::revision(arangodb::transaction::Methods*) const { return _lastRevision; } TRI_voc_rid_t MMFilesCollection::revision() const { return _lastRevision; } /// @brief update statistics for a collection void MMFilesCollection::setRevision(TRI_voc_rid_t revision, bool force) { if (revision > 0 && (force || revision > _lastRevision)) { _lastRevision = revision; } } size_t MMFilesCollection::journalSize() const { return _journalSize; }; bool MMFilesCollection::isVolatile() const { return _isVolatile; } /// @brief closes an open collection int MMFilesCollection::close() { LOG_TOPIC("2408b", DEBUG, Logger::ENGINES) << "closing '" << _logicalCollection.name() << "'"; if (!_logicalCollection.deleted() && !_logicalCollection.vocbase().isDropped()) { auto primIdx = primaryIndex(); auto idxSize = primIdx->size(); if (_initialCount != static_cast(idxSize)) { _initialCount = idxSize; // save new "count" value StorageEngine* engine = EngineSelectorFeature::ENGINE; bool const doSync = application_features::ApplicationServer::getFeature( "Database") ->forceSyncProperties(); engine->changeCollection(_logicalCollection.vocbase(), _logicalCollection, doSync); } } { // We also have to unload the indexes. WRITE_LOCKER(writeLocker, _dataLock); READ_LOCKER_EVENTUAL(guard, _indexesLock); for (auto& idx : _indexes) { idx->unload(); } } // wait until ditches have been processed fully int tries = 0; while (true) { bool hasDocumentDitch = _ditches.contains(MMFilesDitch::TRI_DITCH_DOCUMENT); bool hasOtherDitch = (_ditches.contains(MMFilesDitch::TRI_DITCH_DATAFILE_DROP) || _ditches.contains(MMFilesDitch::TRI_DITCH_DATAFILE_RENAME) || _ditches.contains(MMFilesDitch::TRI_DITCH_REPLICATION) || _ditches.contains(MMFilesDitch::TRI_DITCH_COMPACTION)); if (!hasDocumentDitch && !hasOtherDitch) { // we can abort now break; } // give the cleanup thread more time to clean up { WRITE_UNLOCKER(unlocker, _logicalCollection.lock()); std::this_thread::sleep_for(std::chrono::milliseconds(20)); } if ((++tries % 50) == 0) { if (hasDocumentDitch) { LOG_TOPIC("51984", INFO, Logger::ENGINES) << "waiting for cleanup of document ditches for collection '" << _logicalCollection.name() << "'. has other: " << hasOtherDitch; } else { LOG_TOPIC("16847", INFO, Logger::ENGINES) << "waiting for cleanup of ditches for collection '" << _logicalCollection.name() << "'"; } } if (tries == 10 * 50 && !hasOtherDitch) { // give it up after 10 seconds - this will close the collection at all cost break; } } { WRITE_LOCKER(writeLocker, _filesLock); // close compactor files closeDatafiles(_compactors); for (auto& it : _compactors) { delete it; } _compactors.clear(); // close journal files closeDatafiles(_journals); for (auto& it : _journals) { delete it; } _journals.clear(); // close datafiles closeDatafiles(_datafiles); for (auto& it : _datafiles) { delete it; } _datafiles.clear(); } _lastRevision = 0; // clear revisions lookup table _revisionsCache.clear(); return TRI_ERROR_NO_ERROR; } /// @brief seal a datafile int MMFilesCollection::sealDatafile(MMFilesDatafile* datafile, bool isCompactor) { int res = datafile->seal(); if (res != TRI_ERROR_NO_ERROR) { LOG_TOPIC("73f8b", ERR, arangodb::Logger::DATAFILES) << "failed to seal journal '" << datafile->getName() << "': " << TRI_errno_string(res); return res; } if (!isCompactor && datafile->isPhysical()) { // rename the file std::string dname("datafile-" + std::to_string(datafile->fid()) + ".db"); std::string filename = arangodb::basics::FileUtils::buildFilename(path(), dname); LOG_TOPIC("cfb3f", TRACE, arangodb::Logger::DATAFILES) << "closing and renaming journal file '" << datafile->getName() << "'"; res = datafile->rename(filename); if (res == TRI_ERROR_NO_ERROR) { LOG_TOPIC("9d9f5", TRACE, arangodb::Logger::DATAFILES) << "closed and renamed journal file '" << datafile->getName() << "'"; } else { LOG_TOPIC("24db4", ERR, arangodb::Logger::DATAFILES) << "failed to rename datafile '" << datafile->getName() << "' to '" << filename << "': " << TRI_errno_string(res); } } return res; } /// @brief set the initial datafiles for the collection void MMFilesCollection::setInitialFiles(std::vector&& datafiles, std::vector&& journals, std::vector&& compactors) { WRITE_LOCKER(writeLocker, _filesLock); _datafiles = std::move(datafiles); _journals = std::move(journals); _compactors = std::move(compactors); TRI_ASSERT(_journals.size() <= 1); } /// @brief rotate the active journal - will do nothing if there is no journal int MMFilesCollection::rotateActiveJournal() { WRITE_LOCKER(writeLocker, _filesLock); // note: only journals need to be handled here as the journal is the // only place that's ever written to. if a journal is full, it will have been // sealed and synced already if (_journals.empty()) { return TRI_ERROR_ARANGO_NO_JOURNAL; } if (_journals.size() > 1) { // we should never have more than a single journal at a time return TRI_ERROR_INTERNAL; } MMFilesDatafile* datafile = _journals.back(); TRI_ASSERT(datafile != nullptr); TRI_IF_FAILURE("CreateMultipleJournals") { // create an additional journal now, without sealing and renaming the old // one! _datafiles.emplace_back(datafile); _journals.pop_back(); return TRI_ERROR_NO_ERROR; } // make sure we have enough room in the target vector before we go on _datafiles.reserve(_datafiles.size() + 1); int res = sealDatafile(datafile, false); if (res != TRI_ERROR_NO_ERROR) { return res; } // shouldn't throw as we reserved enough space before _datafiles.emplace_back(datafile); TRI_ASSERT(!_journals.empty()); TRI_ASSERT(_journals.back() == datafile); _journals.pop_back(); TRI_ASSERT(_journals.empty()); return res; } /// @brief sync the active journal - will do nothing if there is no journal /// or if the journal is volatile int MMFilesCollection::syncActiveJournal() { WRITE_LOCKER(writeLocker, _filesLock); // note: only journals need to be handled here as the journal is the // only place that's ever written to. if a journal is full, it will have been // sealed and synced already if (_journals.empty()) { // nothing to do return TRI_ERROR_NO_ERROR; } TRI_ASSERT(_journals.size() == 1); MMFilesDatafile* datafile = _journals.back(); TRI_ASSERT(datafile != nullptr); return datafile->sync(); } /// @brief reserve space in the current journal. if no create exists or the /// current journal cannot provide enough space, close the old journal and /// create a new one int MMFilesCollection::reserveJournalSpace(TRI_voc_tick_t tick, uint32_t size, char*& resultPosition, MMFilesDatafile*& resultDatafile) { // reset results resultPosition = nullptr; resultDatafile = nullptr; // start with configured journal size uint32_t targetSize = static_cast(_journalSize); // make sure that the document fits while (targetSize - 256 < size) { targetSize *= 2; } WRITE_LOCKER(writeLocker, _filesLock); TRI_ASSERT(_journals.size() <= 1); while (true) { // no need to go on if the collection is already deleted if (TRI_VOC_COL_STATUS_DELETED == _logicalCollection.status()) { return TRI_ERROR_ARANGO_DATA_SOURCE_NOT_FOUND; } MMFilesDatafile* datafile = nullptr; if (_journals.empty()) { // create enough room in the journals vector _journals.reserve(_journals.size() + 1); try { std::unique_ptr df(createDatafile(tick, targetSize, false)); // shouldn't throw as we reserved enough space before _journals.emplace_back(df.get()); df.release(); TRI_ASSERT(_journals.size() == 1); } catch (basics::Exception const& ex) { LOG_TOPIC("dfb34", ERR, Logger::COLLECTOR) << "cannot select journal: " << ex.what(); return ex.code(); } catch (std::exception const& ex) { LOG_TOPIC("79da1", ERR, Logger::COLLECTOR) << "cannot select journal: " << ex.what(); return TRI_ERROR_INTERNAL; } catch (...) { LOG_TOPIC("167e7", ERR, Logger::COLLECTOR) << "cannot select journal: unknown exception"; return TRI_ERROR_INTERNAL; } } // select datafile TRI_ASSERT(!_journals.empty()); TRI_ASSERT(_journals.size() == 1); datafile = _journals.back(); TRI_ASSERT(datafile != nullptr); // try to reserve space in the datafile MMFilesMarker* position = nullptr; int res = datafile->reserveElement(size, &position, targetSize); // found a datafile with enough space left if (res == TRI_ERROR_NO_ERROR) { // set result resultPosition = reinterpret_cast(position); resultDatafile = datafile; return TRI_ERROR_NO_ERROR; } if (res != TRI_ERROR_ARANGO_DATAFILE_FULL) { // some other error LOG_TOPIC("037f8", ERR, Logger::COLLECTOR) << "cannot select journal: '" << TRI_last_error() << "'"; return res; } // TRI_ERROR_ARANGO_DATAFILE_FULL... // journal is full, close it and sync LOG_TOPIC("22602", DEBUG, Logger::COLLECTOR) << "closing full journal '" << datafile->getName() << "'"; // make sure we have enough room in the target vector before we go on _datafiles.reserve(_datafiles.size() + 1); res = sealDatafile(datafile, false); // move journal into _datafiles vector // this shouldn't fail, as we have reserved space before already _datafiles.emplace_back(datafile); // and finally erase it from _journals vector TRI_ASSERT(!_journals.empty()); TRI_ASSERT(_journals.back() == datafile); _journals.pop_back(); TRI_ASSERT(_journals.empty()); if (res != TRI_ERROR_NO_ERROR) { // an error occurred, we must stop here return res; } } // otherwise, next iteration! return TRI_ERROR_ARANGO_NO_JOURNAL; } /// @brief create compactor file MMFilesDatafile* MMFilesCollection::createCompactor(TRI_voc_fid_t fid, size_t maximalSize) { WRITE_LOCKER(writeLocker, _filesLock); TRI_ASSERT(_compactors.empty()); // reserve enough space for the later addition _compactors.reserve(_compactors.size() + 1); std::unique_ptr compactor( createDatafile(fid, static_cast(maximalSize), true)); // should not throw, as we've reserved enough space before _compactors.emplace_back(compactor.get()); TRI_ASSERT(_compactors.size() == 1); return compactor.release(); } /// @brief close an existing compactor int MMFilesCollection::closeCompactor(MMFilesDatafile* datafile) { WRITE_LOCKER(writeLocker, _filesLock); if (_compactors.size() != 1) { return TRI_ERROR_ARANGO_NO_JOURNAL; } MMFilesDatafile* compactor = _compactors[0]; if (datafile != compactor) { // wrong compactor file specified... should not happen return TRI_ERROR_INTERNAL; } return sealDatafile(datafile, true); } /// @brief replace a datafile with a compactor int MMFilesCollection::replaceDatafileWithCompactor(MMFilesDatafile* datafile, MMFilesDatafile* compactor) { TRI_ASSERT(datafile != nullptr); TRI_ASSERT(compactor != nullptr); WRITE_LOCKER(writeLocker, _filesLock); TRI_ASSERT(!_compactors.empty()); for (size_t i = 0; i < _datafiles.size(); ++i) { if (_datafiles[i]->fid() == datafile->fid()) { // found! // now put the compactor in place of the datafile _datafiles[i] = compactor; // remove the compactor file from the list of compactors TRI_ASSERT(_compactors[0] != nullptr); TRI_ASSERT(_compactors[0]->fid() == compactor->fid()); _compactors.erase(_compactors.begin()); TRI_ASSERT(_compactors.empty()); return TRI_ERROR_NO_ERROR; } } return TRI_ERROR_INTERNAL; } /// @brief creates a datafile MMFilesDatafile* MMFilesCollection::createDatafile(TRI_voc_fid_t fid, uint32_t journalSize, bool isCompactor) { TRI_ASSERT(fid > 0); // create an entry for the new datafile try { _datafileStatistics.create(fid); } catch (basics::Exception const& ex) { THROW_ARANGO_EXCEPTION_MESSAGE(ex.code(), ex.what()); } catch (std::exception const& ex) { // rethrow but do not change error code THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, ex.what()); } catch (...) { THROW_ARANGO_EXCEPTION(TRI_ERROR_OUT_OF_MEMORY); } std::unique_ptr datafile; if (isVolatile()) { // in-memory collection datafile.reset(MMFilesDatafile::create(StaticStrings::Empty, fid, journalSize, true)); } else { // construct a suitable filename (which may be temporary at the beginning) std::string jname; if (isCompactor) { jname = "compaction-"; } else { jname = "temp-"; } jname.append(std::to_string(fid) + ".db"); std::string filename = arangodb::basics::FileUtils::buildFilename(path(), jname); TRI_IF_FAILURE("CreateJournalDocumentCollection") { // simulate disk full THROW_ARANGO_EXCEPTION(TRI_ERROR_ARANGO_FILESYSTEM_FULL); } // remove an existing temporary file first if (TRI_ExistsFile(filename.c_str())) { // remove an existing file first TRI_UnlinkFile(filename.c_str()); } datafile.reset(MMFilesDatafile::create(filename, fid, journalSize, true)); } if (datafile == nullptr) { if (TRI_errno() == TRI_ERROR_OUT_OF_MEMORY_MMAP) { THROW_ARANGO_EXCEPTION(TRI_ERROR_OUT_OF_MEMORY_MMAP); } THROW_ARANGO_EXCEPTION(TRI_ERROR_ARANGO_NO_JOURNAL); } // datafile is there now TRI_ASSERT(datafile != nullptr); if (isCompactor) { LOG_TOPIC("dff57", TRACE, arangodb::Logger::DATAFILES) << "created new compactor '" << datafile->getName() << "'"; } else { LOG_TOPIC("3eda1", TRACE, arangodb::Logger::DATAFILES) << "created new journal '" << datafile->getName() << "'"; } // create a collection header, still in the temporary file MMFilesMarker* position; int res = datafile->reserveElement(sizeof(MMFilesCollectionHeaderMarker), &position, journalSize); TRI_IF_FAILURE("CreateJournalDocumentCollectionReserve1") { res = TRI_ERROR_DEBUG; } if (res != TRI_ERROR_NO_ERROR) { LOG_TOPIC("b905e", ERR, arangodb::Logger::DATAFILES) << "cannot create collection header in file '" << datafile->getName() << "': " << TRI_errno_string(res); // close the journal and remove it std::string temp(datafile->getName()); datafile.reset(); TRI_UnlinkFile(temp.c_str()); THROW_ARANGO_EXCEPTION(res); } MMFilesCollectionHeaderMarker cm; MMFilesDatafileHelper::InitMarker(reinterpret_cast(&cm), TRI_DF_MARKER_COL_HEADER, sizeof(MMFilesCollectionHeaderMarker), static_cast(fid)); cm._cid = _logicalCollection.id(); res = datafile->writeCrcElement(position, &cm.base); TRI_IF_FAILURE("CreateJournalDocumentCollectionReserve2") { res = TRI_ERROR_DEBUG; } if (res != TRI_ERROR_NO_ERROR) { int res = datafile->_lastError; LOG_TOPIC("3ef4d", ERR, arangodb::Logger::DATAFILES) << "cannot create collection header in file '" << datafile->getName() << "': " << TRI_last_error(); // close the datafile and remove it std::string temp(datafile->getName()); datafile.reset(); TRI_UnlinkFile(temp.c_str()); THROW_ARANGO_EXCEPTION(res); } TRI_ASSERT(fid == datafile->fid()); // if a physical file, we can rename it from the temporary name to the correct // name if (!isCompactor && datafile->isPhysical()) { // and use the correct name std::string oldName = datafile->getName(); std::string jname("journal-" + std::to_string(datafile->fid()) + ".db"); std::string filename = arangodb::basics::FileUtils::buildFilename(path(), jname); LOG_TOPIC("3e87e", TRACE, arangodb::Logger::DATAFILES) << "renaming journal '" << datafile->getName() << "' to '" << filename << "'"; int res = datafile->rename(filename); if (res != TRI_ERROR_NO_ERROR) { LOG_TOPIC("4ddf6", ERR, arangodb::Logger::DATAFILES) << "failed to rename journal '" << datafile->getName() << "' to '" << filename << "': " << TRI_errno_string(res); std::string temp(datafile->getName()); datafile.reset(); TRI_UnlinkFile(temp.c_str()); THROW_ARANGO_EXCEPTION(res); } LOG_TOPIC("cf30c", TRACE, arangodb::Logger::DATAFILES) << "renamed journal from '" << oldName << "' to '" << filename << "'"; } return datafile.release(); } /// @brief remove a compactor file from the list of compactors bool MMFilesCollection::removeCompactor(MMFilesDatafile* df) { TRI_ASSERT(df != nullptr); WRITE_LOCKER(writeLocker, _filesLock); for (auto it = _compactors.begin(); it != _compactors.end(); ++it) { if ((*it) == df) { // and finally remove the file from the _compactors vector _compactors.erase(it); return true; } } // not found return false; } /// @brief remove a datafile from the list of datafiles bool MMFilesCollection::removeDatafile(MMFilesDatafile* df) { TRI_ASSERT(df != nullptr); WRITE_LOCKER(writeLocker, _filesLock); for (auto it = _datafiles.begin(); it != _datafiles.end(); ++it) { if ((*it) == df) { // and finally remove the file from the _datafiles vector _datafiles.erase(it); return true; } } // not found return false; } /// @brief iterates over a collection bool MMFilesCollection::iterateDatafiles( std::function const& cb) { READ_LOCKER(readLocker, _filesLock); if (!iterateDatafilesVector(_datafiles, cb) || !iterateDatafilesVector(_compactors, cb) || !iterateDatafilesVector(_journals, cb)) { return false; } return true; } /// @brief iterate over all datafiles in a vector /// the caller must hold the _filesLock bool MMFilesCollection::iterateDatafilesVector( std::vector const& files, std::function const& cb) { for (auto const& datafile : files) { datafile->sequentialAccess(); datafile->willNeed(); if (!TRI_IterateDatafile(datafile, cb)) { return false; } if (datafile->isPhysical() && datafile->isSealed()) { datafile->randomAccess(); } } return true; } /// @brief closes the datafiles passed in the vector bool MMFilesCollection::closeDatafiles(std::vector const& files) { bool result = true; for (auto const& datafile : files) { TRI_ASSERT(datafile != nullptr); if (datafile->state() == TRI_DF_STATE_CLOSED) { continue; } int res = datafile->close(); if (res != TRI_ERROR_NO_ERROR) { result = false; } } return result; } /// @brief export properties void MMFilesCollection::getPropertiesVPack(velocypack::Builder& result) const { TRI_ASSERT(result.isOpenObject()); result.add("count", VPackValue(_initialCount)); result.add("doCompact", VPackValue(_doCompact)); result.add("indexBuckets", VPackValue(_indexBuckets)); result.add("isVolatile", VPackValue(_isVolatile)); result.add("journalSize", VPackValue(_journalSize)); result.add("path", VPackValue(_path)); TRI_ASSERT(result.isOpenObject()); } void MMFilesCollection::figuresSpecific(std::shared_ptr& builder) { // fills in compaction status char const* lastCompactionStatus = "-"; char lastCompactionStampString[21]; lastCompactionStampString[0] = '-'; lastCompactionStampString[1] = '\0'; double lastCompactionStamp; { MUTEX_LOCKER(mutexLocker, _compactionStatusLock); lastCompactionStatus = _lastCompactionStatus; lastCompactionStamp = _lastCompactionStamp; } if (lastCompactionStatus != nullptr) { if (lastCompactionStamp == 0.0) { lastCompactionStamp = TRI_microtime(); } struct tm tb; time_t tt = static_cast(lastCompactionStamp); TRI_gmtime(tt, &tb); strftime(&lastCompactionStampString[0], sizeof(lastCompactionStampString), "%Y-%m-%dT%H:%M:%SZ", &tb); } builder->add("documentReferences", VPackValue(_ditches.numMMFilesDocumentMMFilesDitches())); char const* waitingForDitch = _ditches.head(); builder->add("waitingFor", VPackValue(waitingForDitch == nullptr ? "-" : waitingForDitch)); // add datafile statistics MMFilesDatafileStatisticsContainer dfi = _datafileStatistics.all(); MMFilesDatafileStatistics::CompactionStats stats = _datafileStatistics.getStats(); builder->add("alive", VPackValue(VPackValueType::Object)); { builder->add("count", VPackValue(dfi.numberAlive)); builder->add("size", VPackValue(dfi.sizeAlive)); builder->close(); // alive } builder->add("dead", VPackValue(VPackValueType::Object)); { builder->add("count", VPackValue(dfi.numberDead)); builder->add("size", VPackValue(dfi.sizeDead)); builder->add("deletion", VPackValue(dfi.numberDeletions)); builder->close(); // dead } builder->add("compactionStatus", VPackValue(VPackValueType::Object)); { builder->add("message", VPackValue(lastCompactionStatus)); builder->add("time", VPackValue(&lastCompactionStampString[0])); builder->add("count", VPackValue(stats._compactionCount)); builder->add("filesCombined", VPackValue(stats._filesCombined)); builder->add("bytesRead", VPackValue(stats._compactionBytesRead)); builder->add("bytesWritten", VPackValue(stats._compactionBytesWritten)); builder->close(); // compactionStatus } // add file statistics READ_LOCKER(readLocker, _filesLock); size_t sizeDatafiles = 0; builder->add("datafiles", VPackValue(VPackValueType::Object)); for (auto const& it : _datafiles) { sizeDatafiles += it->initSize(); } builder->add("count", VPackValue(_datafiles.size())); builder->add("fileSize", VPackValue(sizeDatafiles)); builder->close(); // datafiles size_t sizeJournals = 0; for (auto const& it : _journals) { sizeJournals += it->initSize(); } builder->add("journals", VPackValue(VPackValueType::Object)); builder->add("count", VPackValue(_journals.size())); builder->add("fileSize", VPackValue(sizeJournals)); builder->close(); // journals size_t sizeCompactors = 0; for (auto const& it : _compactors) { sizeCompactors += it->initSize(); } builder->add("compactors", VPackValue(VPackValueType::Object)); builder->add("count", VPackValue(_compactors.size())); builder->add("fileSize", VPackValue(sizeCompactors)); builder->close(); // compactors builder->add("revisions", VPackValue(VPackValueType::Object)); builder->add("count", VPackValue(_revisionsCache.size())); builder->add("size", VPackValue(_revisionsCache.memoryUsage())); builder->close(); // revisions builder->add("lastTick", VPackValue(_maxTick)); builder->add("uncollectedLogfileEntries", VPackValue(uncollectedLogfileEntries())); } /// @brief iterate over a vector of datafiles and pick those with a specific /// data range std::vector MMFilesCollection::datafilesInRange( TRI_voc_tick_t dataMin, TRI_voc_tick_t dataMax) { std::vector result; auto apply = [&dataMin, &dataMax, &result](MMFilesDatafile const* datafile, bool isJournal) { DatafileDescription entry = {datafile, datafile->_dataMin, datafile->_dataMax, datafile->_tickMax, isJournal}; LOG_TOPIC("2e4d2", TRACE, arangodb::Logger::DATAFILES) << "checking datafile " << datafile->fid() << " with data range " << datafile->_dataMin << " - " << datafile->_dataMax << ", tick max: " << datafile->_tickMax; LOG_TOPIC("730fa", TRACE, arangodb::Logger::REPLICATION) << "checking datafile " << datafile->fid() << " with data range " << datafile->_dataMin << " - " << datafile->_dataMax << ", tick max: " << datafile->_tickMax; if (datafile->_dataMin == 0 || datafile->_dataMax == 0) { // datafile doesn't have any data return; } TRI_ASSERT(datafile->_tickMin <= datafile->_tickMax); TRI_ASSERT(datafile->_dataMin <= datafile->_dataMax); if (dataMax < datafile->_dataMin) { // datafile is newer than requested range return; } if (dataMin > datafile->_dataMax) { // datafile is older than requested range return; } result.emplace_back(entry); }; READ_LOCKER(readLocker, _filesLock); for (auto& it : _datafiles) { apply(it, false); } for (auto& it : _journals) { apply(it, true); } return result; } bool MMFilesCollection::applyForTickRange( TRI_voc_tick_t dataMin, TRI_voc_tick_t dataMax, std::function const& callback) { LOG_TOPIC("9ec80", TRACE, arangodb::Logger::DATAFILES) << "getting datafiles in data range " << dataMin << " - " << dataMax; std::vector datafiles = datafilesInRange(dataMin, dataMax); // now we have a list of datafiles... size_t const n = datafiles.size(); LOG_TOPIC("b72ce", TRACE, arangodb::Logger::REPLICATION) << "getting datafiles in data range " << dataMin << " - " << dataMax << " produced " << n << " datafile(s)"; for (size_t i = 0; i < n; ++i) { auto const& e = datafiles[i]; MMFilesDatafile const* datafile = e._data; // we are reading from a journal that might be modified in parallel // so we must read-lock it CONDITIONAL_READ_LOCKER(readLocker, _filesLock, e._isJournal); if (!e._isJournal) { TRI_ASSERT(datafile->isSealed()); } char const* ptr = datafile->_data; char const* end = ptr + datafile->currentSize(); while (ptr < end) { auto const* marker = reinterpret_cast(ptr); if (marker->getSize() == 0) { // end of datafile break; } MMFilesMarkerType type = marker->getType(); if (type <= TRI_DF_MARKER_MIN) { break; } ptr += MMFilesDatafileHelper::AlignedMarkerSize(marker); if (type == TRI_DF_MARKER_BLANK) { // fully ignore these marker types. they don't need to be replicated, // but we also cannot stop iteration if we find one of these continue; } // get the marker's tick and check whether we should include it TRI_voc_tick_t foundTick = marker->getTick(); if (foundTick <= dataMin) { // marker too old continue; } if (foundTick > dataMax) { // marker too new return false; // hasMore = false } if (type != TRI_DF_MARKER_VPACK_DOCUMENT && type != TRI_DF_MARKER_VPACK_REMOVE) { // found a non-data marker... // check if we can abort searching if (foundTick >= dataMax || (foundTick > e._tickMax && i == (n - 1))) { // fetched the last available marker return false; // hasMore = false } continue; } // note the last tick we processed bool doAbort = false; if (!callback(foundTick, marker)) { doAbort = true; } if (foundTick >= dataMax || (foundTick >= e._tickMax && i == (n - 1))) { // fetched the last available marker return false; // hasMore = false } if (doAbort) { return true; // hasMore = true } } // next marker in datafile } // next datafile return false; // hasMore = false } // @brief Return the number of documents in this collection uint64_t MMFilesCollection::numberDocuments(transaction::Methods* trx) const { TRI_ASSERT(!ServerState::instance()->isCoordinator()); return primaryIndex()->size(); } void MMFilesCollection::sizeHint(transaction::Methods* trx, int64_t hint) { if (hint <= 0) { return; } primaryIndex()->resize(trx, static_cast(hint * 1.1)); } /// @brief report extra memory used by indexes etc. size_t MMFilesCollection::memory() const { return 0; // TODO } /// @brief disallow compaction of the collection void MMFilesCollection::preventCompaction() { _compactionLock.readLock(); } /// @brief try disallowing compaction of the collection bool MMFilesCollection::tryPreventCompaction() { return _compactionLock.tryReadLock(); } /// @brief re-allow compaction of the collection void MMFilesCollection::allowCompaction() { _compactionLock.unlock(); } /// @brief exclusively lock the collection for compaction void MMFilesCollection::lockForCompaction() { _compactionLock.writeLock(); } /// @brief try to exclusively lock the collection for compaction bool MMFilesCollection::tryLockForCompaction() { return _compactionLock.tryWriteLock(); } /// @brief signal that compaction is finished void MMFilesCollection::finishCompaction() { _compactionLock.unlock(); } /// @brief iterator for index open bool MMFilesCollection::openIndex(velocypack::Slice const& description, transaction::Methods& trx) { // VelocyPack must be an index description if (!description.isObject()) { return false; } bool unused = false; auto idx = createIndex(trx, description, /*restore*/ false, unused); if (idx == nullptr) { // error was already printed if we get here return false; } return true; } /// @brief initializes an index with a set of existing documents void MMFilesCollection::fillIndex( std::shared_ptr queue, transaction::Methods& trx, Index* idx, std::shared_ptr>> documents, bool skipPersistent) { TRI_ASSERT(idx->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX); TRI_ASSERT(!ServerState::instance()->isCoordinator()); if (!useSecondaryIndexes()) { return; } MMFilesIndex* midx = static_cast(idx); if (midx->isPersistent() && skipPersistent) { return; } try { // move task into thread pool std::shared_ptr<::MMFilesIndexFillerTask> worker; worker.reset(new ::MMFilesIndexFillerTask(queue, trx, midx, documents)); queue->enqueue(worker); } catch (...) { // set error code queue->setStatus(TRI_ERROR_INTERNAL); } } uint32_t MMFilesCollection::indexBuckets() const { return _indexBuckets; } int MMFilesCollection::fillAllIndexes(transaction::Methods& trx) { READ_LOCKER(guard, _indexesLock); return fillIndexes(trx, _indexes); } /// @brief Fill the given list of Indexes int MMFilesCollection::fillIndexes(transaction::Methods& trx, std::vector> const& indexes, bool skipPersistent) { // distribute the work to index threads plus this thread TRI_ASSERT(!ServerState::instance()->isCoordinator()); size_t const n = indexes.size(); if (n == 0 || (n == 1 && indexes[0].get()->type() == Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX)) { return TRI_ERROR_NO_ERROR; } bool rolledBack = false; auto rollbackAll = [&]() -> void { for (size_t i = 0; i < n; i++) { auto idx = indexes[i].get(); if (idx->type() == Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX) { continue; } MMFilesIndex* midx = static_cast(idx); if (midx->isPersistent()) { continue; } idx->unload(); // TODO: check is this safe? truncate not necessarily // feasible } }; TRI_ASSERT(n > 0); PerformanceLogScope logScope( std::string("fill-indexes-document-collection { collection: ") + _logicalCollection.vocbase().name() + "/" + _logicalCollection.name() + " }, indexes: " + std::to_string(n - 1)); auto poster = [](std::function fn) -> void { SchedulerFeature::SCHEDULER->queue(RequestLane::INTERNAL_LOW, fn); }; auto queue = std::make_shared(poster); try { TRI_ASSERT(!ServerState::instance()->isCoordinator()); // give the index a size hint auto primaryIdx = primaryIndex(); auto nrUsed = primaryIdx->size(); for (size_t i = 0; i < n; i++) { auto idx = indexes[i]; if (idx->type() == Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX) { continue; } static_cast(idx.get())->sizeHint(trx, nrUsed); } // process documents a million at a time size_t blockSize = 1024 * 1024 * 1; if (nrUsed < blockSize) { blockSize = nrUsed; } if (blockSize == 0) { blockSize = 1; } auto documentsPtr = std::make_shared>>(); std::vector>& documents = *documentsPtr.get(); documents.reserve(blockSize); auto insertInAllIndexes = [&]() -> void { for (size_t i = 0; i < n; ++i) { auto idx = indexes[i]; if (idx->type() == Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX) { continue; } fillIndex(queue, trx, idx.get(), documentsPtr, skipPersistent); } queue->dispatchAndWait(); if (queue->status() != TRI_ERROR_NO_ERROR) { rollbackAll(); rolledBack = true; } }; if (nrUsed > 0) { arangodb::basics::BucketPosition position; uint64_t total = 0; while (true) { auto element = primaryIdx->lookupSequential(&trx, position, total); if (!element) { break; } LocalDocumentId const documentId = element.localDocumentId(); uint8_t const* vpack = lookupDocumentVPack(documentId); if (vpack != nullptr) { documents.emplace_back(std::make_pair(documentId, VPackSlice(vpack))); if (documents.size() == blockSize) { // now actually fill the secondary indexes insertInAllIndexes(); if (queue->status() != TRI_ERROR_NO_ERROR) { break; } documents.clear(); } } } } // process the remainder of the documents if (queue->status() == TRI_ERROR_NO_ERROR && !documents.empty()) { insertInAllIndexes(); } } catch (arangodb::basics::Exception const& ex) { queue->setStatus(ex.code()); LOG_TOPIC("ff805", WARN, arangodb::Logger::ENGINES) << "caught exception while filling indexes: " << ex.what(); } catch (std::bad_alloc const&) { queue->setStatus(TRI_ERROR_OUT_OF_MEMORY); } catch (std::exception const& ex) { LOG_TOPIC("0644a", WARN, arangodb::Logger::ENGINES) << "caught exception while filling indexes: " << ex.what(); queue->setStatus(TRI_ERROR_INTERNAL); } catch (...) { LOG_TOPIC("2ea3f", WARN, arangodb::Logger::ENGINES) << "caught unknown exception while filling indexes"; queue->setStatus(TRI_ERROR_INTERNAL); } if (queue->status() != TRI_ERROR_NO_ERROR && !rolledBack) { try { rollbackAll(); } catch (...) { } } return queue->status(); } /// @brief opens an existing collection int MMFilesCollection::openWorker(bool ignoreErrors) { auto& vocbase = _logicalCollection.vocbase(); PerformanceLogScope logScope(std::string("open-collection { collection: ") + vocbase.name() + "/" + _logicalCollection.name() + " }"); try { // check for journals and datafiles MMFilesEngine* engine = static_cast(EngineSelectorFeature::ENGINE); auto res = engine->openCollection(&vocbase, &_logicalCollection, ignoreErrors); if (res != TRI_ERROR_NO_ERROR) { LOG_TOPIC("06d43", DEBUG, arangodb::Logger::ENGINES) << "cannot open '" << path() << "', check failed"; return res; } return TRI_ERROR_NO_ERROR; } catch (basics::Exception const& ex) { LOG_TOPIC("0f765", ERR, arangodb::Logger::ENGINES) << "cannot load collection parameter file '" << path() << "': " << ex.what(); return ex.code(); } catch (std::exception const& ex) { LOG_TOPIC("b32db", ERR, arangodb::Logger::ENGINES) << "cannot load collection parameter file '" << path() << "': " << ex.what(); return TRI_ERROR_INTERNAL; } } void MMFilesCollection::open(bool ignoreErrors) { VPackBuilder builder; auto engine = static_cast(EngineSelectorFeature::ENGINE); auto& vocbase = _logicalCollection.vocbase(); auto cid = _logicalCollection.id(); engine->getCollectionInfo(vocbase, cid, builder, true, 0); VPackSlice initialCount = builder.slice().get(std::vector({"parameters", "count"})); if (initialCount.isNumber()) { int64_t count = initialCount.getNumber(); if (count > 0) { _initialCount = count; } } PerformanceLogScope logScope( std::string("open-document-collection { collection: ") + vocbase.name() + "/" + _logicalCollection.name() + " }"); int res = openWorker(ignoreErrors); if (res != TRI_ERROR_NO_ERROR) { THROW_ARANGO_EXCEPTION_MESSAGE( res, std::string("cannot open document collection from path '") + path() + "': " + TRI_errno_string(res)); } arangodb::SingleCollectionTransaction trx(arangodb::transaction::StandaloneContext::Create(vocbase), _logicalCollection, AccessMode::Type::READ); // the underlying collections must not be locked here because the "load" // routine can be invoked from any other place, e.g. from an AQL query trx.addHint(transaction::Hints::Hint::LOCK_NEVER); { PerformanceLogScope logScope(std::string("iterate-markers { collection: ") + vocbase.name() + "/" + _logicalCollection.name() + " }"); // iterate over all markers of the collection res = iterateMarkersOnLoad(&trx); if (res != TRI_ERROR_NO_ERROR) { THROW_ARANGO_EXCEPTION_MESSAGE( res, std::string("cannot iterate data of document collection: ") + TRI_errno_string(res)); } } // build the indexes meta-data, but do not fill the indexes yet { auto old = useSecondaryIndexes(); // turn filling of secondary indexes off. we're now only interested in // getting // the indexes' definition. we'll fill them below ourselves. useSecondaryIndexes(false); try { detectIndexes(trx); useSecondaryIndexes(old); } catch (basics::Exception const& ex) { useSecondaryIndexes(old); THROW_ARANGO_EXCEPTION_MESSAGE( ex.code(), std::string("cannot initialize collection indexes: ") + ex.what()); } catch (std::exception const& ex) { useSecondaryIndexes(old); THROW_ARANGO_EXCEPTION_MESSAGE( TRI_ERROR_INTERNAL, std::string("cannot initialize collection indexes: ") + ex.what()); } catch (...) { useSecondaryIndexes(old); THROW_ARANGO_EXCEPTION_MESSAGE( TRI_ERROR_INTERNAL, "cannot initialize collection indexes: unknown exception"); } } if (!engine->inRecovery() && !engine->upgrading()) { // build the index structures, and fill the indexes fillAllIndexes(trx); } // successfully opened collection. now adjust version number if (LogicalCollection::currentVersion() != _logicalCollection.version() && !engine->upgrading()) { setCurrentVersion(); // updates have already happened elsewhere, it is safe to bump the number } } /// @brief iterate all markers of the collection int MMFilesCollection::iterateMarkersOnLoad(transaction::Methods* trx) { // initialize state for iteration OpenIteratorState openState(&_logicalCollection, trx); if (_initialCount != -1) { _revisionsCache.sizeHint(_initialCount); sizeHint(trx, _initialCount); openState._initialCount = _initialCount; } // read all documents and fill primary index auto cb = [&openState](MMFilesMarker const* marker, MMFilesDatafile* datafile) -> bool { return OpenIterator(marker, &openState, datafile); }; iterateDatafiles(cb); LOG_TOPIC("fc80d", TRACE, arangodb::Logger::ENGINES) << "found " << openState._documents << " document markers, " << openState._deletions << " deletion markers for collection '" << _logicalCollection.name() << "'"; // pick up persistent id flag from state _hasAllPersistentLocalIds.store(openState._hasAllPersistentLocalIds); auto engine = static_cast(EngineSelectorFeature::ENGINE); LOG_TOPIC_IF("40333", WARN, arangodb::Logger::ENGINES, !openState._hasAllPersistentLocalIds && !engine->upgrading() && !engine->inRecovery()) << "collection '" << _logicalCollection.name() << "' does not have all " << "persistent LocalDocumentIds; cannot be linked to an arangosearch " "view"; // update the real statistics for the collection try { for (auto& it : openState._stats) { createStats(it.first, *(it.second)); } } catch (basics::Exception const& ex) { return ex.code(); } catch (...) { return TRI_ERROR_INTERNAL; } return TRI_ERROR_NO_ERROR; } LocalDocumentId MMFilesCollection::lookupKey(transaction::Methods* trx, VPackSlice const& key) const { MMFilesPrimaryIndex* index = primaryIndex(); MMFilesSimpleIndexElement element = index->lookupKey(trx, key); return element ? LocalDocumentId(element.localDocumentId()) : LocalDocumentId(); } Result MMFilesCollection::read(transaction::Methods* trx, VPackSlice const& key, ManagedDocumentResult& result, bool lock) { TRI_IF_FAILURE("ReadDocumentNoLock") { // test what happens if no lock can be acquired return Result(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("ReadDocumentNoLockExcept") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } bool const useDeadlockDetector = (lock && !trx->isSingleOperationTransaction() && !trx->state()->hasHint(transaction::Hints::Hint::NO_DLD)); if (lock) { int res = lockRead(useDeadlockDetector, trx->state()); if (res != TRI_ERROR_NO_ERROR) { THROW_ARANGO_EXCEPTION(res); } } TRI_DEFER(if (lock) { unlockRead(useDeadlockDetector, trx->state()); }); LocalDocumentId const documentId = lookupDocument(trx, key, result); if (documentId.empty()) { return Result(TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND); } // we found a document return Result(TRI_ERROR_NO_ERROR); } Result MMFilesCollection::read(transaction::Methods* trx, arangodb::velocypack::StringRef const& key, ManagedDocumentResult& result, bool lock) { // copy string into a vpack string transaction::BuilderLeaser builder(trx); builder->add(VPackValuePair(key.data(), key.size(), VPackValueType::String)); return read(trx, builder->slice(), result, lock); } bool MMFilesCollection::readDocument(transaction::Methods* trx, LocalDocumentId const& documentId, ManagedDocumentResult& result) const { uint8_t const* vpack = lookupDocumentVPack(documentId); if (vpack != nullptr) { result.setUnmanaged(vpack); return true; } return false; } bool MMFilesCollection::readDocumentWithCallback(transaction::Methods* trx, LocalDocumentId const& documentId, IndexIterator::DocumentCallback const& cb) const { uint8_t const* vpack = lookupDocumentVPack(documentId); if (vpack != nullptr) { cb(documentId, VPackSlice(vpack)); return true; } return false; } size_t MMFilesCollection::readDocumentWithCallback( transaction::Methods* trx, std::vector>& documentIds, IndexIterator::DocumentCallback const& cb) { size_t count = 0; batchLookupRevisionVPack(documentIds); for (auto const& it : documentIds) { if (it.second) { cb(it.first, VPackSlice(it.second)); ++count; } } return count; } bool MMFilesCollection::readDocumentConditional(transaction::Methods* trx, LocalDocumentId const& documentId, TRI_voc_tick_t maxTick, ManagedDocumentResult& result) { TRI_ASSERT(documentId.isSet()); uint8_t const* vpack = lookupDocumentVPackConditional(documentId, maxTick, true); if (vpack != nullptr) { result.setUnmanaged(vpack); return true; } return false; } void MMFilesCollection::prepareIndexes(VPackSlice indexesSlice) { TRI_ASSERT(indexesSlice.isArray()); bool foundPrimary = false; bool foundEdge = false; for (auto const& it : VPackArrayIterator(indexesSlice)) { auto const& s = it.get(arangodb::StaticStrings::IndexType); if (s.isString()) { if (s.isEqualString("primary")) { foundPrimary = true; } else if (s.isEqualString("edge")) { foundEdge = true; } } } { READ_LOCKER(guard, _indexesLock); for (auto const& idx : _indexes) { if (idx->type() == Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX) { foundPrimary = true; } else if (TRI_COL_TYPE_EDGE == _logicalCollection.type() && idx->type() == Index::IndexType::TRI_IDX_TYPE_EDGE_INDEX) { foundEdge = true; } } } std::vector> indexes; StorageEngine* engine = EngineSelectorFeature::ENGINE; if (!foundPrimary || (!foundEdge && TRI_COL_TYPE_EDGE == _logicalCollection.type())) { // we still do not have any of the default indexes, so create them now engine->indexFactory().fillSystemIndexes(_logicalCollection, indexes); } engine->indexFactory().prepareIndexes(_logicalCollection, indexesSlice, indexes); for (std::shared_ptr& idx : indexes) { if (ServerState::instance()->isRunningInCluster()) { addIndex(std::move(idx)); } else { addIndexLocal(std::move(idx)); } } { READ_LOCKER(guard, _indexesLock); TRI_ASSERT(!_indexes.empty()); if (_indexes[0]->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX || (TRI_COL_TYPE_EDGE == _logicalCollection.type() && (_indexes.size() < 2 || _indexes[1]->type() != Index::IndexType::TRI_IDX_TYPE_EDGE_INDEX))) { #ifdef ARANGODB_ENABLE_MAINTAINER_MODE for (auto const& it : _indexes) { LOG_TOPIC("5e00b", ERR, arangodb::Logger::ENGINES) << "- " << it.get(); } #endif std::string errorMsg("got invalid indexes for collection '"); errorMsg.append(_logicalCollection.name()); errorMsg.push_back('\''); THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, errorMsg); } } #ifdef ARANGODB_ENABLE_MAINTAINER_MODE { READ_LOCKER(guard, _indexesLock); bool foundPrimary = false; for (auto const& it : _indexes) { if (it->type() == Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX) { if (foundPrimary) { std::string errorMsg( "found multiple primary indexes for collection '"); errorMsg.append(_logicalCollection.name()); errorMsg.push_back('\''); THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, errorMsg); } foundPrimary = true; } } } #endif TRI_ASSERT(!_indexes.empty()); } std::shared_ptr MMFilesCollection::createIndex(arangodb::velocypack::Slice const& info, bool restore, bool& created) { SingleCollectionTransaction trx(transaction::StandaloneContext::Create( _logicalCollection.vocbase()), _logicalCollection, AccessMode::Type::EXCLUSIVE); Result res = trx.begin(); if (!res.ok()) { THROW_ARANGO_EXCEPTION(res); } auto idx = createIndex(trx, info, restore, created); if (idx) { res = trx.commit(); } return idx; } std::shared_ptr MMFilesCollection::createIndex(transaction::Methods& trx, velocypack::Slice const& info, bool restore, bool& created) { // prevent concurrent dropping // TRI_ASSERT(trx->isLocked(&_logicalCollection, AccessMode::Type::READ)); TRI_ASSERT(!ServerState::instance()->isCoordinator()); TRI_ASSERT(info.isObject()); std::shared_ptr idx = PhysicalCollection::lookupIndex(info); if (idx != nullptr) { // We already have this index. if (idx->type() == arangodb::Index::TRI_IDX_TYPE_TTL_INDEX) { THROW_ARANGO_EXCEPTION_MESSAGE( TRI_ERROR_BAD_PARAMETER, "there can only be one ttl index per collection"); } created = false; return idx; } StorageEngine* engine = EngineSelectorFeature::ENGINE; // We are sure that we do not have an index of this type. // We also hold the lock. Create it bool generateKey = !restore; // Restore is not allowed to generate an id idx = engine->indexFactory().prepareIndexFromSlice(info, generateKey, _logicalCollection, false); if (!idx) { LOG_TOPIC("baec2", ERR, Logger::ENGINES) << "index creation failed while restoring"; THROW_ARANGO_EXCEPTION(TRI_ERROR_ARANGO_INDEX_CREATION_FAILED); } if (!restore) { TRI_UpdateTickServer(idx->id()); } auto other = PhysicalCollection::lookupIndex(idx->id()); if (!other) { other = PhysicalCollection::lookupIndex(idx->name()); } if (other) { // definition shares an identifier with an existing index with a // different definition #ifdef ARANGODB_ENABLE_MAINTAINER_MODE VPackBuilder builder; other->toVelocyPack(builder, static_cast::type>( Index::Serialize::Basics)); LOG_TOPIC("928ae", WARN, Logger::ENGINES) << "attempted to create index '" << info.toJson() << "' but found conflicting index '" << builder.slice().toJson() << "'"; #endif THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_ARANGO_DUPLICATE_IDENTIFIER, "duplicate value for `" + arangodb::StaticStrings::IndexId + "` or `" + arangodb::StaticStrings::IndexName + "`"); } TRI_ASSERT(idx->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX); int res = saveIndex(trx, idx); if (res != TRI_ERROR_NO_ERROR) { THROW_ARANGO_EXCEPTION(res); } #if USE_PLAN_CACHE arangodb::aql::PlanCache::instance()->invalidate(_logicalCollection.vocbase()); #endif // Until here no harm is done if sth fails. The shared ptr will clean up. if // left before addIndexLocal(idx); // trigger a rewrite if (!engine->inRecovery()) { auto builder = _logicalCollection.toVelocyPackIgnore({"path", "statusString"}, true, true); _logicalCollection.properties(builder.slice(), false); // always a full-update } created = true; return idx; } /// @brief Persist an index information to file int MMFilesCollection::saveIndex(transaction::Methods& trx, std::shared_ptr idx) { TRI_ASSERT(!ServerState::instance()->isCoordinator()); // we cannot persist PrimaryIndex TRI_ASSERT(idx->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX); std::vector> indexListLocal; indexListLocal.emplace_back(idx); int res = fillIndexes(trx, indexListLocal, false); if (res != TRI_ERROR_NO_ERROR) { return res; } std::shared_ptr builder; try { builder = idx->toVelocyPack(Index::makeFlags(Index::Serialize::Internals)); } catch (arangodb::basics::Exception const& ex) { LOG_TOPIC("90f7e", ERR, arangodb::Logger::ENGINES) << "cannot save index definition: " << ex.what(); return ex.code(); } catch (std::exception const& ex) { LOG_TOPIC("e24da", ERR, arangodb::Logger::ENGINES) << "cannot save index definition: " << ex.what(); return TRI_ERROR_INTERNAL; } catch (...) { LOG_TOPIC("a05ba", ERR, arangodb::Logger::ENGINES) << "cannot save index definition"; return TRI_ERROR_INTERNAL; } if (builder == nullptr) { LOG_TOPIC("63a71", ERR, arangodb::Logger::ENGINES) << "cannot save index definition"; return TRI_ERROR_OUT_OF_MEMORY; } auto& vocbase = _logicalCollection.vocbase(); auto collectionId = _logicalCollection.id(); VPackSlice data = builder->slice(); StorageEngine* engine = EngineSelectorFeature::ENGINE; static_cast(engine)->createIndex(vocbase, collectionId, idx->id(), data); if (!engine->inRecovery()) { // We need to write an index marker try { MMFilesCollectionMarker marker(TRI_DF_MARKER_VPACK_CREATE_INDEX, vocbase.id(), collectionId, data); MMFilesWalSlotInfoCopy slotInfo = MMFilesLogfileManager::instance()->allocateAndWrite(marker, false); res = slotInfo.errorCode; } catch (arangodb::basics::Exception const& ex) { LOG_TOPIC("9b65d", ERR, arangodb::Logger::ENGINES) << "cannot save index definition: " << ex.what(); res = ex.code(); } catch (std::exception const& ex) { LOG_TOPIC("dc491", ERR, arangodb::Logger::ENGINES) << "cannot save index definition: " << ex.what(); return TRI_ERROR_INTERNAL; } catch (...) { res = TRI_ERROR_INTERNAL; } } return res; } bool MMFilesCollection::addIndex(std::shared_ptr idx) { WRITE_LOCKER(guard, _indexesLock); auto const id = idx->id(); for (auto const& it : _indexes) { if (it->id() == id) { // already have this particular index. do not add it again return false; } } TRI_UpdateTickServer(static_cast(id)); _indexes.emplace_back(idx); if (idx->type() == Index::TRI_IDX_TYPE_PRIMARY_INDEX) { TRI_ASSERT(idx->id() == 0); _primaryIndex = static_cast(idx.get()); } return true; } void MMFilesCollection::addIndexLocal(std::shared_ptr idx) { // primary index must be added at position 0 TRI_ASSERT(idx->type() != arangodb::Index::TRI_IDX_TYPE_PRIMARY_INDEX || _indexes.empty()); if (!addIndex(idx)) { return; } // update statistics MMFilesIndex* midx = static_cast(idx.get()); if (midx->isPersistent()) { ++_persistentIndexes; } } bool MMFilesCollection::dropIndex(TRI_idx_iid_t iid) { if (iid == 0) { // invalid index id or primary index events::DropIndex(_logicalCollection.vocbase().name(), _logicalCollection.name(), std::to_string(iid), TRI_ERROR_NO_ERROR); return true; } auto& vocbase = _logicalCollection.vocbase(); if (!removeIndex(iid)) { // We tried to remove an index that does not exist events::DropIndex(_logicalCollection.vocbase().name(), _logicalCollection.name(), std::to_string(iid), TRI_ERROR_ARANGO_INDEX_NOT_FOUND); return false; } auto cid = _logicalCollection.id(); MMFilesEngine* engine = static_cast(EngineSelectorFeature::ENGINE); engine->dropIndex(&vocbase, cid, iid); { auto builder = _logicalCollection.toVelocyPackIgnore({"path", "statusString"}, true, true); _logicalCollection.properties(builder.slice(), false); // always a full-update } if (!engine->inRecovery()) { int res = TRI_ERROR_NO_ERROR; VPackBuilder markerBuilder; markerBuilder.openObject(); markerBuilder.add("id", VPackValue(std::to_string(iid))); markerBuilder.close(); engine->dropIndexWalMarker(&vocbase, cid, markerBuilder.slice(), true, res); if (res == TRI_ERROR_NO_ERROR) { events::DropIndex(_logicalCollection.vocbase().name(), _logicalCollection.name(), std::to_string(iid), TRI_ERROR_NO_ERROR); } else { LOG_TOPIC("96677", WARN, arangodb::Logger::ENGINES) << "could not save index drop marker in log: " << TRI_errno_string(res); events::DropIndex(_logicalCollection.vocbase().name(), _logicalCollection.name(), std::to_string(iid), res); } } return true; } /// @brief removes an index by id bool MMFilesCollection::removeIndex(TRI_idx_iid_t iid) { WRITE_LOCKER(guard, _indexesLock); size_t const n = _indexes.size(); for (size_t i = 0; i < n; ++i) { auto idx = _indexes[i]; if (!idx->canBeDropped()) { continue; } if (idx->id() == iid) { // found! idx->drop(); _indexes.erase(_indexes.begin() + i); // update statistics MMFilesIndex* midx = static_cast(idx.get()); if (midx->isPersistent()) { --_persistentIndexes; } return true; } } // not found return false; } std::unique_ptr MMFilesCollection::getAllIterator(transaction::Methods* trx) const { return std::unique_ptr(primaryIndex()->allIterator(trx)); } std::unique_ptr MMFilesCollection::getAnyIterator(transaction::Methods* trx) const { return std::unique_ptr(primaryIndex()->anyIterator(trx)); } void MMFilesCollection::invokeOnAllElements(transaction::Methods* trx, std::function callback) { primaryIndex()->invokeOnAllElements(callback); } /// @brief read locks a collection, with a timeout (in µseconds) int MMFilesCollection::lockRead(bool useDeadlockDetector, TransactionState const* state, double timeout) { TRI_ASSERT(state != nullptr); TRI_voc_tid_t tid = state->id(); // LOCKING-DEBUG // std::cout << "BeginReadTimed: " << _name << std::endl; int iterations = 0; bool wasBlocked = false; uint64_t waitTime = 0; // indicate that times uninitialized double startTime = 0.0; while (true) { TRY_READ_LOCKER(locker, _dataLock); if (locker.isLocked()) { // when we are here, we've got the read lock if (useDeadlockDetector) { _logicalCollection.vocbase()._deadlockDetector.addReader(tid, &_logicalCollection, wasBlocked); } // keep lock and exit loop locker.steal(); return TRI_ERROR_NO_ERROR; } if (useDeadlockDetector) { try { if (!wasBlocked) { // insert reader wasBlocked = true; if (TRI_ERROR_DEADLOCK == _logicalCollection.vocbase()._deadlockDetector.setReaderBlocked(tid, &_logicalCollection)) { // deadlock LOG_TOPIC("17c2c", TRACE, arangodb::Logger::ENGINES) << "deadlock detected while trying to acquire read-lock " << "on collection '" << _logicalCollection.name() << "'"; return TRI_ERROR_DEADLOCK; } LOG_TOPIC("29637", TRACE, arangodb::Logger::ENGINES) << "waiting for read-lock on collection '" << _logicalCollection.name() << "'"; // intentionally falls through } else if (++iterations >= 5) { // periodically check for deadlocks TRI_ASSERT(wasBlocked); iterations = 0; if (TRI_ERROR_DEADLOCK == _logicalCollection.vocbase()._deadlockDetector.detectDeadlock(tid, &_logicalCollection, false)) { // deadlock _logicalCollection.vocbase()._deadlockDetector.unsetReaderBlocked(tid, &_logicalCollection); LOG_TOPIC("b228b", TRACE, arangodb::Logger::ENGINES) << "deadlock detected while trying to acquire read-lock " << "on collection '" << _logicalCollection.name() << "'"; return TRI_ERROR_DEADLOCK; } } } catch (...) { // clean up! if (wasBlocked) { _logicalCollection.vocbase()._deadlockDetector.unsetReaderBlocked(tid, &_logicalCollection); } // always exit return TRI_ERROR_OUT_OF_MEMORY; } } double now = TRI_microtime(); if (waitTime == 0) { // initialize times // set end time for lock waiting if (timeout <= 0.0) { timeout = defaultLockTimeout; } startTime = now; waitTime = 1; } if (now > startTime + timeout) { if (useDeadlockDetector) { _logicalCollection.vocbase()._deadlockDetector.unsetReaderBlocked(tid, &_logicalCollection); } LOG_TOPIC("aeef3", TRACE, arangodb::Logger::ENGINES) << "timed out after " << timeout << " s waiting for read-lock on collection '" << _logicalCollection.name() << "'"; return TRI_ERROR_LOCK_TIMEOUT; } if (now - startTime < 0.001) { std::this_thread::yield(); } else { if (_logicalCollection.vocbase().isDropped()) { return TRI_ERROR_ARANGO_DATA_SOURCE_NOT_FOUND; } std::this_thread::sleep_for(std::chrono::microseconds(waitTime)); if (waitTime < 32) { waitTime *= 2; } } } } /// @brief write locks a collection, with a timeout int MMFilesCollection::lockWrite(bool useDeadlockDetector, TransactionState const* state, double timeout) { TRI_ASSERT(state != nullptr); TRI_voc_tid_t tid = state->id(); // LOCKING-DEBUG // std::cout << "BeginWriteTimed: " << document->_info._name << std::endl; int iterations = 0; bool wasBlocked = false; uint64_t waitTime = 0; // indicate that times uninitialized double startTime = 0.0; while (true) { TRY_WRITE_LOCKER(locker, _dataLock); if (locker.isLocked()) { // register writer if (useDeadlockDetector) { _logicalCollection.vocbase()._deadlockDetector.addWriter(tid, &_logicalCollection, wasBlocked); } // keep lock and exit loop locker.steal(); return TRI_ERROR_NO_ERROR; } if (useDeadlockDetector) { try { if (!wasBlocked) { // insert writer wasBlocked = true; if (TRI_ERROR_DEADLOCK == _logicalCollection.vocbase()._deadlockDetector.setWriterBlocked(tid, &_logicalCollection)) { // deadlock LOG_TOPIC("55051", TRACE, arangodb::Logger::ENGINES) << "deadlock detected while trying to acquire " << "write-lock on collection '" << _logicalCollection.name() << "'"; return TRI_ERROR_DEADLOCK; } LOG_TOPIC("22d7e", TRACE, arangodb::Logger::ENGINES) << "waiting for write-lock on collection '" << _logicalCollection.name() << "'"; } else if (++iterations >= 5) { // periodically check for deadlocks TRI_ASSERT(wasBlocked); iterations = 0; if (TRI_ERROR_DEADLOCK == _logicalCollection.vocbase()._deadlockDetector.detectDeadlock(tid, &_logicalCollection, true)) { // deadlock _logicalCollection.vocbase()._deadlockDetector.unsetWriterBlocked(tid, &_logicalCollection); LOG_TOPIC("a13a3", TRACE, arangodb::Logger::ENGINES) << "deadlock detected while trying to acquire " "write-lock on collection '" << _logicalCollection.name() << "'"; return TRI_ERROR_DEADLOCK; } } } catch (...) { // clean up! if (wasBlocked) { _logicalCollection.vocbase()._deadlockDetector.unsetWriterBlocked(tid, &_logicalCollection); } // always exit return TRI_ERROR_OUT_OF_MEMORY; } } double now = TRI_microtime(); if (waitTime == 0) { // initialize times // set end time for lock waiting if (timeout <= 0.0) { timeout = defaultLockTimeout; } startTime = now; waitTime = 1; } if (now > startTime + timeout) { if (useDeadlockDetector) { _logicalCollection.vocbase()._deadlockDetector.unsetWriterBlocked(tid, &_logicalCollection); } LOG_TOPIC("34494", TRACE, arangodb::Logger::ENGINES) << "timed out after " << timeout << " s waiting for write-lock on collection '" << _logicalCollection.name() << "'"; return TRI_ERROR_LOCK_TIMEOUT; } if (now - startTime < 0.001) { std::this_thread::yield(); } else { if (_logicalCollection.vocbase().isDropped()) { return TRI_ERROR_ARANGO_DATA_SOURCE_NOT_FOUND; } std::this_thread::sleep_for(std::chrono::microseconds(waitTime)); if (waitTime < 32) { waitTime *= 2; } } } } /// @brief read unlocks a collection int MMFilesCollection::unlockRead(bool useDeadlockDetector, TransactionState const* state) { TRI_ASSERT(state != nullptr); TRI_voc_tid_t tid = state->id(); if (useDeadlockDetector) { // unregister reader try { _logicalCollection.vocbase()._deadlockDetector.unsetReader(tid, &_logicalCollection); } catch (...) { } } // LOCKING-DEBUG // std::cout << "EndRead: " << _name << std::endl; _dataLock.unlockRead(); return TRI_ERROR_NO_ERROR; } /// @brief write unlocks a collection int MMFilesCollection::unlockWrite(bool useDeadlockDetector, TransactionState const* state) { TRI_ASSERT(state != nullptr); TRI_voc_tid_t tid = state->id(); if (useDeadlockDetector) { // unregister writer try { _logicalCollection.vocbase()._deadlockDetector.unsetWriter(tid, &_logicalCollection); } catch (...) { // must go on here to unlock the lock } } // LOCKING-DEBUG // std::cout << "EndWrite: " << _name << std::endl; _dataLock.unlockWrite(); return TRI_ERROR_NO_ERROR; } Result MMFilesCollection::truncate(transaction::Methods& trx, OperationOptions& options) { auto primaryIdx = primaryIndex(); options.ignoreRevs = true; // create remove marker transaction::BuilderLeaser builder(&trx); auto callback = [&](MMFilesSimpleIndexElement const& element) { LocalDocumentId const oldDocumentId = element.localDocumentId(); uint8_t const* vpack = lookupDocumentVPack(oldDocumentId); if (vpack != nullptr) { builder->clear(); VPackSlice oldDoc(vpack); LocalDocumentId const documentId = LocalDocumentId::create(); TRI_voc_rid_t revisionId; newObjectForRemove(&trx, oldDoc, *builder.get(), options.isRestore, revisionId); Result res = removeFastPath(trx, revisionId, oldDocumentId, VPackSlice(vpack), options, documentId, builder->slice()); if (res.fail()) { THROW_ARANGO_EXCEPTION(res); } } return true; }; try { primaryIdx->invokeOnAllElementsForRemoval(callback); } catch (basics::Exception const& e) { return Result(e.code(), e.message()); } catch (std::exception const& e) { return Result(TRI_ERROR_INTERNAL, e.what()); } catch (...) { return Result(TRI_ERROR_INTERNAL, "unknown error during truncate"); } READ_LOCKER(guard, _indexesLock); auto indexes = _indexes; size_t const n = indexes.size(); TRI_voc_tick_t tick = TRI_NewTickServer(); for (size_t i = 1; i < n; ++i) { auto idx = indexes[i]; TRI_ASSERT(idx->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX); idx->afterTruncate(tick); } return Result(); } /// @brief compact-data operation Result MMFilesCollection::compact() { // this will only reset the last compaction timestamp, // so the collection becomes eligible for the next round of compactions MUTEX_LOCKER(mutexLocker, _compactionStatusLock); _lastCompactionStamp = 0.0; return {}; } LocalDocumentId MMFilesCollection::reuseOrCreateLocalDocumentId(OperationOptions const& options) const { if (options.recoveryData != nullptr) { auto marker = static_cast(options.recoveryData); if (marker->hasLocalDocumentId()) { return marker->getLocalDocumentId(); } // falls through intentionally } // new operation, no recovery -> generate a new LocalDocumentId return LocalDocumentId::create(); } Result MMFilesCollection::insert(arangodb::transaction::Methods* trx, VPackSlice const slice, arangodb::ManagedDocumentResult& resultMdr, OperationOptions& options, bool lock, KeyLockInfo* keyLockInfo, std::function const& callbackDuringLock) { LocalDocumentId const documentId = reuseOrCreateLocalDocumentId(options); auto isEdgeCollection = (TRI_COL_TYPE_EDGE == _logicalCollection.type()); transaction::BuilderLeaser builder(trx); VPackSlice newSlice; Result res(TRI_ERROR_NO_ERROR); TRI_voc_rid_t revisionId = 0; if (options.recoveryData == nullptr) { res = newObjectForInsert(trx, slice, isEdgeCollection, *builder.get(), options.isRestore, revisionId); if (res.fail()) { return res; } newSlice = builder->slice(); } else { TRI_ASSERT(slice.isObject()); // we can get away with the fast hash function here, as key values are // restricted to strings newSlice = slice; VPackSlice keySlice = newSlice.get(StaticStrings::KeyString); if (keySlice.isString()) { VPackValueLength l; char const* p = keySlice.getString(l); TRI_ASSERT(p != nullptr); _logicalCollection.keyGenerator()->track(p, l); } VPackSlice revSlice = newSlice.get(StaticStrings::RevString); if (revSlice.isString()) { VPackValueLength l; char const* p = revSlice.getStringUnchecked(l); TRI_ASSERT(p != nullptr); revisionId = TRI_StringToRid(p, l, false); } } TRI_ASSERT(revisionId != 0); // create marker MMFilesCrudMarker insertMarker( TRI_DF_MARKER_VPACK_DOCUMENT, static_cast(trx->state())->idForMarker(), documentId, newSlice); MMFilesWalMarker const* marker; if (options.recoveryData == nullptr) { marker = &insertMarker; } else { marker = static_cast(options.recoveryData); } // now insert into indexes TRI_IF_FAILURE("InsertDocumentNoLock") { // test what happens if no lock can be acquired return Result(TRI_ERROR_DEBUG); } // TODO Do we need a LogicalCollection here? MMFilesDocumentOperation operation(&_logicalCollection, TRI_VOC_DOCUMENT_OPERATION_INSERT); TRI_IF_FAILURE("InsertDocumentNoHeader") { // test what happens if no header can be acquired return Result(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("InsertDocumentNoHeaderExcept") { // test what happens if no header can be acquired THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } VPackSlice const doc(marker->vpack()); operation.setDocumentIds(MMFilesDocumentDescriptor(), MMFilesDocumentDescriptor(documentId, doc.begin())); try { insertLocalDocumentId(documentId, marker->vpack(), 0, true, true); // and go on with the insertion... } catch (basics::Exception const& ex) { return Result(ex.code()); } catch (std::bad_alloc const&) { return Result(TRI_ERROR_OUT_OF_MEMORY); } catch (std::exception const& ex) { return Result(TRI_ERROR_INTERNAL, ex.what()); } catch (...) { return Result(TRI_ERROR_INTERNAL); } TRI_ASSERT(keyLockInfo != nullptr); if (keyLockInfo->shouldLock) { TRI_ASSERT(lock); lockKey(*keyLockInfo, newSlice.get(StaticStrings::KeyString)); } res = Result(); { // use lock? bool const useDeadlockDetector = (lock && !trx->isSingleOperationTransaction() && !trx->state()->hasHint(transaction::Hints::Hint::NO_DLD)); try { arangodb::MMFilesCollectionWriteLocker collectionLocker(this, useDeadlockDetector, trx->state(), lock); try { // insert into indexes res = insertDocument(*trx, documentId, revisionId, doc, operation, marker, options, options.waitForSync); } catch (basics::Exception const& ex) { res = Result(ex.code()); } catch (std::bad_alloc const&) { res = Result(TRI_ERROR_OUT_OF_MEMORY); } catch (std::exception const& ex) { res = Result(TRI_ERROR_INTERNAL, ex.what()); } catch (...) { res = Result(TRI_ERROR_INTERNAL); } if (res.ok() && callbackDuringLock != nullptr) { callbackDuringLock(); } } catch (...) { // the collectionLocker may have thrown in its constructor... // if it did, then we need to manually remove the revision id // from the list of revisions try { removeLocalDocumentId(documentId, false); } catch (...) { } throw; } if (res.fail()) { operation.revert(trx); } } if (res.ok()) { if (options.returnNew) { resultMdr.setManaged(doc.begin()); TRI_ASSERT(resultMdr.revisionId() == revisionId); } else if (!options.silent) { // need to pass up revision and key transaction::BuilderLeaser keyBuilder(trx); keyBuilder->openObject(/*unindexed*/true); keyBuilder->add(StaticStrings::KeyString, transaction::helpers::extractKeyFromDocument(doc)); keyBuilder->close(); resultMdr.setManaged()->assign(reinterpret_cast(keyBuilder->start()), keyBuilder->size()); resultMdr.setRevisionId(revisionId); } } return res; } bool MMFilesCollection::isFullyCollected() const { int64_t uncollected = _uncollectedLogfileEntries.load(); return (uncollected == 0); } MMFilesDocumentPosition MMFilesCollection::lookupDocument(LocalDocumentId const& documentId) const { TRI_ASSERT(documentId.isSet()); MMFilesDocumentPosition const old = _revisionsCache.lookup(documentId); if (old) { return old; } TRI_ASSERT(false); THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "got invalid revision value on lookup"); } uint8_t const* MMFilesCollection::lookupDocumentVPack(LocalDocumentId const& documentId) const { TRI_ASSERT(documentId.isSet()); MMFilesDocumentPosition const old = _revisionsCache.lookup(documentId); if (old) { uint8_t const* vpack = static_cast(old.dataptr()); TRI_ASSERT(VPackSlice(vpack).isObject()); return vpack; } TRI_ASSERT(false); THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "got invalid vpack value on lookup"); } uint8_t const* MMFilesCollection::lookupDocumentVPackConditional( LocalDocumentId const& documentId, TRI_voc_tick_t maxTick, bool excludeWal) const { TRI_ASSERT(documentId.isSet()); MMFilesDocumentPosition const old = _revisionsCache.lookup(documentId); if (!old) { return nullptr; } if (excludeWal && old.pointsToWal()) { return nullptr; } uint8_t const* vpack = static_cast(old.dataptr()); if (maxTick > 0) { MMFilesMarker const* marker = reinterpret_cast( vpack - MMFilesDatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_DOCUMENT)); if (marker->getTick() > maxTick) { return nullptr; } } return vpack; } void MMFilesCollection::batchLookupRevisionVPack( std::vector>& documentIds) const { _revisionsCache.batchLookup(documentIds); } MMFilesDocumentPosition MMFilesCollection::insertLocalDocumentId( LocalDocumentId const& documentId, uint8_t const* dataptr, TRI_voc_fid_t fid, bool isInWal, bool shouldLock) { TRI_ASSERT(documentId.isSet()); TRI_ASSERT(dataptr != nullptr); return _revisionsCache.insert(documentId, dataptr, fid, isInWal, shouldLock); } void MMFilesCollection::insertLocalDocumentId(MMFilesDocumentPosition const& position, bool shouldLock) { return _revisionsCache.insert(position, shouldLock); } void MMFilesCollection::updateLocalDocumentId(LocalDocumentId const& documentId, uint8_t const* dataptr, TRI_voc_fid_t fid, bool isInWal) { TRI_ASSERT(documentId.isSet()); TRI_ASSERT(dataptr != nullptr); _revisionsCache.update(documentId, dataptr, fid, isInWal); } bool MMFilesCollection::updateLocalDocumentIdConditional( LocalDocumentId const& documentId, MMFilesMarker const* oldPosition, MMFilesMarker const* newPosition, TRI_voc_fid_t newFid, bool isInWal) { TRI_ASSERT(documentId.isSet()); TRI_ASSERT(newPosition != nullptr); return _revisionsCache.updateConditional(documentId, oldPosition, newPosition, newFid, isInWal); } void MMFilesCollection::removeLocalDocumentId(LocalDocumentId const& documentId, bool updateStats) { TRI_ASSERT(documentId.isSet()); if (updateStats) { MMFilesDocumentPosition const old = _revisionsCache.fetchAndRemove(documentId); if (old && !old.pointsToWal() && old.fid() != 0) { TRI_ASSERT(old.dataptr() != nullptr); uint8_t const* vpack = static_cast(old.dataptr()); auto oldMarker = reinterpret_cast( vpack - MMFilesDatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_DOCUMENT)); int64_t size = MMFilesDatafileHelper::AlignedMarkerSize(oldMarker); _datafileStatistics.increaseDead(old.fid(), 1, size); } } else { _revisionsCache.remove(documentId); } } bool MMFilesCollection::hasAllPersistentLocalIds() const { return _hasAllPersistentLocalIds.load(); } Result MMFilesCollection::persistLocalDocumentIdsForDatafile(MMFilesCollection& collection, MMFilesDatafile& file) { Result res; // make a first pass to count documents and determine output size size_t numDocuments = 0; bool ok = TRI_IterateDatafile(&file, ::countDocumentsIterator, &numDocuments); if (!ok) { res.reset(TRI_ERROR_INTERNAL, "could not count documents"); return res; } size_t outputSizeLimit = file.currentSize() + (numDocuments * sizeof(LocalDocumentId)); MMFilesDatafile* outputFile = nullptr; { READ_UNLOCKER(unlocker, collection._filesLock); outputFile = collection.createCompactor(file.fid(), outputSizeLimit); } if (nullptr == outputFile) { return Result(TRI_ERROR_INTERNAL); } res = TRI_IterateDatafile(&file, ::persistLocalDocumentIdIterator, outputFile); if (res.fail()) { return res; } { READ_UNLOCKER(unlocker, collection._filesLock); res = collection.closeCompactor(outputFile); if (res.fail()) { return res; } // TODO detect error in replacement? MMFilesCompactorThread::RenameDatafileCallback(&file, outputFile, &collection._logicalCollection); } return res; } Result MMFilesCollection::persistLocalDocumentIds() { if (_logicalCollection.version() >= LogicalCollection::CollectionVersions::VERSION_34) { // already good, just continue return Result(); } WRITE_LOCKER(dataLocker, _dataLock); TRI_ASSERT(_compactors.empty()); // convert journal to datafile first int res = rotateActiveJournal(); if (TRI_ERROR_NO_ERROR != res && TRI_ERROR_ARANGO_NO_JOURNAL != res) { return Result(res); } // now handle datafiles { READ_LOCKER(locker, _filesLock); for (auto file : _datafiles) { Result result = persistLocalDocumentIdsForDatafile(*this, *file); if (result.fail()) { return result; } } } _hasAllPersistentLocalIds.store(true); TRI_ASSERT(_compactors.empty()); TRI_ASSERT(_journals.empty()); // mark collection as upgraded so we can avoid re-checking setCurrentVersion(); return Result(); } void MMFilesCollection::setCurrentVersion() { _logicalCollection.setVersion(static_cast( LogicalCollection::currentVersion())); bool const doSync = application_features::ApplicationServer::getFeature( "Database") ->forceSyncProperties(); StorageEngine* engine = EngineSelectorFeature::ENGINE; engine->changeCollection(_logicalCollection.vocbase(), _logicalCollection, doSync); } /// @brief creates a new entry in the primary index Result MMFilesCollection::insertPrimaryIndex(transaction::Methods* trx, LocalDocumentId const& documentId, VPackSlice const& doc, OperationOptions& options) { TRI_IF_FAILURE("InsertPrimaryIndex") { return Result(TRI_ERROR_DEBUG); } // insert into primary index return primaryIndex()->insertKey(trx, documentId, doc, options.indexOperationMode); } /// @brief deletes an entry from the primary index Result MMFilesCollection::deletePrimaryIndex(arangodb::transaction::Methods* trx, LocalDocumentId const& documentId, VPackSlice const& doc, OperationOptions& options) { TRI_IF_FAILURE("DeletePrimaryIndex") { return Result(TRI_ERROR_DEBUG); } return primaryIndex()->removeKey(trx, documentId, doc, options.indexOperationMode); } /// @brief creates a new entry in the secondary indexes Result MMFilesCollection::insertSecondaryIndexes(arangodb::transaction::Methods& trx, LocalDocumentId const& documentId, velocypack::Slice const& doc, Index::OperationMode mode) { // Coordinator doesn't know index internals TRI_ASSERT(!ServerState::instance()->isCoordinator()); TRI_IF_FAILURE("InsertSecondaryIndexes") { return Result(TRI_ERROR_DEBUG); } bool const useSecondary = useSecondaryIndexes(); if (!useSecondary && _persistentIndexes == 0) { return TRI_ERROR_NO_ERROR; } Result result; READ_LOCKER(guard, _indexesLock); auto indexes = _indexes; size_t const n = indexes.size(); for (size_t i = 1; i < n; ++i) { auto idx = indexes[i]; TRI_ASSERT(idx->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX); MMFilesIndex* midx = static_cast(idx.get()); if (!useSecondary && !midx->isPersistent()) { continue; } Result res = midx->insert(trx, documentId, doc, mode); // in case of no-memory, return immediately if (res.errorNumber() == TRI_ERROR_OUT_OF_MEMORY) { return res; } if (!res.ok()) { if (res.errorNumber() == TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED || result.ok()) { // "prefer" unique constraint violated result = res; } } } return result; } /// @brief deletes an entry from the secondary indexes Result MMFilesCollection::deleteSecondaryIndexes(transaction::Methods& trx, LocalDocumentId const& documentId, velocypack::Slice const& doc, Index::OperationMode mode) { // Coordintor doesn't know index internals TRI_ASSERT(!ServerState::instance()->isCoordinator()); bool const useSecondary = useSecondaryIndexes(); if (!useSecondary && _persistentIndexes == 0) { return Result(TRI_ERROR_NO_ERROR); } TRI_IF_FAILURE("DeleteSecondaryIndexes") { return Result(TRI_ERROR_DEBUG); } Result result; READ_LOCKER(guard, _indexesLock); auto indexes = _indexes; size_t const n = indexes.size(); for (size_t i = 1; i < n; ++i) { auto idx = indexes[i]; TRI_ASSERT(idx->type() != Index::IndexType::TRI_IDX_TYPE_PRIMARY_INDEX); MMFilesIndex* midx = static_cast(idx.get()); if (!useSecondary && !midx->isPersistent()) { continue; } Result res = midx->remove(trx, documentId, doc, mode); if (res.fail()) { // an error occurred result = res; } } return result; } /// @brief enumerate all indexes of the collection, but don't fill them yet int MMFilesCollection::detectIndexes(transaction::Methods& trx) { StorageEngine* engine = EngineSelectorFeature::ENGINE; VPackBuilder builder; engine->getCollectionInfo(_logicalCollection.vocbase(), _logicalCollection.id(), builder, true, UINT64_MAX); // iterate over all index files for (auto const& it : VPackArrayIterator(builder.slice().get("indexes"))) { bool ok = openIndex(it, trx); if (!ok) { LOG_TOPIC("8c1c4", ERR, arangodb::Logger::ENGINES) << "cannot load index for collection '" << _logicalCollection.name() << "'"; } } return TRI_ERROR_NO_ERROR; } /// @brief insert a document into all indexes known to /// this collection. /// This function guarantees all or nothing, /// If it returns NO_ERROR all indexes are filled. /// If it returns an error no documents are inserted Result MMFilesCollection::insertIndexes(transaction::Methods& trx, LocalDocumentId const& documentId, velocypack::Slice const& doc, OperationOptions& options) { // insert into primary index first auto res = insertPrimaryIndex(&trx, documentId, doc, options); if (res.fail()) { // insert has failed return res; } // insert into secondary indexes res = insertSecondaryIndexes(trx, documentId, doc, options.indexOperationMode); if (res.fail()) { deleteSecondaryIndexes(trx, documentId, doc, Index::OperationMode::rollback); deletePrimaryIndex(&trx, documentId, doc, options); } return res; } /// @brief insert a document, low level worker /// the caller must make sure the write lock on the collection is held Result MMFilesCollection::insertDocument(arangodb::transaction::Methods& trx, LocalDocumentId const& documentId, TRI_voc_rid_t revisionId, velocypack::Slice const& doc, MMFilesDocumentOperation& operation, MMFilesWalMarker const* marker, OperationOptions& options, bool& waitForSync) { Result res = insertIndexes(trx, documentId, doc, options); if (res.fail()) { return res; } operation.indexed(); TRI_IF_FAILURE("InsertDocumentNoOperation") { return Result(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("InsertDocumentNoOperationExcept") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } return Result(static_cast(trx.state()) ->addOperation(documentId, revisionId, operation, marker, waitForSync)); } Result MMFilesCollection::update(arangodb::transaction::Methods* trx, VPackSlice const newSlice, ManagedDocumentResult& resultMdr, OperationOptions& options, bool lock, ManagedDocumentResult& previousMdr) { VPackSlice key = newSlice.get(StaticStrings::KeyString); if (key.isNone()) { return Result(TRI_ERROR_ARANGO_DOCUMENT_HANDLE_BAD); } else if (!key.isString()) { return Result(TRI_ERROR_ARANGO_DOCUMENT_KEY_BAD); } LocalDocumentId const documentId = reuseOrCreateLocalDocumentId(options); auto isEdgeCollection = (TRI_COL_TYPE_EDGE == _logicalCollection.type()); TRI_IF_FAILURE("UpdateDocumentNoLock") { return Result(TRI_ERROR_DEBUG); } bool const useDeadlockDetector = (lock && !trx->isSingleOperationTransaction() && !trx->state()->hasHint(transaction::Hints::Hint::NO_DLD)); arangodb::MMFilesCollectionWriteLocker collectionLocker(this, useDeadlockDetector, trx->state(), lock); // get the previous revision LocalDocumentId const oldDocumentId = lookupDocument(trx, key, previousMdr); if (oldDocumentId.empty()) { return Result(TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND); } VPackSlice const oldDoc(previousMdr.vpack()); TRI_ASSERT(previousMdr.revisionId() != 0); TRI_IF_FAILURE("UpdateDocumentNoMarker") { // test what happens when no marker can be created return Result(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("UpdateDocumentNoMarkerExcept") { // test what happens when no marker can be created THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } // Check old revision: if (!options.ignoreRevs) { TRI_voc_rid_t expectedRev = 0; if (newSlice.isObject()) { expectedRev = TRI_ExtractRevisionId(newSlice); } int res = checkRevision(trx, expectedRev, previousMdr.revisionId()); if (res != TRI_ERROR_NO_ERROR) { return Result(res); } } if (newSlice.length() <= 1) { // no need to do anything resultMdr = previousMdr; if (_logicalCollection.waitForSync()) { options.waitForSync = true; } return Result(); } Result res; // merge old and new values TRI_voc_rid_t revisionId; transaction::BuilderLeaser builder(trx); if (options.recoveryData == nullptr) { res = mergeObjectsForUpdate(trx, oldDoc, newSlice, isEdgeCollection, options.mergeObjects, options.keepNull, *builder.get(), options.isRestore, revisionId); if (res.fail()) { return res; } if (_isDBServer) { // Need to check that no sharding keys have changed: if (arangodb::shardKeysChanged(_logicalCollection, oldDoc, builder->slice(), true)) { return Result(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SHARDING_ATTRIBUTES); } if (arangodb::smartJoinAttributeChanged(_logicalCollection, oldDoc, builder->slice(), true)) { return Result(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SMART_JOIN_ATTRIBUTE); } } } else { revisionId = TRI_ExtractRevisionId(VPackSlice( static_cast(options.recoveryData)->vpack())); } // create marker MMFilesCrudMarker updateMarker( TRI_DF_MARKER_VPACK_DOCUMENT, static_cast(trx->state())->idForMarker(), documentId, builder->slice()); MMFilesWalMarker const* marker; if (options.recoveryData == nullptr) { marker = &updateMarker; } else { marker = static_cast(options.recoveryData); } VPackSlice const newDoc(marker->vpack()); MMFilesDocumentOperation operation(&_logicalCollection, TRI_VOC_DOCUMENT_OPERATION_UPDATE); try { insertLocalDocumentId(documentId, marker->vpack(), 0, true, true); operation.setDocumentIds(MMFilesDocumentDescriptor(oldDocumentId, oldDoc.begin()), MMFilesDocumentDescriptor(documentId, newDoc.begin())); res = updateDocument(*trx, revisionId, oldDocumentId, oldDoc, documentId, newDoc, operation, marker, options, options.waitForSync); } catch (basics::Exception const& ex) { res = Result(ex.code()); } catch (std::bad_alloc const&) { res = Result(TRI_ERROR_OUT_OF_MEMORY); } catch (std::exception const& ex) { res = Result(TRI_ERROR_INTERNAL, ex.what()); } catch (...) { res = Result(TRI_ERROR_INTERNAL); } if (res.fail()) { operation.revert(trx); } else { if (options.returnNew) { resultMdr.setManaged(newDoc.begin()); } else { // need to pass revId manually resultMdr.setRevisionId(revisionId); } } return res; } Result MMFilesCollection::replace(transaction::Methods* trx, VPackSlice const newSlice, ManagedDocumentResult& resultMdr, OperationOptions& options, bool lock, ManagedDocumentResult& previousMdr) { LocalDocumentId const documentId = reuseOrCreateLocalDocumentId(options); auto isEdgeCollection = (TRI_COL_TYPE_EDGE == _logicalCollection.type()); TRI_IF_FAILURE("ReplaceDocumentNoLock") { return Result(TRI_ERROR_DEBUG); } // get the previous revision VPackSlice key = newSlice.get(StaticStrings::KeyString); if (key.isNone()) { return TRI_ERROR_ARANGO_DOCUMENT_HANDLE_BAD; } else if (!key.isString()) { return Result(TRI_ERROR_ARANGO_DOCUMENT_KEY_BAD); } bool const useDeadlockDetector = (lock && !trx->isSingleOperationTransaction() && !trx->state()->hasHint(transaction::Hints::Hint::NO_DLD)); arangodb::MMFilesCollectionWriteLocker collectionLocker(this, useDeadlockDetector, trx->state(), lock); // get the previous revision LocalDocumentId const oldDocumentId = lookupDocument(trx, key, previousMdr); if (oldDocumentId.empty()) { return Result(TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND); } TRI_IF_FAILURE("ReplaceDocumentNoMarker") { // test what happens when no marker can be created return Result(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("ReplaceDocumentNoMarkerExcept") { // test what happens when no marker can be created THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } uint8_t const* vpack = previousMdr.vpack(); TRI_ASSERT(previousMdr.revisionId() != 0); VPackSlice const oldDoc(vpack); // Check old revision: if (!options.ignoreRevs) { TRI_voc_rid_t expectedRev = 0; if (newSlice.isObject()) { expectedRev = TRI_ExtractRevisionId(newSlice); } int res = checkRevision(trx, expectedRev, previousMdr.revisionId()); if (res != TRI_ERROR_NO_ERROR) { return Result(res); } } Result res; // merge old and new values TRI_voc_rid_t revisionId; transaction::BuilderLeaser builder(trx); res = newObjectForReplace(trx, oldDoc, newSlice, isEdgeCollection, *builder.get(), options.isRestore, revisionId); if (res.fail()) { return res; } if (options.recoveryData == nullptr) { if (_isDBServer) { // Need to check that no sharding keys have changed: if (arangodb::shardKeysChanged(_logicalCollection, oldDoc, builder->slice(), false)) { return Result(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SHARDING_ATTRIBUTES); } if (arangodb::smartJoinAttributeChanged(_logicalCollection, oldDoc, builder->slice(), false)) { return Result(TRI_ERROR_CLUSTER_MUST_NOT_CHANGE_SMART_JOIN_ATTRIBUTE); } } } // create marker MMFilesCrudMarker replaceMarker( TRI_DF_MARKER_VPACK_DOCUMENT, static_cast(trx->state())->idForMarker(), documentId, builder->slice()); MMFilesWalMarker const* marker; if (options.recoveryData == nullptr) { marker = &replaceMarker; } else { marker = static_cast(options.recoveryData); } VPackSlice const newDoc(marker->vpack()); MMFilesDocumentOperation operation(&_logicalCollection, TRI_VOC_DOCUMENT_OPERATION_REPLACE); try { insertLocalDocumentId(documentId, marker->vpack(), 0, true, true); operation.setDocumentIds(MMFilesDocumentDescriptor(oldDocumentId, oldDoc.begin()), MMFilesDocumentDescriptor(documentId, newDoc.begin())); res = updateDocument(*trx, revisionId, oldDocumentId, oldDoc, documentId, newDoc, operation, marker, options, options.waitForSync); } catch (basics::Exception const& ex) { res = Result(ex.code()); } catch (std::bad_alloc const&) { res = Result(TRI_ERROR_OUT_OF_MEMORY); } catch (std::exception const& ex) { res = Result(TRI_ERROR_INTERNAL, ex.what()); } catch (...) { res = Result(TRI_ERROR_INTERNAL); } if (res.fail()) { operation.revert(trx); } else { if (options.returnNew) { resultMdr.setManaged(newDoc.begin()); } else { // need to pass revId manually resultMdr.setRevisionId(revisionId); } } return res; } Result MMFilesCollection::remove(transaction::Methods& trx, velocypack::Slice slice, ManagedDocumentResult& previousMdr, OperationOptions& options, bool lock, KeyLockInfo* keyLockInfo, std::function const& callbackDuringLock) { LocalDocumentId const documentId = LocalDocumentId::create(); transaction::BuilderLeaser builder(&trx); TRI_voc_rid_t revisionId; newObjectForRemove(&trx, slice, *builder.get(), options.isRestore, revisionId); TRI_IF_FAILURE("RemoveDocumentNoMarker") { // test what happens when no marker can be created return Result(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("RemoveDocumentNoMarkerExcept") { // test what happens if no marker can be created THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } // create marker MMFilesCrudMarker removeMarker( TRI_DF_MARKER_VPACK_REMOVE, static_cast(trx.state())->idForMarker(), documentId, builder->slice()); MMFilesWalMarker const* marker; if (options.recoveryData == nullptr) { marker = &removeMarker; } else { marker = static_cast(options.recoveryData); } TRI_IF_FAILURE("RemoveDocumentNoLock") { // test what happens if no lock can be acquired return Result(TRI_ERROR_DEBUG); } VPackSlice key; if (slice.isString()) { key = slice; } else { key = slice.get(StaticStrings::KeyString); } TRI_ASSERT(!key.isNone()); if (!key.isString()) { return Result(TRI_ERROR_ARANGO_DOCUMENT_KEY_BAD); } TRI_ASSERT(keyLockInfo != nullptr); if (keyLockInfo->shouldLock) { lockKey(*keyLockInfo, key); } MMFilesDocumentOperation operation(&_logicalCollection, TRI_VOC_DOCUMENT_OPERATION_REMOVE); bool const useDeadlockDetector = (lock && !trx.isSingleOperationTransaction() && !trx.state()->hasHint(transaction::Hints::Hint::NO_DLD)); arangodb::MMFilesCollectionWriteLocker collectionLocker(this, useDeadlockDetector, trx.state(), lock); // get the previous revision LocalDocumentId const oldDocumentId = lookupDocument(&trx, key, previousMdr); if (oldDocumentId.empty()) { return Result(TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND); } VPackSlice const oldDoc(previousMdr.vpack()); TRI_ASSERT(previousMdr.revisionId() != 0); Result res; // Check old revision: if (!options.ignoreRevs && slice.isObject()) { TRI_voc_rid_t expectedRevisionId = TRI_ExtractRevisionId(slice); res = checkRevision(&trx, expectedRevisionId, previousMdr.revisionId()); if (res.fail()) { return res; } } // we found a document to remove try { operation.setDocumentIds(MMFilesDocumentDescriptor(oldDocumentId, oldDoc.begin()), MMFilesDocumentDescriptor()); // delete from indexes res = deleteSecondaryIndexes(trx, oldDocumentId, oldDoc, options.indexOperationMode); if (res.fail()) { insertSecondaryIndexes(trx, oldDocumentId, oldDoc, Index::OperationMode::rollback); THROW_ARANGO_EXCEPTION(res); } res = deletePrimaryIndex(&trx, oldDocumentId, oldDoc, options); if (res.fail()) { insertSecondaryIndexes(trx, oldDocumentId, oldDoc, Index::OperationMode::rollback); THROW_ARANGO_EXCEPTION(res); } operation.indexed(); TRI_IF_FAILURE("RemoveDocumentNoOperation") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } try { removeLocalDocumentId(oldDocumentId, true); } catch (...) { } TRI_IF_FAILURE("RemoveDocumentNoOperationExcept") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } res = static_cast(trx.state()) ->addOperation(documentId, revisionId, operation, marker, options.waitForSync); if (res.ok() && callbackDuringLock != nullptr) { callbackDuringLock(); } } catch (basics::Exception const& ex) { res = Result(ex.code(), ex.what()); } catch (std::bad_alloc const&) { res = Result(TRI_ERROR_OUT_OF_MEMORY); } catch (std::exception const& ex) { res = Result(TRI_ERROR_INTERNAL, ex.what()); } catch (...) { res = Result(TRI_ERROR_INTERNAL); } if (res.fail()) { operation.revert(&trx); } return res; } /// @brief Defer a callback to be executed when the collection /// can be dropped. The callback is supposed to drop /// the collection and it is guaranteed that no one is using /// it at that moment. void MMFilesCollection::deferDropCollection(std::function const& callback) { // add callback for dropping ditches()->createMMFilesDropCollectionDitch(_logicalCollection, callback, __FILE__, __LINE__); } /// @brief rolls back a document operation Result MMFilesCollection::rollbackOperation(transaction::Methods& trx, TRI_voc_document_operation_e type, LocalDocumentId const& oldDocumentId, velocypack::Slice const& oldDoc, LocalDocumentId const& newDocumentId, velocypack::Slice const& newDoc) { OperationOptions options; options.indexOperationMode = Index::OperationMode::rollback; if (type == TRI_VOC_DOCUMENT_OPERATION_INSERT) { TRI_ASSERT(oldDocumentId.empty()); TRI_ASSERT(oldDoc.isNone()); TRI_ASSERT(!newDocumentId.empty()); TRI_ASSERT(!newDoc.isNone()); // ignore any errors we're getting from this deletePrimaryIndex(&trx, newDocumentId, newDoc, options); deleteSecondaryIndexes(trx, newDocumentId, newDoc, Index::OperationMode::rollback); return TRI_ERROR_NO_ERROR; } if (type == TRI_VOC_DOCUMENT_OPERATION_UPDATE || type == TRI_VOC_DOCUMENT_OPERATION_REPLACE) { TRI_ASSERT(!oldDocumentId.empty()); TRI_ASSERT(!oldDoc.isNone()); TRI_ASSERT(!newDocumentId.empty()); TRI_ASSERT(!newDoc.isNone()); // remove the current values from the indexes deleteSecondaryIndexes(trx, newDocumentId, newDoc, Index::OperationMode::rollback); // re-insert old state return insertSecondaryIndexes(trx, oldDocumentId, oldDoc, Index::OperationMode::rollback); } if (type == TRI_VOC_DOCUMENT_OPERATION_REMOVE) { // re-insert old document TRI_ASSERT(!oldDocumentId.empty()); TRI_ASSERT(!oldDoc.isNone()); TRI_ASSERT(newDocumentId.empty()); TRI_ASSERT(newDoc.isNone()); auto res = insertPrimaryIndex(&trx, oldDocumentId, oldDoc, options); if (res.ok()) { res = insertSecondaryIndexes(trx, oldDocumentId, oldDoc, Index::OperationMode::rollback); } else { LOG_TOPIC("fccac", ERR, arangodb::Logger::ENGINES) << "error rolling back remove operation"; } return res; } #ifdef ARANGODB_ENABLE_MAINTAINER_MODE LOG_TOPIC("c5060", ERR, arangodb::Logger::ENGINES) << "logic error. invalid operation type on rollback"; #endif return TRI_ERROR_INTERNAL; } /// @brief removes a document or edge, fast path function for database documents Result MMFilesCollection::removeFastPath(transaction::Methods& trx, TRI_voc_rid_t revisionId, LocalDocumentId const& oldDocumentId, velocypack::Slice const oldDoc, OperationOptions& options, LocalDocumentId const& documentId, velocypack::Slice const toRemove) { TRI_IF_FAILURE("RemoveDocumentNoMarker") { // test what happens when no marker can be created return Result(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("RemoveDocumentNoMarkerExcept") { // test what happens if no marker can be created THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } // create marker MMFilesCrudMarker removeMarker( TRI_DF_MARKER_VPACK_REMOVE, static_cast(trx.state())->idForMarker(), documentId, toRemove); MMFilesWalMarker const* marker = &removeMarker; TRI_IF_FAILURE("RemoveDocumentNoLock") { // test what happens if no lock can be acquired return Result(TRI_ERROR_DEBUG); } VPackSlice key = arangodb::transaction::helpers::extractKeyFromDocument(oldDoc); TRI_ASSERT(!key.isNone()); MMFilesDocumentOperation operation(&_logicalCollection, TRI_VOC_DOCUMENT_OPERATION_REMOVE); operation.setDocumentIds(MMFilesDocumentDescriptor(oldDocumentId, oldDoc.begin()), MMFilesDocumentDescriptor()); // delete from indexes Result res; try { res = deleteSecondaryIndexes(trx, oldDocumentId, oldDoc, options.indexOperationMode); if (res.fail()) { insertSecondaryIndexes(trx, oldDocumentId, oldDoc, Index::OperationMode::rollback); THROW_ARANGO_EXCEPTION(res); } res = deletePrimaryIndex(&trx, oldDocumentId, oldDoc, options); if (res.fail()) { insertSecondaryIndexes(trx, oldDocumentId, oldDoc, Index::OperationMode::rollback); THROW_ARANGO_EXCEPTION(res); } operation.indexed(); try { removeLocalDocumentId(oldDocumentId, true); } catch (...) { } TRI_IF_FAILURE("RemoveDocumentNoOperation") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("RemoveDocumentNoOperationExcept") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } res = static_cast(trx.state()) ->addOperation(documentId, revisionId, operation, marker, options.waitForSync); } catch (basics::Exception const& ex) { res = Result(ex.code()); } catch (std::bad_alloc const&) { res = Result(TRI_ERROR_OUT_OF_MEMORY); } catch (std::exception const& ex) { res = Result(TRI_ERROR_INTERNAL, ex.what()); } catch (...) { res = Result(TRI_ERROR_INTERNAL); } if (res.fail()) { operation.revert(&trx); } return res; } /// @brief looks up a document by key, low level worker /// the caller must make sure the read lock on the collection is held /// the key must be a string slice, no revision check is performed LocalDocumentId MMFilesCollection::lookupDocument(transaction::Methods* trx, VPackSlice key, ManagedDocumentResult& result) { TRI_ASSERT(key.isString()); MMFilesSimpleIndexElement element = primaryIndex()->lookupKey(trx, key, result); if (element) { LocalDocumentId const documentId = element.localDocumentId(); uint8_t const* vpack = lookupDocumentVPack(documentId); if (vpack != nullptr) { result.setUnmanaged(vpack); } return documentId; } return LocalDocumentId(); } /// @brief updates an existing document, low level worker /// the caller must make sure the write lock on the collection is held Result MMFilesCollection::updateDocument( transaction::Methods& trx, TRI_voc_rid_t revisionId, LocalDocumentId const& oldDocumentId, velocypack::Slice const& oldDoc, LocalDocumentId const& newDocumentId, velocypack::Slice const& newDoc, MMFilesDocumentOperation& operation, MMFilesWalMarker const* marker, OperationOptions& options, bool& waitForSync) { // remove old document from secondary indexes // (it will stay in the primary index as the key won't change) Result res = deleteSecondaryIndexes(trx, oldDocumentId, oldDoc, options.indexOperationMode); if (res.fail()) { // re-insert the document in case of failure, ignore errors during rollback insertSecondaryIndexes(trx, oldDocumentId, oldDoc, Index::OperationMode::rollback); return res; } // insert new document into secondary indexes res = insertSecondaryIndexes(trx, newDocumentId, newDoc, options.indexOperationMode); if (res.fail()) { // rollback deleteSecondaryIndexes(trx, newDocumentId, newDoc, Index::OperationMode::rollback); insertSecondaryIndexes(trx, oldDocumentId, oldDoc, Index::OperationMode::rollback); return res; } // update the index element (primary index only - other indexes have been // adjusted) // TODO: pass key into this function so it does not have to be looked up again VPackSlice keySlice(transaction::helpers::extractKeyFromDocument(newDoc)); auto* element = primaryIndex()->lookupKeyRef(&trx, keySlice); if (element != nullptr && element->isSet()) { element->updateLocalDocumentId(newDocumentId, static_cast( keySlice.begin() - newDoc.begin())); } operation.indexed(); try { removeLocalDocumentId(oldDocumentId, true); } catch (...) { } TRI_IF_FAILURE("UpdateDocumentNoOperation") { return Result(TRI_ERROR_DEBUG); } TRI_IF_FAILURE("UpdateDocumentNoOperationExcept") { THROW_ARANGO_EXCEPTION(TRI_ERROR_DEBUG); } return Result(static_cast(trx.state()) ->addOperation(newDocumentId, revisionId, operation, marker, waitForSync)); } void MMFilesCollection::lockKey(KeyLockInfo& keyLockInfo, VPackSlice const& key) { TRI_ASSERT(keyLockInfo.key.empty()); // copy out the key we need to lock TRI_ASSERT(key.isString()); std::string k = key.copyString(); MMFilesCollection::KeyLockShard& shard = getShardForKey(k); // register key unlock function keyLockInfo.unlocker = [this](KeyLockInfo& keyLock) { unlockKey(keyLock); }; do { { MUTEX_LOCKER(locker, shard._mutex); // if the insert fails because the key is already in the set, // we carry on trying until the previous value is gone from the set. // if the insert fails because of an out-of-memory exception, // we can let the exception escape from here. no harm will be done if (shard._keys.insert(k).second) { // if insertion into the lock set succeeded, we can // go on without the lock. otherwise we just need to carry on trying locker.unlock(); // store key to unlock later. the move assignment will not fail keyLockInfo.key = std::move(k); return; } } std::this_thread::yield(); } while (!application_features::ApplicationServer::isStopping()); // we can only get here on shutdown THROW_ARANGO_EXCEPTION(TRI_ERROR_SHUTTING_DOWN); } void MMFilesCollection::unlockKey(KeyLockInfo& keyLockInfo) noexcept { TRI_ASSERT(keyLockInfo.shouldLock); if (!keyLockInfo.key.empty()) { MMFilesCollection::KeyLockShard& shard = getShardForKey(keyLockInfo.key); MUTEX_LOCKER(locker, shard._mutex); shard._keys.erase(keyLockInfo.key); } } MMFilesCollection::KeyLockShard& MMFilesCollection::getShardForKey(std::string const& key) noexcept { size_t hash = std::hash()(key); return _keyLockShards[hash % numKeyLockShards]; }