1
0
Fork 0
arangodb/arangod/StorageEngine/MMFilesCollection.cpp

1192 lines
38 KiB
C++

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Jan Steemann
////////////////////////////////////////////////////////////////////////////////
#include "MMFilesCollection.h"
#include "ApplicationFeatures/ApplicationServer.h"
#include "Basics/FileUtils.h"
#include "Basics/ReadLocker.h"
#include "Basics/StaticStrings.h"
#include "Basics/VelocyPackHelper.h"
#include "Basics/WriteLocker.h"
#include "Indexes/PrimaryIndex.h"
#include "Logger/Logger.h"
#include "RestServer/DatabaseFeature.h"
#include "StorageEngine/EngineSelectorFeature.h"
#include "StorageEngine/MMFilesDocumentPosition.h"
#include "StorageEngine/StorageEngine.h"
#include "Utils/SingleCollectionTransaction.h"
#include "Utils/StandaloneTransactionContext.h"
#include "Utils/Transaction.h"
#include "VocBase/DatafileHelper.h"
#include "VocBase/KeyGenerator.h"
#include "VocBase/LogicalCollection.h"
#include "VocBase/datafile.h"
#include "VocBase/ticks.h"
#include "Wal/LogfileManager.h"
using namespace arangodb;
namespace {
/// @brief find a statistics container for a given file id
static DatafileStatisticsContainer* FindDatafileStats(
MMFilesCollection::OpenIteratorState* state, TRI_voc_fid_t fid) {
auto it = state->_stats.find(fid);
if (it != state->_stats.end()) {
return (*it).second;
}
auto stats = std::make_unique<DatafileStatisticsContainer>();
state->_stats.emplace(fid, stats.get());
return stats.release();
}
} // namespace
/// @brief process a document (or edge) marker when opening a collection
int MMFilesCollection::OpenIteratorHandleDocumentMarker(TRI_df_marker_t const* marker,
TRI_datafile_t* datafile,
MMFilesCollection::OpenIteratorState* state) {
LogicalCollection* collection = state->_collection;
MMFilesCollection* c = static_cast<MMFilesCollection*>(collection->getPhysical());
arangodb::Transaction* trx = state->_trx;
TRI_ASSERT(trx != nullptr);
VPackSlice const slice(reinterpret_cast<char const*>(marker) + DatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_DOCUMENT));
uint8_t const* vpack = slice.begin();
VPackSlice keySlice;
TRI_voc_rid_t revisionId;
Transaction::extractKeyAndRevFromDocument(slice, keySlice, revisionId);
c->setRevision(revisionId, false);
if (state->_trackKeys) {
VPackValueLength length;
char const* p = keySlice.getString(length);
collection->keyGenerator()->track(p, length);
}
++state->_documents;
TRI_voc_fid_t const fid = datafile->fid();
if (state->_fid != fid) {
// update the state
state->_fid = fid; // when we're here, we're looking at a datafile
state->_dfi = FindDatafileStats(state, fid);
}
// no primary index lock required here because we are the only ones reading
// from the index ATM
SimpleIndexElement* found = state->_primaryIndex->lookupKeyRef(trx, keySlice, state->_mmdr);
// it is a new entry
if (found == nullptr || found->revisionId() == 0) {
c->insertRevision(revisionId, vpack, fid, false, false);
// insert into primary index
int res = state->_primaryIndex->insertKey(trx, revisionId, VPackSlice(vpack), state->_mmdr);
if (res != TRI_ERROR_NO_ERROR) {
c->removeRevision(revisionId, false);
LOG(ERR) << "inserting document into primary index failed with error: " << TRI_errno_string(res);
return res;
}
// update the datafile info
state->_dfi->numberAlive++;
state->_dfi->sizeAlive += DatafileHelper::AlignedMarkerSize<int64_t>(marker);
}
// it is an update
else {
TRI_voc_rid_t const oldRevisionId = found->revisionId();
// update the revision id in primary index
found->updateRevisionId(revisionId, static_cast<uint32_t>(keySlice.begin() - vpack));
MMFilesDocumentPosition const old = c->lookupRevision(oldRevisionId);
// remove old revision
c->removeRevision(oldRevisionId, false);
// insert new revision
c->insertRevision(revisionId, vpack, fid, false, false);
// update the datafile info
DatafileStatisticsContainer* dfi;
if (old.fid() == state->_fid) {
dfi = state->_dfi;
} else {
dfi = FindDatafileStats(state, old.fid());
}
if (old.dataptr() != nullptr) {
uint8_t const* vpack = static_cast<uint8_t const*>(old.dataptr());
int64_t size = static_cast<int64_t>(arangodb::DatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_DOCUMENT) + VPackSlice(vpack).byteSize());
dfi->numberAlive--;
dfi->sizeAlive -= DatafileHelper::AlignedSize<int64_t>(size);
dfi->numberDead++;
dfi->sizeDead += DatafileHelper::AlignedSize<int64_t>(size);
}
state->_dfi->numberAlive++;
state->_dfi->sizeAlive += DatafileHelper::AlignedMarkerSize<int64_t>(marker);
}
return TRI_ERROR_NO_ERROR;
}
/// @brief process a deletion marker when opening a collection
int MMFilesCollection::OpenIteratorHandleDeletionMarker(TRI_df_marker_t const* marker,
TRI_datafile_t* datafile,
MMFilesCollection::OpenIteratorState* state) {
LogicalCollection* collection = state->_collection;
MMFilesCollection* c = static_cast<MMFilesCollection*>(collection->getPhysical());
arangodb::Transaction* trx = state->_trx;
VPackSlice const slice(reinterpret_cast<char const*>(marker) + DatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_REMOVE));
VPackSlice keySlice;
TRI_voc_rid_t revisionId;
Transaction::extractKeyAndRevFromDocument(slice, keySlice, revisionId);
c->setRevision(revisionId, false);
if (state->_trackKeys) {
VPackValueLength length;
char const* p = keySlice.getString(length);
collection->keyGenerator()->track(p, length);
}
++state->_deletions;
if (state->_fid != datafile->fid()) {
// update the state
state->_fid = datafile->fid();
state->_dfi = FindDatafileStats(state, datafile->fid());
}
// no primary index lock required here because we are the only ones reading
// from the index ATM
SimpleIndexElement found = state->_primaryIndex->lookupKey(trx, keySlice, state->_mmdr);
// it is a new entry, so we missed the create
if (!found) {
// update the datafile info
state->_dfi->numberDeletions++;
}
// it is a real delete
else {
TRI_voc_rid_t oldRevisionId = found.revisionId();
MMFilesDocumentPosition const old = c->lookupRevision(oldRevisionId);
// update the datafile info
DatafileStatisticsContainer* dfi;
if (old.fid() == state->_fid) {
dfi = state->_dfi;
} else {
dfi = FindDatafileStats(state, old.fid());
}
TRI_ASSERT(old.dataptr() != nullptr);
uint8_t const* vpack = static_cast<uint8_t const*>(old.dataptr());
int64_t size = DatafileHelper::AlignedSize<int64_t>(arangodb::DatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_DOCUMENT) + VPackSlice(vpack).byteSize());
dfi->numberAlive--;
dfi->sizeAlive -= DatafileHelper::AlignedSize<int64_t>(size);
dfi->numberDead++;
dfi->sizeDead += DatafileHelper::AlignedSize<int64_t>(size);
state->_dfi->numberDeletions++;
state->_primaryIndex->removeKey(trx, oldRevisionId, VPackSlice(vpack), state->_mmdr);
c->removeRevision(oldRevisionId, true);
}
return TRI_ERROR_NO_ERROR;
}
/// @brief iterator for open
bool MMFilesCollection::OpenIterator(TRI_df_marker_t const* marker, MMFilesCollection::OpenIteratorState* data,
TRI_datafile_t* datafile) {
TRI_voc_tick_t const tick = marker->getTick();
TRI_df_marker_type_t const type = marker->getType();
int res;
if (type == TRI_DF_MARKER_VPACK_DOCUMENT) {
res = OpenIteratorHandleDocumentMarker(marker, datafile, data);
if (datafile->_dataMin == 0) {
datafile->_dataMin = tick;
}
if (tick > datafile->_dataMax) {
datafile->_dataMax = tick;
}
} else if (type == TRI_DF_MARKER_VPACK_REMOVE) {
res = OpenIteratorHandleDeletionMarker(marker, datafile, data);
} else {
if (type == TRI_DF_MARKER_HEADER) {
// ensure there is a datafile info entry for each datafile of the
// collection
FindDatafileStats(data, datafile->fid());
}
LOG(TRACE) << "skipping marker type " << TRI_NameMarkerDatafile(marker);
res = TRI_ERROR_NO_ERROR;
}
if (datafile->_tickMin == 0) {
datafile->_tickMin = tick;
}
if (tick > datafile->_tickMax) {
datafile->_tickMax = tick;
}
if (tick > data->_collection->maxTick()) {
if (type != TRI_DF_MARKER_HEADER &&
type != TRI_DF_MARKER_FOOTER &&
type != TRI_DF_MARKER_COL_HEADER &&
type != TRI_DF_MARKER_PROLOGUE) {
data->_collection->maxTick(tick);
}
}
return (res == TRI_ERROR_NO_ERROR);
}
MMFilesCollection::MMFilesCollection(LogicalCollection* collection)
: PhysicalCollection(collection), _ditches(collection), _initialCount(0), _lastRevision(0) {}
MMFilesCollection::~MMFilesCollection() {
try {
close();
} catch (...) {
// dtor must not propagate exceptions
}
}
TRI_voc_rid_t MMFilesCollection::revision() const {
return _lastRevision;
}
void MMFilesCollection::setRevision(TRI_voc_rid_t revision, bool force) {
if (force || revision > _lastRevision) {
_lastRevision = revision;
}
}
int64_t MMFilesCollection::initialCount() const {
return _initialCount;
}
void MMFilesCollection::updateCount(int64_t count) {
_initialCount = count;
_revisionsCache.sizeHint(count);
}
/// @brief closes an open collection
int MMFilesCollection::close() {
{
WRITE_LOCKER(writeLocker, _filesLock);
// close compactor files
closeDatafiles(_compactors);
for (auto& it : _compactors) {
delete it;
}
_compactors.clear();
// close journal files
closeDatafiles(_journals);
for (auto& it : _journals) {
delete it;
}
_journals.clear();
// close datafiles
closeDatafiles(_datafiles);
for (auto& it : _datafiles) {
delete it;
}
_datafiles.clear();
}
_lastRevision = 0;
// clear revisions lookup table
_revisionsCache.clear();
return TRI_ERROR_NO_ERROR;
}
/// @brief seal a datafile
int MMFilesCollection::sealDatafile(TRI_datafile_t* datafile, bool isCompactor) {
int res = datafile->seal();
if (res != TRI_ERROR_NO_ERROR) {
LOG(ERR) << "failed to seal journal '" << datafile->getName()
<< "': " << TRI_errno_string(res);
return res;
}
if (!isCompactor && datafile->isPhysical()) {
// rename the file
std::string dname("datafile-" + std::to_string(datafile->fid()) + ".db");
std::string filename = arangodb::basics::FileUtils::buildFilename(_logicalCollection->path(), dname);
res = datafile->rename(filename);
if (res == TRI_ERROR_NO_ERROR) {
LOG(TRACE) << "closed file '" << datafile->getName() << "'";
} else {
LOG(ERR) << "failed to rename datafile '" << datafile->getName()
<< "' to '" << filename << "': " << TRI_errno_string(res);
}
}
return res;
}
/// @brief rotate the active journal - will do nothing if there is no journal
int MMFilesCollection::rotateActiveJournal() {
WRITE_LOCKER(writeLocker, _filesLock);
// note: only journals need to be handled here as the journal is the
// only place that's ever written to. if a journal is full, it will have been
// sealed and synced already
if (_journals.empty()) {
return TRI_ERROR_ARANGO_NO_JOURNAL;
}
TRI_datafile_t* datafile = _journals[0];
TRI_ASSERT(datafile != nullptr);
// make sure we have enough room in the target vector before we go on
_datafiles.reserve(_datafiles.size() + 1);
int res = sealDatafile(datafile, false);
if (res != TRI_ERROR_NO_ERROR) {
return res;
}
// shouldn't throw as we reserved enough space before
_datafiles.emplace_back(datafile);
TRI_ASSERT(!_journals.empty());
TRI_ASSERT(_journals.back() == datafile);
_journals.erase(_journals.begin());
TRI_ASSERT(_journals.empty());
return res;
}
/// @brief sync the active journal - will do nothing if there is no journal
/// or if the journal is volatile
int MMFilesCollection::syncActiveJournal() {
WRITE_LOCKER(writeLocker, _filesLock);
// note: only journals need to be handled here as the journal is the
// only place that's ever written to. if a journal is full, it will have been
// sealed and synced already
if (_journals.empty()) {
// nothing to do
return TRI_ERROR_NO_ERROR;
}
TRI_datafile_t* datafile = _journals[0];
TRI_ASSERT(datafile != nullptr);
int res = TRI_ERROR_NO_ERROR;
// we only need to care about physical datafiles
// anonymous regions do not need to be synced
if (datafile->isPhysical()) {
char const* synced = datafile->_synced;
char* written = datafile->_written;
if (synced < written) {
bool ok = datafile->sync(synced, written);
if (ok) {
LOG_TOPIC(TRACE, Logger::COLLECTOR) << "msync succeeded "
<< (void*)synced << ", size "
<< (written - synced);
datafile->_synced = written;
} else {
res = TRI_errno();
if (res == TRI_ERROR_NO_ERROR) {
// oops, error code got lost
res = TRI_ERROR_INTERNAL;
}
LOG_TOPIC(ERR, Logger::COLLECTOR)
<< "msync failed with: " << TRI_last_error();
datafile->setState(TRI_DF_STATE_WRITE_ERROR);
}
}
}
return res;
}
/// @brief reserve space in the current journal. if no create exists or the
/// current journal cannot provide enough space, close the old journal and
/// create a new one
int MMFilesCollection::reserveJournalSpace(TRI_voc_tick_t tick,
TRI_voc_size_t size,
char*& resultPosition,
TRI_datafile_t*& resultDatafile) {
// reset results
resultPosition = nullptr;
resultDatafile = nullptr;
WRITE_LOCKER(writeLocker, _filesLock);
// start with configured journal size
TRI_voc_size_t targetSize = static_cast<TRI_voc_size_t>(_logicalCollection->journalSize());
// make sure that the document fits
while (targetSize - 256 < size) {
targetSize *= 2;
}
while (true) {
TRI_datafile_t* datafile = nullptr;
if (_journals.empty()) {
// create enough room in the journals vector
_journals.reserve(_journals.size() + 1);
try {
std::unique_ptr<TRI_datafile_t> df(createDatafile(tick, targetSize, false));
// shouldn't throw as we reserved enough space before
_journals.emplace_back(df.get());
df.release();
} catch (basics::Exception const& ex) {
LOG_TOPIC(ERR, Logger::COLLECTOR) << "cannot select journal: " << ex.what();
return ex.code();
} catch (std::exception const& ex) {
LOG_TOPIC(ERR, Logger::COLLECTOR) << "cannot select journal: " << ex.what();
return TRI_ERROR_INTERNAL;
} catch (...) {
LOG_TOPIC(ERR, Logger::COLLECTOR) << "cannot select journal: unknown exception";
return TRI_ERROR_INTERNAL;
}
}
// select datafile
TRI_ASSERT(!_journals.empty());
datafile = _journals[0];
TRI_ASSERT(datafile != nullptr);
// try to reserve space in the datafile
TRI_df_marker_t* position = nullptr;
int res = datafile->reserveElement(size, &position, targetSize);
// found a datafile with enough space left
if (res == TRI_ERROR_NO_ERROR) {
datafile->_written = ((char*)position) + size;
// set result
resultPosition = reinterpret_cast<char*>(position);
resultDatafile = datafile;
return TRI_ERROR_NO_ERROR;
}
if (res != TRI_ERROR_ARANGO_DATAFILE_FULL) {
// some other error
LOG_TOPIC(ERR, Logger::COLLECTOR) << "cannot select journal: '"
<< TRI_last_error() << "'";
return res;
}
// TRI_ERROR_ARANGO_DATAFILE_FULL...
// journal is full, close it and sync
LOG_TOPIC(DEBUG, Logger::COLLECTOR) << "closing full journal '"
<< datafile->getName() << "'";
// make sure we have enough room in the target vector before we go on
_datafiles.reserve(_datafiles.size() + 1);
res = sealDatafile(datafile, false);
// move journal into _datafiles vector
// this shouldn't fail, as we have reserved space before already
_datafiles.emplace_back(datafile);
// and finally erase it from _journals vector
TRI_ASSERT(!_journals.empty());
TRI_ASSERT(_journals.back() == datafile);
_journals.erase(_journals.begin());
TRI_ASSERT(_journals.empty());
if (res != TRI_ERROR_NO_ERROR) {
// an error occurred, we must stop here
return res;
}
} // otherwise, next iteration!
return TRI_ERROR_ARANGO_NO_JOURNAL;
}
/// @brief create compactor file
TRI_datafile_t* MMFilesCollection::createCompactor(TRI_voc_fid_t fid,
TRI_voc_size_t maximalSize) {
WRITE_LOCKER(writeLocker, _filesLock);
TRI_ASSERT(_compactors.empty());
// reserve enough space for the later addition
_compactors.reserve(_compactors.size() + 1);
std::unique_ptr<TRI_datafile_t> compactor(createDatafile(fid, static_cast<TRI_voc_size_t>(maximalSize), true));
// should not throw, as we've reserved enough space before
_compactors.emplace_back(compactor.get());
return compactor.release();
}
/// @brief close an existing compactor
int MMFilesCollection::closeCompactor(TRI_datafile_t* datafile) {
WRITE_LOCKER(writeLocker, _filesLock);
if (_compactors.size() != 1) {
return TRI_ERROR_ARANGO_NO_JOURNAL;
}
TRI_datafile_t* compactor = _compactors[0];
if (datafile != compactor) {
// wrong compactor file specified... should not happen
return TRI_ERROR_INTERNAL;
}
return sealDatafile(datafile, true);
}
/// @brief replace a datafile with a compactor
int MMFilesCollection::replaceDatafileWithCompactor(TRI_datafile_t* datafile,
TRI_datafile_t* compactor) {
TRI_ASSERT(datafile != nullptr);
TRI_ASSERT(compactor != nullptr);
WRITE_LOCKER(writeLocker, _filesLock);
TRI_ASSERT(!_compactors.empty());
for (size_t i = 0; i < _datafiles.size(); ++i) {
if (_datafiles[i]->fid() == datafile->fid()) {
// found!
// now put the compactor in place of the datafile
_datafiles[i] = compactor;
// remove the compactor file from the list of compactors
TRI_ASSERT(_compactors[0] != nullptr);
TRI_ASSERT(_compactors[0]->fid() == compactor->fid());
_compactors.erase(_compactors.begin());
TRI_ASSERT(_compactors.empty());
return TRI_ERROR_NO_ERROR;
}
}
return TRI_ERROR_INTERNAL;
}
/// @brief creates a datafile
TRI_datafile_t* MMFilesCollection::createDatafile(TRI_voc_fid_t fid,
TRI_voc_size_t journalSize,
bool isCompactor) {
TRI_ASSERT(fid > 0);
// create an entry for the new datafile
try {
_datafileStatistics.create(fid);
} catch (...) {
THROW_ARANGO_EXCEPTION(TRI_ERROR_OUT_OF_MEMORY);
}
std::unique_ptr<TRI_datafile_t> datafile;
if (_logicalCollection->isVolatile()) {
// in-memory collection
datafile.reset(TRI_datafile_t::create(StaticStrings::Empty, fid, journalSize, true));
} else {
// construct a suitable filename (which may be temporary at the beginning)
std::string jname;
if (isCompactor) {
jname = "compaction-";
} else {
jname = "temp-";
}
jname.append(std::to_string(fid) + ".db");
std::string filename = arangodb::basics::FileUtils::buildFilename(_logicalCollection->path(), jname);
TRI_IF_FAILURE("CreateJournalDocumentCollection") {
// simulate disk full
THROW_ARANGO_EXCEPTION(TRI_ERROR_ARANGO_FILESYSTEM_FULL);
}
// remove an existing temporary file first
if (TRI_ExistsFile(filename.c_str())) {
// remove an existing file first
TRI_UnlinkFile(filename.c_str());
}
datafile.reset(TRI_datafile_t::create(filename, fid, journalSize, true));
}
if (datafile == nullptr) {
if (TRI_errno() == TRI_ERROR_OUT_OF_MEMORY_MMAP) {
THROW_ARANGO_EXCEPTION(TRI_ERROR_OUT_OF_MEMORY_MMAP);
}
THROW_ARANGO_EXCEPTION(TRI_ERROR_ARANGO_NO_JOURNAL);
}
// datafile is there now
TRI_ASSERT(datafile != nullptr);
if (isCompactor) {
LOG(TRACE) << "created new compactor '" << datafile->getName()
<< "'";
} else {
LOG(TRACE) << "created new journal '" << datafile->getName() << "'";
}
// create a collection header, still in the temporary file
TRI_df_marker_t* position;
int res = datafile->reserveElement(sizeof(TRI_col_header_marker_t), &position, journalSize);
TRI_IF_FAILURE("CreateJournalDocumentCollectionReserve1") {
res = TRI_ERROR_DEBUG;
}
if (res != TRI_ERROR_NO_ERROR) {
LOG(ERR) << "cannot create collection header in file '"
<< datafile->getName() << "': " << TRI_errno_string(res);
// close the journal and remove it
std::string temp(datafile->getName());
datafile.reset();
TRI_UnlinkFile(temp.c_str());
THROW_ARANGO_EXCEPTION(res);
}
TRI_col_header_marker_t cm;
DatafileHelper::InitMarker(
reinterpret_cast<TRI_df_marker_t*>(&cm), TRI_DF_MARKER_COL_HEADER,
sizeof(TRI_col_header_marker_t), static_cast<TRI_voc_tick_t>(fid));
cm._cid = _logicalCollection->cid();
res = datafile->writeCrcElement(position, &cm.base, false);
TRI_IF_FAILURE("CreateJournalDocumentCollectionReserve2") {
res = TRI_ERROR_DEBUG;
}
if (res != TRI_ERROR_NO_ERROR) {
int res = datafile->_lastError;
LOG(ERR) << "cannot create collection header in file '"
<< datafile->getName() << "': " << TRI_last_error();
// close the datafile and remove it
std::string temp(datafile->getName());
datafile.reset();
TRI_UnlinkFile(temp.c_str());
THROW_ARANGO_EXCEPTION(res);
}
TRI_ASSERT(fid == datafile->fid());
// if a physical file, we can rename it from the temporary name to the correct
// name
if (!isCompactor && datafile->isPhysical()) {
// and use the correct name
std::string jname("journal-" + std::to_string(datafile->fid()) + ".db");
std::string filename = arangodb::basics::FileUtils::buildFilename(_logicalCollection->path(), jname);
int res = datafile->rename(filename);
if (res != TRI_ERROR_NO_ERROR) {
LOG(ERR) << "failed to rename journal '" << datafile->getName()
<< "' to '" << filename << "': " << TRI_errno_string(res);
std::string temp(datafile->getName());
datafile.reset();
TRI_UnlinkFile(temp.c_str());
THROW_ARANGO_EXCEPTION(res);
}
LOG(TRACE) << "renamed journal from '" << datafile->getName()
<< "' to '" << filename << "'";
}
return datafile.release();
}
/// @brief remove a compactor file from the list of compactors
bool MMFilesCollection::removeCompactor(TRI_datafile_t* df) {
TRI_ASSERT(df != nullptr);
WRITE_LOCKER(writeLocker, _filesLock);
for (auto it = _compactors.begin(); it != _compactors.end(); ++it) {
if ((*it) == df) {
// and finally remove the file from the _compactors vector
_compactors.erase(it);
return true;
}
}
// not found
return false;
}
/// @brief remove a datafile from the list of datafiles
bool MMFilesCollection::removeDatafile(TRI_datafile_t* df) {
TRI_ASSERT(df != nullptr);
WRITE_LOCKER(writeLocker, _filesLock);
for (auto it = _datafiles.begin(); it != _datafiles.end(); ++it) {
if ((*it) == df) {
// and finally remove the file from the _compactors vector
_datafiles.erase(it);
return true;
}
}
// not found
return false;
}
/// @brief iterates over a collection
bool MMFilesCollection::iterateDatafiles(std::function<bool(TRI_df_marker_t const*, TRI_datafile_t*)> const& cb) {
if (!iterateDatafilesVector(_datafiles, cb) ||
!iterateDatafilesVector(_compactors, cb) ||
!iterateDatafilesVector(_journals, cb)) {
return false;
}
return true;
}
/// @brief iterate over all datafiles in a vector
bool MMFilesCollection::iterateDatafilesVector(std::vector<TRI_datafile_t*> const& files,
std::function<bool(TRI_df_marker_t const*, TRI_datafile_t*)> const& cb) {
for (auto const& datafile : files) {
if (!TRI_IterateDatafile(datafile, cb)) {
return false;
}
if (datafile->isPhysical() && datafile->_isSealed) {
TRI_MMFileAdvise(datafile->_data, datafile->maximalSize(),
TRI_MADVISE_RANDOM);
}
}
return true;
}
/// @brief closes the datafiles passed in the vector
bool MMFilesCollection::closeDatafiles(std::vector<TRI_datafile_t*> const& files) {
bool result = true;
for (auto const& datafile : files) {
TRI_ASSERT(datafile != nullptr);
if (datafile->state() == TRI_DF_STATE_CLOSED) {
continue;
}
int res = datafile->close();
if (res != TRI_ERROR_NO_ERROR) {
result = false;
}
}
return result;
}
void MMFilesCollection::figures(std::shared_ptr<arangodb::velocypack::Builder>& builder) {
builder->add("documentReferences", VPackValue(_ditches.numDocumentDitches()));
char const* waitingForDitch = _ditches.head();
builder->add("waitingFor", VPackValue(waitingForDitch == nullptr ? "-" : waitingForDitch));
// add datafile statistics
DatafileStatisticsContainer dfi = _datafileStatistics.all();
builder->add("alive", VPackValue(VPackValueType::Object));
builder->add("count", VPackValue(dfi.numberAlive));
builder->add("size", VPackValue(dfi.sizeAlive));
builder->close(); // alive
builder->add("dead", VPackValue(VPackValueType::Object));
builder->add("count", VPackValue(dfi.numberDead));
builder->add("size", VPackValue(dfi.sizeDead));
builder->add("deletion", VPackValue(dfi.numberDeletions));
builder->close(); // dead
// add file statistics
READ_LOCKER(readLocker, _filesLock);
size_t sizeDatafiles = 0;
builder->add("datafiles", VPackValue(VPackValueType::Object));
for (auto const& it : _datafiles) {
sizeDatafiles += it->_initSize;
}
builder->add("count", VPackValue(_datafiles.size()));
builder->add("fileSize", VPackValue(sizeDatafiles));
builder->close(); // datafiles
size_t sizeJournals = 0;
for (auto const& it : _journals) {
sizeJournals += it->_initSize;
}
builder->add("journals", VPackValue(VPackValueType::Object));
builder->add("count", VPackValue(_journals.size()));
builder->add("fileSize", VPackValue(sizeJournals));
builder->close(); // journals
size_t sizeCompactors = 0;
for (auto const& it : _compactors) {
sizeCompactors += it->_initSize;
}
builder->add("compactors", VPackValue(VPackValueType::Object));
builder->add("count", VPackValue(_compactors.size()));
builder->add("fileSize", VPackValue(sizeCompactors));
builder->close(); // compactors
}
/// @brief iterate over a vector of datafiles and pick those with a specific
/// data range
std::vector<MMFilesCollection::DatafileDescription> MMFilesCollection::datafilesInRange(TRI_voc_tick_t dataMin, TRI_voc_tick_t dataMax) {
std::vector<DatafileDescription> result;
auto apply = [&dataMin, &dataMax, &result](TRI_datafile_t const* datafile, bool isJournal) {
DatafileDescription entry = {datafile, datafile->_dataMin, datafile->_dataMax, datafile->_tickMax, isJournal};
LOG(TRACE) << "checking datafile " << datafile->fid() << " with data range " << datafile->_dataMin << " - " << datafile->_dataMax << ", tick max: " << datafile->_tickMax;
if (datafile->_dataMin == 0 || datafile->_dataMax == 0) {
// datafile doesn't have any data
return;
}
TRI_ASSERT(datafile->_tickMin <= datafile->_tickMax);
TRI_ASSERT(datafile->_dataMin <= datafile->_dataMax);
if (dataMax < datafile->_dataMin) {
// datafile is newer than requested range
return;
}
if (dataMin > datafile->_dataMax) {
// datafile is older than requested range
return;
}
result.emplace_back(entry);
};
READ_LOCKER(readLocker, _filesLock);
for (auto& it : _datafiles) {
apply(it, false);
}
for (auto& it : _journals) {
apply(it, true);
}
return result;
}
bool MMFilesCollection::applyForTickRange(TRI_voc_tick_t dataMin, TRI_voc_tick_t dataMax,
std::function<bool(TRI_voc_tick_t foundTick, TRI_df_marker_t const* marker)> const& callback) {
LOG(TRACE) << "getting datafiles in data range " << dataMin << " - " << dataMax;
std::vector<DatafileDescription> datafiles = datafilesInRange(dataMin, dataMax);
// now we have a list of datafiles...
size_t const n = datafiles.size();
for (size_t i = 0; i < n; ++i) {
auto const& e = datafiles[i];
TRI_datafile_t const* datafile = e._data;
// we are reading from a journal that might be modified in parallel
// so we must read-lock it
CONDITIONAL_READ_LOCKER(readLocker, _filesLock, e._isJournal);
if (!e._isJournal) {
TRI_ASSERT(datafile->_isSealed);
}
char const* ptr = datafile->_data;
char const* end = ptr + datafile->_currentSize;
while (ptr < end) {
auto const* marker = reinterpret_cast<TRI_df_marker_t const*>(ptr);
if (marker->getSize() == 0) {
// end of datafile
break;
}
TRI_df_marker_type_t type = marker->getType();
if (type <= TRI_DF_MARKER_MIN) {
break;
}
ptr += DatafileHelper::AlignedMarkerSize<size_t>(marker);
if (type == TRI_DF_MARKER_BLANK) {
// fully ignore these marker types. they don't need to be replicated,
// but we also cannot stop iteration if we find one of these
continue;
}
// get the marker's tick and check whether we should include it
TRI_voc_tick_t foundTick = marker->getTick();
if (foundTick <= dataMin) {
// marker too old
continue;
}
if (foundTick > dataMax) {
// marker too new
return false; // hasMore = false
}
if (type != TRI_DF_MARKER_VPACK_DOCUMENT &&
type != TRI_DF_MARKER_VPACK_REMOVE) {
// found a non-data marker...
// check if we can abort searching
if (foundTick >= dataMax || (foundTick > e._tickMax && i == (n - 1))) {
// fetched the last available marker
return false; // hasMore = false
}
continue;
}
// note the last tick we processed
bool doAbort = false;
if (!callback(foundTick, marker)) {
doAbort = true;
}
if (foundTick >= dataMax || (foundTick >= e._tickMax && i == (n - 1))) {
// fetched the last available marker
return false; // hasMore = false
}
if (doAbort) {
return true; // hasMore = true
}
} // next marker in datafile
} // next datafile
return false; // hasMore = false
}
/// @brief report extra memory used by indexes etc.
size_t MMFilesCollection::memory() const {
return 0; // TODO
}
/// @brief disallow compaction of the collection
void MMFilesCollection::preventCompaction() {
_compactionLock.readLock();
}
/// @brief try disallowing compaction of the collection
bool MMFilesCollection::tryPreventCompaction() {
return _compactionLock.tryReadLock();
}
/// @brief re-allow compaction of the collection
void MMFilesCollection::allowCompaction() {
_compactionLock.unlock();
}
/// @brief exclusively lock the collection for compaction
void MMFilesCollection::lockForCompaction() {
_compactionLock.writeLock();
}
/// @brief try to exclusively lock the collection for compaction
bool MMFilesCollection::tryLockForCompaction() {
return _compactionLock.tryWriteLock();
}
/// @brief signal that compaction is finished
void MMFilesCollection::finishCompaction() {
_compactionLock.unlock();
}
/// @brief iterate all markers of the collection
int MMFilesCollection::iterateMarkersOnLoad(arangodb::Transaction* trx) {
// initialize state for iteration
OpenIteratorState openState(_logicalCollection, trx);
if (_initialCount != -1) {
_logicalCollection->sizeHint(trx, _initialCount);
openState._initialCount = _initialCount;
}
// read all documents and fill primary index
auto cb = [&openState](TRI_df_marker_t const* marker, TRI_datafile_t* datafile) -> bool {
return OpenIterator(marker, &openState, datafile);
};
iterateDatafiles(cb);
LOG(TRACE) << "found " << openState._documents << " document markers, "
<< openState._deletions << " deletion markers for collection '" << _logicalCollection->name() << "'";
if (_logicalCollection->version() <= LogicalCollection::VERSION_30 &&
_lastRevision >= static_cast<TRI_voc_rid_t>(2016 - 1970) * 1000 * 60 * 60 * 24 * 365 &&
application_features::ApplicationServer::server->getFeature<DatabaseFeature>("Database")->check30Revisions()) {
// a collection from 3.0 or earlier with a _rev value that is higher than we can handle safely
_logicalCollection->setRevisionError();
LOG(WARN) << "collection '" << _logicalCollection->name() << "' contains _rev values that are higher than expected for an ArangoDB 3.1 database. If this collection was created or used with a pre-release or development version of ArangoDB 3.1, please restart the server with option '--database.check-30-revisions false' to suppress this warning. If this collection was created with an ArangoDB 3.0, please dump the 3.0 database with arangodump and restore it in 3.1 with arangorestore.";
if (application_features::ApplicationServer::server->getFeature<DatabaseFeature>("Database")->fail30Revisions()) {
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_ARANGO_CORRUPTED_DATAFILE, std::string("collection '") + _logicalCollection->name() + "' contains _rev values from 3.0 and needs to be migrated using dump/restore");
}
}
// update the real statistics for the collection
try {
for (auto& it : openState._stats) {
createStats(it.first, *(it.second));
}
} catch (basics::Exception const& ex) {
return ex.code();
} catch (...) {
return TRI_ERROR_INTERNAL;
}
return TRI_ERROR_NO_ERROR;
}
MMFilesDocumentPosition MMFilesCollection::lookupRevision(TRI_voc_rid_t revisionId) const {
TRI_ASSERT(revisionId != 0);
MMFilesDocumentPosition const old = _revisionsCache.lookup(revisionId);
if (old) {
return old;
}
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "got invalid revision value on lookup");
}
uint8_t const* MMFilesCollection::lookupRevisionVPack(TRI_voc_rid_t revisionId) const {
TRI_ASSERT(revisionId != 0);
MMFilesDocumentPosition const old = _revisionsCache.lookup(revisionId);
if (old) {
uint8_t const* vpack = static_cast<uint8_t const*>(old.dataptr());
TRI_ASSERT(VPackSlice(vpack).isObject());
return vpack;
}
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "got invalid vpack value on lookup");
}
uint8_t const* MMFilesCollection::lookupRevisionVPackConditional(TRI_voc_rid_t revisionId, TRI_voc_tick_t maxTick, bool excludeWal) const {
TRI_ASSERT(revisionId != 0);
MMFilesDocumentPosition const old = _revisionsCache.lookup(revisionId);
if (!old) {
return nullptr;
}
if (excludeWal && old.pointsToWal()) {
return nullptr;
}
uint8_t const* vpack = static_cast<uint8_t const*>(old.dataptr());
if (maxTick > 0) {
TRI_df_marker_t const* marker = reinterpret_cast<TRI_df_marker_t const*>(vpack - arangodb::DatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_DOCUMENT));
if (marker->getTick() > maxTick) {
return nullptr;
}
}
return vpack;
}
void MMFilesCollection::insertRevision(TRI_voc_rid_t revisionId, uint8_t const* dataptr, TRI_voc_fid_t fid, bool isInWal, bool shouldLock) {
TRI_ASSERT(revisionId != 0);
TRI_ASSERT(dataptr != nullptr);
_revisionsCache.insert(revisionId, dataptr, fid, isInWal, shouldLock);
}
void MMFilesCollection::updateRevision(TRI_voc_rid_t revisionId, uint8_t const* dataptr, TRI_voc_fid_t fid, bool isInWal) {
TRI_ASSERT(revisionId != 0);
TRI_ASSERT(dataptr != nullptr);
_revisionsCache.update(revisionId, dataptr, fid, isInWal);
}
bool MMFilesCollection::updateRevisionConditional(TRI_voc_rid_t revisionId, TRI_df_marker_t const* oldPosition, TRI_df_marker_t const* newPosition, TRI_voc_fid_t newFid, bool isInWal) {
TRI_ASSERT(revisionId != 0);
TRI_ASSERT(newPosition != nullptr);
return _revisionsCache.updateConditional(revisionId, oldPosition, newPosition, newFid, isInWal);
}
void MMFilesCollection::removeRevision(TRI_voc_rid_t revisionId, bool updateStats) {
TRI_ASSERT(revisionId != 0);
if (updateStats) {
MMFilesDocumentPosition const old = _revisionsCache.fetchAndRemove(revisionId);
if (old && !old.pointsToWal()) {
TRI_ASSERT(old.dataptr() != nullptr);
uint8_t const* vpack = static_cast<uint8_t const*>(old.dataptr());
int64_t size = DatafileHelper::AlignedSize<int64_t>(arangodb::DatafileHelper::VPackOffset(TRI_DF_MARKER_VPACK_DOCUMENT) + VPackSlice(vpack).byteSize());
_datafileStatistics.increaseDead(old.fid(), 1, size);
}
} else {
_revisionsCache.remove(revisionId);
}
}