1
0
Fork 0
arangodb/arangod/VocBase/compactor.cpp

1424 lines
49 KiB
C++

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Dr. Frank Celler
////////////////////////////////////////////////////////////////////////////////
#ifdef _WIN32
#include "Basics/win-utils.h"
#endif
#include "compactor.h"
#include "Basics/conversions.h"
#include "Basics/files.h"
#include "Basics/FileUtils.h"
#include "Basics/memory-map.h"
#include "Basics/Logger.h"
#include "Basics/tri-strings.h"
#include "Basics/WriteLocker.h"
#include "Indexes/PrimaryIndex.h"
#include "Utils/SingleCollectionTransaction.h"
#include "Utils/StandaloneTransactionContext.h"
#include "VocBase/DatafileHelper.h"
#include "VocBase/DatafileStatistics.h"
#include "VocBase/document-collection.h"
#include "VocBase/server.h"
#include "VocBase/vocbase.h"
using namespace arangodb;
static char const* ReasonNoDatafiles =
"skipped compaction because collection has no datafiles";
static char const* ReasonCompactionBlocked =
"skipped compaction because existing compactor file is in the way and "
"waits to be processed";
static char const* ReasonDatafileSmall =
"compacting datafile because it's small and will be merged with next";
static char const* ReasonEmpty =
"compacting datafile because collection is empty";
static char const* ReasonOnlyDeletions =
"compacting datafile because it contains only deletion markers";
static char const* ReasonDeadSize =
"compacting datafile because it contains much dead object space";
static char const* ReasonDeadSizeShare =
"compacting datafile because it contains high share of dead objects";
static char const* ReasonDeadCount =
"compacting datafile because it contains many dead objects";
static char const* ReasonNothingToCompact =
"checked datafiles, but no compaction opportunity found";
////////////////////////////////////////////////////////////////////////////////
/// @brief minimum size of dead data (in bytes) in a datafile that will make
/// the datafile eligible for compaction at all.
///
/// Any datafile with less dead data than the threshold will not become a
/// candidate for compaction.
////////////////////////////////////////////////////////////////////////////////
#define COMPACTOR_DEAD_SIZE_THRESHOLD (1024 * 128)
////////////////////////////////////////////////////////////////////////////////
/// @brief percentage of dead documents in a datafile that will trigger the
/// compaction
///
/// for example, if the collection contains 800 bytes of alive and 400 bytes of
/// dead documents, the share of the dead documents is 400 / (400 + 800) = 33 %.
/// if this value if higher than the threshold, the datafile will be compacted
////////////////////////////////////////////////////////////////////////////////
#define COMPACTOR_DEAD_SIZE_SHARE (0.1)
////////////////////////////////////////////////////////////////////////////////
/// @brief minimum number of deletion marker in file from which on we will
/// compact it if nothing else qualifies file for compaction
////////////////////////////////////////////////////////////////////////////////
#define COMPACTOR_DEAD_THRESHOLD (16384)
////////////////////////////////////////////////////////////////////////////////
/// @brief maximum number of datafiles to join together in one compaction run
////////////////////////////////////////////////////////////////////////////////
#define COMPACTOR_MAX_FILES 3
////////////////////////////////////////////////////////////////////////////////
/// @brief maximum multiple of journal filesize of a compacted file
/// a value of 3 means that the maximum filesize of the compacted file is
/// 3 x (collection->journalSize)
////////////////////////////////////////////////////////////////////////////////
#define COMPACTOR_MAX_SIZE_FACTOR (3)
////////////////////////////////////////////////////////////////////////////////
/// @brief maximum filesize of resulting compacted file
////////////////////////////////////////////////////////////////////////////////
#define COMPACTOR_MAX_RESULT_FILESIZE (128 * 1024 * 1024)
////////////////////////////////////////////////////////////////////////////////
/// @brief datafiles smaller than the following value will be merged with others
////////////////////////////////////////////////////////////////////////////////
#define COMPACTOR_MIN_SIZE (128 * 1024)
////////////////////////////////////////////////////////////////////////////////
/// @brief re-try compaction of a specific collection in this interval (in s)
////////////////////////////////////////////////////////////////////////////////
#define COMPACTOR_COLLECTION_INTERVAL (10.0)
////////////////////////////////////////////////////////////////////////////////
/// @brief compactify interval in microseconds
////////////////////////////////////////////////////////////////////////////////
static int const COMPACTOR_INTERVAL = (1 * 1000 * 1000);
////////////////////////////////////////////////////////////////////////////////
/// @brief compaction blocker entry
////////////////////////////////////////////////////////////////////////////////
typedef struct compaction_blocker_s {
TRI_voc_tick_t _id;
double _expires;
} compaction_blocker_t;
////////////////////////////////////////////////////////////////////////////////
/// @brief auxiliary struct used when initializing compaction
////////////////////////////////////////////////////////////////////////////////
struct compaction_initial_context_t {
arangodb::Transaction* _trx;
TRI_document_collection_t* _document;
int64_t _targetSize;
TRI_voc_fid_t _fid;
bool _keepDeletions;
bool _failed;
};
////////////////////////////////////////////////////////////////////////////////
/// @brief compaction state
////////////////////////////////////////////////////////////////////////////////
struct compaction_context_t {
arangodb::Transaction* _trx;
TRI_document_collection_t* _document;
TRI_datafile_t* _compactor;
DatafileStatisticsContainer _dfi;
bool _keepDeletions;
};
////////////////////////////////////////////////////////////////////////////////
/// @brief compaction instruction for a single datafile
////////////////////////////////////////////////////////////////////////////////
struct compaction_info_t {
TRI_datafile_t* _datafile;
bool _keepDeletions;
};
////////////////////////////////////////////////////////////////////////////////
/// @brief creates a compactor file, based on a datafile
////////////////////////////////////////////////////////////////////////////////
static TRI_datafile_t* CreateCompactor(TRI_document_collection_t* document,
TRI_voc_fid_t fid, int64_t maximalSize) {
TRI_collection_t* collection = document;
// reserve room for one additional entry
collection->_compactors.reserve(collection->_compactors.size() + 1);
TRI_LOCK_JOURNAL_ENTRIES_DOC_COLLECTION(document);
TRI_datafile_t* compactor = TRI_CreateDatafileDocumentCollection(
document, fid, static_cast<TRI_voc_size_t>(maximalSize), true);
if (compactor != nullptr) {
collection->_compactors.emplace_back(compactor);
}
TRI_UNLOCK_JOURNAL_ENTRIES_DOC_COLLECTION(document);
return compactor;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief write a copy of the marker into the datafile
////////////////////////////////////////////////////////////////////////////////
static int CopyMarker(TRI_document_collection_t* document,
TRI_datafile_t* compactor, TRI_df_marker_t const* marker,
TRI_df_marker_t** result) {
int res = TRI_ReserveElementDatafile(compactor, marker->getSize(), result, 0);
if (res != TRI_ERROR_NO_ERROR) {
document->_lastError = TRI_set_errno(TRI_ERROR_ARANGO_NO_JOURNAL);
return TRI_ERROR_ARANGO_NO_JOURNAL;
}
return TRI_WriteElementDatafile(compactor, *result, marker, false);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief locate a datafile, identified by fid, in a vector of datafiles
////////////////////////////////////////////////////////////////////////////////
static bool LocateDatafile(std::vector<TRI_datafile_t*> const& vector,
TRI_voc_fid_t fid, size_t* position) {
size_t const n = vector.size();
for (size_t i = 0; i < n; ++i) {
auto datafile = vector[i];
if (datafile->_fid == fid) {
*position = i;
return true;
}
}
return false;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief callback to drop a datafile
////////////////////////////////////////////////////////////////////////////////
static void DropDatafileCallback(TRI_datafile_t* datafile, void* data) {
TRI_document_collection_t* document =
static_cast<TRI_document_collection_t*>(data);
TRI_voc_fid_t fid = datafile->_fid;
std::string copy;
std::string name("deleted-" + std::to_string(fid) + ".db");
std::string filename = arangodb::basics::FileUtils::buildFilename(document->_directory, name);
bool ok;
if (datafile->isPhysical(datafile)) {
// copy the current filename
copy = datafile->_filename;
ok = TRI_RenameDatafile(datafile, filename.c_str());
if (!ok) {
LOG_TOPIC(ERR, Logger::COMPACTOR) << "cannot rename obsolete datafile '" << copy << "' to '" << filename << "': " << TRI_last_error();
}
}
LOG_TOPIC(DEBUG, Logger::COMPACTOR) << "finished compacting datafile '" << datafile->getName(datafile) << "'";
ok = TRI_CloseDatafile(datafile);
if (!ok) {
LOG_TOPIC(ERR, Logger::COMPACTOR) << "cannot close obsolete datafile '" << datafile->getName(datafile) << "': " << TRI_last_error();
} else if (datafile->isPhysical(datafile)) {
LOG_TOPIC(DEBUG, Logger::COMPACTOR) << "wiping compacted datafile from disk";
int res = TRI_UnlinkFile(filename.c_str());
if (res != TRI_ERROR_NO_ERROR) {
LOG_TOPIC(ERR, Logger::COMPACTOR) << "cannot wipe obsolete datafile '" << datafile->getName(datafile) << "': " << TRI_last_error();
}
// check for .dead files
if (!copy.empty()) {
// remove .dead file for datafile
std::string deadfile = copy + ".dead";
// check if .dead file exists, then remove it
if (TRI_ExistsFile(deadfile.c_str())) {
TRI_UnlinkFile(deadfile.c_str());
}
}
}
TRI_FreeDatafile(datafile);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief callback to rename a datafile
///
/// The datafile will be renamed to "temp-abc.db" (where "abc" is the fid of
/// the datafile) first. If this rename operation fails, there will be a
/// compactor file and a datafile. On startup, the datafile will be preferred
/// in this case.
/// If renaming succeeds, the compactor will be named to the original datafile.
/// If that does not succeed, there is a compactor file and a renamed datafile.
/// On startup, the compactor file will be used, and the renamed datafile
/// will be treated as a temporary file and dropped.
////////////////////////////////////////////////////////////////////////////////
static void RenameDatafileCallback(TRI_datafile_t* datafile, void* data) {
auto* context = static_cast<compaction_context_t*>(data);
TRI_datafile_t* compactor = context->_compactor;
TRI_document_collection_t* document = context->_document;
bool ok = false;
TRI_ASSERT(datafile->_fid == compactor->_fid);
if (datafile->isPhysical(datafile)) {
// construct a suitable tempname
std::string jname("temp-" + std::to_string(datafile->_fid) + ".db");
std::string tempFilename = arangodb::basics::FileUtils::buildFilename(document->_directory, jname);
std::string realName = datafile->_filename;
if (!TRI_RenameDatafile(datafile, tempFilename.c_str())) {
LOG_TOPIC(ERR, Logger::COMPACTOR) << "unable to rename datafile '" << datafile->getName(datafile) << "' to '" << tempFilename << "'";
} else {
if (!TRI_RenameDatafile(compactor, realName.c_str())) {
LOG_TOPIC(ERR, Logger::COMPACTOR) << "unable to rename compaction file '" << compactor->getName(compactor) << "' to '" << realName << "'";
} else {
ok = true;
}
}
} else {
ok = true;
}
if (ok) {
size_t i;
// must acquire a write-lock as we're about to change the datafiles vector
TRI_WRITE_LOCK_DATAFILES_DOC_COLLECTION(document);
if (!LocateDatafile(document->_datafiles, datafile->_fid, &i)) {
TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document);
LOG_TOPIC(ERR, Logger::COMPACTOR) << "logic error: could not locate datafile";
TRI_Free(TRI_CORE_MEM_ZONE, context);
return;
}
// put the compactor in place of the datafile
document->_datafiles[i] = compactor;
if (!LocateDatafile(document->_compactors, compactor->_fid, &i)) {
TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document);
LOG_TOPIC(ERR, Logger::COMPACTOR) << "logic error: could not locate compactor";
TRI_Free(TRI_CORE_MEM_ZONE, context);
return;
}
// remove the compactor from the list of compactors
document->_compactors.erase(document->_compactors.begin() + i);
TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document);
DropDatafileCallback(datafile, document);
}
TRI_Free(TRI_CORE_MEM_ZONE, context);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief datafile iterator, copies "live" data from datafile into compactor
///
/// this function is called for all markers in the collected datafiles. Its
// 7 purpose is to find the still-alive markers and copy them into the compactor
/// file.
/// IMPORTANT: if the logic inside this function is adjusted, the total size
/// calculated by function CalculateSize might need adjustment, too!!
////////////////////////////////////////////////////////////////////////////////
static bool Compactifier(TRI_df_marker_t const* marker, void* data,
TRI_datafile_t* datafile) {
auto* context = static_cast<compaction_context_t*>(data);
TRI_document_collection_t* document = context->_document;
TRI_voc_fid_t const targetFid = context->_compactor->_fid;
TRI_df_marker_type_t const type = marker->getType();
// new or updated document
if (type == TRI_DF_MARKER_VPACK_DOCUMENT) {
VPackSlice const slice(reinterpret_cast<char const*>(marker) + DatafileHelper::VPackOffset(type));
TRI_ASSERT(slice.isObject());
VPackSlice const keySlice(slice.get(TRI_VOC_ATTRIBUTE_KEY));
TRI_voc_rid_t const rid = std::stoull(slice.get(TRI_VOC_ATTRIBUTE_REV).copyString());
// check if the document is still active
auto primaryIndex = document->primaryIndex();
auto found = primaryIndex->lookupKey(context->_trx, keySlice);
bool deleted = (found == nullptr || found->revisionId() > rid);
if (deleted) {
// found a dead document
context->_dfi.numberDead++;
context->_dfi.sizeDead += DatafileHelper::AlignedMarkerSize<int64_t>(marker);
return true;
}
context->_keepDeletions = true;
// write to compactor files
TRI_df_marker_t* result;
int res = CopyMarker(document, context->_compactor, marker, &result);
if (res != TRI_ERROR_NO_ERROR) {
// TODO: dont fail but recover from this state
LOG_TOPIC(FATAL, Logger::COMPACTOR) << "cannot write compactor file: " << TRI_last_error(); FATAL_ERROR_EXIT();
}
TRI_doc_mptr_t* found2 = const_cast<TRI_doc_mptr_t*>(found);
TRI_ASSERT(found2->getDataPtr() != nullptr);
TRI_ASSERT(found2->getMarkerPtr()->getSize() > 0);
// let marker point to the new position
found2->setDataPtr(result);
// update fid in case it changes
if (found2->getFid() != targetFid) {
found2->setFid(targetFid, false);
}
context->_dfi.numberAlive++;
context->_dfi.sizeAlive += DatafileHelper::AlignedMarkerSize<int64_t>(marker);
}
// deletions
else if (type == TRI_DF_MARKER_VPACK_REMOVE) {
if (context->_keepDeletions) {
// write to compactor files
TRI_df_marker_t* result;
int res = CopyMarker(document, context->_compactor, marker, &result);
if (res != TRI_ERROR_NO_ERROR) {
// TODO: dont fail but recover from this state
LOG_TOPIC(FATAL, Logger::COMPACTOR) << "cannot write document marker to compactor file: " << TRI_last_error(); FATAL_ERROR_EXIT();
}
// update datafile info
context->_dfi.numberDeletions++;
}
}
return true;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief remove an empty compactor file
////////////////////////////////////////////////////////////////////////////////
static int RemoveCompactor(TRI_document_collection_t* document,
TRI_datafile_t* compactor) {
size_t i;
LOG_TOPIC(DEBUG, Logger::COMPACTOR) << "removing empty compaction file '" << compactor->getName(compactor) << "'";
// remove the datafile from the list of datafiles
TRI_WRITE_LOCK_DATAFILES_DOC_COLLECTION(document);
// remove the compactor from the list of compactors
if (!LocateDatafile(document->_compactors, compactor->_fid, &i)) {
TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document);
LOG_TOPIC(ERR, Logger::COMPACTOR) << "logic error: could not locate compactor";
return TRI_ERROR_INTERNAL;
}
document->_compactors.erase(document->_compactors.begin() + i);
TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document);
// close the file & remove it
if (compactor->isPhysical(compactor)) {
char* filename;
filename = TRI_DuplicateString(compactor->getName(compactor));
TRI_CloseDatafile(compactor);
TRI_FreeDatafile(compactor);
TRI_UnlinkFile(filename);
TRI_Free(TRI_CORE_MEM_ZONE, filename);
} else {
TRI_CloseDatafile(compactor);
TRI_FreeDatafile(compactor);
}
return TRI_ERROR_NO_ERROR;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief remove an empty datafile
////////////////////////////////////////////////////////////////////////////////
static int RemoveDatafile(TRI_document_collection_t* document,
TRI_datafile_t* df) {
LOG_TOPIC(DEBUG, Logger::COMPACTOR) << "removing empty datafile '" << df->getName(df) << "'";
// remove the datafile from the list of datafiles
TRI_WRITE_LOCK_DATAFILES_DOC_COLLECTION(document);
size_t i;
if (!LocateDatafile(document->_datafiles, df->_fid, &i)) {
TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document);
LOG_TOPIC(ERR, Logger::COMPACTOR) << "logic error: could not locate datafile";
return TRI_ERROR_INTERNAL;
}
document->_datafiles.erase(document->_datafiles.begin() + i);
TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document);
// update dfi
document->_datafileStatistics.remove(df->_fid);
return TRI_ERROR_NO_ERROR;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief datafile iterator, calculates necessary total size
////////////////////////////////////////////////////////////////////////////////
static bool CalculateSize(TRI_df_marker_t const* marker, void* data,
TRI_datafile_t* datafile) {
auto* context = static_cast<compaction_initial_context_t*>(data);
TRI_document_collection_t* document = context->_document;
TRI_df_marker_type_t const type = marker->getType();
// new or updated document
if (type == TRI_DF_MARKER_VPACK_DOCUMENT) {
VPackSlice const slice(reinterpret_cast<char const*>(marker) + DatafileHelper::VPackOffset(type));
TRI_ASSERT(slice.isObject());
VPackSlice const keySlice(slice.get(TRI_VOC_ATTRIBUTE_KEY));
TRI_voc_rid_t const rid = std::stoull(slice.get(TRI_VOC_ATTRIBUTE_REV).copyString());
// check if the document is still active
auto primaryIndex = document->primaryIndex();
auto found = primaryIndex->lookupKey(context->_trx, keySlice);
bool deleted = (found == nullptr || found->revisionId() > rid);
if (deleted) {
return true;
}
context->_keepDeletions = true;
context->_targetSize += DatafileHelper::AlignedMarkerSize<int64_t>(marker);
}
// deletions
else if (type == TRI_DF_MARKER_VPACK_REMOVE) {
if (context->_keepDeletions) {
context->_targetSize += DatafileHelper::AlignedMarkerSize<int64_t>(marker);
}
}
return true;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief calculate the target size for the compactor to be created
////////////////////////////////////////////////////////////////////////////////
static compaction_initial_context_t InitCompaction(
arangodb::Transaction* trx, TRI_document_collection_t* document,
std::vector<compaction_info_t> const& toCompact) {
compaction_initial_context_t context;
memset(&context, 0, sizeof(compaction_initial_context_t));
context._trx = trx;
context._failed = false;
context._document = document;
// this is the minimum required size
context._targetSize =
sizeof(TRI_df_header_marker_t) + sizeof(TRI_col_header_marker_t) +
sizeof(TRI_df_footer_marker_t) + 256; // allow for some overhead
size_t const n = toCompact.size();
for (size_t i = 0; i < n; ++i) {
auto compaction = toCompact[i];
TRI_datafile_t* df = compaction._datafile;
// We will sequentially scan the logfile for collection:
if (df->isPhysical(df)) {
TRI_MMFileAdvise(df->_data, df->_maximalSize, TRI_MADVISE_SEQUENTIAL);
TRI_MMFileAdvise(df->_data, df->_maximalSize, TRI_MADVISE_WILLNEED);
}
if (i == 0) {
// extract and store fid
context._fid = compaction._datafile->_fid;
}
context._keepDeletions = compaction._keepDeletions;
TRI_READ_LOCK_DOCUMENTS_INDEXES_PRIMARY_COLLECTION(document);
bool ok;
try {
ok = TRI_IterateDatafile(df, CalculateSize, &context);
} catch (...) {
ok = false;
}
TRI_READ_UNLOCK_DOCUMENTS_INDEXES_PRIMARY_COLLECTION(document);
if (df->isPhysical(df)) {
TRI_MMFileAdvise(df->_data, df->_maximalSize, TRI_MADVISE_RANDOM);
}
if (!ok) {
context._failed = true;
break;
}
}
return context;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief compact a list of datafiles
////////////////////////////////////////////////////////////////////////////////
static void CompactifyDatafiles(
TRI_document_collection_t* document,
std::vector<compaction_info_t> const& toCompact) {
TRI_datafile_t* compactor;
compaction_context_t context;
size_t i, j;
size_t const n = toCompact.size();
TRI_ASSERT(n > 0);
arangodb::SingleCollectionTransaction trx(arangodb::StandaloneTransactionContext::Create(document->_vocbase),
document->_info.id(), TRI_TRANSACTION_WRITE);
trx.addHint(TRI_TRANSACTION_HINT_NO_BEGIN_MARKER, true);
trx.addHint(TRI_TRANSACTION_HINT_NO_ABORT_MARKER, true);
trx.addHint(TRI_TRANSACTION_HINT_NO_COMPACTION_LOCK, true);
compaction_initial_context_t initial =
InitCompaction(&trx, document, toCompact);
if (initial._failed) {
LOG_TOPIC(ERR, Logger::COMPACTOR) << "could not create initialize compaction";
return;
}
LOG_TOPIC(DEBUG, Logger::COMPACTOR) << "compactify called for collection '" << document->_info.id() << "' for " << n << " datafiles of total size " << initial._targetSize;
// now create a new compactor file
// we are re-using the _fid of the first original datafile!
compactor = CreateCompactor(document, initial._fid, initial._targetSize);
if (compactor == nullptr) {
// some error occurred
LOG_TOPIC(ERR, Logger::COMPACTOR) << "could not create compactor file";
return;
}
LOG_TOPIC(DEBUG, Logger::COMPACTOR) << "created new compactor file '" << compactor->getName(compactor) << "'";
// these attributes remain the same for all datafiles we collect
context._document = document;
context._compactor = compactor;
context._trx = &trx;
int res = trx.begin();
if (res != TRI_ERROR_NO_ERROR) {
LOG_TOPIC(ERR, Logger::COMPACTOR) << "error during compaction: " << TRI_errno_string(res);
return;
}
// now compact all datafiles
for (i = 0; i < n; ++i) {
auto compaction = toCompact[i];
TRI_datafile_t* df = compaction._datafile;
LOG_TOPIC(DEBUG, Logger::COMPACTOR) << "compacting datafile '" << df->getName(df) << "' into '" << compactor->getName(compactor) << "', number: " << i << ", keep deletions: " << compaction._keepDeletions;
// if this is the first datafile in the list of datafiles, we can also
// collect
// deletion markers
context._keepDeletions = compaction._keepDeletions;
// run the actual compaction of a single datafile
bool ok = TRI_IterateDatafile(df, Compactifier, &context);
if (!ok) {
LOG_TOPIC(WARN, Logger::COMPACTOR) << "failed to compact datafile '" << df->getName(df) << "'";
// compactor file does not need to be removed now. will be removed on next
// startup
// TODO: Remove file
return;
}
} // next file
document->_datafileStatistics.replace(compactor->_fid, context._dfi);
trx.commit();
// remove all datafile statistics that we don't need anymore
for (i = 1; i < n; ++i) {
auto compaction = toCompact[i];
document->_datafileStatistics.remove(compaction._datafile->_fid);
}
// locate the compactor
// must acquire a write-lock as we're about to change the datafiles vector
TRI_WRITE_LOCK_DATAFILES_DOC_COLLECTION(document);
if (!LocateDatafile(document->_compactors, compactor->_fid, &j)) {
// not found
TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document);
LOG_TOPIC(ERR, Logger::COMPACTOR) << "logic error in CompactifyDatafiles: could not find compactor";
return;
}
if (!TRI_CloseDatafileDocumentCollection(document, j, true)) {
TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document);
LOG_TOPIC(ERR, Logger::COMPACTOR) << "could not close compactor file";
// TODO: how do we recover from this state?
return;
}
TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document);
if (context._dfi.numberAlive == 0 && context._dfi.numberDead == 0 &&
context._dfi.numberDeletions == 0 && context._dfi.numberShapes == 0 &&
context._dfi.numberAttributes == 0) {
if (n > 1) {
// create .dead files for all collected files
for (i = 0; i < n; ++i) {
auto compaction = toCompact[i];
TRI_datafile_t* datafile = compaction._datafile;
if (datafile->isPhysical(datafile)) {
char* filename =
TRI_Concatenate2String(datafile->getName(datafile), ".dead");
if (filename != nullptr) {
TRI_WriteFile(filename, "", 0);
TRI_FreeString(TRI_CORE_MEM_ZONE, filename);
}
}
}
}
// compactor is fully empty. remove it
RemoveCompactor(document, compactor);
for (i = 0; i < n; ++i) {
auto compaction = toCompact[i];
// datafile is also empty after compaction and thus useless
RemoveDatafile(document, compaction._datafile);
// add a deletion ditch to the collection
auto b = document->ditches()->createDropDatafileDitch(
compaction._datafile, document, DropDatafileCallback, __FILE__,
__LINE__);
if (b == nullptr) {
LOG_TOPIC(ERR, Logger::COMPACTOR) << "out of memory when creating datafile-drop ditch";
}
}
} else {
if (n > 1) {
// create .dead files for all collected files but the first
for (i = 1; i < n; ++i) {
auto compaction = toCompact[i];
TRI_datafile_t* datafile = compaction._datafile;
if (datafile->isPhysical(datafile)) {
char* filename =
TRI_Concatenate2String(datafile->getName(datafile), ".dead");
if (filename != nullptr) {
TRI_WriteFile(filename, "", 0);
TRI_FreeString(TRI_CORE_MEM_ZONE, filename);
}
}
}
}
for (i = 0; i < n; ++i) {
auto compaction = toCompact[i];
if (i == 0) {
// add a rename marker
void* copy = TRI_Allocate(TRI_CORE_MEM_ZONE,
sizeof(compaction_context_t), false);
memcpy(copy, &context, sizeof(compaction_context_t));
auto b = document->ditches()->createRenameDatafileDitch(
compaction._datafile, copy, RenameDatafileCallback, __FILE__,
__LINE__);
if (b == nullptr) {
LOG_TOPIC(ERR, Logger::COMPACTOR) << "out of memory when creating datafile-rename ditch";
TRI_Free(TRI_CORE_MEM_ZONE, copy);
}
} else {
// datafile is empty after compaction and thus useless
RemoveDatafile(document, compaction._datafile);
// add a drop datafile marker
auto b = document->ditches()->createDropDatafileDitch(
compaction._datafile, document, DropDatafileCallback, __FILE__,
__LINE__);
if (b == nullptr) {
LOG_TOPIC(ERR, Logger::COMPACTOR) << "out of memory when creating datafile-drop ditch";
}
}
}
}
}
////////////////////////////////////////////////////////////////////////////////
/// @brief checks all datafiles of a collection
////////////////////////////////////////////////////////////////////////////////
static bool CompactifyDocumentCollection(TRI_document_collection_t* document) {
// we can hopefully get away without the lock here...
// if (! document->isFullyCollected()) {
// return false;
// }
std::vector<compaction_info_t> toCompact;
toCompact.reserve(COMPACTOR_MAX_FILES);
// if we cannot acquire the read lock instantly, we will exit directly.
// otherwise we'll risk a multi-thread deadlock between synchronizer,
// compactor and data-modification threads (e.g. POST /_api/document)
if (!TRI_TRY_READ_LOCK_DATAFILES_DOC_COLLECTION(document)) {
return false;
}
size_t const n = document->_datafiles.size();
if (n == 0 || document->_compactors.size() > 0) {
// we already have created a compactor file in progress.
// if this happens, then a previous compaction attempt for this collection
// failed
// additionally, if there are no datafiles, then there's no need to compact
TRI_READ_UNLOCK_DATAFILES_DOC_COLLECTION(document);
if (n == 0) {
document->setCompactionStatus(ReasonNoDatafiles);
} else {
document->setCompactionStatus(ReasonCompactionBlocked);
}
return false;
}
LOG_TOPIC(DEBUG, Logger::COMPACTOR) << "inspecting datafiles of collection '" << document->_info.namec_str() << "' for compaction opportunities";
size_t start = document->getNextCompactionStartIndex();
// number of documents is protected by the same lock
uint64_t const numDocuments = document->size();
// get maximum size of result file
uint64_t maxSize = (uint64_t)COMPACTOR_MAX_SIZE_FACTOR *
(uint64_t)document->_info.maximalSize();
if (maxSize < 8 * 1024 * 1024) {
maxSize = 8 * 1024 * 1024;
}
if (maxSize >= COMPACTOR_MAX_RESULT_FILESIZE) {
maxSize = COMPACTOR_MAX_RESULT_FILESIZE;
}
if (start >= n || numDocuments == 0) {
start = 0;
}
int64_t numAlive = 0;
if (start > 0) {
// we don't know for sure if there are alive documents in the first
// datafile,
// so let's assume there are some
numAlive = 16384;
}
bool doCompact = false;
uint64_t totalSize = 0;
char const* reason = nullptr;
char const* firstReason = nullptr;
for (size_t i = start; i < n; ++i) {
TRI_datafile_t* df =
static_cast<TRI_datafile_t*>(document->_datafiles[i]);
TRI_ASSERT(df != nullptr);
DatafileStatisticsContainer dfi =
document->_datafileStatistics.get(df->_fid);
if (dfi.numberUncollected > 0) {
LOG_TOPIC(DEBUG, Logger::COMPACTOR) << "cannot compact datafile " << df->_fid << " of collection '" << document->_info.namec_str() << "' because it still has uncollected entries";
start = i + 1;
break;
}
if (!doCompact && df->_maximalSize < COMPACTOR_MIN_SIZE && i < n - 1) {
// very small datafile and not the last one. let's compact it so it's
// merged with others
doCompact = true;
reason = ReasonDatafileSmall;
} else if (numDocuments == 0 &&
(dfi.numberAlive > 0 || dfi.numberDead > 0 ||
dfi.numberDeletions > 0)) {
// collection is empty, but datafile statistics indicate there is
// something in this datafile
doCompact = true;
reason = ReasonEmpty;
} else if (numAlive == 0 && dfi.numberAlive == 0 &&
dfi.numberDeletions > 0) {
// compact first datafile(s) if they contain only deletions
doCompact = true;
reason = ReasonOnlyDeletions;
} else if (dfi.sizeDead >= (int64_t)COMPACTOR_DEAD_SIZE_THRESHOLD) {
// the size of dead objects is above some threshold
doCompact = true;
reason = ReasonDeadSize;
} else if (dfi.sizeDead > 0 &&
(((double)dfi.sizeDead /
((double)dfi.sizeDead + (double)dfi.sizeAlive) >=
COMPACTOR_DEAD_SIZE_SHARE) ||
((double)dfi.sizeDead / (double)df->_maximalSize >=
COMPACTOR_DEAD_SIZE_SHARE))) {
// the size of dead objects is above some share
doCompact = true;
reason = ReasonDeadSizeShare;
} else if (dfi.numberDead >= (int64_t)COMPACTOR_DEAD_THRESHOLD) {
// the number of dead objects is above some threshold
doCompact = true;
reason = ReasonDeadCount;
}
if (!doCompact) {
numAlive += static_cast<int64_t>(dfi.numberAlive);
continue;
}
TRI_ASSERT(doCompact);
if (firstReason == nullptr) {
firstReason = reason;
}
// remember for next compaction
start = i + 1;
// if we got only deletions then it's safe to continue compaction, regardless of
// the size of the resulting file. this is because deletions will reduce the
// size of the resulting file
if (reason != ReasonOnlyDeletions) {
if (!toCompact.empty() &&
totalSize + (uint64_t)df->_maximalSize >= maxSize &&
(toCompact.size() != 1 || reason != ReasonDatafileSmall)) {
// found enough files to compact (in terms of cumulated size)
// there's one exception to this: if we're merging multiple datafiles,
// then we don't stop at the first one even if the merge of file #1 and #2
// would be too big. if we wouldn't stop in this case, then file #1 would
// be selected for compaction over and over
// normally this case won't happen at all, it can occur however if one
// decreases the journalSize configuration for the collection afterwards, and
// there are already datafiles which are more than 3 times bigger than the
// new (smaller) journalSize value
break;
}
}
TRI_ASSERT(reason != nullptr);
LOG_TOPIC(DEBUG, Logger::COMPACTOR) << "found datafile #" << i << " eligible for compaction. fid: " << df->_fid << ", size: " << df->_maximalSize << ", reason: " << reason << ", numberDead: " << dfi.numberDead << ", numberAlive: " << dfi.numberAlive << ", numberDeletions: " << dfi.numberDeletions << ", numberShapes: " << dfi.numberShapes << ", numberAttributes: " << dfi.numberAttributes << ", numberUncollected: " << dfi.numberUncollected << ", sizeDead: " << dfi.sizeDead << ", sizeAlive: " << dfi.sizeAlive << ", sizeShapes " << dfi.sizeShapes << ", sizeAttributes: " << dfi.sizeAttributes;
totalSize += static_cast<uint64_t>(df->_maximalSize);
compaction_info_t compaction;
compaction._datafile = df;
compaction._keepDeletions = (numAlive > 0 && i > 0);
// TODO: verify that keepDeletions actually works with wrong numAlive stats
try {
toCompact.push_back(compaction);
} catch (...) {
// silently fail. either we had found something to compact or not
// if not, then we can try again next time. if yes, then we'll simply
// forget
// about it and also try again next time
break;
}
// we stop at the first few datafiles.
// this is better than going over all datafiles in a collection in one go
// because the compactor is single-threaded, and collecting all datafiles
// might take a long time (it might even be that there is a request to
// delete the collection in the middle of compaction, but the compactor
// will not pick this up as it is read-locking the collection status)
if (totalSize >= maxSize) {
// result file will be big enough
break;
}
if (totalSize >= COMPACTOR_MIN_SIZE &&
toCompact.size() >= COMPACTOR_MAX_FILES) {
// found enough files to compact
break;
}
numAlive += static_cast<int64_t>(dfi.numberAlive);
}
// can now continue without the lock
TRI_READ_UNLOCK_DATAFILES_DOC_COLLECTION(document);
if (toCompact.empty()) {
// nothing to compact. now reset start index
document->setNextCompactionStartIndex(0);
// cleanup local variables
document->setCompactionStatus(ReasonNothingToCompact);
LOG_TOPIC(DEBUG, Logger::COMPACTOR) << "inspecting datafiles of collection yielded: " << ReasonNothingToCompact;
return false;
}
// handle datafiles with dead objects
TRI_ASSERT(toCompact.size() >= 1);
TRI_ASSERT(reason != nullptr);
document->setCompactionStatus(reason);
document->setNextCompactionStartIndex(start);
CompactifyDatafiles(document, toCompact);
return true;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief try to write-lock the compaction
/// returns true if lock acquisition was successful. the caller is responsible
/// to free the write lock eventually
////////////////////////////////////////////////////////////////////////////////
static bool TryLockCompaction(TRI_vocbase_t* vocbase) {
return TRI_TryWriteLockReadWriteLock(&vocbase->_compactionBlockers._lock);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief write-lock the compaction
////////////////////////////////////////////////////////////////////////////////
static void LockCompaction(TRI_vocbase_t* vocbase) {
while (!TryLockCompaction(vocbase)) {
// cycle until we have acquired the write-lock
usleep(1000);
}
}
////////////////////////////////////////////////////////////////////////////////
/// @brief write-unlock the compaction
////////////////////////////////////////////////////////////////////////////////
static void UnlockCompaction(TRI_vocbase_t* vocbase) {
TRI_WriteUnlockReadWriteLock(&vocbase->_compactionBlockers._lock);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief atomic check and lock for running the compaction
/// if this function returns true, it has acquired a write-lock on the
/// compactionBlockers structure, which the caller must free eventually
////////////////////////////////////////////////////////////////////////////////
static bool CheckAndLockCompaction(TRI_vocbase_t* vocbase) {
// check if we can acquire the write lock instantly
if (!TryLockCompaction(vocbase)) {
// couldn't acquire the write lock
return false;
}
// we are now holding the write lock
double now = TRI_microtime();
// check if we have a still-valid compaction blocker
size_t const n = TRI_LengthVector(&vocbase->_compactionBlockers._data);
for (size_t i = 0; i < n; ++i) {
compaction_blocker_t* blocker = static_cast<compaction_blocker_t*>(
TRI_AtVector(&vocbase->_compactionBlockers._data, i));
if (blocker->_expires > now) {
// found a compaction blocker. unlock and return
UnlockCompaction(vocbase);
return false;
}
}
return true;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief initialize the compaction blockers structure
////////////////////////////////////////////////////////////////////////////////
int TRI_InitCompactorVocBase(TRI_vocbase_t* vocbase) {
TRI_InitReadWriteLock(&vocbase->_compactionBlockers._lock);
TRI_InitVector(&vocbase->_compactionBlockers._data, TRI_UNKNOWN_MEM_ZONE,
sizeof(compaction_blocker_t));
return TRI_ERROR_NO_ERROR;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief destroy the compaction blockers structure
////////////////////////////////////////////////////////////////////////////////
void TRI_DestroyCompactorVocBase(TRI_vocbase_t* vocbase) {
TRI_DestroyVector(&vocbase->_compactionBlockers._data);
TRI_DestroyReadWriteLock(&vocbase->_compactionBlockers._lock);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief remove data of expired compaction blockers
////////////////////////////////////////////////////////////////////////////////
bool TRI_CleanupCompactorVocBase(TRI_vocbase_t* vocbase) {
// check if we can instantly acquire the lock
if (!TryLockCompaction(vocbase)) {
// couldn't acquire lock
return false;
}
// we are now holding the write lock
double now = TRI_microtime();
size_t n = TRI_LengthVector(&vocbase->_compactionBlockers._data);
size_t i = 0;
while (i < n) {
compaction_blocker_t* blocker = static_cast<compaction_blocker_t*>(
TRI_AtVector(&vocbase->_compactionBlockers._data, i));
if (blocker->_expires < now) {
TRI_RemoveVector(&vocbase->_compactionBlockers._data, i);
n--;
} else {
i++;
}
}
UnlockCompaction(vocbase);
return true;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief insert a compaction blocker
////////////////////////////////////////////////////////////////////////////////
int TRI_InsertBlockerCompactorVocBase(TRI_vocbase_t* vocbase, double lifetime,
TRI_voc_tick_t* id) {
if (lifetime <= 0.0) {
return TRI_ERROR_BAD_PARAMETER;
}
compaction_blocker_t blocker;
blocker._id = TRI_NewTickServer();
blocker._expires = TRI_microtime() + lifetime;
LockCompaction(vocbase);
int res = TRI_PushBackVector(&vocbase->_compactionBlockers._data, &blocker);
UnlockCompaction(vocbase);
if (res != TRI_ERROR_NO_ERROR) {
return res;
}
*id = blocker._id;
return TRI_ERROR_NO_ERROR;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief touch an existing compaction blocker
////////////////////////////////////////////////////////////////////////////////
int TRI_TouchBlockerCompactorVocBase(TRI_vocbase_t* vocbase, TRI_voc_tick_t id,
double lifetime) {
bool found = false;
if (lifetime <= 0.0) {
return TRI_ERROR_BAD_PARAMETER;
}
LockCompaction(vocbase);
size_t const n = TRI_LengthVector(&vocbase->_compactionBlockers._data);
for (size_t i = 0; i < n; ++i) {
compaction_blocker_t* blocker = static_cast<compaction_blocker_t*>(
TRI_AtVector(&vocbase->_compactionBlockers._data, i));
if (blocker->_id == id) {
blocker->_expires = TRI_microtime() + lifetime;
found = true;
break;
}
}
UnlockCompaction(vocbase);
if (!found) {
return TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND;
}
return TRI_ERROR_NO_ERROR;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief atomically check-and-lock the compactor
/// if the function returns true, then a write-lock on the compactor was
/// acquired, which must eventually be freed by the caller
////////////////////////////////////////////////////////////////////////////////
bool TRI_CheckAndLockCompactorVocBase(TRI_vocbase_t* vocbase) {
return TryLockCompaction(vocbase);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief unlock the compactor
////////////////////////////////////////////////////////////////////////////////
void TRI_UnlockCompactorVocBase(TRI_vocbase_t* vocbase) {
UnlockCompaction(vocbase);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief remove an existing compaction blocker
////////////////////////////////////////////////////////////////////////////////
int TRI_RemoveBlockerCompactorVocBase(TRI_vocbase_t* vocbase,
TRI_voc_tick_t id) {
bool found = false;
LockCompaction(vocbase);
size_t const n = TRI_LengthVector(&vocbase->_compactionBlockers._data);
for (size_t i = 0; i < n; ++i) {
compaction_blocker_t* blocker = static_cast<compaction_blocker_t*>(
TRI_AtVector(&vocbase->_compactionBlockers._data, i));
if (blocker->_id == id) {
TRI_RemoveVector(&vocbase->_compactionBlockers._data, i);
found = true;
break;
}
}
UnlockCompaction(vocbase);
if (!found) {
return TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND;
}
return TRI_ERROR_NO_ERROR;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief compactor event loop
////////////////////////////////////////////////////////////////////////////////
void TRI_CompactorVocBase(void* data) {
TRI_vocbase_t* vocbase = static_cast<TRI_vocbase_t*>(data);
int numCompacted = 0;
TRI_ASSERT(vocbase->_state == 1);
std::vector<TRI_vocbase_col_t*> collections;
while (true) {
// keep initial _state value as vocbase->_state might change during
// compaction loop
int state = vocbase->_state;
// check if compaction is currently disallowed
if (CheckAndLockCompaction(vocbase)) {
// compaction is currently allowed
double now = TRI_microtime();
numCompacted = 0;
try {
READ_LOCKER(readLocker, vocbase->_collectionsLock);
// copy all collections
collections = vocbase->_collections;
} catch (...) {
collections.clear();
}
bool worked;
for (auto& collection : collections) {
{
TRY_READ_LOCKER(readLocker, collection->_lock);
if (!readLocker.isLocked()) {
// if we can't acquire the read lock instantly, we continue directly
// we don't want to stall here for too long
continue;
}
TRI_document_collection_t* document = collection->_collection;
if (document == nullptr) {
continue;
}
worked = false;
bool doCompact = document->_info.doCompact();
// for document collection, compactify datafiles
if (collection->_status == TRI_VOC_COL_STATUS_LOADED && doCompact) {
// check whether someone else holds a read-lock on the compaction
// lock
TRY_WRITE_LOCKER(locker, document->_compactionLock);
if (!locker.isLocked()) {
// someone else is holding the compactor lock, we'll not compact
continue;
}
try {
if (document->_lastCompaction + COMPACTOR_COLLECTION_INTERVAL <=
now) {
auto ce = document->ditches()->createCompactionDitch(__FILE__,
__LINE__);
if (ce == nullptr) {
// out of memory
LOG_TOPIC(WARN, Logger::COMPACTOR) << "out of memory when trying to create compaction ditch";
} else {
try {
worked = CompactifyDocumentCollection(document);
if (!worked) {
// set compaction stamp
document->_lastCompaction = now;
}
// if we worked, then we don't set the compaction stamp to
// force
// another round of compaction
} catch (...) {
LOG_TOPIC(ERR, Logger::COMPACTOR) << "an unknown exception occurred during compaction";
// in case an error occurs, we must still free this ditch
}
document->ditches()->freeDitch(ce);
}
}
} catch (...) {
// in case an error occurs, we must still relase the lock
LOG_TOPIC(ERR, Logger::COMPACTOR) << "an unknown exception occurred during compaction";
}
}
} // end of lock
if (worked) {
++numCompacted;
// signal the cleanup thread that we worked and that it can now wake
// up
TRI_LockCondition(&vocbase->_cleanupCondition);
TRI_SignalCondition(&vocbase->_cleanupCondition);
TRI_UnlockCondition(&vocbase->_cleanupCondition);
}
}
UnlockCompaction(vocbase);
}
if (numCompacted > 0) {
// no need to sleep long or go into wait state if we worked.
// maybe there's still work left
usleep(1000);
} else if (state != 2 && vocbase->_state == 1) {
// only sleep while server is still running
TRI_LockCondition(&vocbase->_compactorCondition);
TRI_TimedWaitCondition(&vocbase->_compactorCondition,
(uint64_t)COMPACTOR_INTERVAL);
TRI_UnlockCondition(&vocbase->_compactorCondition);
}
if (state == 2) {
// server shutdown
break;
}
}
LOG_TOPIC(DEBUG, Logger::COMPACTOR) << "shutting down compactor thread";
}