//////////////////////////////////////////////////////////////////////////////// /// @brief compactor /// /// @file /// /// DISCLAIMER /// /// Copyright 2014 ArangoDB GmbH, Cologne, Germany /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Dr. Frank Celler /// @author Copyright 2014, ArangoDB GmbH, Cologne, Germany /// @author Copyright 2011-2013, triAGENS GmbH, Cologne, Germany //////////////////////////////////////////////////////////////////////////////// #ifdef _WIN32 #include "Basics/win-utils.h" #endif #include "compactor.h" #include "Basics/conversions.h" #include "Basics/files.h" #include "Basics/logging.h" #include "Basics/tri-strings.h" #include "Basics/memory-map.h" #include "Utils/transactions.h" #include "VocBase/document-collection.h" #include "VocBase/server.h" #include "VocBase/vocbase.h" #include "VocBase/VocShaper.h" // ----------------------------------------------------------------------------- // --SECTION-- private constants // ----------------------------------------------------------------------------- //////////////////////////////////////////////////////////////////////////////// /// @brief minimum size of dead data (in bytes) in a datafile that will make /// the datafile eligible for compaction at all. /// /// Any datafile with less dead data than the threshold will not become a /// candidate for compaction. //////////////////////////////////////////////////////////////////////////////// #define COMPACTOR_DEAD_SIZE_THRESHOLD (1024 * 128) //////////////////////////////////////////////////////////////////////////////// /// @brief percentage of dead documents in a datafile that will trigger the /// compaction /// /// for example, if the collection contains 800 bytes of alive and 400 bytes of /// dead documents, the share of the dead documents is 400 / (400 + 800) = 33 %. /// if this value if higher than the threshold, the datafile will be compacted //////////////////////////////////////////////////////////////////////////////// #define COMPACTOR_DEAD_SIZE_SHARE (0.1) //////////////////////////////////////////////////////////////////////////////// /// @brief maximum number of datafiles to join together in one compaction run //////////////////////////////////////////////////////////////////////////////// #define COMPACTOR_MAX_FILES 4 //////////////////////////////////////////////////////////////////////////////// /// @brief maximum multiple of journal filesize of a compacted file /// a value of 3 means that the maximum filesize of the compacted file is /// 3 x (collection->journalSize) //////////////////////////////////////////////////////////////////////////////// #define COMPACTOR_MAX_SIZE_FACTOR (3) //////////////////////////////////////////////////////////////////////////////// /// @brief maximum filesize of resulting compacted file //////////////////////////////////////////////////////////////////////////////// #define COMPACTOR_MAX_RESULT_FILESIZE (128 * 1024 * 1024) //////////////////////////////////////////////////////////////////////////////// /// @brief datafiles smaller than the following value will be merged with others //////////////////////////////////////////////////////////////////////////////// #define COMPACTOR_MIN_SIZE (128 * 1024) //////////////////////////////////////////////////////////////////////////////// /// @brief re-try compaction of a specific collection in this interval (in s) //////////////////////////////////////////////////////////////////////////////// #define COMPACTOR_COLLECTION_INTERVAL (10.0) //////////////////////////////////////////////////////////////////////////////// /// @brief compactify interval in microseconds //////////////////////////////////////////////////////////////////////////////// static int const COMPACTOR_INTERVAL = (1 * 1000 * 1000); // ----------------------------------------------------------------------------- // --SECTION-- private types // ----------------------------------------------------------------------------- //////////////////////////////////////////////////////////////////////////////// /// @brief compaction blocker entry //////////////////////////////////////////////////////////////////////////////// typedef struct compaction_blocker_s { TRI_voc_tick_t _id; double _expires; } compaction_blocker_t; //////////////////////////////////////////////////////////////////////////////// /// @brief auxiliary struct used when initialising compaction //////////////////////////////////////////////////////////////////////////////// typedef struct compaction_intial_context_s { TRI_document_collection_t* _document; int64_t _targetSize; TRI_voc_fid_t _fid; bool _keepDeletions; bool _failed; } compaction_initial_context_t; //////////////////////////////////////////////////////////////////////////////// /// @brief compaction state //////////////////////////////////////////////////////////////////////////////// typedef struct compaction_context_s { TRI_document_collection_t* _document; TRI_datafile_t* _compactor; TRI_doc_datafile_info_t _dfi; bool _keepDeletions; } compaction_context_t; //////////////////////////////////////////////////////////////////////////////// /// @brief compaction instruction for a single datafile //////////////////////////////////////////////////////////////////////////////// typedef struct compaction_info_s { TRI_datafile_t* _datafile; bool _keepDeletions; } compaction_info_t; // ----------------------------------------------------------------------------- // --SECTION-- private functions // ----------------------------------------------------------------------------- //////////////////////////////////////////////////////////////////////////////// /// @brief return a marker's size //////////////////////////////////////////////////////////////////////////////// static inline int64_t AlignedSize (TRI_df_marker_t const* marker) { return static_cast(TRI_DF_ALIGN_BLOCK(marker->_size)); } //////////////////////////////////////////////////////////////////////////////// /// @brief creates a compactor file, based on a datafile //////////////////////////////////////////////////////////////////////////////// static TRI_datafile_t* CreateCompactor (TRI_document_collection_t* document, TRI_voc_fid_t fid, int64_t maximalSize) { TRI_collection_t* collection = document; // reserve room for one additional entry if (TRI_ReserveVectorPointer(&collection->_compactors, 1) != TRI_ERROR_NO_ERROR) { // could not get memory, exit early return nullptr; } TRI_LOCK_JOURNAL_ENTRIES_DOC_COLLECTION(document); TRI_datafile_t* compactor = TRI_CreateDatafileDocumentCollection(document, fid, static_cast(maximalSize), true); if (compactor != nullptr) { int res TRI_UNUSED = TRI_PushBackVectorPointer(&collection->_compactors, compactor); // we have reserved space before, so we can be sure the push succeeds TRI_ASSERT(res == TRI_ERROR_NO_ERROR); } TRI_UNLOCK_JOURNAL_ENTRIES_DOC_COLLECTION(document); return compactor; } //////////////////////////////////////////////////////////////////////////////// /// @brief write a copy of the marker into the datafile //////////////////////////////////////////////////////////////////////////////// static int CopyMarker (TRI_document_collection_t* document, TRI_datafile_t* compactor, TRI_df_marker_t const* marker, TRI_df_marker_t** result) { int res = TRI_ReserveElementDatafile(compactor, marker->_size, result, 0); if (res != TRI_ERROR_NO_ERROR) { document->_lastError = TRI_set_errno(TRI_ERROR_ARANGO_NO_JOURNAL); return TRI_ERROR_ARANGO_NO_JOURNAL; } return TRI_WriteElementDatafile(compactor, *result, marker, false); } //////////////////////////////////////////////////////////////////////////////// /// @brief locate a datafile, identified by fid, in a vector of datafiles //////////////////////////////////////////////////////////////////////////////// static bool LocateDatafile (TRI_vector_pointer_t const* vector, const TRI_voc_fid_t fid, size_t* position) { size_t const n = vector->_length; for (size_t i = 0; i < n; ++i) { TRI_datafile_t* df = static_cast(vector->_buffer[i]); if (df->_fid == fid) { *position = i; return true; } } return false; } //////////////////////////////////////////////////////////////////////////////// /// @brief callback to drop a datafile //////////////////////////////////////////////////////////////////////////////// static void DropDatafileCallback (TRI_datafile_t* datafile, void* data) { TRI_voc_fid_t fid; char* filename; char* name; char* number; char* copy; bool ok; TRI_document_collection_t* document = static_cast(data); fid = datafile->_fid; copy = nullptr; number = TRI_StringUInt64(fid); name = TRI_Concatenate3String("deleted-", number, ".db"); filename = TRI_Concatenate2File(document->_directory, name); TRI_FreeString(TRI_CORE_MEM_ZONE, number); TRI_FreeString(TRI_CORE_MEM_ZONE, name); if (datafile->isPhysical(datafile)) { // copy the current filename copy = TRI_DuplicateStringZ(TRI_CORE_MEM_ZONE, datafile->_filename); ok = TRI_RenameDatafile(datafile, filename); if (! ok) { LOG_ERROR("cannot rename obsolete datafile '%s' to '%s': %s", copy, filename, TRI_last_error()); } } LOG_DEBUG("finished compacting datafile '%s'", datafile->getName(datafile)); ok = TRI_CloseDatafile(datafile); if (! ok) { LOG_ERROR("cannot close obsolete datafile '%s': %s", datafile->getName(datafile), TRI_last_error()); } else if (datafile->isPhysical(datafile)) { int res; LOG_DEBUG("wiping compacted datafile from disk"); res = TRI_UnlinkFile(filename); if (res != TRI_ERROR_NO_ERROR) { LOG_ERROR("cannot wipe obsolete datafile '%s': %s", datafile->getName(datafile), TRI_last_error()); } // check for .dead files if (copy != nullptr) { // remove .dead file for datafile char* deadfile = TRI_Concatenate2String(copy, ".dead"); if (deadfile != nullptr) { // check if .dead file exists, then remove it if (TRI_ExistsFile(deadfile)) { TRI_UnlinkFile(deadfile); } TRI_FreeString(TRI_CORE_MEM_ZONE, deadfile); } } } TRI_FreeDatafile(datafile); TRI_FreeString(TRI_CORE_MEM_ZONE, filename); if (copy != nullptr) { TRI_FreeString(TRI_CORE_MEM_ZONE, copy); } } //////////////////////////////////////////////////////////////////////////////// /// @brief callback to rename a datafile /// /// The datafile will be renamed to "temp-abc.db" (where "abc" is the fid of /// the datafile) first. If this rename operation fails, there will be a /// compactor file and a datafile. On startup, the datafile will be preferred /// in this case. /// If renaming succeeds, the compactor will be named to the original datafile. /// If that does not succeed, there is a compactor file and a renamed datafile. /// On startup, the compactor file will be used, and the renamed datafile /// will be treated as a temporary file and dropped. //////////////////////////////////////////////////////////////////////////////// static void RenameDatafileCallback (TRI_datafile_t* datafile, void* data) { compaction_context_t* context; TRI_datafile_t* compactor; bool ok; context = (compaction_context_t*) data; compactor = context->_compactor; TRI_document_collection_t* document = context->_document; ok = false; TRI_ASSERT(datafile->_fid == compactor->_fid); if (datafile->isPhysical(datafile)) { char* number; char* jname; char* tempFilename; char* realName; realName = TRI_DuplicateString(datafile->_filename); // construct a suitable tempname number = TRI_StringUInt64(datafile->_fid); jname = TRI_Concatenate3String("temp-", number, ".db"); tempFilename = TRI_Concatenate2File(document->_directory, jname); TRI_FreeString(TRI_CORE_MEM_ZONE, number); TRI_FreeString(TRI_CORE_MEM_ZONE, jname); if (! TRI_RenameDatafile(datafile, tempFilename)) { LOG_ERROR("unable to rename datafile '%s' to '%s'", datafile->getName(datafile), tempFilename); } else { if (! TRI_RenameDatafile(compactor, realName)) { LOG_ERROR("unable to rename compaction file '%s' to '%s'", compactor->getName(compactor), realName); } else { ok = true; } } TRI_FreeString(TRI_CORE_MEM_ZONE, tempFilename); TRI_FreeString(TRI_CORE_MEM_ZONE, realName); } else { ok = true; } if (ok) { TRI_doc_datafile_info_t* dfi; size_t i; // must acquire a write-lock as we're about to change the datafiles vector TRI_WRITE_LOCK_DATAFILES_DOC_COLLECTION(document); if (! LocateDatafile(&document->_datafiles, datafile->_fid, &i)) { TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document); LOG_ERROR("logic error: could not locate datafile"); return; } // put the compactor in place of the datafile document->_datafiles._buffer[i] = compactor; // update dfi dfi = TRI_FindDatafileInfoDocumentCollection(document, compactor->_fid, false); if (dfi != nullptr) { memcpy(dfi, &context->_dfi, sizeof(TRI_doc_datafile_info_t)); } else { LOG_ERROR("logic error: could not find compactor file information"); } if (! LocateDatafile(&document->_compactors, compactor->_fid, &i)) { TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document); LOG_ERROR("logic error: could not locate compactor"); return; } // remove the compactor from the list of compactors TRI_RemoveVectorPointer(&document->_compactors, i); TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document); DropDatafileCallback(datafile, document); } TRI_Free(TRI_CORE_MEM_ZONE, context); } //////////////////////////////////////////////////////////////////////////////// /// @brief datafile iterator, copies "live" data from datafile into compactor /// /// this function is called for all markers in the collected datafiles. Its //7 purpose is to find the still-alive markers and copy them into the compactor /// file. /// IMPORTANT: if the logic inside this function is adjusted, the total size /// calculated by function CalculateSize might need adjustment, too!! //////////////////////////////////////////////////////////////////////////////// static bool Compactifier (TRI_df_marker_t const* marker, void* data, TRI_datafile_t* datafile) { TRI_df_marker_t* result; int res; compaction_context_t* context = static_cast(data); TRI_document_collection_t* document = context->_document; // new or updated document if (marker->_type == TRI_DOC_MARKER_KEY_DOCUMENT || marker->_type == TRI_DOC_MARKER_KEY_EDGE) { bool deleted; TRI_doc_document_key_marker_t const* d = reinterpret_cast(marker); TRI_voc_key_t key = (char*) d + d->_offsetKey; // check if the document is still active auto primaryIndex = document->primaryIndex(); auto found = static_cast(primaryIndex->lookupKey(key)); deleted = (found == nullptr || found->_rid > d->_rid); if (deleted) { LOG_TRACE("found a stale document: %s", key); return true; } context->_keepDeletions = true; // write to compactor files res = CopyMarker(document, context->_compactor, marker, &result); if (res != TRI_ERROR_NO_ERROR) { // TODO: dont fail but recover from this state LOG_FATAL_AND_EXIT("cannot write compactor file: %s", TRI_last_error()); } // check if the document is still active found = static_cast(primaryIndex->lookupKey(key)); deleted = (found == nullptr); if (deleted) { context->_dfi._numberDead += 1; context->_dfi._sizeDead += AlignedSize(marker); LOG_DEBUG("found a stale document after copying: %s", key); return true; } TRI_doc_mptr_t* found2 = const_cast(found); TRI_ASSERT(found2->getDataPtr() != nullptr); // ONLY in COMPACTIFIER, PROTECTED by fake trx outside TRI_ASSERT(((TRI_df_marker_t*) found2->getDataPtr())->_size > 0); // ONLY in COMPACTIFIER, PROTECTED by fake trx outside // the fid might change if (found->_fid != context->_compactor->_fid) { // update old datafile's info TRI_doc_datafile_info_t* dfi = TRI_FindDatafileInfoDocumentCollection(document, found->_fid, false); if (dfi != nullptr) { dfi->_numberDead += 1; dfi->_sizeDead += AlignedSize(marker); } found2->_fid = context->_compactor->_fid; } // let marker point to the new position found2->setDataPtr(result); // update datafile info context->_dfi._numberAlive += 1; context->_dfi._sizeAlive += AlignedSize(marker); } // deletions else if (marker->_type == TRI_DOC_MARKER_KEY_DELETION && context->_keepDeletions) { // write to compactor files res = CopyMarker(document, context->_compactor, marker, &result); if (res != TRI_ERROR_NO_ERROR) { // TODO: dont fail but recover from this state LOG_FATAL_AND_EXIT("cannot write document marker to compactor file: %s", TRI_last_error()); } // update datafile info context->_dfi._numberDeletion++; } // shapes else if (marker->_type == TRI_DF_MARKER_SHAPE) { // write to compactor files res = CopyMarker(document, context->_compactor, marker, &result); if (res != TRI_ERROR_NO_ERROR) { // TODO: dont fail but recover from this state LOG_FATAL_AND_EXIT("cannot write shape marker to compactor file: %s", TRI_last_error()); } res = document->getShaper()->moveMarker(result, nullptr); // ONLY IN COMPACTOR, PROTECTED by fake trx in caller if (res != TRI_ERROR_NO_ERROR) { LOG_FATAL_AND_EXIT("cannot re-locate shape marker"); } context->_dfi._numberShapes++; context->_dfi._sizeShapes += AlignedSize(marker); } // attributes else if (marker->_type == TRI_DF_MARKER_ATTRIBUTE) { // write to compactor files res = CopyMarker(document, context->_compactor, marker, &result); if (res != TRI_ERROR_NO_ERROR) { // TODO: dont fail but recover from this state LOG_FATAL_AND_EXIT("cannot write attribute marker to compactor file: %s", TRI_last_error()); } res = document->getShaper()->moveMarker(result, nullptr); // ONLY IN COMPACTOR, PROTECTED by fake trx in caller if (res != TRI_ERROR_NO_ERROR) { LOG_FATAL_AND_EXIT("cannot re-locate shape marker"); } context->_dfi._numberAttributes++; context->_dfi._sizeAttributes += AlignedSize(marker); } // transaction markers else if (marker->_type == TRI_DOC_MARKER_BEGIN_TRANSACTION || marker->_type == TRI_DOC_MARKER_COMMIT_TRANSACTION || marker->_type == TRI_DOC_MARKER_ABORT_TRANSACTION || marker->_type == TRI_DOC_MARKER_PREPARE_TRANSACTION) { // these markers are used from ArangoDB 2.2 onwards // still, datafiles of older collections might contain these // markers and we need to copy them if (document->_failedTransactions != nullptr) { // write to compactor files res = CopyMarker(document, context->_compactor, marker, &result); if (res != TRI_ERROR_NO_ERROR) { // TODO: dont fail but recover from this state LOG_FATAL_AND_EXIT("cannot write transaction marker to compactor file: %s", TRI_last_error()); } context->_dfi._numberTransactions++; context->_dfi._sizeTransactions += AlignedSize(marker); } // otherwise don't copy } return true; } //////////////////////////////////////////////////////////////////////////////// /// @brief remove an empty compactor file //////////////////////////////////////////////////////////////////////////////// static int RemoveCompactor (TRI_document_collection_t* document, TRI_datafile_t* compactor) { size_t i; LOG_TRACE("removing empty compaction file '%s'", compactor->getName(compactor)); // remove the datafile from the list of datafiles TRI_WRITE_LOCK_DATAFILES_DOC_COLLECTION(document); // remove the compactor from the list of compactors if (! LocateDatafile(&document->_compactors, compactor->_fid, &i)) { TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document); LOG_ERROR("logic error: could not locate compactor"); return TRI_ERROR_INTERNAL; } TRI_RemoveVectorPointer(&document->_compactors, i); TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document); // close the file & remove it if (compactor->isPhysical(compactor)) { char* filename; filename = TRI_DuplicateString(compactor->getName(compactor)); TRI_CloseDatafile(compactor); TRI_FreeDatafile(compactor); TRI_UnlinkFile(filename); TRI_Free(TRI_CORE_MEM_ZONE, filename); } else { TRI_CloseDatafile(compactor); TRI_FreeDatafile(compactor); } return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief remove an empty datafile //////////////////////////////////////////////////////////////////////////////// static int RemoveDatafile (TRI_document_collection_t* document, TRI_datafile_t* df) { TRI_doc_datafile_info_t* dfi; size_t i; LOG_TRACE("removing empty datafile '%s'", df->getName(df)); // remove the datafile from the list of datafiles TRI_WRITE_LOCK_DATAFILES_DOC_COLLECTION(document); if (! LocateDatafile(&document->_datafiles, df->_fid, &i)) { TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document); LOG_ERROR("logic error: could not locate datafile"); return TRI_ERROR_INTERNAL; } TRI_RemoveVectorPointer(&document->_datafiles, i); // update dfi dfi = TRI_FindDatafileInfoDocumentCollection(document, df->_fid, false); if (dfi != nullptr) { TRI_RemoveDatafileInfoDocumentCollection(document, df->_fid); TRI_Free(TRI_UNKNOWN_MEM_ZONE, dfi); } TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document); return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief datafile iterator, calculates necessary total size //////////////////////////////////////////////////////////////////////////////// static bool CalculateSize (TRI_df_marker_t const* marker, void* data, TRI_datafile_t* datafile) { compaction_initial_context_t* context = static_cast(data); TRI_document_collection_t* document = context->_document; // new or updated document if (marker->_type == TRI_DOC_MARKER_KEY_DOCUMENT || marker->_type == TRI_DOC_MARKER_KEY_EDGE) { bool deleted; TRI_doc_document_key_marker_t const* d = reinterpret_cast(marker); TRI_voc_key_t key = (char*) d + d->_offsetKey; // check if the document is still active auto primaryIndex = document->primaryIndex(); auto found = static_cast(primaryIndex->lookupKey(key)); deleted = (found == nullptr || found->_rid > d->_rid); if (deleted) { return true; } context->_keepDeletions = true; context->_targetSize += AlignedSize(marker); } // deletions else if (marker->_type == TRI_DOC_MARKER_KEY_DELETION && context->_keepDeletions) { context->_targetSize += AlignedSize(marker); } // shapes, attributes else if (marker->_type == TRI_DF_MARKER_SHAPE || marker->_type == TRI_DF_MARKER_ATTRIBUTE) { context->_targetSize += AlignedSize(marker); } // transaction markers else if (marker->_type == TRI_DOC_MARKER_BEGIN_TRANSACTION || marker->_type == TRI_DOC_MARKER_COMMIT_TRANSACTION || marker->_type == TRI_DOC_MARKER_ABORT_TRANSACTION || marker->_type == TRI_DOC_MARKER_PREPARE_TRANSACTION) { if (document->_failedTransactions != nullptr) { // these markers only need to be copied if there are "old" failed transactions context->_targetSize += AlignedSize(marker); } } return true; } //////////////////////////////////////////////////////////////////////////////// /// @brief calculate the target size for the compactor to be created //////////////////////////////////////////////////////////////////////////////// static compaction_initial_context_t InitCompaction (TRI_document_collection_t* document, TRI_vector_t const* compactions) { compaction_initial_context_t context; memset(&context, 0, sizeof(compaction_initial_context_t)); context._failed = false; context._document = document; // this is the minimum required size context._targetSize = sizeof(TRI_df_header_marker_t) + sizeof(TRI_col_header_marker_t) + sizeof(TRI_df_footer_marker_t) + 256; // allow for some overhead size_t const n = TRI_LengthVector(compactions); for (size_t i = 0; i < n; ++i) { compaction_info_t* compaction = static_cast(TRI_AtVector(compactions, i)); TRI_datafile_t* df = compaction->_datafile; // We will sequentially scan the logfile for collection: if (df->isPhysical(df)) { TRI_MMFileAdvise(df->_data, df->_maximalSize, TRI_MADVISE_SEQUENTIAL); TRI_MMFileAdvise(df->_data, df->_maximalSize, TRI_MADVISE_WILLNEED); } if (i == 0) { // extract and store fid context._fid = compaction->_datafile->_fid; } context._keepDeletions = compaction->_keepDeletions; bool ok = TRI_IterateDatafile(df, CalculateSize, &context); if (df->isPhysical(df)) { TRI_MMFileAdvise(df->_data, df->_maximalSize, TRI_MADVISE_RANDOM); } if (! ok) { context._failed = true; break; } } return context; } //////////////////////////////////////////////////////////////////////////////// /// @brief compact a list of datafiles //////////////////////////////////////////////////////////////////////////////// static void CompactifyDatafiles (TRI_document_collection_t* document, TRI_vector_t const* compactions) { TRI_datafile_t* compactor; compaction_initial_context_t initial; compaction_context_t context; size_t i, j, n; n = TRI_LengthVector(compactions); TRI_ASSERT(n > 0); // create a fake transaction triagens::arango::TransactionBase trx(true); initial = InitCompaction(document, compactions); if (initial._failed) { LOG_ERROR("could not create initialise compaction"); return; } LOG_TRACE("compactify called for collection '%llu' for %d datafiles of total size %llu", (unsigned long long) document->_info._cid, (int) n, (unsigned long long) initial._targetSize); // now create a new compactor file // we are re-using the _fid of the first original datafile! compactor = CreateCompactor(document, initial._fid, initial._targetSize); if (compactor == nullptr) { // some error occurred LOG_ERROR("could not create compactor file"); return; } LOG_DEBUG("created new compactor file '%s'", compactor->getName(compactor)); memset(&context._dfi, 0, sizeof(TRI_doc_datafile_info_t)); // these attributes remain the same for all datafiles we collect context._document = document; context._compactor = compactor; context._dfi._fid = compactor->_fid; // now compact all datafiles for (i = 0; i < n; ++i) { compaction_info_t* compaction = static_cast(TRI_AtVector(compactions, i)); TRI_datafile_t* df = compaction->_datafile; LOG_TRACE("compacting datafile '%s' into '%s', number: %d, keep deletions: %d", df->getName(df), compactor->getName(compactor), (int) i, (int) compaction->_keepDeletions); // if this is the first datafile in the list of datafiles, we can also collect // deletion markers context._keepDeletions = compaction->_keepDeletions; TRI_WRITE_LOCK_DOCUMENTS_INDEXES_PRIMARY_COLLECTION(document); // run the actual compaction of a single datafile bool ok = TRI_IterateDatafile(df, Compactifier, &context); TRI_WRITE_UNLOCK_DOCUMENTS_INDEXES_PRIMARY_COLLECTION(document); if (! ok) { LOG_WARNING("failed to compact datafile '%s'", df->getName(df)); // compactor file does not need to be removed now. will be removed on next startup // TODO: Remove return; } } // next file // locate the compactor // must acquire a write-lock as we're about to change the datafiles vector TRI_WRITE_LOCK_DATAFILES_DOC_COLLECTION(document); if (! LocateDatafile(&document->_compactors, compactor->_fid, &j)) { // not found TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document); LOG_ERROR("logic error in CompactifyDatafiles: could not find compactor"); return; } if (! TRI_CloseDatafileDocumentCollection(document, j, true)) { TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document); LOG_ERROR("could not close compactor file"); // TODO: how do we recover from this state? return; } TRI_WRITE_UNLOCK_DATAFILES_DOC_COLLECTION(document); if (context._dfi._numberAlive == 0 && context._dfi._numberDead == 0 && context._dfi._numberDeletion == 0 && context._dfi._numberShapes == 0 && context._dfi._numberAttributes == 0 && context._dfi._numberTransactions == 0) { if (n > 1) { // create .dead files for all collected files for (i = 0; i < n; ++i) { compaction_info_t* compaction = static_cast(TRI_AtVector(compactions, i)); TRI_datafile_t* datafile = compaction->_datafile; if (datafile->isPhysical(datafile)) { char* filename = TRI_Concatenate2String(datafile->getName(datafile), ".dead"); if (filename != nullptr) { TRI_WriteFile(filename, "", 0); TRI_FreeString(TRI_CORE_MEM_ZONE, filename); } } } } // compactor is fully empty. remove it RemoveCompactor(document, compactor); for (i = 0; i < n; ++i) { compaction_info_t* compaction = static_cast(TRI_AtVector(compactions, i)); // datafile is also empty after compaction and thus useless RemoveDatafile(document, compaction->_datafile); // add a deletion ditch to the collection auto b = document->ditches()->createDropDatafileDitch(compaction->_datafile, document, DropDatafileCallback, __FILE__, __LINE__); if (b == nullptr) { LOG_ERROR("out of memory when creating datafile-drop ditch"); } } } else { if (n > 1) { // create .dead files for all collected files but the first for (i = 1; i < n; ++i) { compaction_info_t* compaction = static_cast(TRI_AtVector(compactions, i)); TRI_datafile_t* datafile = compaction->_datafile; if (datafile->isPhysical(datafile)) { char* filename = TRI_Concatenate2String(datafile->getName(datafile), ".dead"); if (filename != nullptr) { TRI_WriteFile(filename, "", 0); TRI_FreeString(TRI_CORE_MEM_ZONE, filename); } } } } for (i = 0; i < n; ++i) { compaction_info_t* compaction = static_cast(TRI_AtVector(compactions, i)); if (i == 0) { // add a rename marker void* copy; copy = TRI_Allocate(TRI_CORE_MEM_ZONE, sizeof(compaction_context_t), false); memcpy(copy, &context, sizeof(compaction_context_t)); auto b = document->ditches()->createRenameDatafileDitch(compaction->_datafile, copy, RenameDatafileCallback, __FILE__, __LINE__); if (b == nullptr) { LOG_ERROR("out of memory when creating datafile-rename ditch"); TRI_Free(TRI_CORE_MEM_ZONE, copy); } } else { // datafile is empty after compaction and thus useless RemoveDatafile(document, compaction->_datafile); // add a drop datafile marker auto b = document->ditches()->createDropDatafileDitch(compaction->_datafile, document, DropDatafileCallback, __FILE__, __LINE__); if (b == nullptr) { LOG_ERROR("out of memory when creating datafile-drop ditch"); } } } } } //////////////////////////////////////////////////////////////////////////////// /// @brief checks all datafiles of a collection //////////////////////////////////////////////////////////////////////////////// static bool CompactifyDocumentCollection (TRI_document_collection_t* document) { // we can hopefully get away without the lock here... // if (! TRI_IsFullyCollectedDocumentCollection(document)) { // return false; // } // if we cannot acquire the read lock instantly, we will exit directly. // otherwise we'll risk a multi-thread deadlock between synchroniser, // compactor and data-modification threads (e.g. POST /_api/document) if (! TRI_TRY_READ_LOCK_DATAFILES_DOC_COLLECTION(document)) { return false; } size_t const n = document->_datafiles._length; if (document->_compactors._length > 0 || n == 0) { // we already have created a compactor file in progress. // if this happens, then a previous compaction attempt for this collection failed // additionally, if there are no datafiles, then there's no need to compact TRI_READ_UNLOCK_DATAFILES_DOC_COLLECTION(document); return false; } // get maximum size of result file uint64_t maxSize = (uint64_t) COMPACTOR_MAX_SIZE_FACTOR * (uint64_t) document->_info._maximalSize; if (maxSize < 8 * 1024 * 1024) { maxSize = 8 * 1024 * 1024; } if (maxSize >= COMPACTOR_MAX_RESULT_FILESIZE) { maxSize = COMPACTOR_MAX_RESULT_FILESIZE; } // copy datafile information TRI_vector_t vector; TRI_InitVector(&vector, TRI_UNKNOWN_MEM_ZONE, sizeof(compaction_info_t)); int64_t numAlive = 0; bool compactNext = false; for (size_t i = 0; i < n; ++i) { TRI_doc_datafile_info_t* dfi; compaction_info_t compaction; uint64_t totalSize = 0; bool shouldCompact; TRI_datafile_t* df = static_cast(document->_datafiles._buffer[i]); TRI_ASSERT(df != nullptr); dfi = TRI_FindDatafileInfoDocumentCollection(document, df->_fid, false); if (dfi == nullptr) { // datafile info not found. this shouldn't happen LOG_WARNING("datafile info not found for datafile %llu", (unsigned long long) df->_fid); continue; } shouldCompact = false; if (! compactNext && df->_maximalSize < COMPACTOR_MIN_SIZE && i < n - 1) { // very small datafile. let's compact it so it's merged with others shouldCompact = true; compactNext = true; } else if (numAlive == 0 && dfi->_numberAlive == 0 && dfi->_numberDeletion > 0) { // compact first datafile(s) already if they have some deletions shouldCompact = true; compactNext = true; } else { // in all other cases, only check the number and size of "dead" objects if (dfi->_sizeDead >= (int64_t) COMPACTOR_DEAD_SIZE_THRESHOLD) { shouldCompact = true; compactNext = true; } else if (dfi->_sizeDead > 0) { // the size of dead objects is above some threshold double share = (double) dfi->_sizeDead / ((double) dfi->_sizeDead + (double) dfi->_sizeAlive); if (share >= COMPACTOR_DEAD_SIZE_SHARE) { // the size of dead objects is above some share shouldCompact = true; compactNext = true; } } } if (! shouldCompact) { // only use those datafiles that contain dead objects if (! compactNext) { numAlive += (int64_t) dfi->_numberAlive; continue; } } LOG_TRACE("found datafile eligible for compaction. fid: %llu, size: %llu " "numberDead: %llu, numberAlive: %llu, numberDeletion: %llu, " "numberShapes: %llu, numberAttributes: %llu, transactions: %llu, " "sizeDead: %llu, sizeAlive: %llu, sizeShapes %llu, sizeAttributes: %llu, " "sizeTransactions: %llu", (unsigned long long) df->_fid, (unsigned long long) df->_maximalSize, (unsigned long long) dfi->_numberDead, (unsigned long long) dfi->_numberAlive, (unsigned long long) dfi->_numberDeletion, (unsigned long long) dfi->_numberShapes, (unsigned long long) dfi->_numberAttributes, (unsigned long long) dfi->_numberTransactions, (unsigned long long) dfi->_sizeDead, (unsigned long long) dfi->_sizeAlive, (unsigned long long) dfi->_sizeShapes, (unsigned long long) dfi->_sizeAttributes, (unsigned long long) dfi->_sizeTransactions); totalSize += (uint64_t) df->_maximalSize; compaction._datafile = df; compaction._keepDeletions = (numAlive > 0 && i > 0); TRI_PushBackVector(&vector, &compaction); // we stop at the first few datafiles. // this is better than going over all datafiles in a collection in one go // because the compactor is single-threaded, and collecting all datafiles // might take a long time (it might even be that there is a request to // delete the collection in the middle of compaction, but the compactor // will not pick this up as it is read-locking the collection status) if (TRI_LengthVector(&vector) >= COMPACTOR_MAX_FILES || totalSize >= maxSize) { // found enough to compact break; } numAlive += (int64_t) dfi->_numberAlive; } // can now continue without the lock TRI_READ_UNLOCK_DATAFILES_DOC_COLLECTION(document); if (TRI_LengthVector(&vector) == 0) { // cleanup local variables TRI_DestroyVector(&vector); return false; } // handle datafiles with dead objects TRI_ASSERT(TRI_LengthVector(&vector) >= 1); CompactifyDatafiles(document, &vector); // cleanup local variables TRI_DestroyVector(&vector); return true; } //////////////////////////////////////////////////////////////////////////////// /// @brief try to write-lock the compaction /// returns true if lock acquisition was successful. the caller is responsible /// to free the write lock eventually //////////////////////////////////////////////////////////////////////////////// static bool TryLockCompaction (TRI_vocbase_t* vocbase) { return TRI_TryWriteLockReadWriteLock(&vocbase->_compactionBlockers._lock); } //////////////////////////////////////////////////////////////////////////////// /// @brief write-lock the compaction //////////////////////////////////////////////////////////////////////////////// static void LockCompaction (TRI_vocbase_t* vocbase) { while (! TryLockCompaction(vocbase)) { // cycle until we have acquired the write-lock usleep(1000); } } //////////////////////////////////////////////////////////////////////////////// /// @brief write-unlock the compaction //////////////////////////////////////////////////////////////////////////////// static void UnlockCompaction (TRI_vocbase_t* vocbase) { TRI_WriteUnlockReadWriteLock(&vocbase->_compactionBlockers._lock); } //////////////////////////////////////////////////////////////////////////////// /// @brief atomic check and lock for running the compaction /// if this function returns true, it has acquired a write-lock on the /// compactionBlockers structure, which the caller must free eventually //////////////////////////////////////////////////////////////////////////////// static bool CheckAndLockCompaction (TRI_vocbase_t* vocbase) { // check if we can acquire the write lock instantly if (! TryLockCompaction(vocbase)) { // couldn't acquire the write lock return false; } // we are now holding the write lock double now = TRI_microtime(); // check if we have a still-valid compaction blocker size_t const n = TRI_LengthVector(&vocbase->_compactionBlockers._data); for (size_t i = 0; i < n; ++i) { compaction_blocker_t* blocker = static_cast(TRI_AtVector(&vocbase->_compactionBlockers._data, i)); if (blocker->_expires > now) { // found a compaction blocker. unlock and return UnlockCompaction(vocbase); return false; } } return true; } // ----------------------------------------------------------------------------- // --SECTION-- public functions // ----------------------------------------------------------------------------- //////////////////////////////////////////////////////////////////////////////// /// @brief initialise the compaction blockers structure //////////////////////////////////////////////////////////////////////////////// int TRI_InitCompactorVocBase (TRI_vocbase_t* vocbase) { TRI_InitReadWriteLock(&vocbase->_compactionBlockers._lock); TRI_InitVector(&vocbase->_compactionBlockers._data, TRI_UNKNOWN_MEM_ZONE, sizeof(compaction_blocker_t)); return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief destroy the compaction blockers structure //////////////////////////////////////////////////////////////////////////////// void TRI_DestroyCompactorVocBase (TRI_vocbase_t* vocbase) { TRI_DestroyVector(&vocbase->_compactionBlockers._data); TRI_DestroyReadWriteLock(&vocbase->_compactionBlockers._lock); } //////////////////////////////////////////////////////////////////////////////// /// @brief remove data of expired compaction blockers //////////////////////////////////////////////////////////////////////////////// bool TRI_CleanupCompactorVocBase (TRI_vocbase_t* vocbase) { // check if we can instantly acquire the lock if (! TryLockCompaction(vocbase)) { // couldn't acquire lock return false; } // we are now holding the write lock double now = TRI_microtime(); size_t n = TRI_LengthVector(&vocbase->_compactionBlockers._data); size_t i = 0; while (i < n) { compaction_blocker_t* blocker = static_cast(TRI_AtVector(&vocbase->_compactionBlockers._data, i)); if (blocker->_expires < now) { TRI_RemoveVector(&vocbase->_compactionBlockers._data, i); n--; } else { i++; } } UnlockCompaction(vocbase); return true; } //////////////////////////////////////////////////////////////////////////////// /// @brief insert a compaction blocker //////////////////////////////////////////////////////////////////////////////// int TRI_InsertBlockerCompactorVocBase (TRI_vocbase_t* vocbase, double lifetime, TRI_voc_tick_t* id) { if (lifetime <= 0.0) { return TRI_ERROR_BAD_PARAMETER; } compaction_blocker_t blocker; blocker._id = TRI_NewTickServer(); blocker._expires = TRI_microtime() + lifetime; LockCompaction(vocbase); int res = TRI_PushBackVector(&vocbase->_compactionBlockers._data, &blocker); UnlockCompaction(vocbase); if (res != TRI_ERROR_NO_ERROR) { return res; } *id = blocker._id; return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief touch an existing compaction blocker //////////////////////////////////////////////////////////////////////////////// int TRI_TouchBlockerCompactorVocBase (TRI_vocbase_t* vocbase, TRI_voc_tick_t id, double lifetime) { bool found = false; if (lifetime <= 0.0) { return TRI_ERROR_BAD_PARAMETER; } LockCompaction(vocbase); size_t const n = TRI_LengthVector(&vocbase->_compactionBlockers._data); for (size_t i = 0; i < n; ++i) { compaction_blocker_t* blocker = static_cast(TRI_AtVector(&vocbase->_compactionBlockers._data, i)); if (blocker->_id == id) { blocker->_expires = TRI_microtime() + lifetime; found = true; break; } } UnlockCompaction(vocbase); if (! found) { return TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND; } return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief atomically check-and-lock the compactor /// if the function returns true, then a write-lock on the compactor was /// acquired, which must eventually be freed by the caller //////////////////////////////////////////////////////////////////////////////// bool TRI_CheckAndLockCompactorVocBase (TRI_vocbase_t* vocbase) { return TryLockCompaction(vocbase); } //////////////////////////////////////////////////////////////////////////////// /// @brief unlock the compactor //////////////////////////////////////////////////////////////////////////////// void TRI_UnlockCompactorVocBase (TRI_vocbase_t* vocbase) { UnlockCompaction(vocbase); } //////////////////////////////////////////////////////////////////////////////// /// @brief remove an existing compaction blocker //////////////////////////////////////////////////////////////////////////////// int TRI_RemoveBlockerCompactorVocBase (TRI_vocbase_t* vocbase, TRI_voc_tick_t id) { bool found = false; LockCompaction(vocbase); size_t const n = TRI_LengthVector(&vocbase->_compactionBlockers._data); for (size_t i = 0; i < n; ++i) { compaction_blocker_t* blocker = static_cast(TRI_AtVector(&vocbase->_compactionBlockers._data, i)); if (blocker->_id == id) { TRI_RemoveVector(&vocbase->_compactionBlockers._data, i); found = true; break; } } UnlockCompaction(vocbase); if (! found) { return TRI_ERROR_ARANGO_DOCUMENT_NOT_FOUND; } return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief compactor event loop //////////////////////////////////////////////////////////////////////////////// void TRI_CompactorVocBase (void* data) { TRI_vocbase_t* vocbase = static_cast(data); int numCompacted = 0; TRI_ASSERT(vocbase->_state == 1); std::vector collections; while (true) { // keep initial _state value as vocbase->_state might change during compaction loop int state = vocbase->_state; // check if compaction is currently disallowed if (CheckAndLockCompaction(vocbase)) { // compaction is currently allowed double now = TRI_microtime(); numCompacted = 0; try { READ_LOCKER(vocbase->_collectionsLock); // copy all collections collections = vocbase->_collections; } catch (...) { collections.clear(); } for (auto& collection : collections) { if (! TRI_TRY_READ_LOCK_STATUS_VOCBASE_COL(collection)) { // if we can't acquire the read lock instantly, we continue directly // we don't want to stall here for too long continue; } TRI_document_collection_t* document = collection->_collection; if (document == nullptr) { TRI_READ_UNLOCK_STATUS_VOCBASE_COL(collection); continue; } bool worked = false; bool doCompact = document->_info._doCompact; // for document collection, compactify datafiles if (collection->_status == TRI_VOC_COL_STATUS_LOADED && doCompact) { // check whether someone else holds a read-lock on the compaction lock if (! TRI_TryWriteLockReadWriteLock(&document->_compactionLock)) { // someone else is holding the compactor lock, we'll not compact TRI_READ_UNLOCK_STATUS_VOCBASE_COL(collection); continue; } if (document->_lastCompaction + COMPACTOR_COLLECTION_INTERVAL <= now) { auto ce = document->ditches()->createCompactionDitch(__FILE__, __LINE__); if (ce == nullptr) { // out of memory LOG_WARNING("out of memory when trying to create compaction ditch"); } else { worked = CompactifyDocumentCollection(document); if (! worked) { // set compaction stamp document->_lastCompaction = now; } // if we worked, then we don't set the compaction stamp to force another round of compaction document->ditches()->freeDitch(ce); } } // read-unlock the compaction lock TRI_WriteUnlockReadWriteLock(&document->_compactionLock); } TRI_READ_UNLOCK_STATUS_VOCBASE_COL(collection); if (worked) { ++numCompacted; // signal the cleanup thread that we worked and that it can now wake up TRI_LockCondition(&vocbase->_cleanupCondition); TRI_SignalCondition(&vocbase->_cleanupCondition); TRI_UnlockCondition(&vocbase->_cleanupCondition); } } UnlockCompaction(vocbase); } if (numCompacted > 0) { // no need to sleep long or go into wait state if we worked. // maybe there's still work left usleep(1000); } else if (state != 2 && vocbase->_state == 1) { // only sleep while server is still running TRI_LockCondition(&vocbase->_compactorCondition); TRI_TimedWaitCondition(&vocbase->_compactorCondition, (uint64_t) COMPACTOR_INTERVAL); TRI_UnlockCondition(&vocbase->_compactorCondition); } if (state == 2) { // server shutdown break; } } LOG_TRACE("shutting down compactor thread"); } // ----------------------------------------------------------------------------- // --SECTION-- END-OF-FILE // ----------------------------------------------------------------------------- // Local Variables: // mode: outline-minor // outline-regexp: "/// @brief\\|/// {@inheritDoc}\\|/// @page\\|// --SECTION--\\|/// @\\}" // End: