From a80d6635ab14aef031ba4d3739ac84eec3aedfc9 Mon Sep 17 00:00:00 2001 From: Jan Steemann Date: Wed, 2 Sep 2015 10:18:24 +0200 Subject: [PATCH] create fat datafiles This prevents SIGBUS when a sparse datafile is accessed and the disk is full. In this case the mmapped region is not necessarily backed by physical memory, and accessing the memory may crash the program --- UnitTests/Makefile.unittests | 9 +- arangod/VocBase/datafile.cpp | 131 ++++++++----- arangod/VocBase/document-collection.cpp | 4 +- arangod/Wal/AllocatorThread.cpp | 64 +++++-- arangod/Wal/AllocatorThread.h | 22 ++- arangod/Wal/CollectorThread.cpp | 128 ++++++++++--- arangod/Wal/CollectorThread.h | 22 ++- arangod/Wal/Logfile.cpp | 3 +- arangod/Wal/LogfileManager.cpp | 67 +++++-- arangod/Wal/LogfileManager.h | 9 +- arangod/Wal/Slots.cpp | 178 +++++++++++------- arangod/Wal/Slots.h | 5 +- .../frontend/js/bootstrap/module-internal.js | 17 -- js/common/bootstrap/module-internal.js | 17 -- .../tests/recovery/disk-full-datafile.js | 103 ++++++++++ .../tests/recovery/disk-full-logfile-data.js | 105 +++++++++++ js/server/tests/recovery/disk-full-logfile.js | 97 ++++++++++ .../shell-datafile-timecritical-noncluster.js | 177 +++++++++++++++++ .../tests/shell-transactions-noncluster.js | 10 +- lib/Basics/files.cpp | 24 +++ lib/Basics/files.h | 12 ++ lib/Basics/memory-map-posix.cpp | 7 +- 22 files changed, 987 insertions(+), 224 deletions(-) create mode 100644 js/server/tests/recovery/disk-full-datafile.js create mode 100644 js/server/tests/recovery/disk-full-logfile-data.js create mode 100644 js/server/tests/recovery/disk-full-logfile.js create mode 100644 js/server/tests/shell-datafile-timecritical-noncluster.js diff --git a/UnitTests/Makefile.unittests b/UnitTests/Makefile.unittests index 488aef7f2f..8661da5764 100755 --- a/UnitTests/Makefile.unittests +++ b/UnitTests/Makefile.unittests @@ -214,7 +214,7 @@ if ENABLE_FAILURE_TESTS execute-recovery-test: @rm -rf "$(VOCDIR)" @mkdir -p "$(VOCDIR)/databases" - @builddir@/bin/arangod "$(VOCDIR)" --no-server $(SERVER_OPT) --server.threads 1 --wal.reserve-logfiles 1 --javascript.script "@top_srcdir@/js/server/tests/recovery/$(RECOVERY_SCRIPT).js" --javascript.script-parameter setup || true # the server will crash with segfault intentionally in this test + @builddir@/bin/arangod "$(VOCDIR)" --no-server $(SERVER_OPT) --server.threads 1 --wal.reserve-logfiles 1 --javascript.script "@top_srcdir@/js/server/tests/recovery/$(RECOVERY_SCRIPT).js" --javascript.script-parameter setup --log.level warning || true # the server will crash with segfault intentionally in this test @rm -f core $(VALGRIND) @builddir@/bin/arangod --no-server "$(VOCDIR)" $(SERVER_OPT) --server.threads 1 --wal.ignore-logfile-errors true --wal.reserve-logfiles 1 --javascript.script "@top_srcdir@/js/server/tests/recovery/$(RECOVERY_SCRIPT).js" --javascript.script-parameter recover || test "x$(FORCE)" == "x1" @@ -225,6 +225,10 @@ unittests-recovery: @echo "================================================================================" @echo + $(MAKE) execute-recovery-test PID=$(PID) RECOVERY_SCRIPT="disk-full-logfile" + $(MAKE) execute-recovery-test PID=$(PID) RECOVERY_SCRIPT="disk-full-logfile-data" + $(MAKE) execute-recovery-test PID=$(PID) RECOVERY_SCRIPT="disk-full-datafile" + $(MAKE) execute-recovery-test PID=$(PID) RECOVERY_SCRIPT="disk-full-datafile" $(MAKE) execute-recovery-test PID=$(PID) RECOVERY_SCRIPT="create-with-temp" $(MAKE) execute-recovery-test PID=$(PID) RECOVERY_SCRIPT="create-collection-fail" $(MAKE) execute-recovery-test PID=$(PID) RECOVERY_SCRIPT="create-database-fail" @@ -500,8 +504,9 @@ SHELL_COMMON = \ SHELL_SERVER_ONLY = \ @top_srcdir@/js/server/tests/shell-readonly-noncluster-disabled.js \ - @top_srcdir@/js/server/tests/shell-collection-not-loaded-timecritical-noncluster.js \ @top_srcdir@/js/server/tests/shell-wal-noncluster-memoryintense.js \ + @top_srcdir@/js/server/tests/shell-datafile-timecritical-noncluster.js \ + @top_srcdir@/js/server/tests/shell-collection-not-loaded-timecritical-noncluster.js \ @top_srcdir@/js/server/tests/shell-sharding-helpers.js \ @top_srcdir@/js/server/tests/shell-compaction-noncluster-timecritical.js \ @top_srcdir@/js/server/tests/shell-shaped-noncluster.js \ diff --git a/arangod/VocBase/datafile.cpp b/arangod/VocBase/datafile.cpp index a016271c66..0ddd3176f0 100644 --- a/arangod/VocBase/datafile.cpp +++ b/arangod/VocBase/datafile.cpp @@ -32,18 +32,16 @@ #endif #include "datafile.h" - #include "Basics/conversions.h" +#include "Basics/files.h" #include "Basics/hashes.h" #include "Basics/logging.h" #include "Basics/memory-map.h" #include "Basics/tri-strings.h" -#include "Basics/files.h" #include "VocBase/server.h" - // #define DEBUG_DATAFILE 1 - + // ----------------------------------------------------------------------------- // --SECTION-- private functions // ----------------------------------------------------------------------------- @@ -228,33 +226,78 @@ static bool CheckCrcMarker (TRI_df_marker_t const* marker, } //////////////////////////////////////////////////////////////////////////////// -/// @brief creates a new sparse datafile +/// @brief creates a new datafile /// /// returns the file descriptor or -1 if the file cannot be created //////////////////////////////////////////////////////////////////////////////// -static int CreateSparseFile (char const* filename, - const TRI_voc_size_t maximalSize) { +static int CreateDatafile (char const* filename, + TRI_voc_size_t maximalSize) { TRI_ERRORBUF; - TRI_lseek_t offset; - char zero; - ssize_t res; - int fd; - + // open the file - fd = TRI_CREATE(filename, O_CREAT | O_EXCL | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); + int fd = TRI_CREATE(filename, O_CREAT | O_EXCL | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); + + TRI_IF_FAILURE("CreateDatafile1") { + // intentionally fail + TRI_CLOSE(fd); + fd = -1; + errno = ENOSPC; + } if (fd < 0) { - TRI_SYSTEM_ERROR(); + if (errno == ENOSPC) { + TRI_set_errno(TRI_ERROR_ARANGO_FILESYSTEM_FULL); + LOG_ERROR("cannot create datafile '%s': %s", filename, TRI_last_error()); + } + else { + TRI_SYSTEM_ERROR(); - TRI_set_errno(TRI_ERROR_SYS_ERROR); - - LOG_ERROR("cannot create datafile '%s': %s", filename, TRI_GET_ERRORBUF); + TRI_set_errno(TRI_ERROR_SYS_ERROR); + LOG_ERROR("cannot create datafile '%s': %s", filename, TRI_GET_ERRORBUF); + } return -1; } - // create sparse file - offset = TRI_LSEEK(fd, (TRI_lseek_t) (maximalSize - 1), SEEK_SET); + // fill file with zeros from FileNullBuffer + size_t written = 0; + while (written < maximalSize) { + size_t writeSize = TRI_GetNullBufferSizeFiles(); + + if (writeSize +written > maximalSize) { + writeSize = maximalSize - written; + } + + ssize_t writeResult = TRI_WRITE(fd, TRI_GetNullBufferFiles(), writeSize); + + TRI_IF_FAILURE("CreateDatafile2") { + // intentionally fail + writeResult = -1; + errno = ENOSPC; + } + + if (writeResult < 0) { + if (errno == ENOSPC) { + TRI_set_errno(TRI_ERROR_ARANGO_FILESYSTEM_FULL); + LOG_ERROR("cannot create datafile '%s': %s", filename, TRI_last_error()); + } + else { + TRI_SYSTEM_ERROR(); + TRI_set_errno(TRI_ERROR_SYS_ERROR); + LOG_ERROR("cannot create datafile '%s': %s", filename, TRI_GET_ERRORBUF); + } + + TRI_CLOSE(fd); + TRI_UnlinkFile(filename); + + return -1; + } + + written += static_cast(writeResult); + } + + // go back to offset 0 + TRI_lseek_t offset = TRI_LSEEK(fd, (TRI_lseek_t) 0, SEEK_SET); if (offset == (TRI_lseek_t) -1) { TRI_SYSTEM_ERROR(); @@ -268,21 +311,6 @@ static int CreateSparseFile (char const* filename, return -1; } - zero = 0; - res = TRI_WRITE(fd, &zero, 1); - - if (res < 0) { - TRI_SYSTEM_ERROR(); - TRI_set_errno(TRI_ERROR_SYS_ERROR); - TRI_CLOSE(fd); - - // remove empty file - TRI_UnlinkFile(filename); - - LOG_ERROR("cannot create sparse datafile '%s': '%s'", filename, TRI_GET_ERRORBUF); - return -1; - } - return fd; } @@ -355,8 +383,6 @@ static int TruncateAndSealDatafile (TRI_datafile_t* datafile, TRI_voc_size_t vocSize) { TRI_ERRORBUF; char* oldname; - char zero; - int res; void* data; void* mmHandle; @@ -368,7 +394,7 @@ static int TruncateAndSealDatafile (TRI_datafile_t* datafile, // sanity check if (sizeof(TRI_df_header_marker_t) + sizeof(TRI_df_footer_marker_t) > maximalSize) { - LOG_ERROR("cannot create datafile '%s', maximal size '%u' is too small", datafile->getName(datafile), (unsigned int) maximalSize); + LOG_ERROR("cannot create datafile '%s', maximal size %u is too small", datafile->getName(datafile), (unsigned int) maximalSize); return TRI_set_errno(TRI_ERROR_ARANGO_MAXIMAL_SIZE_TOO_SMALL); } @@ -379,13 +405,13 @@ static int TruncateAndSealDatafile (TRI_datafile_t* datafile, if (fd < 0) { TRI_SYSTEM_ERROR(); - LOG_ERROR("cannot create new datafile '%s': '%s'", filename, TRI_GET_ERRORBUF); + LOG_ERROR("cannot create new datafile '%s': %s", filename, TRI_GET_ERRORBUF); TRI_FreeString(TRI_CORE_MEM_ZONE, filename); return TRI_set_errno(TRI_ERROR_SYS_ERROR); } - // create sparse file + // go back to the beginning of the file TRI_lseek_t offset = TRI_LSEEK(fd, (TRI_lseek_t) (maximalSize - 1), SEEK_SET); if (offset == (TRI_lseek_t) -1) { @@ -396,14 +422,14 @@ static int TruncateAndSealDatafile (TRI_datafile_t* datafile, // remove empty file TRI_UnlinkFile(filename); - LOG_ERROR("cannot seek in new datafile '%s': '%s'", filename, TRI_GET_ERRORBUF); + LOG_ERROR("cannot seek in new datafile '%s': %s", filename, TRI_GET_ERRORBUF); TRI_FreeString(TRI_CORE_MEM_ZONE, filename); return TRI_ERROR_SYS_ERROR; } - zero = 0; - res = TRI_WRITE(fd, &zero, 1); + char zero = 0; + int res = TRI_WRITE(fd, &zero, 1); if (res < 0) { TRI_SYSTEM_ERROR(); @@ -413,7 +439,7 @@ static int TruncateAndSealDatafile (TRI_datafile_t* datafile, // remove empty file TRI_UnlinkFile(filename); - LOG_ERROR("cannot create sparse datafile '%s': '%s'", filename, TRI_GET_ERRORBUF); + LOG_ERROR("cannot create datafile '%s': %s", filename, TRI_GET_ERRORBUF); TRI_FreeString(TRI_CORE_MEM_ZONE, filename); return TRI_ERROR_SYS_ERROR; @@ -430,7 +456,7 @@ static int TruncateAndSealDatafile (TRI_datafile_t* datafile, // remove empty file TRI_UnlinkFile(filename); - LOG_ERROR("cannot memory map file '%s': '%s'", filename, TRI_GET_ERRORBUF); + LOG_ERROR("cannot memory map file '%s': %s", filename, TRI_GET_ERRORBUF); TRI_FreeString(TRI_CORE_MEM_ZONE, filename); return TRI_errno(); @@ -1250,7 +1276,6 @@ TRI_datafile_t* TRI_CreateDatafile (char const* filename, return nullptr; } - datafile->_state = TRI_DF_STATE_WRITE; if (withInitialMarkers) { @@ -1358,12 +1383,9 @@ TRI_datafile_t* TRI_CreateAnonymousDatafile (TRI_voc_fid_t fid, TRI_datafile_t* TRI_CreatePhysicalDatafile (char const* filename, TRI_voc_fid_t fid, TRI_voc_size_t maximalSize) { - void* data; - void* mmHandle; - TRI_ASSERT(filename != nullptr); - int fd = CreateSparseFile(filename, maximalSize); + int fd = CreateDatafile(filename, maximalSize); if (fd < 0) { // an error occurred @@ -1371,7 +1393,14 @@ TRI_datafile_t* TRI_CreatePhysicalDatafile (char const* filename, } // memory map the data - ssize_t res = TRI_MMFile(0, maximalSize, PROT_WRITE | PROT_READ, MAP_SHARED, fd, &mmHandle, 0, &data); + void* data; + void* mmHandle; + int flags = MAP_SHARED; +#ifdef __linux__ + // try populating the mapping already + flags |= MAP_POPULATE; +#endif + ssize_t res = TRI_MMFile(0, maximalSize, PROT_WRITE | PROT_READ, flags, fd, &mmHandle, 0, &data); if (res != TRI_ERROR_NO_ERROR) { TRI_set_errno(res); @@ -1620,7 +1649,7 @@ int TRI_ReserveElementDatafile (TRI_datafile_t* datafile, LOG_TRACE("cannot write marker, not enough space"); - return datafile->_lastError; + return TRI_ERROR_ARANGO_DATAFILE_FULL; } *position = (TRI_df_marker_t*) datafile->_next; diff --git a/arangod/VocBase/document-collection.cpp b/arangod/VocBase/document-collection.cpp index b1d3898bfd..88e0e4f7d9 100644 --- a/arangod/VocBase/document-collection.cpp +++ b/arangod/VocBase/document-collection.cpp @@ -2426,9 +2426,9 @@ TRI_datafile_t* TRI_CreateDatafileDocumentCollection (TRI_document_collection_t* TRI_IF_FAILURE("CreateJournalDocumentCollection") { // simulate disk full TRI_FreeString(TRI_CORE_MEM_ZONE, filename); - document->_lastError = TRI_set_errno(TRI_ERROR_OUT_OF_MEMORY_MMAP); + document->_lastError = TRI_set_errno(TRI_ERROR_ARANGO_FILESYSTEM_FULL); - EnsureErrorCode(ENOSPC); + EnsureErrorCode(TRI_ERROR_ARANGO_FILESYSTEM_FULL); return nullptr; } diff --git a/arangod/Wal/AllocatorThread.cpp b/arangod/Wal/AllocatorThread.cpp index b12a736ffd..b83558ad49 100644 --- a/arangod/Wal/AllocatorThread.cpp +++ b/arangod/Wal/AllocatorThread.cpp @@ -60,7 +60,9 @@ AllocatorThread::AllocatorThread (LogfileManager* logfileManager) _recoveryLock(), _requestedSize(0), _stop(0), - _inRecovery(true) { + _inRecovery(true), + _allocatorResultCondition(), + _allocatorResult(TRI_ERROR_NO_ERROR) { allowAsynchronousCancelation(); } @@ -76,6 +78,22 @@ AllocatorThread::~AllocatorThread () { // --SECTION-- public methods // ----------------------------------------------------------------------------- +//////////////////////////////////////////////////////////////////////////////// +/// @brief wait for the collector result +//////////////////////////////////////////////////////////////////////////////// + +int AllocatorThread::waitForResult (uint64_t timeout) { + CONDITION_LOCKER(guard, _allocatorResultCondition); + + if (_allocatorResult == TRI_ERROR_NO_ERROR) { + if (guard.wait(timeout)) { + return TRI_ERROR_LOCK_TIMEOUT; + } + } + + return _allocatorResult; +} + //////////////////////////////////////////////////////////////////////////////// /// @brief stops the allocator thread //////////////////////////////////////////////////////////////////////////////// @@ -109,14 +127,16 @@ void AllocatorThread::signal (uint32_t markerSize) { guard.signal(); } +// ----------------------------------------------------------------------------- +// --SECTION-- private methods +// ----------------------------------------------------------------------------- + //////////////////////////////////////////////////////////////////////////////// /// @brief creates a new reserve logfile //////////////////////////////////////////////////////////////////////////////// -bool AllocatorThread::createReserveLogfile (uint32_t size) { - int res = _logfileManager->createReserveLogfile(size); - - return (res == TRI_ERROR_NO_ERROR); +int AllocatorThread::createReserveLogfile (uint32_t size) { + return _logfileManager->createReserveLogfile(size); } // ----------------------------------------------------------------------------- @@ -130,43 +150,59 @@ bool AllocatorThread::createReserveLogfile (uint32_t size) { void AllocatorThread::run () { while (_stop == 0) { uint32_t requestedSize = 0; - + { CONDITION_LOCKER(guard, _condition); requestedSize = _requestedSize; _requestedSize = 0; } + int res = TRI_ERROR_NO_ERROR; + try { if (requestedSize == 0 && ! inRecovery() && ! _logfileManager->hasReserveLogfiles()) { // only create reserve files if we are not in the recovery mode - if (createReserveLogfile(0)) { + res = createReserveLogfile(0); + + if (res == TRI_ERROR_NO_ERROR) { continue; } - - LOG_ERROR("unable to create new WAL reserve logfile"); + + LOG_ERROR("unable to create new WAL reserve logfile for sized marker: %s", + TRI_errno_string(res)); } else if (requestedSize > 0 && - _logfileManager->logfileCreationAllowed(requestedSize)) { - if (createReserveLogfile(requestedSize)) { + _logfileManager->logfileCreationAllowed(requestedSize)) { + + res = createReserveLogfile(requestedSize); + + if (res == TRI_ERROR_NO_ERROR) { continue; } - - LOG_ERROR("unable to create new WAL reserve logfile"); + + LOG_ERROR("unable to create new WAL reserve logfile: %s", + TRI_errno_string(res)); } } catch (triagens::basics::Exception const& ex) { - int res = ex.code(); + res = ex.code(); LOG_ERROR("got unexpected error in allocatorThread: %s", TRI_errno_string(res)); } catch (...) { LOG_ERROR("got unspecific error in allocatorThread"); } + // reset allocator status + { + CONDITION_LOCKER(guard, _allocatorResultCondition); + _allocatorResult = res; + } + { CONDITION_LOCKER(guard, _condition); + guard.wait(Interval); } } diff --git a/arangod/Wal/AllocatorThread.h b/arangod/Wal/AllocatorThread.h index ddddf217c2..f86f0c0406 100644 --- a/arangod/Wal/AllocatorThread.h +++ b/arangod/Wal/AllocatorThread.h @@ -93,10 +93,10 @@ namespace triagens { void signal (uint32_t); //////////////////////////////////////////////////////////////////////////////// -/// @brief creates a new reserve logfile +/// @brief wait for allocator result //////////////////////////////////////////////////////////////////////////////// - bool createReserveLogfile (uint32_t); + int waitForResult (uint64_t); //////////////////////////////////////////////////////////////////////////////// /// @brief tell the thread that the recovery phase is over @@ -134,6 +134,12 @@ namespace triagens { private: +//////////////////////////////////////////////////////////////////////////////// +/// @brief creates a new reserve logfile +//////////////////////////////////////////////////////////////////////////////// + + int createReserveLogfile (uint32_t); + // ----------------------------------------------------------------------------- // --SECTION-- private variables // ----------------------------------------------------------------------------- @@ -176,6 +182,18 @@ namespace triagens { bool _inRecovery; +//////////////////////////////////////////////////////////////////////////////// +/// @brief condition variable for the allocator thread +//////////////////////////////////////////////////////////////////////////////// + + basics::ConditionVariable _allocatorResultCondition; + +//////////////////////////////////////////////////////////////////////////////// +/// @brief allocation result +//////////////////////////////////////////////////////////////////////////////// + + int _allocatorResult; + //////////////////////////////////////////////////////////////////////////////// /// @brief wait interval for the allocator thread when idle //////////////////////////////////////////////////////////////////////////////// diff --git a/arangod/Wal/CollectorThread.cpp b/arangod/Wal/CollectorThread.cpp index 76e64b92e7..13a2363e94 100644 --- a/arangod/Wal/CollectorThread.cpp +++ b/arangod/Wal/CollectorThread.cpp @@ -292,7 +292,9 @@ CollectorThread::CollectorThread (LogfileManager* logfileManager, _operationsQueue(), _operationsQueueInUse(false), _stop(0), - _numPendingOperations(0) { + _numPendingOperations(0), + _collectorResultCondition(), + _collectorResult(TRI_ERROR_NO_ERROR) { allowAsynchronousCancelation(); } @@ -308,6 +310,22 @@ CollectorThread::~CollectorThread () { // --SECTION-- public methods // ----------------------------------------------------------------------------- +//////////////////////////////////////////////////////////////////////////////// +/// @brief wait for the collector result +//////////////////////////////////////////////////////////////////////////////// + +int CollectorThread::waitForResult (uint64_t timeout) { + CONDITION_LOCKER(guard, _collectorResultCondition); + + if (_collectorResult == TRI_ERROR_NO_ERROR) { + if (guard.wait(timeout)) { + return TRI_ERROR_LOCK_TIMEOUT; + } + } + + return _collectorResult; +} + //////////////////////////////////////////////////////////////////////////////// /// @brief stops the collector thread //////////////////////////////////////////////////////////////////////////////// @@ -347,18 +365,35 @@ void CollectorThread::run () { while (true) { int stop = (int) _stop; - bool worked = false; + bool hasWorked = false; + bool doDelay = false; try { // step 1: collect a logfile if any qualifies - if (stop == 0) { + if (stop == 0) { // don't collect additional logfiles in case we want to shut down - worked |= this->collectLogfiles(); + bool worked; + int res = this->collectLogfiles(worked); + + if (res == TRI_ERROR_NO_ERROR) { + hasWorked |= worked; + } + else if (res == TRI_ERROR_ARANGO_FILESYSTEM_FULL) { + doDelay = true; + } } // step 2: update master pointers try { - worked |= this->processQueuedOperations(); + bool worked; + int res = this->processQueuedOperations(worked); + + if (res == TRI_ERROR_NO_ERROR) { + hasWorked |= worked; + } + else if (res == TRI_ERROR_ARANGO_FILESYSTEM_FULL) { + doDelay = true; + } } catch (...) { // re-activate the queue @@ -375,11 +410,19 @@ void CollectorThread::run () { LOG_ERROR("got unspecific error in collectorThread::run"); } - if (stop == 0 && ! worked) { + uint64_t interval = Interval; + + if (doDelay) { + hasWorked = false; + // wait longer before retrying in case disk is full + interval *= 2; + } + + if (stop == 0 && ! hasWorked) { // sleep only if there was nothing to do CONDITION_LOCKER(guard, _condition); - if (! guard.wait(Interval)) { + if (! guard.wait(interval)) { if (++counter > 10) { LOG_TRACE("wal collector has queued operations: %d", (int) numQueuedOperations()); counter = 0; @@ -408,38 +451,68 @@ void CollectorThread::run () { /// @brief step 1: perform collection of a logfile (if any) //////////////////////////////////////////////////////////////////////////////// -bool CollectorThread::collectLogfiles () { +int CollectorThread::collectLogfiles (bool& worked) { + // always init result variable + worked = false; + TRI_IF_FAILURE("CollectorThreadCollect") { - return false; + return TRI_ERROR_NO_ERROR; } Logfile* logfile = _logfileManager->getCollectableLogfile(); if (logfile == nullptr) { - return false; + return TRI_ERROR_NO_ERROR; } + worked = true; _logfileManager->setCollectionRequested(logfile); - int res = collect(logfile); + try { + int res = collect(logfile); + // LOG_TRACE("collected logfile: %llu. result: %d", (unsigned long long) logfile->id(), res); - if (res == TRI_ERROR_NO_ERROR) { - _logfileManager->setCollectionDone(logfile); - return true; + if (res == TRI_ERROR_NO_ERROR) { + // reset collector status + { + CONDITION_LOCKER(guard, _collectorResultCondition); + _collectorResult = TRI_ERROR_NO_ERROR; + } + + _logfileManager->setCollectionDone(logfile); + } + else { + // return the logfile to the logfile manager in case of errors + _logfileManager->forceStatus(logfile, Logfile::StatusType::SEALED); + + // set error in collector + { + CONDITION_LOCKER(guard, _collectorResultCondition); + _collectorResult = res; + _collectorResultCondition.broadcast(); + } + } + + return res; + } + catch (...) { + LOG_DEBUG("collecting logfile %llu failed", (unsigned long long) logfile->id()); + _logfileManager->setCollectionDone(logfile); + + return TRI_ERROR_INTERNAL; } - - // return the logfile to the logfile manager in case of errors - _logfileManager->forceStatus(logfile, Logfile::StatusType::SEALED); - return false; } //////////////////////////////////////////////////////////////////////////////// /// @brief step 2: process all still-queued collection operations //////////////////////////////////////////////////////////////////////////////// -bool CollectorThread::processQueuedOperations () { +int CollectorThread::processQueuedOperations (bool& worked) { + // always init result variable + worked = false; + TRI_IF_FAILURE("CollectorThreadProcessQueuedOperations") { - return false; + return TRI_ERROR_NO_ERROR; } { @@ -448,7 +521,7 @@ bool CollectorThread::processQueuedOperations () { if (_operationsQueue.empty()) { // nothing to do - return false; + return TRI_ERROR_NO_ERROR; } // this flag indicates that no one else must write to the queue @@ -542,7 +615,9 @@ bool CollectorThread::processQueuedOperations () { _operationsQueueInUse = false; } - return true; + worked = true; + + return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// @@ -800,7 +875,6 @@ int CollectorThread::collect (Logfile* logfile) { if (state.documentOperations.find(cid) != state.documentOperations.end()) { DocumentOperationsType const& ops = state.documentOperations[cid]; - for (auto it2 = ops.begin(); it2 != ops.end(); ++it2) { sortedOperations.push_back((*it2).second); } @@ -827,7 +901,13 @@ int CollectorThread::collect (Logfile* logfile) { if (res != TRI_ERROR_NO_ERROR && res != TRI_ERROR_ARANGO_DATABASE_NOT_FOUND && res != TRI_ERROR_ARANGO_COLLECTION_NOT_FOUND) { - LOG_WARNING("got unexpected error in CollectorThread::collect: %s", TRI_errno_string(res)); + + if (res != TRI_ERROR_ARANGO_FILESYSTEM_FULL) { + // other places already log this error, and making the logging conditional here + // prevents the log message from being shown over and over again in case the + // file system is full + LOG_WARNING("got unexpected error in CollectorThread::collect: %s", TRI_errno_string(res)); + } // abort early return res; } diff --git a/arangod/Wal/CollectorThread.h b/arangod/Wal/CollectorThread.h index 557679e131..1074e7df0c 100644 --- a/arangod/Wal/CollectorThread.h +++ b/arangod/Wal/CollectorThread.h @@ -242,6 +242,12 @@ namespace triagens { public: +//////////////////////////////////////////////////////////////////////////////// +/// @brief wait for the collector result +//////////////////////////////////////////////////////////////////////////////// + + int waitForResult (uint64_t); + //////////////////////////////////////////////////////////////////////////////// /// @brief stops the collector thread //////////////////////////////////////////////////////////////////////////////// @@ -288,13 +294,13 @@ namespace triagens { /// @brief step 1: perform collection of a logfile (if any) //////////////////////////////////////////////////////////////////////////////// - bool collectLogfiles (); + int collectLogfiles (bool&); //////////////////////////////////////////////////////////////////////////////// /// @brief step 2: process all still-queued collection operations //////////////////////////////////////////////////////////////////////////////// - bool processQueuedOperations (); + int processQueuedOperations (bool&); //////////////////////////////////////////////////////////////////////////////// /// @brief process all operations for a single collection @@ -428,6 +434,18 @@ namespace triagens { uint64_t _numPendingOperations; +//////////////////////////////////////////////////////////////////////////////// +/// @brief condition variable for the collector thread result +//////////////////////////////////////////////////////////////////////////////// + + basics::ConditionVariable _collectorResultCondition; + +//////////////////////////////////////////////////////////////////////////////// +/// @brief last collector result +//////////////////////////////////////////////////////////////////////////////// + + int _collectorResult; + //////////////////////////////////////////////////////////////////////////////// /// @brief wait interval for the collector thread when idle //////////////////////////////////////////////////////////////////////////////// diff --git a/arangod/Wal/Logfile.cpp b/arangod/Wal/Logfile.cpp index 79e99669c6..382047d115 100644 --- a/arangod/Wal/Logfile.cpp +++ b/arangod/Wal/Logfile.cpp @@ -86,8 +86,7 @@ Logfile* Logfile::createNew (std::string const& filename, } } - Logfile* logfile = new Logfile(id, df, StatusType::EMPTY); - return logfile; + return new Logfile(id, df, StatusType::EMPTY); } //////////////////////////////////////////////////////////////////////////////// diff --git a/arangod/Wal/LogfileManager.cpp b/arangod/Wal/LogfileManager.cpp index afde0c9c17..2e15bb480a 100644 --- a/arangod/Wal/LogfileManager.cpp +++ b/arangod/Wal/LogfileManager.cpp @@ -971,24 +971,26 @@ int LogfileManager::flush (bool waitForSync, if (res == TRI_ERROR_NO_ERROR) { // we need to wait for the collector... - this->waitForCollector(lastOpenLogfileId, maxWaitTime); + // LOG_TRACE("entering waitForCollector with lastOpenLogfileId %llu", (unsigned long long) lastOpenLogfileId); + res = this->waitForCollector(lastOpenLogfileId, maxWaitTime); } else if (res == TRI_ERROR_ARANGO_DATAFILE_EMPTY) { // current logfile is empty and cannot be collected // we need to wait for the collector to collect the previously sealed datafile if (lastSealedLogfileId > 0) { - this->waitForCollector(lastSealedLogfileId, maxWaitTime); + res = this->waitForCollector(lastSealedLogfileId, maxWaitTime); } } } - if (writeShutdownFile) { + if (writeShutdownFile && + (res == TRI_ERROR_NO_ERROR || res == TRI_ERROR_ARANGO_DATAFILE_EMPTY)) { // update the file with the last tick, last sealed etc. return writeShutdownInfo(false); } - return TRI_ERROR_NO_ERROR; + return res; } //////////////////////////////////////////////////////////////////////////////// @@ -1283,16 +1285,20 @@ Logfile* LogfileManager::getLogfile (Logfile::IdType id, /// @brief get a logfile for writing. this may return nullptr //////////////////////////////////////////////////////////////////////////////// -Logfile* LogfileManager::getWriteableLogfile (uint32_t size, - Logfile::StatusType& status) { +int LogfileManager::getWriteableLogfile (uint32_t size, + Logfile::StatusType& status, + Logfile*& result) { static const uint64_t SleepTime = 10 * 1000; static const uint64_t MaxIterations = 1500; size_t iterations = 0; bool haveSignalled = false; + // always initialize the result + result = nullptr; + TRI_IF_FAILURE("LogfileManagerGetWriteableLogfile") { // intentionally don't return a logfile - return nullptr; + return TRI_ERROR_DEBUG; } while (++iterations < MaxIterations) { @@ -1309,12 +1315,15 @@ Logfile* LogfileManager::getWriteableLogfile (uint32_t size, // found a logfile, update the status variable and return the logfile { + // LOG_TRACE("setting lastOpenedId %llu", (unsigned long long) logfile->id()); MUTEX_LOCKER(_idLock); _lastOpenedId = logfile->id(); } + result = logfile; status = logfile->status(); - return logfile; + + return TRI_ERROR_NO_ERROR; } if (logfile->status() == Logfile::StatusType::EMPTY && @@ -1339,12 +1348,22 @@ Logfile* LogfileManager::getWriteableLogfile (uint32_t size, _allocatorThread->signal(size); haveSignalled = true; } - usleep(SleepTime); + + int res = _allocatorThread->waitForResult(SleepTime); + + if (res != TRI_ERROR_LOCK_TIMEOUT && + res != TRI_ERROR_NO_ERROR) { + TRI_ASSERT(result == nullptr); + + // some error occurred + return res; + } } + TRI_ASSERT(result == nullptr); LOG_WARNING("unable to acquire writeable WAL logfile after %llu ms", (unsigned long long) (MaxIterations * SleepTime) / 1000); - return nullptr; + return TRI_ERROR_LOCK_TIMEOUT; } //////////////////////////////////////////////////////////////////////////////// @@ -1501,6 +1520,7 @@ void LogfileManager::setCollectionDone (Logfile* logfile) { TRI_ASSERT(logfile != nullptr); Logfile::IdType id = logfile->id(); + // LOG_ERROR("setCollectionDone setting lastCollectedId to %llu", (unsigned long long) id); { WRITE_LOCKER(_logfilesLock); logfile->setStatus(Logfile::StatusType::COLLECTED); @@ -1605,8 +1625,8 @@ void LogfileManager::removeLogfile (Logfile* logfile) { /// @brief wait until a specific logfile has been collected //////////////////////////////////////////////////////////////////////////////// -void LogfileManager::waitForCollector (Logfile::IdType logfileId, - double maxWaitTime) { +int LogfileManager::waitForCollector (Logfile::IdType logfileId, + double maxWaitTime) { static const int64_t SingleWaitPeriod = 50 * 1000; int64_t maxIterations = INT64_MAX; // wait forever @@ -1616,19 +1636,32 @@ void LogfileManager::waitForCollector (Logfile::IdType logfileId, LOG_TRACE("will wait for max. %f seconds for collector to finish", maxWaitTime); } - LOG_TRACE("waiting for collector thread to collect logfile %llu", (unsigned long long) logfileId); + LOG_TRACE("waiting for collector thread to collect logfile %llu", + (unsigned long long) logfileId); // wait for the collector thread to finish the collection int64_t iterations = 0; while (++iterations < maxIterations) { if (_lastCollectedId >= logfileId) { - return; + return TRI_ERROR_NO_ERROR; } - LOG_TRACE("waiting for collector"); - usleep(SingleWaitPeriod); + int res = _collectorThread->waitForResult(SingleWaitPeriod); + + // LOG_TRACE("still waiting for collector. logfileId: %llu lastCollected: %llu, result: %d", (unsigned long long) logfileId, (unsigned long long) _lastCollectedId, (int) res); + + if (res != TRI_ERROR_LOCK_TIMEOUT && + res != TRI_ERROR_NO_ERROR) { + // some error occurred + return res; + } + + // try again } + + // waited for too long + return TRI_ERROR_LOCK_TIMEOUT; } //////////////////////////////////////////////////////////////////////////////// @@ -1763,6 +1796,7 @@ int LogfileManager::writeShutdownInfo (bool writeShutdownTime) { if (json == nullptr) { LOG_ERROR("unable to write WAL state file '%s'", filename.c_str()); + return TRI_ERROR_OUT_OF_MEMORY; } @@ -1801,7 +1835,6 @@ int LogfileManager::writeShutdownInfo (bool writeShutdownTime) { if (! ok) { LOG_ERROR("unable to write WAL state file '%s'", filename.c_str()); - return TRI_ERROR_CANNOT_WRITE_FILE; } diff --git a/arangod/Wal/LogfileManager.h b/arangod/Wal/LogfileManager.h index 32758ea64f..65840ffde6 100644 --- a/arangod/Wal/LogfileManager.h +++ b/arangod/Wal/LogfileManager.h @@ -606,8 +606,9 @@ namespace triagens { /// @brief get a logfile for writing. this may return nullptr //////////////////////////////////////////////////////////////////////////////// - Logfile* getWriteableLogfile (uint32_t, - Logfile::StatusType&); + int getWriteableLogfile (uint32_t, + Logfile::StatusType&, + Logfile*&); //////////////////////////////////////////////////////////////////////////////// /// @brief get a logfile to collect. this may return nullptr @@ -682,8 +683,8 @@ namespace triagens { /// @brief wait for the collector thread to collect a specific logfile //////////////////////////////////////////////////////////////////////////////// - void waitForCollector (Logfile::IdType, - double); + int waitForCollector (Logfile::IdType, + double); //////////////////////////////////////////////////////////////////////////////// /// @brief run the recovery procedure diff --git a/arangod/Wal/Slots.cpp b/arangod/Wal/Slots.cpp index 074fac7362..2eef45a95f 100644 --- a/arangod/Wal/Slots.cpp +++ b/arangod/Wal/Slots.cpp @@ -52,7 +52,7 @@ Slots::Slots (LogfileManager* logfileManager, : _logfileManager(logfileManager), _condition(), _lock(), - _slots(new Slot[numberOfSlots]), + _slots(nullptr), _numberOfSlots(numberOfSlots), _freeSlots(numberOfSlots), _waiting(0), @@ -63,6 +63,8 @@ Slots::Slots (LogfileManager* logfileManager, _lastCommittedTick(0), _lastCommittedDataTick(0), _numEvents(0) { + + _slots = new Slot[numberOfSlots]; } //////////////////////////////////////////////////////////////////////////////// @@ -70,9 +72,7 @@ Slots::Slots (LogfileManager* logfileManager, //////////////////////////////////////////////////////////////////////////////// Slots::~Slots () { - if (_slots != nullptr) { - delete[] _slots; - } + delete[] _slots; } // ----------------------------------------------------------------------------- @@ -110,6 +110,9 @@ int Slots::flush (bool waitForSync) { if (! waitForTick(lastTick)) { res = TRI_ERROR_ARANGO_SYNC_TIMEOUT; } + else if (! worked) { + res = TRI_ERROR_ARANGO_DATAFILE_EMPTY; + } } else if (! worked) { // logfile to flush was still empty and thus not flushed @@ -176,33 +179,41 @@ SlotInfo Slots::nextUnused (uint32_t size) { _logfile = nullptr; } + + TRI_IF_FAILURE("LogfileManagerGetWriteableLogfile") { + return SlotInfo(TRI_ERROR_ARANGO_NO_JOURNAL); + } // fetch the next free logfile (this may create a new one) - Logfile::StatusType status = newLogfile(alignedSize); + Logfile::StatusType status; + int res = newLogfile(alignedSize, status); - if (_logfile == nullptr) { - usleep(10 * 1000); - - TRI_IF_FAILURE("LogfileManagerGetWriteableLogfile") { - return SlotInfo(TRI_ERROR_ARANGO_NO_JOURNAL); - } - - // try again in next iteration - } - else if (status == Logfile::StatusType::EMPTY) { - // inititialise the empty logfile by writing a header marker - int res = writeHeader(slot); - - if (res != TRI_ERROR_NO_ERROR) { + if (res != TRI_ERROR_NO_ERROR) { + if (res != TRI_ERROR_ARANGO_NO_JOURNAL) { return SlotInfo(res); } - // advance to next slot - slot = &_slots[_handoutIndex]; - _logfileManager->setLogfileOpen(_logfile); + usleep(10 * 1000); + // try again in next iteration } else { - TRI_ASSERT(status == Logfile::StatusType::OPEN); + TRI_ASSERT(_logfile != nullptr); + + if (status == Logfile::StatusType::EMPTY) { + // inititialise the empty logfile by writing a header marker + int res = writeHeader(slot); + + if (res != TRI_ERROR_NO_ERROR) { + return SlotInfo(res); + } + + // advance to next slot + slot = &_slots[_handoutIndex]; + _logfileManager->setLogfileOpen(_logfile); + } + else { + TRI_ASSERT(status == Logfile::StatusType::OPEN); + } } } @@ -220,7 +231,7 @@ SlotInfo Slots::nextUnused (uint32_t size) { return SlotInfo(slot); } } - + // if we get here, all slots are busy CONDITION_LOCKER(guard, _condition); if (! hasWaited) { @@ -237,6 +248,7 @@ SlotInfo Slots::nextUnused (uint32_t size) { if (mustWait) { guard.wait(10 * 1000); } + } return SlotInfo(TRI_ERROR_ARANGO_NO_JOURNAL); @@ -295,33 +307,41 @@ SlotInfo Slots::nextUnused (uint32_t size, _logfile = nullptr; } + + TRI_IF_FAILURE("LogfileManagerGetWriteableLogfile") { + return SlotInfo(TRI_ERROR_ARANGO_NO_JOURNAL); + } // fetch the next free logfile (this may create a new one) - Logfile::StatusType status = newLogfile(alignedSize); + Logfile::StatusType status; + int res = newLogfile(alignedSize, status); - if (_logfile == nullptr) { - usleep(10 * 1000); - - TRI_IF_FAILURE("LogfileManagerGetWriteableLogfile") { - return SlotInfo(TRI_ERROR_ARANGO_NO_JOURNAL); - } - - // try again in next iteration - } - else if (status == Logfile::StatusType::EMPTY) { - // inititialise the empty logfile by writing a header marker - int res = writeHeader(slot); - - if (res != TRI_ERROR_NO_ERROR) { + if (res != TRI_ERROR_NO_ERROR) { + if (res != TRI_ERROR_ARANGO_NO_JOURNAL) { return SlotInfo(res); } - // advance to next slot - slot = &_slots[_handoutIndex]; - _logfileManager->setLogfileOpen(_logfile); + usleep(10 * 1000); + // try again in next iteration } else { - TRI_ASSERT(status == Logfile::StatusType::OPEN); + TRI_ASSERT(_logfile != nullptr); + + if (status == Logfile::StatusType::EMPTY) { + // inititialise the empty logfile by writing a header marker + int res = writeHeader(slot); + + if (res != TRI_ERROR_NO_ERROR) { + return SlotInfo(res); + } + + // advance to next slot + slot = &_slots[_handoutIndex]; + _logfileManager->setLogfileOpen(_logfile); + } + else { + TRI_ASSERT(status == Logfile::StatusType::OPEN); + } } } @@ -354,7 +374,7 @@ SlotInfo Slots::nextUnused (uint32_t size, return SlotInfo(slot); } } - + // if we get here, all slots are busy CONDITION_LOCKER(guard, _condition); if (! hasWaited) { @@ -404,7 +424,8 @@ void Slots::returnUsed (SlotInfo& slotInfo, /// @brief get the next synchronisable region //////////////////////////////////////////////////////////////////////////////// -SyncRegion Slots::getSyncRegion () { +SyncRegion Slots::getSyncRegion () { + bool sealRequested = false; SyncRegion region; MUTEX_LOCKER(_lock); @@ -415,11 +436,16 @@ SyncRegion Slots::getSyncRegion () { Slot const* slot = &_slots[slotIndex]; TRI_ASSERT(slot != nullptr); + if (sealRequested && slot->isUnused()) { + region.canSeal = true; + } + if (! slot->isReturned()) { // found a slot that is not yet returned // if it belongs to another logfile, we can seal the logfile we created // the region for auto otherId = slot->logfileId(); + if (region.logfileId != 0 && otherId != 0 && otherId != region.logfileId) { region.canSeal = true; @@ -440,6 +466,10 @@ SyncRegion Slots::getSyncRegion () { region.firstSlotIndex = slotIndex; region.lastSlotIndex = slotIndex; region.waitForSync = slot->waitForSync(); + + if (status == Logfile::StatusType::SEAL_REQUESTED) { + sealRequested = true; + } } else { if (slot->logfileId() != region.logfileId) { @@ -617,37 +647,46 @@ int Slots::closeLogfile (Slot::TickType& lastCommittedTick, // fall-through intentional } + + TRI_IF_FAILURE("LogfileManagerGetWriteableLogfile") { + return TRI_ERROR_ARANGO_NO_JOURNAL; + } TRI_ASSERT(_logfile == nullptr); // fetch the next free logfile (this may create a new one) // note: as we don't have a real marker to write the size does // not matter (we use a size of 1 as it must be > 0) - Logfile::StatusType status = newLogfile(1); + Logfile::StatusType status; + int res = newLogfile(1, status); - if (_logfile == nullptr) { - TRI_IF_FAILURE("LogfileManagerGetWriteableLogfile") { - return TRI_ERROR_ARANGO_NO_JOURNAL; + if (res != TRI_ERROR_NO_ERROR) { + if (res != TRI_ERROR_ARANGO_NO_JOURNAL) { + return res; } usleep(10 * 1000); // try again in next iteration } - else if (status == Logfile::StatusType::EMPTY) { - // inititialise the empty logfile by writing a header marker - int res = writeHeader(slot); + else { + TRI_ASSERT(_logfile != nullptr); - if (res != TRI_ERROR_NO_ERROR) { - LOG_ERROR("could not write logfile header: %s", TRI_errno_string(res)); - return res; + if (status == Logfile::StatusType::EMPTY) { + // inititialise the empty logfile by writing a header marker + res = writeHeader(slot); + + if (res != TRI_ERROR_NO_ERROR) { + LOG_ERROR("could not write logfile header: %s", TRI_errno_string(res)); + return res; + } + + _logfileManager->setLogfileOpen(_logfile); + worked = true; + } + else { + TRI_ASSERT(status == Logfile::StatusType::OPEN); + worked = false; } - _logfileManager->setLogfileOpen(_logfile); - worked = true; - return TRI_ERROR_NO_ERROR; - } - else { - TRI_ASSERT(status == Logfile::StatusType::OPEN); - worked = false; return TRI_ERROR_NO_ERROR; } } @@ -757,13 +796,20 @@ bool Slots::waitForTick (Slot::TickType tick) { /// specified size //////////////////////////////////////////////////////////////////////////////// -Logfile::StatusType Slots::newLogfile (uint32_t size) { +int Slots::newLogfile (uint32_t size, + Logfile::StatusType& status) { TRI_ASSERT(size > 0); - Logfile::StatusType status = Logfile::StatusType::UNKNOWN; - _logfile = _logfileManager->getWriteableLogfile(size, status); + status = Logfile::StatusType::UNKNOWN; + Logfile* logfile = nullptr; + int res = _logfileManager->getWriteableLogfile(size, status, logfile); + + if (res == TRI_ERROR_NO_ERROR) { + TRI_ASSERT(logfile != nullptr); + _logfile = logfile; + } - return status; + return res; } // ----------------------------------------------------------------------------- diff --git a/arangod/Wal/Slots.h b/arangod/Wal/Slots.h index acf87d6357..ace6b9e419 100644 --- a/arangod/Wal/Slots.h +++ b/arangod/Wal/Slots.h @@ -249,7 +249,8 @@ namespace triagens { /// specified size //////////////////////////////////////////////////////////////////////////////// - Logfile::StatusType newLogfile (uint32_t); + int newLogfile (uint32_t, + Logfile::StatusType& status); // ----------------------------------------------------------------------------- // --SECTION-- private variables @@ -279,7 +280,7 @@ namespace triagens { /// @brief all slots //////////////////////////////////////////////////////////////////////////////// - Slot* const _slots; + Slot* _slots; //////////////////////////////////////////////////////////////////////////////// /// @brief the total number of slots diff --git a/js/apps/system/_admin/aardvark/APP/frontend/js/bootstrap/module-internal.js b/js/apps/system/_admin/aardvark/APP/frontend/js/bootstrap/module-internal.js index a40bfee027..4e05fbf121 100644 --- a/js/apps/system/_admin/aardvark/APP/frontend/js/bootstrap/module-internal.js +++ b/js/apps/system/_admin/aardvark/APP/frontend/js/bootstrap/module-internal.js @@ -1814,23 +1814,6 @@ exports.stopColorPrint = function (silent) { // --SECTION-- public utility functions // ----------------------------------------------------------------------------- -//////////////////////////////////////////////////////////////////////////////// -/// @brief exponentialBackoff -//////////////////////////////////////////////////////////////////////////////// - -exports.exponentialBackOff = function (n, i) { - if (i === 0) { - return 0; - } - if (n === 0) { - return 0; - } - if (n === 1) { - return Math.random() < 0.5 ? 0 : i; - } - return Math.floor(Math.random() * (n + 1)) * i; -}; - //////////////////////////////////////////////////////////////////////////////// /// @brief env //////////////////////////////////////////////////////////////////////////////// diff --git a/js/common/bootstrap/module-internal.js b/js/common/bootstrap/module-internal.js index a40bfee027..4e05fbf121 100644 --- a/js/common/bootstrap/module-internal.js +++ b/js/common/bootstrap/module-internal.js @@ -1814,23 +1814,6 @@ exports.stopColorPrint = function (silent) { // --SECTION-- public utility functions // ----------------------------------------------------------------------------- -//////////////////////////////////////////////////////////////////////////////// -/// @brief exponentialBackoff -//////////////////////////////////////////////////////////////////////////////// - -exports.exponentialBackOff = function (n, i) { - if (i === 0) { - return 0; - } - if (n === 0) { - return 0; - } - if (n === 1) { - return Math.random() < 0.5 ? 0 : i; - } - return Math.floor(Math.random() * (n + 1)) * i; -}; - //////////////////////////////////////////////////////////////////////////////// /// @brief env //////////////////////////////////////////////////////////////////////////////// diff --git a/js/server/tests/recovery/disk-full-datafile.js b/js/server/tests/recovery/disk-full-datafile.js new file mode 100644 index 0000000000..6bc38a7e99 --- /dev/null +++ b/js/server/tests/recovery/disk-full-datafile.js @@ -0,0 +1,103 @@ +/*jshint globalstrict:false, strict:false, unused : false */ +/*global fail, assertEqual */ + +//////////////////////////////////////////////////////////////////////////////// +/// @brief tests for dump/reload +/// +/// @file +/// +/// DISCLAIMER +/// +/// Copyright 2010-2012 triagens GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is triAGENS GmbH, Cologne, Germany +/// +/// @author Jan Steemann +/// @author Copyright 2012, triAGENS GmbH, Cologne, Germany +//////////////////////////////////////////////////////////////////////////////// + +var db = require("org/arangodb").db; +var internal = require("internal"); +var jsunity = require("jsunity"); +var fs = require("fs"); + +function runSetup () { + 'use strict'; + internal.debugClearFailAt(); + + db._drop("UnitTestsRecovery"); + var c = db._create("UnitTestsRecovery"); + internal.wal.flush(true, true); + + for (var i = 0; i < 10; ++i) { + c.insert({ value: i }); + } + + internal.debugSetFailAt("CreateDatafile1"); + internal.debugSetFailAt("LogfileManagerGetWriteableLogfile"); + + try { + internal.wal.flush(); + fail(); + } + catch (err) { + } + + internal.wait(3); + + internal.debugSegfault("crashing server"); +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test suite +//////////////////////////////////////////////////////////////////////////////// + +function recoverySuite () { + 'use strict'; + jsunity.jsUnity.attachAssertions(); + + return { + setUp: function () { + }, + tearDown: function () { + }, + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test whether we can start the server +//////////////////////////////////////////////////////////////////////////////// + + testDiskFullLogfileData : function () { + assertEqual(10, db._collection("UnitTestsRecovery").count()); + } + + }; +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief executes the test suite +//////////////////////////////////////////////////////////////////////////////// + +function main (argv) { + 'use strict'; + if (argv[1] === "setup") { + runSetup(); + return 0; + } + else { + jsunity.run(recoverySuite); + return jsunity.done().status ? 0 : 1; + } +} + diff --git a/js/server/tests/recovery/disk-full-logfile-data.js b/js/server/tests/recovery/disk-full-logfile-data.js new file mode 100644 index 0000000000..2d88eb679c --- /dev/null +++ b/js/server/tests/recovery/disk-full-logfile-data.js @@ -0,0 +1,105 @@ +/*jshint globalstrict:false, strict:false, unused : false */ +/*global fail, assertEqual */ + +//////////////////////////////////////////////////////////////////////////////// +/// @brief tests for dump/reload +/// +/// @file +/// +/// DISCLAIMER +/// +/// Copyright 2010-2012 triagens GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is triAGENS GmbH, Cologne, Germany +/// +/// @author Jan Steemann +/// @author Copyright 2012, triAGENS GmbH, Cologne, Germany +//////////////////////////////////////////////////////////////////////////////// + +var db = require("org/arangodb").db; +var internal = require("internal"); +var jsunity = require("jsunity"); +var fs = require("fs"); + +function runSetup () { + 'use strict'; + internal.debugClearFailAt(); + + db._drop("UnitTestsRecovery"); + var c = db._create("UnitTestsRecovery"); + internal.wal.flush(true, true); + internal.debugSetFailAt("CreateDatafile1"); + internal.debugSetFailAt("LogfileManagerGetWriteableLogfile"); + + c.insert({ value: 1 }); + try { + internal.wal.flush(); + fail(); + } + catch (err) { + } + + internal.wait(3); + try { + c.save({ _key: "crashme" }, true); // wait for sync + fail(); + } + catch (err) { + } + + internal.debugSegfault("crashing server"); +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test suite +//////////////////////////////////////////////////////////////////////////////// + +function recoverySuite () { + 'use strict'; + jsunity.jsUnity.attachAssertions(); + + return { + setUp: function () { + }, + tearDown: function () { + }, + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test whether we can start the server +//////////////////////////////////////////////////////////////////////////////// + + testDiskFullLogfileData : function () { + assertEqual(1, db._collection("UnitTestsRecovery").count()); + } + + }; +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief executes the test suite +//////////////////////////////////////////////////////////////////////////////// + +function main (argv) { + 'use strict'; + if (argv[1] === "setup") { + runSetup(); + return 0; + } + else { + jsunity.run(recoverySuite); + return jsunity.done().status ? 0 : 1; + } +} + diff --git a/js/server/tests/recovery/disk-full-logfile.js b/js/server/tests/recovery/disk-full-logfile.js new file mode 100644 index 0000000000..de7c0500ee --- /dev/null +++ b/js/server/tests/recovery/disk-full-logfile.js @@ -0,0 +1,97 @@ +/*jshint globalstrict:false, strict:false, unused : false */ +/*global fail, assertEqual */ + +//////////////////////////////////////////////////////////////////////////////// +/// @brief tests for dump/reload +/// +/// @file +/// +/// DISCLAIMER +/// +/// Copyright 2010-2012 triagens GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is triAGENS GmbH, Cologne, Germany +/// +/// @author Jan Steemann +/// @author Copyright 2012, triAGENS GmbH, Cologne, Germany +//////////////////////////////////////////////////////////////////////////////// + +var db = require("org/arangodb").db; +var internal = require("internal"); +var jsunity = require("jsunity"); +var fs = require("fs"); + +function runSetup () { + 'use strict'; + internal.debugClearFailAt(); + + db._drop("UnitTestsRecovery"); + var c = db._create("UnitTestsRecovery"); + + internal.wal.flush(true, true); + + for (var i = 0; i < 1000; ++i) { + c.insert({ value: i }); + } + c.save({ _key: "crashme" }, true); // wait for sync + internal.debugSetFailAt("CreateDatafile1"); + internal.debugSetFailAt("LogfileManagerGetWriteableLogfile"); + internal.wal.flush(false, false); + internal.wait(2); + + internal.debugSegfault("crashing server"); +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test suite +//////////////////////////////////////////////////////////////////////////////// + +function recoverySuite () { + 'use strict'; + jsunity.jsUnity.attachAssertions(); + + return { + setUp: function () { + }, + tearDown: function () { + }, + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test whether we can start the server +//////////////////////////////////////////////////////////////////////////////// + + testDiskFullLogfile : function () { + assertEqual(1001, db._collection("UnitTestsRecovery").count()); + } + + }; +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief executes the test suite +//////////////////////////////////////////////////////////////////////////////// + +function main (argv) { + 'use strict'; + if (argv[1] === "setup") { + runSetup(); + return 0; + } + else { + jsunity.run(recoverySuite); + return jsunity.done().status ? 0 : 1; + } +} + diff --git a/js/server/tests/shell-datafile-timecritical-noncluster.js b/js/server/tests/shell-datafile-timecritical-noncluster.js new file mode 100644 index 0000000000..ddae8d0c20 --- /dev/null +++ b/js/server/tests/shell-datafile-timecritical-noncluster.js @@ -0,0 +1,177 @@ +/*jshint globalstrict:false, strict:false, maxlen : 200 */ +/*global fail, assertEqual */ + +//////////////////////////////////////////////////////////////////////////////// +/// @brief tests for transactions +/// +/// @file +/// +/// DISCLAIMER +/// +/// Copyright 2010-2012 triagens GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is triAGENS GmbH, Cologne, Germany +/// +/// @author Jan Steemann +/// @author Copyright 2013, triAGENS GmbH, Cologne, Germany +//////////////////////////////////////////////////////////////////////////////// + +var jsunity = require("jsunity"); +var internal = require("internal"); +var arangodb = require("org/arangodb"); +var db = arangodb.db; + +// ----------------------------------------------------------------------------- +// --SECTION-- test suite +// ----------------------------------------------------------------------------- + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test suite +//////////////////////////////////////////////////////////////////////////////// + +function datafileFailuresSuite () { + 'use strict'; + var cn = "UnitTestsDatafile"; + var c = null; + + return { + +//////////////////////////////////////////////////////////////////////////////// +/// @brief set up +//////////////////////////////////////////////////////////////////////////////// + + setUp : function () { + internal.debugClearFailAt(); + db._drop(cn); + }, + +//////////////////////////////////////////////////////////////////////////////// +/// @brief tear down +//////////////////////////////////////////////////////////////////////////////// + + tearDown : function () { + internal.debugClearFailAt(); + + if (c !== null) { + c.drop(); + } + + c = null; + internal.wait(4); + }, + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test: disk full +//////////////////////////////////////////////////////////////////////////////// + + testDiskFullDuringLogfileCreationNoJournal : function () { + ["CreateDatafile1", "CreateDatafile2"].forEach(function(what) { + internal.debugClearFailAt(); + db._drop(cn); + + while (true) { + try { + internal.wal.flush(true, true); + break; + } + catch (err) { + } + internal.wait(0.5, false); + } + c = db._create(cn); + + internal.debugSetFailAt(what); + internal.debugSetFailAt("LogfileManagerGetWriteableLogfile"); + try { + internal.wal.flush(); + fail(); + } + catch (err) { + assertEqual(internal.errors.ERROR_ARANGO_NO_JOURNAL.code, err.errorNum); + } + + internal.wait(3, false); + internal.debugClearFailAt(); + assertEqual(0, c.count()); + }); + }, + +//////////////////////////////////////////////////////////////////////////////// +/// @brief test: disk full +//////////////////////////////////////////////////////////////////////////////// + + testDiskFullDuringCollectionNoJournal : function () { + ["CreateDatafile1", "CreateDatafile2"].forEach(function(what) { + internal.debugClearFailAt(); + db._drop(cn); + + while (true) { + try { + internal.wal.flush(true, true); + break; + } + catch (err) { + } + internal.wait(0.5, false); + } + + c = db._create(cn); + for (var i = 0; i < 1000; ++i) { + c.insert({ value: i }); + } + + internal.debugSetFailAt(what); + internal.debugSetFailAt("LogfileManagerGetWriteableLogfile"); + try { + internal.wal.flush(); + fail(); + } + catch (err) { + assertEqual(internal.errors.ERROR_ARANGO_NO_JOURNAL.code, err.errorNum); + } + + internal.wait(3, false); + internal.debugClearFailAt(); + assertEqual(1000, c.count()); + }); + } + + }; +} + +// ----------------------------------------------------------------------------- +// --SECTION-- main +// ----------------------------------------------------------------------------- + +//////////////////////////////////////////////////////////////////////////////// +/// @brief executes the test suites +//////////////////////////////////////////////////////////////////////////////// + +// only run this test suite if server-side failures are enabled +if (internal.debugCanUseFailAt()) { + jsunity.run(datafileFailuresSuite); +} + +return jsunity.done(); + +// ----------------------------------------------------------------------------- +// --SECTION-- END-OF-FILE +// ----------------------------------------------------------------------------- + +// Local Variables: +// mode: outline-minor +// outline-regexp: "\\(/// @brief\\|/// @addtogroup\\|// --SECTION--\\|/// @page\\|/// @\\}\\)" +// End: + diff --git a/js/server/tests/shell-transactions-noncluster.js b/js/server/tests/shell-transactions-noncluster.js index 88da7d2f71..23b0ee056e 100644 --- a/js/server/tests/shell-transactions-noncluster.js +++ b/js/server/tests/shell-transactions-noncluster.js @@ -5365,7 +5365,15 @@ function transactionServerFailuresSuite () { assertEqual(100160, fig.uncollectedLogfileEntries); internal.debugClearFailAt(); - internal.wal.flush(true, true); + while (true) { + try { + internal.wal.flush(true, true); + break; + } + catch (err) { + internal.wait(0.5, false); + } + } assertEqual(100150, c.count()); diff --git a/lib/Basics/files.cpp b/lib/Basics/files.cpp index 081592ae74..64de051b7e 100644 --- a/lib/Basics/files.cpp +++ b/lib/Basics/files.cpp @@ -70,6 +70,12 @@ using namespace std; // --SECTION-- private variables // ----------------------------------------------------------------------------- +//////////////////////////////////////////////////////////////////////////////// +/// @brief a static buffer of zeros, used to initialize files +//////////////////////////////////////////////////////////////////////////////// + +static char NullBuffer[4096]; + //////////////////////////////////////////////////////////////////////////////// /// @brief already initialised //////////////////////////////////////////////////////////////////////////////// @@ -2439,6 +2445,22 @@ char* TRI_LocateConfigDirectory () { #endif +//////////////////////////////////////////////////////////////////////////////// +/// @brief get the address of the null buffer +//////////////////////////////////////////////////////////////////////////////// + +char* TRI_GetNullBufferFiles () { + return &NullBuffer[0]; +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief get the size of the null buffer +//////////////////////////////////////////////////////////////////////////////// + +size_t TRI_GetNullBufferSizeFiles () { + return sizeof(NullBuffer); +} + // ----------------------------------------------------------------------------- // --SECTION-- module functions // ----------------------------------------------------------------------------- @@ -2452,6 +2474,8 @@ char* TRI_LocateConfigDirectory () { void TRI_InitialiseFiles (void) { // clear user-defined temp path TempPath = nullptr; + + memset(TRI_GetNullBufferFiles(), 0, TRI_GetNullBufferSizeFiles()); } //////////////////////////////////////////////////////////////////////////////// diff --git a/lib/Basics/files.h b/lib/Basics/files.h index 7e1ecb3b61..f79342479c 100644 --- a/lib/Basics/files.h +++ b/lib/Basics/files.h @@ -355,6 +355,18 @@ std::string TRI_LocateInstallDirectory (void); char* TRI_LocateConfigDirectory (void); +//////////////////////////////////////////////////////////////////////////////// +/// @brief get the address of the null buffer +//////////////////////////////////////////////////////////////////////////////// + +char* TRI_GetNullBufferFiles (); + +//////////////////////////////////////////////////////////////////////////////// +/// @brief get the size of the null buffer +//////////////////////////////////////////////////////////////////////////////// + +size_t TRI_GetNullBufferSizeFiles (); + // ----------------------------------------------------------------------------- // --SECTION-- module functions // ----------------------------------------------------------------------------- diff --git a/lib/Basics/memory-map-posix.cpp b/lib/Basics/memory-map-posix.cpp index de9635e1ac..725c05e1dd 100644 --- a/lib/Basics/memory-map-posix.cpp +++ b/lib/Basics/memory-map-posix.cpp @@ -91,7 +91,9 @@ int TRI_MMFile (void* memoryAddress, int64_t offset, void** result) { - off_t offsetRetyped = (off_t)(offset); + TRI_ASSERT(memoryAddress == nullptr); + off_t offsetRetyped = (off_t) offset; + TRI_ASSERT(offsetRetyped == 0); *mmHandle = nullptr; // only useful for Windows @@ -127,6 +129,9 @@ int TRI_UNMMFile (void* memoryAddress, if (errno == ENOSPC) { return TRI_ERROR_ARANGO_FILESYSTEM_FULL; } + if (errno == ENOMEM) { + return TRI_ERROR_OUT_OF_MEMORY_MMAP; + } return TRI_ERROR_SYS_ERROR; }