//////////////////////////////////////////////////////////////////////////////// /// DISCLAIMER /// /// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Dr. Frank Celler //////////////////////////////////////////////////////////////////////////////// #include "MMFilesDatafile.h" #include "ApplicationFeatures/PageSizeFeature.h" #include "Basics/FileUtils.h" #include "Basics/StaticStrings.h" #include "Basics/StringUtils.h" #include "Basics/encoding.h" #include "Basics/files.h" #include "Basics/hashes.h" #include "Basics/memory-map.h" #include "Basics/tri-strings.h" #include "Basics/ScopeGuard.h" #include "Logger/Logger.h" #include "MMFiles/MMFilesDatafileHelper.h" #include "VocBase/ticks.h" #include #include #include #include #include #ifdef TRI_HAVE_UNISTD_H #include #endif using namespace arangodb; using namespace arangodb::basics; namespace { /// @brief create a hex string representation of the value static std::string hexValue(uint64_t value) { static const uint64_t Bits[] = {56, 48, 40, 32, 24, 16, 8, 0}; std::string line("0x"); for (uint64_t i = 0; i < 8; ++i) { uint8_t c = static_cast((static_cast(value) >> Bits[i]) & 0xFFULL); uint8_t n1 = c >> 4; uint8_t n2 = c & 0x0F; line.push_back((n1 < 10) ? ('0' + n1) : 'A' + n1 - 10); line.push_back((n2 < 10) ? ('0' + n2) : 'A' + n2 - 10); } return line; } /// @brief calculate a CRC value the same way as ArangoDB 2.8 did static TRI_voc_crc_t Crc28(TRI_voc_crc_t crc, void const* data, size_t length) { static TRI_voc_crc_t const CrcPolynomial = 0xEDB88320; unsigned char* current = (unsigned char*)data; while (length--) { crc ^= *current++; for (unsigned int i = 0; i < 8; ++i) { if (crc & 1) { crc = (crc >> 1) ^ CrcPolynomial; } else { crc = crc >> 1; } } } return crc; } /// @brief check if a marker appears to be created by ArangoDB 2.8 static bool IsMarker28(void const* marker, size_t length) { struct Marker28 { uint32_t _size; TRI_voc_crc_t _crc; uint32_t _type; #ifdef TRI_PADDING_32 char _padding_df_marker[4]; #endif TRI_voc_tick_t _tick; }; uint32_t zero = 0; off_t o = offsetof(Marker28, _crc); size_t n = sizeof(TRI_voc_crc_t); char const* ptr = static_cast(marker); Marker28 const* m = static_cast(marker); if (m->_size < o + n || (m->_size - o - n > length)) { return false; } TRI_voc_crc_t crc = TRI_InitialCrc32(); crc = Crc28(crc, ptr, o); crc = Crc28(crc, (char*)&zero, n); crc = Crc28(crc, ptr + o + n, m->_size - o - n); crc = TRI_FinalCrc32(crc); return crc == m->_crc; } /// @brief calculates the actual CRC of a marker, without bounds checks static TRI_voc_crc_t CalculateCrcValue(MMFilesMarker const* marker) { uint32_t zero = 0; off_t o = marker->offsetOfCrc(); size_t n = sizeof(TRI_voc_crc_t); char const* ptr = reinterpret_cast(marker); TRI_voc_crc_t crc = TRI_InitialCrc32(); crc = TRI_BlockCrc32(crc, ptr, o); crc = TRI_BlockCrc32(crc, (char*)&zero, n); crc = TRI_BlockCrc32(crc, ptr + o + n, marker->getSize() - o - n); crc = TRI_FinalCrc32(crc); return crc; } /// @brief checks a CRC of a marker, with bounds checks static bool CheckCrcMarker(MMFilesMarker const* marker, char const* end) { uint32_t const size = marker->getSize(); if (size < sizeof(MMFilesMarker)) { return false; } if (reinterpret_cast(marker) + size > end) { return false; } auto expected = CalculateCrcValue(marker); return marker->getCrc() == expected; } //////////////////////////////////////////////////////////////////////////////// /// @brief extract the numeric part from a filename /// the filename must look like this: /.*type-abc\.ending$/, where abc is /// a number, and type and ending are arbitrary letters //////////////////////////////////////////////////////////////////////////////// static uint64_t GetNumericFilenamePart(char const* filename) { char const* pos1 = strrchr(filename, '.'); if (pos1 == nullptr) { return 0; } char const* pos2 = strrchr(filename, '-'); if (pos2 == nullptr || pos2 > pos1) { return 0; } return StringUtils::uint64(pos2 + 1, pos1 - pos2 - 1); } } // namespace //////////////////////////////////////////////////////////////////////////////// /// @brief creates a new anonymous datafile /// /// this is only supported on certain platforms (Linux, MacOS) //////////////////////////////////////////////////////////////////////////////// #ifdef TRI_HAVE_ANONYMOUS_MMAP static MMFilesDatafile* CreateAnonymousDatafile(TRI_voc_fid_t fid, uint32_t maximalSize) { #ifdef TRI_MMAP_ANONYMOUS // fd -1 is required for "real" anonymous regions int fd = -1; int flags = TRI_MMAP_ANONYMOUS | MAP_SHARED; #else // ugly workaround if MAP_ANONYMOUS is not available int fd = TRI_OPEN("/dev/zero", O_RDWR | TRI_O_CLOEXEC); if (fd == -1) { return nullptr; } int flags = MAP_PRIVATE; #endif // memory map the data void* data; void* mmHandle; int res = TRI_MMFile(nullptr, maximalSize, PROT_WRITE | PROT_READ, flags, fd, &mmHandle, 0, &data); #ifdef MAP_ANONYMOUS // nothing to do #else // close auxilliary file TRI_CLOSE(fd); fd = -1; #endif if (res != TRI_ERROR_NO_ERROR) { TRI_set_errno(res); LOG_TOPIC("48c6f", ERR, arangodb::Logger::DATAFILES) << "cannot memory map anonymous region: " << TRI_last_error(); LOG_TOPIC("f4929", ERR, arangodb::Logger::DATAFILES) << "The database directory might reside on a shared folder " "(VirtualBox, VMWare) or an NFS " "mounted volume which does not allow memory mapped files."; return nullptr; } return new MMFilesDatafile(StaticStrings::Empty, fd, mmHandle, maximalSize, 0, fid, static_cast(data)); } #endif //////////////////////////////////////////////////////////////////////////////// /// @brief creates a new physical datafile //////////////////////////////////////////////////////////////////////////////// static MMFilesDatafile* CreatePhysicalDatafile(std::string const& filename, TRI_voc_fid_t fid, uint32_t maximalSize) { TRI_ASSERT(!filename.empty()); int fd = TRI_CreateDatafile(filename, maximalSize); if (fd < 0) { // an error occurred return nullptr; } // memory map the data void* data; void* mmHandle; int flags = MAP_SHARED; #ifdef __linux__ // try populating the mapping already flags |= MAP_POPULATE; #endif int res = TRI_MMFile(nullptr, maximalSize, PROT_WRITE | PROT_READ, flags, fd, &mmHandle, 0, &data); if (res != TRI_ERROR_NO_ERROR) { TRI_set_errno(res); TRI_CLOSE(fd); // remove empty file TRI_UnlinkFile(filename.c_str()); LOG_TOPIC("fb701", ERR, arangodb::Logger::DATAFILES) << "cannot memory map file '" << filename << "': '" << TRI_errno_string(res) << "'"; LOG_TOPIC("0c2e1", ERR, arangodb::Logger::DATAFILES) << "The database directory might reside on a shared folder " "(VirtualBox, VMWare) or an NFS-mounted volume which does not allow " "memory mapped files."; return nullptr; } // create datafile structure try { return new MMFilesDatafile(filename, fd, mmHandle, maximalSize, 0, fid, static_cast(data)); } catch (...) { TRI_CLOSE(fd); return nullptr; } } /// @brief whether or not a datafile is empty int MMFilesDatafile::judge(std::string const& filename) { off_t filesize = basics::FileUtils::size(filename); if (filesize == 0) { // empty logfile return TRI_ERROR_ARANGO_DATAFILE_EMPTY; } if (filesize < static_cast(256 * sizeof(uint64_t))) { // too small return TRI_ERROR_ARANGO_DATAFILE_UNREADABLE; } int fd = TRI_OPEN(filename.c_str(), O_RDONLY | TRI_O_CLOEXEC); if (fd < 0) { return TRI_ERROR_ARANGO_DATAFILE_UNREADABLE; } uint64_t buffer[256]; if (!TRI_ReadPointer(fd, &buffer, 256 * sizeof(uint64_t))) { TRI_CLOSE(fd); return TRI_ERROR_ARANGO_DATAFILE_UNREADABLE; } uint64_t* ptr = buffer; uint64_t* end = buffer + 256; while (ptr < end) { if (*ptr != 0) { TRI_CLOSE(fd); return TRI_ERROR_NO_ERROR; } ++ptr; } TRI_CLOSE(fd); return TRI_ERROR_ARANGO_DATAFILE_EMPTY; } /// @brief creates either an anonymous or a physical datafile MMFilesDatafile* MMFilesDatafile::create(std::string const& filename, TRI_voc_fid_t fid, uint32_t maximalSize, bool withInitialMarkers) { size_t pageSize = PageSizeFeature::getPageSize(); TRI_ASSERT(pageSize >= 256); // use multiples of page-size maximalSize = (uint32_t)(((maximalSize + pageSize - 1) / pageSize) * pageSize); // sanity check maximal size if (sizeof(MMFilesDatafileHeaderMarker) + sizeof(MMFilesDatafileFooterMarker) > maximalSize) { LOG_TOPIC("4e7f0", ERR, arangodb::Logger::DATAFILES) << "cannot create datafile, maximal size '" << maximalSize << "' is too small"; TRI_set_errno(TRI_ERROR_ARANGO_MAXIMAL_SIZE_TOO_SMALL); return nullptr; } // create either an anonymous or a physical datafile std::unique_ptr datafile; if (filename.empty()) { #ifdef TRI_HAVE_ANONYMOUS_MMAP datafile.reset(CreateAnonymousDatafile(fid, maximalSize)); #endif } else { datafile.reset(CreatePhysicalDatafile(filename, fid, maximalSize)); } if (datafile == nullptr) { // an error occurred during creation return nullptr; } datafile->setState(TRI_DF_STATE_WRITE); if (withInitialMarkers) { int res = datafile->writeInitialHeaderMarker(fid, maximalSize); if (res != TRI_ERROR_NO_ERROR) { LOG_TOPIC("ae8c8", ERR, arangodb::Logger::DATAFILES) << "cannot write header to datafile '" << datafile->getName() << "'"; TRI_UNMMFile(const_cast(datafile->data()), datafile->initSize(), datafile->fd(), &datafile->_mmHandle); datafile->close(); return nullptr; } } LOG_TOPIC("d0f2e", DEBUG, arangodb::Logger::DATAFILES) << "created datafile '" << datafile->getName() << "' of size " << maximalSize << " and page-size " << pageSize; return datafile.release(); } //////////////////////////////////////////////////////////////////////////////// /// @brief returns the name for a marker //////////////////////////////////////////////////////////////////////////////// char const* TRI_NameMarkerDatafile(MMFilesMarkerType type) { switch (type) { // general markers case TRI_DF_MARKER_HEADER: return "datafile header"; case TRI_DF_MARKER_FOOTER: return "footer"; case TRI_DF_MARKER_BLANK: return "blank marker (used when repairing datafiles)"; case TRI_DF_MARKER_COL_HEADER: return "collection header"; case TRI_DF_MARKER_PROLOGUE: return "prologue"; case TRI_DF_MARKER_VPACK_DOCUMENT: return "document"; case TRI_DF_MARKER_VPACK_REMOVE: return "remove document"; case TRI_DF_MARKER_VPACK_CREATE_COLLECTION: return "create collection"; case TRI_DF_MARKER_VPACK_DROP_COLLECTION: return "drop collection"; case TRI_DF_MARKER_VPACK_RENAME_COLLECTION: return "rename collection"; case TRI_DF_MARKER_VPACK_CHANGE_COLLECTION: return "change collection"; case TRI_DF_MARKER_VPACK_CREATE_INDEX: return "create index"; case TRI_DF_MARKER_VPACK_DROP_INDEX: return "drop index"; case TRI_DF_MARKER_VPACK_CREATE_DATABASE: return "create database"; case TRI_DF_MARKER_VPACK_DROP_DATABASE: return "drop database"; case TRI_DF_MARKER_VPACK_BEGIN_TRANSACTION: return "begin transaction"; case TRI_DF_MARKER_VPACK_COMMIT_TRANSACTION: return "commit transaction"; case TRI_DF_MARKER_VPACK_ABORT_TRANSACTION: return "abort transaction"; case TRI_DF_MARKER_VPACK_CREATE_VIEW: return "create view"; case TRI_DF_MARKER_VPACK_DROP_VIEW: return "drop view"; case TRI_DF_MARKER_VPACK_CHANGE_VIEW: return "change view"; default: return "unknown"; } } //////////////////////////////////////////////////////////////////////////////// /// @brief checks whether a marker is valid //////////////////////////////////////////////////////////////////////////////// bool TRI_IsValidMarkerDatafile(MMFilesMarker const* marker) { if (marker == nullptr) { return false; } // check marker type MMFilesMarkerType const type = marker->getType(); if (type <= TRI_DF_MARKER_MIN) { // marker type is less than minimum allowed type value return false; } if (type >= TRI_DF_MARKER_MAX) { // marker type is greater than maximum allowed type value return false; } if (marker->getSize() >= MMFilesDatafileHelper::MaximalMarkerSize()) { // a single marker bigger than this limit seems unreasonable // note: this is an arbitrary limit return false; } return true; } /// @brief reserves room for an element, advances the pointer /// note: maximalJournalSize is the collection's maximalJournalSize property, /// which may be different from the size of the current datafile /// some callers do not set the value of maximalJournalSize int MMFilesDatafile::reserveElement(uint32_t size, MMFilesMarker** position, uint32_t maximalJournalSize) { *position = nullptr; size = encoding::alignedSize(size); if (_state != TRI_DF_STATE_WRITE) { if (_state == TRI_DF_STATE_READ) { LOG_TOPIC("d84c7", ERR, arangodb::Logger::DATAFILES) << "cannot reserve marker, datafile is read-only"; return TRI_ERROR_ARANGO_READ_ONLY; } return TRI_ERROR_ARANGO_ILLEGAL_STATE; } // check the maximal size if (size + MMFilesDatafileHelper::JournalOverhead() > _maximalSize) { // marker is bigger than journal size. // adding the marker to this datafile will not work if (maximalJournalSize <= _maximalSize) { // the collection property 'maximalJournalSize' is equal to // or smaller than the size of this datafile // creating a new file and writing the marker into it will not work either return TRI_ERROR_ARANGO_DOCUMENT_TOO_LARGE; } // if we get here, the collection's 'maximalJournalSize' property is // higher than the size of this datafile. // maybe the marker will fit into a new datafile with the bigger size? if (size + MMFilesDatafileHelper::JournalOverhead() > maximalJournalSize) { // marker still won't fit return TRI_ERROR_ARANGO_DOCUMENT_TOO_LARGE; } // intentionally falls through } // add the marker, leave enough room for the footer if (_currentSize + size + _footerSize > _maximalSize) { _lastError = TRI_set_errno(TRI_ERROR_ARANGO_DATAFILE_FULL); _full = true; LOG_TOPIC("c0ce8", TRACE, arangodb::Logger::DATAFILES) << "cannot write marker, not enough space"; return TRI_ERROR_ARANGO_DATAFILE_FULL; } *position = reinterpret_cast(_next); TRI_ASSERT(*position != nullptr); advanceWritePosition(size); return TRI_ERROR_NO_ERROR; } void MMFilesDatafile::sequentialAccess() { TRI_MMFileAdvise(_data, _initSize, TRI_MADVISE_SEQUENTIAL); } void MMFilesDatafile::randomAccess() { TRI_MMFileAdvise(_data, _initSize, TRI_MADVISE_RANDOM); } void MMFilesDatafile::willNeed() { TRI_MMFileAdvise(_data, _initSize, TRI_MADVISE_WILLNEED); } void MMFilesDatafile::dontNeed() { TRI_MMFileAdvise(_data, _initSize, TRI_MADVISE_DONTNEED); } bool MMFilesDatafile::readOnly() { return (TRI_ProtectMMFile(_data, _initSize, PROT_READ, _fd) == TRI_ERROR_NO_ERROR); } bool MMFilesDatafile::readWrite() { return (TRI_ProtectMMFile(_data, _initSize, PROT_READ | PROT_WRITE, _fd) == TRI_ERROR_NO_ERROR); } void MMFilesDatafile::dontDump() { TRI_MMFileAdvise(_data, _initSize, TRI_MADVISE_DONTDUMP); } int MMFilesDatafile::lockInMemory() { TRI_ASSERT(!_lockedInMemory); int res = TRI_MMFileLock(_data, _initSize); if (res == TRI_ERROR_NO_ERROR) { _lockedInMemory = true; } return res; } int MMFilesDatafile::unlockFromMemory() { if (!_lockedInMemory) { return TRI_ERROR_NO_ERROR; } int res = TRI_MMFileUnlock(_data, _initSize); if (res == TRI_ERROR_NO_ERROR) { _lockedInMemory = false; } return res; } /// @brief writes a marker to the datafile /// this function will write the marker as-is, without any CRC or tick updates int MMFilesDatafile::writeElement(void* position, MMFilesMarker const* marker) { TRI_ASSERT(marker->getTick() > 0); TRI_ASSERT(marker->getSize() > 0); TRI_UpdateTicksDatafile(this, marker); if (_state != TRI_DF_STATE_WRITE) { if (_state == TRI_DF_STATE_READ) { LOG_TOPIC("313ef", ERR, arangodb::Logger::DATAFILES) << "cannot write marker, datafile is read-only"; return TRI_ERROR_ARANGO_READ_ONLY; } return TRI_ERROR_ARANGO_ILLEGAL_STATE; } TRI_ASSERT(position != nullptr); // out of bounds check for writing into a datafile if (position == nullptr || position < (void*)_data || position >= (void*)(_data + maximalSize())) { LOG_TOPIC("4e443", ERR, arangodb::Logger::DATAFILES) << "logic error. writing out of bounds of datafile '" << getName() << "'"; return TRI_ERROR_ARANGO_ILLEGAL_STATE; } memcpy(position, marker, static_cast(marker->getSize())); TRI_IF_FAILURE("BreakHeaderMarker") { #ifdef ARANGODB_ENABLE_FAILURE_TESTS if (marker->getType() == TRI_DF_MARKER_HEADER && getName().find("logfile-") == std::string::npos) { // intentionally corrupt the marker reinterpret_cast(position)->breakIt(); } #endif } return TRI_ERROR_NO_ERROR; } //////////////////////////////////////////////////////////////////////////////// /// @brief update tick values for a datafile //////////////////////////////////////////////////////////////////////////////// void TRI_UpdateTicksDatafile(MMFilesDatafile* datafile, MMFilesMarker const* marker) { MMFilesMarkerType const type = marker->getType(); if (type != TRI_DF_MARKER_HEADER && type != TRI_DF_MARKER_FOOTER && type != TRI_DF_MARKER_COL_HEADER) { // every marker but headers / footers counts TRI_voc_tick_t tick = marker->getTick(); if (datafile->_tickMin == 0) { datafile->_tickMin = tick; } if (datafile->_tickMax < tick) { datafile->_tickMax = tick; } if (datafile->_dataMin == 0) { datafile->_dataMin = tick; } if (datafile->_dataMax < tick) { datafile->_dataMax = tick; } } } //////////////////////////////////////////////////////////////////////////////// /// @brief checksums and writes a marker to the datafile //////////////////////////////////////////////////////////////////////////////// int MMFilesDatafile::writeCrcElement(void* position, MMFilesMarker* marker) { TRI_ASSERT(marker->getTick() != 0); if (isPhysical()) { TRI_voc_crc_t crc = TRI_InitialCrc32(); crc = TRI_BlockCrc32(crc, (char const*)marker, marker->getSize()); marker->setCrc(TRI_FinalCrc32(crc)); } return writeElement(position, marker); } //////////////////////////////////////////////////////////////////////////////// /// @brief iterates over a datafile /// also may set datafile's min/max tick values /// deprecated //////////////////////////////////////////////////////////////////////////////// bool TRI_IterateDatafile(MMFilesDatafile* datafile, bool (*iterator)(MMFilesMarker const*, void*, MMFilesDatafile*), void* data) { TRI_ASSERT(iterator != nullptr); LOG_TOPIC("8ccdf", DEBUG, arangodb::Logger::DATAFILES) << "iterating over datafile '" << datafile->getName() << "', fid: " << datafile->fid() << ", size: " << datafile->currentSize(); char const* ptr = datafile->data(); char const* end = ptr + datafile->currentSize(); if (datafile->state() != TRI_DF_STATE_READ && datafile->state() != TRI_DF_STATE_WRITE) { TRI_set_errno(TRI_ERROR_ARANGO_ILLEGAL_STATE); return false; } TRI_voc_tick_t maxTick = 0; TRI_DEFER(TRI_UpdateTickServer(maxTick)); while (ptr < end) { auto const* marker = reinterpret_cast(ptr); if (marker->getSize() == 0) { return true; } TRI_voc_tick_t tick = marker->getTick(); if (tick > maxTick) { maxTick = tick; } // update the tick statistics TRI_UpdateTicksDatafile(datafile, marker); if (!iterator(marker, data, datafile)) { return false; } ptr += MMFilesDatafileHelper::AlignedMarkerSize(marker); } return true; } Result TRI_IterateDatafile(MMFilesDatafile* datafile, Result (*iterator)(MMFilesMarker const*, void*, MMFilesDatafile*), void* data) { TRI_ASSERT(iterator != nullptr); LOG_TOPIC("32e89", DEBUG, arangodb::Logger::DATAFILES) << "iterating over datafile '" << datafile->getName() << "', fid: " << datafile->fid() << ", size: " << datafile->currentSize(); char const* ptr = datafile->data(); char const* end = ptr + datafile->currentSize(); if (datafile->state() != TRI_DF_STATE_READ && datafile->state() != TRI_DF_STATE_WRITE) { return TRI_ERROR_ARANGO_ILLEGAL_STATE; } TRI_voc_tick_t maxTick = 0; TRI_DEFER(TRI_UpdateTickServer(maxTick)); while (ptr < end) { auto const* marker = reinterpret_cast(ptr); if (marker->getSize() == 0) { return Result(); } TRI_voc_tick_t tick = marker->getTick(); if (tick > maxTick) { maxTick = tick; } // update the tick statistics TRI_UpdateTicksDatafile(datafile, marker); Result res = iterator(marker, data, datafile); if (res.fail()) { return res; } ptr += MMFilesDatafileHelper::AlignedMarkerSize(marker); } return Result(); } /// @brief iterates over a datafile /// also may set datafile's min/max tick values bool TRI_IterateDatafile(MMFilesDatafile* datafile, std::function const& cb) { LOG_TOPIC("dab1d", DEBUG, arangodb::Logger::DATAFILES) << "iterating over datafile '" << datafile->getName() << "', fid: " << datafile->fid() << ", size: " << datafile->currentSize(); char const* ptr = datafile->data(); char const* end = ptr + datafile->currentSize(); if (datafile->state() != TRI_DF_STATE_READ && datafile->state() != TRI_DF_STATE_WRITE) { TRI_set_errno(TRI_ERROR_ARANGO_ILLEGAL_STATE); return false; } TRI_voc_tick_t maxTick = 0; TRI_DEFER(TRI_UpdateTickServer(maxTick)); while (ptr < end) { auto const* marker = reinterpret_cast(ptr); if (marker->getSize() == 0) { return true; } TRI_voc_tick_t tick = marker->getTick(); if (tick > maxTick) { maxTick = tick; } // update the tick statistics TRI_UpdateTicksDatafile(datafile, marker); if (!cb(marker, datafile)) { return false; } ptr += MMFilesDatafileHelper::AlignedMarkerSize(marker); } return true; } /// @brief renames a datafile int MMFilesDatafile::rename(std::string const& filename) { // this function must not be called for non-physical datafiles TRI_ASSERT(isPhysical()); TRI_ASSERT(!filename.empty()); if (TRI_ExistsFile(filename.c_str())) { LOG_TOPIC("a87f5", ERR, arangodb::Logger::DATAFILES) << "cannot overwrite datafile '" << filename << "'"; _lastError = TRI_ERROR_ARANGO_DATAFILE_ALREADY_EXISTS; return TRI_ERROR_ARANGO_DATAFILE_ALREADY_EXISTS; } int res = TRI_RenameFile(_filename.c_str(), filename.c_str()); if (res != TRI_ERROR_NO_ERROR) { _state = TRI_DF_STATE_RENAME_ERROR; _lastError = TRI_ERROR_SYS_ERROR; return res; } _filename = filename; return TRI_ERROR_NO_ERROR; } /// @brief seals a datafile, writes a footer, sets it to read-only int MMFilesDatafile::seal() { if (_state == TRI_DF_STATE_READ) { return TRI_ERROR_ARANGO_READ_ONLY; } if (_state != TRI_DF_STATE_WRITE) { return TRI_ERROR_ARANGO_ILLEGAL_STATE; } if (_isSealed) { return TRI_ERROR_ARANGO_DATAFILE_SEALED; } // set a proper tick value if (_tickMax == 0) { _tickMax = TRI_NewTickServer(); } // create the footer MMFilesDatafileFooterMarker footer = MMFilesDatafileHelper::CreateFooterMarker(_tickMax); // reserve space and write footer to file _footerSize = 0; MMFilesMarker* position; int res = reserveElement(footer.base.getSize(), &position, 0); if (res == TRI_ERROR_NO_ERROR) { TRI_ASSERT(position != nullptr); res = writeCrcElement(position, &footer.base); } if (res != TRI_ERROR_NO_ERROR) { return res; } // sync file TRI_ASSERT(_data != nullptr); TRI_ASSERT(_written != nullptr); res = this->sync(_synced, reinterpret_cast(_data) + _currentSize); if (res != TRI_ERROR_NO_ERROR) { LOG_TOPIC("14ddd", ERR, arangodb::Logger::DATAFILES) << "msync failed with: " << TRI_errno_string(res); } // intentionally ignore return value of protection here because this call // would only restrict further file accesses (which is not required // for ArangoDB to work) readOnly(); if (res != TRI_ERROR_NO_ERROR) { return _lastError; } // seal datafile _isSealed = true; _state = TRI_DF_STATE_READ; // note: _initSize must remain constant // TRI_ASSERT(_initSize == _maximalSize); _maximalSize = _currentSize; if (isPhysical()) { // From now on we predict random access (until collection or compaction): randomAccess(); } return TRI_ERROR_NO_ERROR; } /// @brief truncates a datafile and seals it /// this is called from the recovery procedure only int MMFilesDatafile::truncate(std::string const& path, uint32_t position) { // this function must not be called for non-physical datafiles TRI_ASSERT(!path.empty()); std::unique_ptr datafile(MMFilesDatafile::openHelper(path, true)); if (datafile == nullptr) { return TRI_ERROR_ARANGO_DATAFILE_UNREADABLE; } return datafile->truncateAndSeal(position); } /// @brief try to repair a datafile bool MMFilesDatafile::tryRepair(std::string const& path) { // this function must not be called for non-physical datafiles TRI_ASSERT(!path.empty()); std::unique_ptr datafile(MMFilesDatafile::openHelper(path, true)); if (datafile == nullptr) { return false; } // set to read/write access if (!datafile->readWrite()) { LOG_TOPIC("afcfd", ERR, arangodb::Logger::DATAFILES) << "unable to change file protection for datafile '" << datafile->getName() << "'"; return false; } return datafile->tryRepair(); } //////////////////////////////////////////////////////////////////////////////// /// @brief diagnoses a marker //////////////////////////////////////////////////////////////////////////////// static std::string DiagnoseMarker(MMFilesMarker const* marker, char const* end) { std::ostringstream result; if (marker == nullptr) { return "marker is undefined. should not happen"; } // check marker type MMFilesMarkerType type = marker->getType(); if (type <= TRI_DF_MARKER_MIN) { // marker type is less than minimum allowed type value result << "marker type value (" << static_cast(type) << ") is wrong. expecting value higher than " << TRI_DF_MARKER_MIN; return result.str(); } if (type >= TRI_DF_MARKER_MAX) { // marker type is greater than maximum allowed type value result << "marker type value (" << static_cast(type) << ") is wrong. expecting value less than " << TRI_DF_MARKER_MAX; return result.str(); } uint32_t size = marker->getSize(); if (size >= MMFilesDatafileHelper::MaximalMarkerSize()) { // a single marker bigger than this size seems unreasonable // note: this is an arbitrary limit result << "marker size value (" << size << ") is wrong. expecting value less than " << MMFilesDatafileHelper::MaximalMarkerSize(); return result.str(); } if (size < sizeof(MMFilesMarker)) { result << "marker size is too small (" << size << "). expecting at least " << sizeof(MMFilesMarker) << " bytes"; return result.str(); } if (reinterpret_cast(marker) + size > end) { return "marker size is beyond end of datafile"; } TRI_voc_crc_t crc = CalculateCrcValue(marker); if (marker->getCrc() == crc) { return "crc checksum is correct"; } result << "crc checksum (hex " << std::hex << marker->getCrc() << ") is wrong. expecting (hex " << std::hex << crc << ")"; return result.str(); } MMFilesDatafile::MMFilesDatafile(std::string const& filename, int fd, void* mmHandle, uint32_t maximalSize, uint32_t currentSize, TRI_voc_fid_t fid, char* data) : _filename(filename), _fid(fid), _state(TRI_DF_STATE_READ), _fd(fd), _mmHandle(mmHandle), _initSize(maximalSize), _maximalSize(maximalSize), _currentSize(currentSize), _footerSize(sizeof(MMFilesDatafileFooterMarker)), _full(false), _isSealed(false), _lockedInMemory(false), _data(data), _next(data + currentSize), _tickMin(0), _tickMax(0), _dataMin(0), _dataMax(0), _lastError(TRI_ERROR_NO_ERROR), _synced(data), _written(nullptr) { // filename is a string for physical datafiles, and NULL for anonymous regions // fd is a positive value for physical datafiles, and -1 for anonymous regions if (filename.empty()) { TRI_ASSERT(fd == -1); } else { TRI_ASSERT(fd >= 0); // Advise OS that sequential access is going to happen: sequentialAccess(); } dontDump(); } MMFilesDatafile::~MMFilesDatafile() { try { this->close(); } catch (...) { // silently continue as this is the destructor } } /// @brief return the name of a datafile std::string MMFilesDatafile::getName() const { if (_filename.empty()) { // anonymous regions do not have a filename return "anonymous region"; } // return name of the physical file return _filename; } /// @brief close a datafile int MMFilesDatafile::close() { if (_state == TRI_DF_STATE_READ || _state == TRI_DF_STATE_WRITE || _state == TRI_DF_STATE_OPEN_ERROR) { int res = TRI_UNMMFile(_data, _initSize, _fd, &_mmHandle); if (res != TRI_ERROR_NO_ERROR) { // leave file open here as it will still be memory-mapped LOG_TOPIC("c3c4b", ERR, arangodb::Logger::DATAFILES) << "munmap failed with: " << res; _state = TRI_DF_STATE_WRITE_ERROR; _lastError = res; return res; } if (isPhysical()) { TRI_ASSERT(_fd >= 0); int res = TRI_CLOSE(_fd); if (res != TRI_ERROR_NO_ERROR) { LOG_TOPIC("38043", ERR, arangodb::Logger::DATAFILES) << "unable to close datafile '" << getName() << "': " << res; } } _state = TRI_DF_STATE_CLOSED; _data = nullptr; _next = nullptr; _fd = -1; return TRI_ERROR_NO_ERROR; } if (_state == TRI_DF_STATE_CLOSED) { TRI_ASSERT(_fd == -1); LOG_TOPIC("846fc", TRACE, arangodb::Logger::DATAFILES) << "closing an already closed datafile '" << getName() << "'"; return TRI_ERROR_NO_ERROR; } LOG_TOPIC("d209b", ERR, arangodb::Logger::DATAFILES) << "attempting to close datafile with an invalid state"; return TRI_ERROR_ARANGO_ILLEGAL_STATE; } int MMFilesDatafile::sync() { if (_synced < _written) { return sync(_synced, _written); } return TRI_ERROR_NO_ERROR; } /// @brief sync the data of a datafile int MMFilesDatafile::sync(char const* begin, char const* end) { if (!isPhysical()) { // we only need to care about physical datafiles // anonymous regions do not need to be synced return TRI_ERROR_NO_ERROR; } TRI_ASSERT(_fd >= 0); if (begin == end) { // no need to sync return TRI_ERROR_NO_ERROR; } int res = TRI_MSync(_fd, begin, end); if (res != TRI_ERROR_NO_ERROR) { _state = TRI_DF_STATE_WRITE_ERROR; _lastError = res; LOG_TOPIC("2ebe8", ERR, Logger::COLLECTOR) << "msync failed with: " << TRI_errno_string(res); } else { _synced = const_cast(end); } return res; } /// @brief truncates a datafile /// Create a truncated datafile, seal it and rename the old. int MMFilesDatafile::truncateAndSeal(uint32_t position) { TRI_ERRORBUF; void* data; void* mmHandle; // this function must not be called for non-physical datafiles TRI_ASSERT(isPhysical()); size_t pageSize = PageSizeFeature::getPageSize(); // use multiples of page-size size_t maximalSize = ((position + sizeof(MMFilesDatafileFooterMarker) + pageSize - 1) / pageSize) * pageSize; // sanity check if (sizeof(MMFilesDatafileHeaderMarker) + sizeof(MMFilesDatafileFooterMarker) > maximalSize) { LOG_TOPIC("73851", ERR, arangodb::Logger::DATAFILES) << "cannot create datafile '" << getName() << "', maximal size " << maximalSize << " is too small"; return TRI_ERROR_ARANGO_MAXIMAL_SIZE_TOO_SMALL; } // open the file std::string filename = getName() + ".new"; int fd = TRI_CREATE(filename.c_str(), O_CREAT | O_EXCL | O_RDWR | TRI_O_CLOEXEC, S_IRUSR | S_IWUSR); if (fd < 0) { TRI_SYSTEM_ERROR(); LOG_TOPIC("e1655", ERR, arangodb::Logger::DATAFILES) << "cannot create new datafile '" << filename << "': " << TRI_GET_ERRORBUF; return TRI_set_errno(TRI_ERROR_SYS_ERROR); } // go back to the beginning of the file TRI_lseek_t offset = TRI_LSEEK(fd, (TRI_lseek_t)(maximalSize - 1), SEEK_SET); if (offset == (TRI_lseek_t)-1) { TRI_SYSTEM_ERROR(); TRI_set_errno(TRI_ERROR_SYS_ERROR); TRI_CLOSE(fd); // remove empty file TRI_UnlinkFile(filename.c_str()); LOG_TOPIC("c7e78", ERR, arangodb::Logger::DATAFILES) << "cannot seek in new datafile '" << filename << "': " << TRI_GET_ERRORBUF; return TRI_ERROR_SYS_ERROR; } char zero = 0; long written = TRI_WRITE(fd, &zero, 1); if (written < 0) { TRI_SYSTEM_ERROR(); TRI_set_errno(TRI_ERROR_SYS_ERROR); TRI_CLOSE(fd); // remove empty file TRI_UnlinkFile(filename.c_str()); LOG_TOPIC("50f59", ERR, arangodb::Logger::DATAFILES) << "cannot create datafile '" << filename << "': " << TRI_GET_ERRORBUF; return TRI_ERROR_SYS_ERROR; } // memory map the data int res = TRI_MMFile(nullptr, maximalSize, PROT_WRITE | PROT_READ, MAP_SHARED, fd, &mmHandle, 0, &data); if (res != TRI_ERROR_NO_ERROR) { TRI_SYSTEM_ERROR(); TRI_set_errno(res); TRI_CLOSE(fd); // remove empty file TRI_UnlinkFile(filename.c_str()); LOG_TOPIC("7c0c1", ERR, arangodb::Logger::DATAFILES) << "cannot memory map file '" << filename << "': " << TRI_GET_ERRORBUF; LOG_TOPIC("c07ea", ERR, arangodb::Logger::DATAFILES) << "The database directory might reside on a shared folder " "(VirtualBox, VMWare) or an NFS " "mounted volume which does not allow memory mapped files."; return TRI_errno(); } // copy the data memcpy(data, _data, position); // patch the datafile structure res = TRI_UNMMFile(_data, _initSize, _fd, &_mmHandle); if (res < 0) { TRI_CLOSE(_fd); LOG_TOPIC("d77be", ERR, arangodb::Logger::DATAFILES) << "munmap failed with: " << res; return res; } // ............................................................................................. // For windows: Mem mapped files use handles // the datafile->_mmHandle handle object has been closed in the underlying // TRI_UNMMFile(...) call above so we do not need to close it for the // associated file below // ............................................................................................. TRI_CLOSE(_fd); _data = static_cast(data); _next = (char*)(data) + position; _currentSize = position; TRI_ASSERT(_initSize == _maximalSize); TRI_ASSERT(maximalSize <= _initSize); _maximalSize = static_cast(maximalSize); _initSize = static_cast(maximalSize); _fd = fd; _mmHandle = mmHandle; _state = TRI_DF_STATE_WRITE; _full = false; _isSealed = false; _synced = static_cast(data); _written = _next; // rename files std::string oldname = _filename + ".corrupted"; res = TRI_RenameFile(_filename.c_str(), oldname.c_str()); if (res != TRI_ERROR_NO_ERROR) { LOG_TOPIC("dcdb0", ERR, Logger::DATAFILES) << "unable to rename file '" << filename << "' to '" << oldname << "': " << TRI_errno_string(res); return res; } res = TRI_RenameFile(filename.c_str(), _filename.c_str()); if (res != TRI_ERROR_NO_ERROR) { LOG_TOPIC("bd82c", ERR, Logger::DATAFILES) << "unable to rename file '" << filename << "' to '" << _filename << "': " << TRI_errno_string(res); return res; } // need to reset the datafile state here to write, otherwise the following // call will return an error _state = TRI_DF_STATE_WRITE; return seal(); } /// @brief checks a datafile bool MMFilesDatafile::check(bool ignoreFailures, bool autoSeal) { // this function must not be called for non-physical datafiles TRI_ASSERT(isPhysical()); LOG_TOPIC("a1829", TRACE, arangodb::Logger::DATAFILES) << "checking markers in datafile '" << getName() << "', autoSeal: " << autoSeal; char const* ptr = _data; char const* end = ptr + _currentSize; char const* lastGood = nullptr; uint32_t currentSize = 0; if (_currentSize == 0) { LOG_TOPIC("5a86d", WARN, arangodb::Logger::DATAFILES) << "current size is 0 in read-only datafile '" << getName() << "', trying to fix"; end = _data + _maximalSize; } TRI_voc_tick_t maxTick = 0; TRI_DEFER(TRI_UpdateTickServer(maxTick)); while (ptr < end) { bool const canRead = (ptr + sizeof(MMFilesMarker) <= end); MMFilesMarker const* marker = reinterpret_cast(ptr); uint32_t const size = marker->getSize(); TRI_voc_tick_t const tick = marker->getTick(); MMFilesMarkerType const type = marker->getType(); if (canRead) { if (size == 0) { LOG_TOPIC("296ef", DEBUG, arangodb::Logger::DATAFILES) << "reached end of datafile '" << getName() << "' data, current size " << currentSize << ", autoSeal: " << autoSeal; _currentSize = currentSize; _next = _data + _currentSize; if (autoSeal) { _isSealed = true; } return true; } } if (size < sizeof(MMFilesMarker)) { if (ignoreFailures) { return fix(currentSize); } _lastError = TRI_set_errno(TRI_ERROR_ARANGO_CORRUPTED_DATAFILE); _currentSize = currentSize; _next = _data + _currentSize; _state = TRI_DF_STATE_OPEN_ERROR; LOG_TOPIC("253f0", WARN, arangodb::Logger::DATAFILES) << "marker in datafile '" << getName() << "' too small, size " << size << ", should be at least " << sizeof(MMFilesMarker); return false; } // prevent reading over the end of the file if (ptr + size > end) { if (ignoreFailures) { return fix(currentSize); } _lastError = TRI_set_errno(TRI_ERROR_ARANGO_CORRUPTED_DATAFILE); _currentSize = currentSize; _next = _data + _currentSize; _state = TRI_DF_STATE_OPEN_ERROR; LOG_TOPIC("b7826", WARN, arangodb::Logger::DATAFILES) << "marker in datafile '" << getName() << "' points with size " << size << " beyond end of file"; if (lastGood != nullptr) { LOG_TOPIC("46e12", INFO, arangodb::Logger::DATAFILES) << "last good marker found at: " << hexValue(static_cast(static_cast(lastGood - _data))); } printMarker(marker, static_cast(end - ptr), _data, end); return false; } // the following sanity check offers some, but not 100% crash-protection // when reading totally corrupted datafiles if (!TRI_IsValidMarkerDatafile(marker)) { if (type == 0 && size < 128) { // ignore markers with type 0 and a small size LOG_TOPIC("b6959", WARN, arangodb::Logger::DATAFILES) << "ignoring suspicious marker in datafile '" << getName() << "': type: " << type << ", size: " << size; } else { if (ignoreFailures) { return fix(currentSize); } _lastError = TRI_set_errno(TRI_ERROR_ARANGO_CORRUPTED_DATAFILE); _currentSize = currentSize; _next = _data + _currentSize; _state = TRI_DF_STATE_OPEN_ERROR; LOG_TOPIC("59440", WARN, arangodb::Logger::DATAFILES) << "marker in datafile '" << getName() << "' is corrupt: type: " << type << ", size: " << size; if (lastGood != nullptr) { LOG_TOPIC("d84cc", INFO, arangodb::Logger::DATAFILES) << "last good marker found at: " << hexValue(static_cast(static_cast(lastGood - _data))); } printMarker(marker, size, _data, end); return false; } } if (type != 0) { bool ok = CheckCrcMarker(marker, end); if (!ok) { // CRC mismatch! bool nextMarkerOk = false; if (size > 0) { auto next = reinterpret_cast(marker) + encoding::alignedSize(size); auto p = next; if (p < end) { // check if the rest of the datafile is only followed by NULL bytes bool isFollowedByNullBytes = true; while (p < end) { if (*p != '\0') { isFollowedByNullBytes = false; break; } ++p; } if (isFollowedByNullBytes) { // only last marker in datafile was corrupt. fix the datafile in // place LOG_TOPIC("c7162", WARN, arangodb::Logger::DATAFILES) << "datafile '" << getName() << "' automatically truncated at last marker"; ignoreFailures = true; } else { // there is some other stuff following. now inspect it... TRI_ASSERT(next <= end); if (next < end) { // there is a next marker auto nextMarker = reinterpret_cast(next); if (nextMarker->getType() != 0 && nextMarker->getSize() >= sizeof(MMFilesMarker) && next + nextMarker->getSize() <= end && TRI_IsValidMarkerDatafile(nextMarker) && CheckCrcMarker(nextMarker, end)) { // next marker looks good. nextMarkerOk = true; } } else { // EOF nextMarkerOk = true; } } } } if (!ignoreFailures) { _lastError = TRI_set_errno(TRI_ERROR_ARANGO_CORRUPTED_DATAFILE); _currentSize = currentSize; _next = _data + _currentSize; _state = TRI_DF_STATE_OPEN_ERROR; LOG_TOPIC("3a215", WARN, arangodb::Logger::DATAFILES) << "crc mismatch found in datafile '" << getName() << "' of size " << _maximalSize << ", at position " << currentSize; LOG_TOPIC("23e69", WARN, arangodb::Logger::DATAFILES) << "crc mismatch found inside marker of type '" << TRI_NameMarkerDatafile(marker) << "' and size " << size << ". expected crc: " << CalculateCrcValue(marker) << ", actual crc: " << marker->getCrc(); if (lastGood != nullptr) { LOG_TOPIC("824b4", INFO, arangodb::Logger::DATAFILES) << "last good marker found at: " << hexValue(static_cast(static_cast(lastGood - _data))); } printMarker(marker, size, _data, end); if (nextMarkerOk) { LOG_TOPIC("d6e23", INFO, arangodb::Logger::DATAFILES) << "data directly following this marker looks ok so repairing " "the marker manually may recover it..."; LOG_TOPIC("77894", INFO, arangodb::Logger::DATAFILES) << "to truncate the file at this marker, please restart the " "server with the parameter '--wal.ignore-logfile-errors " "true' if the error happening during WAL recovery, or with " "parameter '--database.ignore-datafile-errors true' if it " "happened after WAL recovery"; } else { LOG_TOPIC("3bd1e", WARN, arangodb::Logger::DATAFILES) << "data directly following this marker cannot be analyzed"; } return false; } // ignore failures... // truncate return fix(currentSize); } } if (tick > maxTick) { maxTick = tick; } size_t alignedSize = MMFilesDatafileHelper::AlignedMarkerSize(marker); currentSize += static_cast(alignedSize); if (marker->getType() == TRI_DF_MARKER_FOOTER) { LOG_TOPIC("eb9ed", DEBUG, arangodb::Logger::DATAFILES) << "found footer, reached end of datafile '" << getName() << "', current size " << currentSize; _isSealed = true; _currentSize = currentSize; _next = _data + _currentSize; return true; } lastGood = ptr; ptr += alignedSize; } return true; } void MMFilesDatafile::printMarker(MMFilesMarker const* marker, uint32_t size, char const* begin, char const* end) { LOG_TOPIC("8dd71", INFO, arangodb::Logger::DATAFILES) << "raw marker data following:"; LOG_TOPIC("0e551", INFO, arangodb::Logger::DATAFILES) << "type: " << TRI_NameMarkerDatafile(marker) << ", size: " << marker->getSize() << ", crc: " << marker->getCrc(); LOG_TOPIC("5548e", INFO, arangodb::Logger::DATAFILES) << "(expected layout: size (4 bytes), crc (4 bytes), type and tick (8 " "bytes), payload following)"; char const* p = reinterpret_cast(marker); char const* e = reinterpret_cast(marker) + encoding::alignedSize(size); if (e + 16 < end) { // add some extra bytes for following data e += 16; } std::string line; std::string raw; size_t printed = 0; while (p < e) { // print offset line.append(hexValue(static_cast(static_cast(p - begin)))); // print data line.append(": "); for (size_t i = 0; i < 16; ++i) { if (i == 8) { // separate groups of 8 bytes line.push_back(' '); raw.push_back(' '); } if (p >= e) { line.append(" "); } else { uint8_t c = static_cast(*p++); uint8_t n1 = c >> 4; uint8_t n2 = c & 0x0F; line.push_back((n1 < 10) ? ('0' + n1) : 'A' + n1 - 10); line.push_back((n2 < 10) ? ('0' + n2) : 'A' + n2 - 10); line.push_back(' '); raw.push_back((c < 32 || c >= 127) ? '.' : static_cast(c)); ++printed; } } LOG_TOPIC("9daf7", INFO, arangodb::Logger::DATAFILES) << line << " " << raw; line.clear(); raw.clear(); if (printed >= 2048) { LOG_TOPIC("f529a", INFO, arangodb::Logger::DATAFILES) << "(output truncated due to excessive length)"; break; } } } /// @brief fixes a corrupted datafile bool MMFilesDatafile::fix(uint32_t currentSize) { LOG_TOPIC("9d2dc", WARN, arangodb::Logger::DATAFILES) << "datafile '" << getName() << "' is corrupted at position " << currentSize; LOG_TOPIC("439c6", WARN, arangodb::Logger::DATAFILES) << "setting datafile '" << getName() << "' to read-only and ignoring all data from this file beyond this " "position"; _currentSize = currentSize; TRI_ASSERT(_initSize == _maximalSize); TRI_ASSERT(currentSize <= _initSize); _maximalSize = static_cast(currentSize); _next = _data + _currentSize; _full = true; _state = TRI_DF_STATE_READ; _isSealed = true; return true; } /// @brief scans a datafile DatafileScan MMFilesDatafile::scanHelper() { // this function must not be called for non-physical datafiles TRI_ASSERT(isPhysical()); char* ptr = _data; char* end = _data + _currentSize; uint32_t currentSize = 0; DatafileScan scan; scan.currentSize = _currentSize; scan.maximalSize = _maximalSize; if (_currentSize == 0) { end = _data + _maximalSize; } while (ptr < end) { MMFilesMarker* marker = reinterpret_cast(ptr); DatafileScanEntry entry; entry.position = static_cast(ptr - _data); entry.size = marker->getSize(); entry.realSize = static_cast(MMFilesDatafileHelper::AlignedMarkerSize(marker)); entry.tick = marker->getTick(); entry.type = marker->getType(); entry.status = 1; entry.typeName = TRI_NameMarkerDatafile(marker); if (marker->getSize() == 0 && marker->getCrc() == 0 && marker->getType() == 0 && marker->getTick() == 0) { entry.status = 2; scan.endPosition = currentSize; scan.entries.emplace_back(entry); return scan; } ++scan.numberMarkers; if (marker->getSize() == 0) { entry.status = 3; scan.status = 2; scan.endPosition = currentSize; scan.entries.emplace_back(entry); return scan; } if (marker->getSize() < sizeof(MMFilesMarker)) { entry.status = 4; entry.diagnosis = DiagnoseMarker(marker, end); scan.endPosition = currentSize; scan.status = 3; scan.entries.emplace_back(entry); return scan; } if (!TRI_IsValidMarkerDatafile(marker)) { entry.status = 4; entry.diagnosis = DiagnoseMarker(marker, end); scan.endPosition = currentSize; scan.status = 3; scan.entries.emplace_back(entry); return scan; } bool ok = CheckCrcMarker(marker, end); if (!ok) { entry.status = 5; entry.diagnosis = DiagnoseMarker(marker, end); scan.status = 4; } MMFilesMarkerType const type = marker->getType(); if (ok) { if (type == TRI_DF_MARKER_VPACK_DOCUMENT || type == TRI_DF_MARKER_VPACK_REMOVE) { VPackSlice const slice(reinterpret_cast(marker) + MMFilesDatafileHelper::VPackOffset(type)); TRI_ASSERT(slice.isObject()); try { entry.key = slice.get(StaticStrings::KeyString).copyString(); } catch (...) { entry.key = "(unable to read value of _key)"; } } } scan.entries.emplace_back(entry); size_t size = MMFilesDatafileHelper::AlignedMarkerSize(marker); currentSize += static_cast(size); if (marker->getType() == TRI_DF_MARKER_FOOTER) { scan.endPosition = currentSize; scan.isSealed = true; return scan; } ptr += size; } return scan; } /// @brief create the initial datafile header marker int MMFilesDatafile::writeInitialHeaderMarker(TRI_voc_fid_t fid, uint32_t maximalSize) { // create the header MMFilesDatafileHeaderMarker header = MMFilesDatafileHelper::CreateHeaderMarker(maximalSize, static_cast(fid)); // reserve space and write header to file MMFilesMarker* position; int res = reserveElement(header.base.getSize(), &position, 0); if (res == TRI_ERROR_NO_ERROR) { TRI_ASSERT(position != nullptr); res = writeCrcElement(position, &header.base); } return res; } /// @brief tries to repair a datafile bool MMFilesDatafile::tryRepair() { // this function must not be called for non-physical datafiles TRI_ASSERT(isPhysical()); char* ptr = _data; char* end = _data + _currentSize; if (_currentSize == 0) { end = _data + _maximalSize; } uint32_t currentSize = 0; while (ptr < end) { if (ptr + sizeof(MMFilesMarker) > end) { // remaining data chunk is too small return false; } MMFilesMarker* marker = reinterpret_cast(ptr); uint32_t const size = marker->getSize(); if (size == 0) { // reached end return true; } if (size < sizeof(MMFilesMarker) || ptr + size > end) { // marker too small or too big return false; } if (!TRI_IsValidMarkerDatafile(marker)) { // unknown marker type return false; } if (marker->getType() != 0) { if (!CheckCrcMarker(marker, end)) { // CRC mismatch! auto next = reinterpret_cast(marker) + size; auto p = next; if (p < end) { // check if the rest of the datafile is only followed by NULL bytes bool isFollowedByNullBytes = true; while (p < end) { if (*p != '\0') { isFollowedByNullBytes = false; break; } ++p; } if (isFollowedByNullBytes) { // only last marker in datafile was corrupt. fix the datafile in // place LOG_TOPIC("faf7c", INFO, arangodb::Logger::DATAFILES) << "truncating datafile '" << getName() << "' at position " << currentSize; int res = truncateAndSeal(currentSize); return (res == TRI_ERROR_NO_ERROR); } // there is some other stuff following. now inspect it... TRI_ASSERT(next <= end); if (next < end) { // there is a next marker auto nextMarker = reinterpret_cast(next); if (nextMarker->getType() != 0 && nextMarker->getSize() >= sizeof(MMFilesMarker) && next + nextMarker->getSize() <= end && TRI_IsValidMarkerDatafile(nextMarker) && CheckCrcMarker(nextMarker, end)) { // next marker looks good. // create a temporary buffer auto buffer = std::unique_ptr(new char[size]); // create a new marker in the temporary buffer auto temp = reinterpret_cast(buffer.get()); MMFilesDatafileHelper::InitMarker(reinterpret_cast( buffer.get()), TRI_DF_MARKER_BLANK, static_cast(size)); temp->setCrc(CalculateCrcValue(temp)); // all done. now copy back the marker into the file memcpy(static_cast(ptr), buffer.get(), static_cast(size)); buffer.reset(); // don't need the buffer anymore int res = this->sync(ptr, (ptr + size)); if (res == TRI_ERROR_NO_ERROR) { LOG_TOPIC("0ce37", INFO, arangodb::Logger::DATAFILES) << "zeroed single invalid marker in datafile '" << getName() << "' at position " << currentSize; } else { LOG_TOPIC("d345b", ERR, arangodb::Logger::DATAFILES) << "could not zero single invalid marker in datafile '" << getName() << "' at position " << currentSize; return false; } } else { // next marker looks broken, too. int res = truncateAndSeal(currentSize); return (res == TRI_ERROR_NO_ERROR); } } } } } size_t alignedSize = MMFilesDatafileHelper::AlignedMarkerSize(marker); currentSize += static_cast(alignedSize); if (marker->getType() == TRI_DF_MARKER_FOOTER) { return true; } ptr += alignedSize; } return true; } /// @brief opens an existing datafile /// The datafile will be opened read-only if a footer is found MMFilesDatafile* MMFilesDatafile::open(std::string const& filename, bool ignoreFailures, bool autoSeal) { // this function must not be called for non-physical datafiles TRI_ASSERT(!filename.empty()); std::unique_ptr datafile(MMFilesDatafile::openHelper(filename, ignoreFailures)); if (datafile == nullptr) { return nullptr; } // check the datafile by scanning markers bool ok = datafile->check(ignoreFailures, autoSeal); if (!ok) { TRI_UNMMFile(const_cast(datafile->data()), datafile->initSize(), datafile->fd(), &datafile->_mmHandle); TRI_CLOSE(datafile->fd()); LOG_TOPIC("5bc05", ERR, arangodb::Logger::DATAFILES) << "datafile '" << datafile->getName() << "' is corrupt"; // must free datafile here return nullptr; } // change to read-write if no footer has been found if (!datafile->_isSealed) { if (!datafile->readWrite()) { LOG_TOPIC("47b43", ERR, arangodb::Logger::DATAFILES) << "unable to change memory protection for memory backed by datafile " "'" << datafile->getName() << "'. please check file permissions and mount options."; return nullptr; } datafile->setState(TRI_DF_STATE_WRITE); } // Advise on sequential use: datafile->sequentialAccess(); datafile->willNeed(); return datafile.release(); } /// @brief opens a datafile MMFilesDatafile* MMFilesDatafile::openHelper(std::string const& filename, bool ignoreErrors) { TRI_ERRORBUF; // this function must not be called for non-physical datafiles TRI_ASSERT(!filename.empty()); TRI_voc_fid_t fid = GetNumericFilenamePart(filename.c_str()); // attempt to open a datafile file int fd = TRI_OPEN(filename.c_str(), O_RDWR | TRI_O_CLOEXEC); if (fd < 0) { TRI_SYSTEM_ERROR(); TRI_set_errno(TRI_ERROR_SYS_ERROR); LOG_TOPIC("73fe5", ERR, arangodb::Logger::DATAFILES) << "cannot open datafile '" << filename << "': '" << TRI_GET_ERRORBUF << "'"; return nullptr; } // compute the size of the file TRI_stat_t status; int res = TRI_FSTAT(fd, &status); if (res < 0) { TRI_SYSTEM_ERROR(); TRI_set_errno(TRI_ERROR_SYS_ERROR); TRI_CLOSE(fd); LOG_TOPIC("c9845", ERR, arangodb::Logger::DATAFILES) << "cannot get status of datafile '" << filename << "': " << TRI_GET_ERRORBUF; return nullptr; } // check that file is not too small uint32_t size = static_cast(status.st_size); if (size < sizeof(MMFilesDatafileHeaderMarker) + sizeof(MMFilesDatafileFooterMarker)) { TRI_set_errno(TRI_ERROR_ARANGO_CORRUPTED_DATAFILE); TRI_CLOSE(fd); LOG_TOPIC("79cef", ERR, arangodb::Logger::DATAFILES) << "datafile '" << filename << "' is corrupt, size is only " << size; return nullptr; } // read header from file char buffer[128]; memset(&buffer[0], 0, sizeof(buffer)); size_t const len = sizeof(MMFilesDatafileHeaderMarker); ssize_t toRead = sizeof(buffer); if (toRead > static_cast(status.st_size)) { toRead = static_cast(status.st_size); } bool ok = TRI_ReadPointer(fd, &buffer[0], toRead); if (!ok) { LOG_TOPIC("64ff5", ERR, arangodb::Logger::DATAFILES) << "cannot read datafile header from '" << filename << "': " << TRI_last_error(); TRI_CLOSE(fd); return nullptr; } char const* ptr = reinterpret_cast(&buffer[0]); char const* end = static_cast(ptr) + len; MMFilesDatafileHeaderMarker const* header = reinterpret_cast(&buffer[0]); // check CRC ok = CheckCrcMarker(reinterpret_cast(ptr), end); if (!ok) { if (IsMarker28(ptr, len)) { TRI_CLOSE(fd); LOG_TOPIC("bbb3c", ERR, arangodb::Logger::DATAFILES) << "datafile found from older version of ArangoDB. " << "Please dump data from that version with arangodump " << "and reload it into this ArangoDB instance with arangorestore"; FATAL_ERROR_EXIT(); } TRI_set_errno(TRI_ERROR_ARANGO_CORRUPTED_DATAFILE); LOG_TOPIC("867dd", ERR, arangodb::Logger::DATAFILES) << "corrupted datafile header read from '" << filename << "'"; printMarker(reinterpret_cast(ptr), len, &buffer[0], end); if (!ignoreErrors) { TRI_CLOSE(fd); return nullptr; } } // check the datafile version if (ok) { if (header->_version != TRI_DF_VERSION) { TRI_set_errno(TRI_ERROR_ARANGO_CORRUPTED_DATAFILE); LOG_TOPIC("91d98", ERR, arangodb::Logger::DATAFILES) << "unknown datafile version '" << header->_version << "' in datafile '" << filename << "'"; if (!ignoreErrors) { TRI_CLOSE(fd); return nullptr; } } } // check the maximal size if (size > header->_maximalSize) { LOG_TOPIC("942e7", DEBUG, arangodb::Logger::DATAFILES) << "datafile '" << filename << "' has size " << size << ", but maximal size is " << header->_maximalSize; } // map datafile into memory void* data; void* mmHandle; res = TRI_MMFile(nullptr, size, PROT_READ, MAP_SHARED, fd, &mmHandle, 0, &data); if (res != TRI_ERROR_NO_ERROR) { TRI_set_errno(res); TRI_CLOSE(fd); LOG_TOPIC("70318", ERR, arangodb::Logger::DATAFILES) << "cannot memory map datafile '" << filename << "': " << TRI_errno_string(res); LOG_TOPIC("9909b", ERR, arangodb::Logger::DATAFILES) << "The database directory might reside on a shared folder " "(VirtualBox, VMWare) or an NFS-mounted volume which does not allow " "memory mapped files."; return nullptr; } // create datafile structure try { return new MMFilesDatafile(filename, fd, mmHandle, size, size, fid, static_cast(data)); } catch (...) { TRI_UNMMFile(data, size, fd, &mmHandle); TRI_CLOSE(fd); return nullptr; } } /// @brief returns information about the datafile DatafileScan MMFilesDatafile::scan(std::string const& path) { // this function must not be called for non-physical datafiles TRI_ASSERT(!path.empty()); std::unique_ptr datafile(MMFilesDatafile::openHelper(path, true)); if (datafile != nullptr) { return datafile->scanHelper(); } DatafileScan scan; scan.currentSize = 0; scan.maximalSize = 0; scan.endPosition = 0; scan.numberMarkers = 0; scan.status = 5; scan.isSealed = false; return scan; }