//////////////////////////////////////////////////////////////////////////////// /// DISCLAIMER /// /// Copyright 2017 ArangoDB GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Matthew Von-Maszewski //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// // based upon leveldb/util/throttle.cc // Copyright (c) 2011-2017 Basho Technologies, Inc. All Rights Reserved. // // This file is provided to you under the Apache License, // Version 2.0 (the "License"); you may not use this file // except in compliance with the License. You may obtain // a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 //////////////////////////////////////////////////////////////////////////////// #include "RocksDBThrottle.h" #ifndef _WIN32 #include #include #endif #include "Basics/ConditionLocker.h" #include "Basics/MutexLocker.h" #include "Logger/Logger.h" namespace arangodb { //////////////////////////////////////////////////////////////////////////////// /// AdjustThreadPriority() below uses the Linux setpriority() function to dynamically /// lower and raise a given thread's scheduling priority. The Linux default is /// to only allow a thread to lower its priority, not to raise it. Even if the /// raise would be to a previous priority. /// /// Servers with 4 cores or less REALLY need the full benefit of AdjustThreadPriority(). /// /// To get full performance benefit of this code, the server needs three settings: /// /// 1. /etc/pam.d/login must contain the line "auth require pam_cap.so" /// 2. /etc/security/capability.conf must contain "cap_sys_nice arangodb" /// 3. root must execute this command "setcap cap_sys_nice+ie arangod" on /// the arangodb binary executable /// /// The above settings allow the code to vary the threads across 3 priorities based upon /// the current compaction's level. Without the settings, threads eventual lock into only 2 /// different priorities (which is still far better having everything at same priority). /// /// Setting 3 above must be applied to the arangod binary after every build or installation. /// /// The code does not (yet) support Windows. //////////////////////////////////////////////////////////////////////////////// // code will dynamically change a thread's priority based upon the compaction's level: // base +1 : flush mem buffer to level 0 // base +2 : level 0 compaction to level 1 // base +3 : all other compactions struct sPriorityInfo { bool _baseSet; int _basePriority; int _currentPriority; }; thread_local sPriorityInfo gThreadPriority={false, 0, 0}; // rocksdb flushes and compactions start and stop within same thread, no overlapping // (OSX 10.12 requires a static initializer for thread_local ... time_point on mac does not have // one in clang 9.0.0) thread_local uint8_t gFlushStart[sizeof(std::chrono::steady_clock::time_point)]; // // Setup the object, clearing variables, but do no real work // RocksDBThrottle::RocksDBThrottle() : _internalRocksDB(nullptr), _threadRunning(false), _replaceIdx(2), _throttleBps(0), _firstThrottle(true) { memset(&_throttleData, 0, sizeof(_throttleData)); } // // Shutdown the background thread only if it was ever started // RocksDBThrottle::~RocksDBThrottle() { StopThread(); } // // Shutdown the background thread only if it was ever started // void RocksDBThrottle::StopThread() { if (_threadRunning.load()) { { CONDITION_LOCKER(guard, _threadCondvar); _threadRunning.store(false); _threadCondvar.signal(); } // lock _threadFuture.wait(); { CONDITION_LOCKER(guard, _threadCondvar); _internalRocksDB = nullptr; _delayToken.reset(); } // lock } // if } // RocksDBThrottle::StopThread /// /// @brief rocksdb does not track flush time in its statistics. Save start time in /// a thread specific storage /// void RocksDBThrottle::OnFlushBegin(rocksdb::DB* db, const rocksdb::FlushJobInfo& flush_job_info) { // save start time in thread local storage std::chrono::steady_clock::time_point osx_hack = std::chrono::steady_clock::now(); memcpy(gFlushStart, &osx_hack, sizeof(std::chrono::steady_clock::time_point)); AdjustThreadPriority(1); return; } // RocksDBThrottle::OnFlushBegin void RocksDBThrottle::OnFlushCompleted(rocksdb::DB* db, const rocksdb::FlushJobInfo& flush_job_info) { std::chrono::microseconds flush_time; uint64_t flush_size; std::chrono::steady_clock::time_point osx_hack; memcpy(&osx_hack, gFlushStart, sizeof(std::chrono::steady_clock::time_point)); flush_time = std::chrono::duration_cast(std::chrono::steady_clock::now() - osx_hack); flush_size = flush_job_info.table_properties.data_size + flush_job_info.table_properties.index_size + flush_job_info.table_properties.filter_size; SetThrottleWriteRate(flush_time, flush_job_info.table_properties.num_entries, flush_size, true); // start throttle after first data is posted // (have seen some odd zero and small size flushes early) // (64<<20) is default size for write_buffer_size in column family options, too hard to read from here if ((64<<19)write_controller()).max_delayed_write_rate() < _throttleBps) { ((WriteController&)_internalRocksDB->write_controller()).set_max_delayed_write_rate(_throttleBps); } // if // Only replace the token when absolutely necessary. GetDelayToken() // also resets internal timers which can result in long pauses if // flushes/compactions are happening often. if (nullptr == _delayToken.get()) { _delayToken=(((WriteController&)_internalRocksDB->write_controller()).GetDelayToken(_throttleBps)); LOG_TOPIC(DEBUG, arangodb::Logger::ENGINES) << "SetThrottle(): GetDelayTokey(" << _throttleBps << ")"; } else { LOG_TOPIC(DEBUG, arangodb::Logger::ENGINES) << "SetThrottle(): set_delayed_write_rate(" << _throttleBps << ")"; ((WriteController&)_internalRocksDB->write_controller()).set_delayed_write_rate(_throttleBps); } // else } else { _delayToken.reset(); LOG_TOPIC(DEBUG, arangodb::Logger::ENGINES) << "SetThrottle(): _delaytoken.reset()"; } // else } // if } // lock } // RocksDBThrottle::SetThrottle /// /// @brief Use rocksdb's internal statistics to determine if /// additional slowing of writes is warranted /// int64_t RocksDBThrottle::ComputeBacklog() { int64_t compaction_backlog, imm_backlog, imm_trigger; bool ret_flag; std::string ret_string, property_name; int temp; // want count of level 0 files to estimate if compactions "behind" // and therefore likely to start stalling / stopping compaction_backlog = 0; imm_backlog = 0; if (_families.size()) { imm_trigger = _internalRocksDB->GetOptions(_families[0]).max_write_buffer_number / 2; } else { imm_trigger = 3; } // else // loop through column families to obtain family specific counts for (auto & cf : _families) { property_name = rocksdb::DB::Properties::kNumFilesAtLevelPrefix; property_name.append("0"); ret_flag=_internalRocksDB->GetProperty(cf, property_name, &ret_string); if (ret_flag) { temp=std::stoi(ret_string); } else { temp =0; } // else if (kL0_SlowdownWritesTrigger<=temp) { temp -= (kL0_SlowdownWritesTrigger -1); } else { temp = 0; } // else compaction_backlog += temp; property_name=rocksdb::DB::Properties::kNumImmutableMemTable; ret_flag=_internalRocksDB->GetProperty(cf, property_name, &ret_string); if (ret_flag) { temp=std::stoi(ret_string); imm_backlog += temp; } // if } // for if (imm_trigger