//////////////////////////////////////////////////////////////////////////////// /// DISCLAIMER /// /// Copyright 2017 ArangoDB GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Matthew Von-Maszewski //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// // based upon leveldb/util/throttle.cc // Copyright (c) 2011-2017 Basho Technologies, Inc. All Rights Reserved. // // This file is provided to you under the Apache License, // Version 2.0 (the "License"); you may not use this file // except in compliance with the License. You may obtain // a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 //////////////////////////////////////////////////////////////////////////////// #include "RocksDBThrottle.h" #ifndef _WIN32 #include #endif #ifdef TRI_HAVE_UNISTD_H #include #include #endif #include "Basics/ConditionLocker.h" #include "Basics/MutexLocker.h" #include "Logger/Logger.h" namespace arangodb { //////////////////////////////////////////////////////////////////////////////// /// AdjustThreadPriority() below uses the Linux setpriority() function to /// dynamically /// lower and raise a given thread's scheduling priority. The Linux default is /// to only allow a thread to lower its priority, not to raise it. Even if the /// raise would be to a previous priority. /// /// Servers with 4 cores or less REALLY need the full benefit of /// AdjustThreadPriority(). /// /// To get full performance benefit of this code, the server needs three /// settings: /// /// 1. /etc/pam.d/login must contain the line "auth require pam_cap.so" /// 2. /etc/security/capability.conf must contain "cap_sys_nice arangodb" /// 3. root must execute this command "setcap cap_sys_nice+ie arangod" on /// the arangodb binary executable /// /// The above settings allow the code to vary the threads across 3 priorities /// based upon /// the current compaction's level. Without the settings, threads eventual /// lock into only 2 different priorities (which is still far better having /// everything at same priority). /// /// Setting 3 above must be applied to the arangod binary after every build or /// installation. /// /// The code does not (yet) support Windows. //////////////////////////////////////////////////////////////////////////////// // code will dynamically change a thread's priority based upon the compaction's // level: // base +1 : flush mem buffer to level 0 // base +2 : level 0 compaction to level 1 // base +3 : all other compactions struct sPriorityInfo { bool _baseSet; int _basePriority; int _currentPriority; }; thread_local sPriorityInfo gThreadPriority = {false, 0, 0}; // rocksdb flushes and compactions start and stop within same thread, no // overlapping // (OSX 10.12 requires a static initializer for thread_local ... time_point on // mac does not have // one in clang 9.0.0) thread_local uint8_t gFlushStart[sizeof(std::chrono::steady_clock::time_point)]; // // Setup the object, clearing variables, but do no real work // RocksDBThrottle::RocksDBThrottle() : _internalRocksDB(nullptr), _threadRunning(false), _replaceIdx(2), _throttleBps(0), _firstThrottle(true) { memset(&_throttleData, 0, sizeof(_throttleData)); } // // Shutdown the background thread only if it was ever started // RocksDBThrottle::~RocksDBThrottle() { StopThread(); } // // Shutdown the background thread only if it was ever started // void RocksDBThrottle::StopThread() { if (_threadRunning.load()) { { CONDITION_LOCKER(guard, _threadCondvar); _threadRunning.store(false); _threadCondvar.signal(); } // lock _threadFuture.wait(); { CONDITION_LOCKER(guard, _threadCondvar); _internalRocksDB = nullptr; _delayToken.reset(); } // lock } // if } // RocksDBThrottle::StopThread /// /// @brief rocksdb does not track flush time in its statistics. Save start time /// in /// a thread specific storage /// void RocksDBThrottle::OnFlushBegin(rocksdb::DB* db, const rocksdb::FlushJobInfo& flush_job_info) { // save start time in thread local storage std::chrono::steady_clock::time_point osx_hack = std::chrono::steady_clock::now(); memcpy(gFlushStart, &osx_hack, sizeof(std::chrono::steady_clock::time_point)); AdjustThreadPriority(1); return; } // RocksDBThrottle::OnFlushBegin void RocksDBThrottle::OnFlushCompleted(rocksdb::DB* db, const rocksdb::FlushJobInfo& flush_job_info) { std::chrono::microseconds flush_time; uint64_t flush_size; std::chrono::steady_clock::time_point osx_hack; memcpy(&osx_hack, gFlushStart, sizeof(std::chrono::steady_clock::time_point)); flush_time = std::chrono::duration_cast( std::chrono::steady_clock::now() - osx_hack); flush_size = flush_job_info.table_properties.data_size + flush_job_info.table_properties.index_size + flush_job_info.table_properties.filter_size; SetThrottleWriteRate(flush_time, flush_job_info.table_properties.num_entries, flush_size, true); // start throttle after first data is posted // (have seen some odd zero and small size flushes early) // (64<<20) is default size for write_buffer_size in column family options, // too hard to read from here if ((64 << 19) < flush_size) { std::call_once(_initFlag, &RocksDBThrottle::Startup, this, db); } // if } // RocksDBThrottle::OnFlushCompleted void RocksDBThrottle::OnCompactionCompleted(rocksdb::DB* db, const rocksdb::CompactionJobInfo& ci) { std::chrono::microseconds elapsed(ci.stats.elapsed_micros); SetThrottleWriteRate(elapsed, ci.stats.num_output_records, ci.stats.total_output_bytes, false); // rocksdb 5.6 had an API call for when a standard compaction started. 5.14 // has no such thing. // this line fakes "compaction start" by making the wild assumption that the // next level compacting is likely similar to the previous. This is only for // thread priority manipulation, approximate is fine. (and you must have used // "setcap" on the arangod binary for it to even matter, see comments at top) RocksDBThrottle::AdjustThreadPriority((0 == ci.base_input_level) ? 2 : 3); } // RocksDBThrottle::OnCompactionCompleted void RocksDBThrottle::Startup(rocksdb::DB* db) { CONDITION_LOCKER(guard, _threadCondvar); _internalRocksDB = (rocksdb::DBImpl*)db; // addresses race condition during fast start/stop _threadFuture = std::async(std::launch::async, &RocksDBThrottle::ThreadLoop, this); while (!_threadRunning.load()) { _threadCondvar.wait(10000); } // while } // RocksDBThrottle::Startup void RocksDBThrottle::SetThrottleWriteRate(std::chrono::microseconds Micros, uint64_t Keys, uint64_t Bytes, bool IsLevel0) { // lock _threadMutex while we update _throttleData MUTEX_LOCKER(mutexLocker, _threadMutex); unsigned target_idx; // throw out anything smaller than 32Mbytes ... be better if this // was calculated against write_buffer_size, but that varies by column family if ((64 << 19) < Bytes) { // index 0 for level 0 compactions, index 1 for all others target_idx = (IsLevel0 ? 0 : 1); _throttleData[target_idx]._micros += Micros; _throttleData[target_idx]._keys += Keys; _throttleData[target_idx]._bytes += Bytes; _throttleData[target_idx]._compactions += 1; // attempt to override throttle changes by rocksdb ... hammer this often // (note that _threadMutex IS HELD) SetThrottle(); } // if LOG_TOPIC("7afe9", DEBUG, arangodb::Logger::ENGINES) << "SetThrottleWriteRate: Micros " << Micros.count() << ", Keys " << Keys << ", Bytes " << Bytes << ", IsLevel0 " << IsLevel0; return; } // RocksDBThrottle::SetThrottleWriteRate void RocksDBThrottle::ThreadLoop() { _replaceIdx = 2; // addresses race condition during fast start/stop { CONDITION_LOCKER(guard, _threadCondvar); _threadRunning.store(true); _threadCondvar.signal(); } // lock LOG_TOPIC("a4a57", DEBUG, arangodb::Logger::ENGINES) << "ThreadLoop() started"; while (_threadRunning.load()) { // // start actual throttle work // try { RecalculateThrottle(); } catch (...) { LOG_TOPIC("b0a2e", ERR, arangodb::Logger::ENGINES) << "RecalculateThrottle() sent a throw. RocksDB?"; _threadRunning.store(false); } // try/catchxs ++_replaceIdx; if (THROTTLE_INTERVALS == _replaceIdx) _replaceIdx = 2; // wait on _threadCondvar { CONDITION_LOCKER(guard, _threadCondvar); if (_threadRunning.load()) { // test in case of race at shutdown _threadCondvar.wait(THROTTLE_SECONDS * 1000000); } // if } // lock } // while LOG_TOPIC("eebbe", DEBUG, arangodb::Logger::ENGINES) << "ThreadLoop() ended"; } // RocksDBThrottle::ThreadLoop // // Routine to actually perform the throttle calculation, // now is external routing from ThreadLoop() to easy unit test void RocksDBThrottle::RecalculateThrottle() { unsigned loop; std::chrono::microseconds tot_micros; uint64_t tot_bytes, tot_keys, tot_compact, adjustment_bytes; int64_t new_throttle, compaction_backlog, temp_rate; bool no_data; tot_micros *= 0; tot_keys = 0; tot_bytes = 0; tot_compact = 0; temp_rate = 0; compaction_backlog = ComputeBacklog(); { MUTEX_LOCKER(mutexLocker, _threadMutex); _throttleData[_replaceIdx] = _throttleData[1]; memset(&_throttleData[1], 0, sizeof(_throttleData[1])); // this could be faster by keeping running totals and // subtracting [_replaceIdx] before copying [0] into it, // then adding new [_replaceIdx]. But that needs more // time for testing. for (loop = 2; loop < THROTTLE_INTERVALS; ++loop) { tot_micros += _throttleData[loop]._micros; tot_keys += _throttleData[loop]._keys; tot_bytes += _throttleData[loop]._bytes; tot_compact += _throttleData[loop]._compactions; } // for // flag to skip throttle changes if zero data available no_data = (0 == tot_bytes && 0 == _throttleData[0]._bytes); } // unique_lock // reduce bytes by 10% for each excess level_0 files and/or excess write // buffers adjustment_bytes = (tot_bytes * compaction_backlog) / 10; if (adjustment_bytes < tot_bytes) { tot_bytes -= adjustment_bytes; } else { tot_bytes = 1; // not zero, let smoothing drift number down instead of // taking level-0 } // lock _threadMutex while we update _throttleData if (!no_data) { MUTEX_LOCKER(mutexLocker, _threadMutex); // non-level0 data available? if (0 != tot_bytes && 0 != tot_micros.count()) { // average bytes per secon for level 1+ compactions // (adjust bytes upward by 1000000 since dividing by microseconds, // yields integer bytes per second) new_throttle = ((tot_bytes * 1000000) / tot_micros.count()); } // if // attempt to most recent level0 // (only use most recent level0 until level1+ data becomes available, // useful on restart of heavily loaded server) else if (0 != _throttleData[0]._bytes && 0 != _throttleData[0]._micros.count()) { new_throttle = (_throttleData[0]._bytes * 1000000) / _throttleData[0]._micros.count(); } // else if else { new_throttle = 1; } // else if (0 == new_throttle) new_throttle = 1; // throttle must have an effect // change the throttle slowly // (+1 & +2 keep throttle moving toward goal when difference new and // old is less than THROTTLE_SCALING) if (!_firstThrottle) { temp_rate = _throttleBps; if (temp_rate < new_throttle) temp_rate += (new_throttle - temp_rate) / THROTTLE_SCALING + 1; else temp_rate -= (temp_rate - new_throttle) / THROTTLE_SCALING + 2; // +2 can make this go negative if (temp_rate < 1) temp_rate = 1; // throttle must always have an effect LOG_TOPIC("46d4a", DEBUG, arangodb::Logger::ENGINES) << "RecalculateThrottle(): old " << _throttleBps << ", new " << temp_rate; _throttleBps = temp_rate; // prepare for next interval memset(&_throttleData[0], 0, sizeof(_throttleData[0])); } else if (1 < new_throttle) { // never had a valid throttle, and have first hint now _throttleBps = new_throttle; LOG_TOPIC("e0bbb", DEBUG, arangodb::Logger::ENGINES) << "RecalculateThrottle(): first " << _throttleBps; _firstThrottle = false; } // else if // This SetThrottle() call currently occurs without holding the // rocksdb db mutex. Not safe, seen likely crash from it. // Add back only if this becomes a pluggable WriteController with // access to db mutex. // SetThrottle(); } // !no_data && unlock _threadMutex } // RocksDBThrottle::RecalculateThrottle /// /// @brief Hack a throttle rate into the WriteController object /// void RocksDBThrottle::SetThrottle() { // called by routine with _threadMutex held // using condition variable's mutex to protect _internalRocksDB race { CONDITION_LOCKER(guard, _threadCondvar); // this routine can get called before _internalRocksDB is set if (nullptr != _internalRocksDB) { // inform write_controller_ of our new rate // (column_family.cc RecalculateWriteStallConditions() makes assumptions // that could force a divide by zero if _throttleBps is less than four // ... using 100 for safety) if (100 < _throttleBps) { // hard casting away of "const" ... if (((WriteController&)_internalRocksDB->write_controller()).max_delayed_write_rate() < _throttleBps) { ((WriteController&)_internalRocksDB->write_controller()).set_max_delayed_write_rate(_throttleBps); } // if // Only replace the token when absolutely necessary. GetDelayToken() // also resets internal timers which can result in long pauses if // flushes/compactions are happening often. if (nullptr == _delayToken.get()) { _delayToken = (((WriteController&)_internalRocksDB->write_controller()).GetDelayToken(_throttleBps)); LOG_TOPIC("7c51e", DEBUG, arangodb::Logger::ENGINES) << "SetThrottle(): GetDelayTokey(" << _throttleBps << ")"; } else { LOG_TOPIC("2eb9e", DEBUG, arangodb::Logger::ENGINES) << "SetThrottle(): set_delayed_write_rate(" << _throttleBps << ")"; ((WriteController&)_internalRocksDB->write_controller()).set_delayed_write_rate(_throttleBps); } // else } else { _delayToken.reset(); LOG_TOPIC("af180", DEBUG, arangodb::Logger::ENGINES) << "SetThrottle(): _delaytoken.reset()"; } // else } // if } // lock } // RocksDBThrottle::SetThrottle /// /// @brief Use rocksdb's internal statistics to determine if /// additional slowing of writes is warranted /// int64_t RocksDBThrottle::ComputeBacklog() { int64_t compaction_backlog, imm_backlog, imm_trigger; bool ret_flag; std::string ret_string, property_name; int temp; // want count of level 0 files to estimate if compactions "behind" // and therefore likely to start stalling / stopping compaction_backlog = 0; imm_backlog = 0; if (_families.size()) { imm_trigger = _internalRocksDB->GetOptions(_families[0]).max_write_buffer_number / 2; } else { imm_trigger = 3; } // else // loop through column families to obtain family specific counts for (auto& cf : _families) { property_name = rocksdb::DB::Properties::kNumFilesAtLevelPrefix; property_name.append("0"); ret_flag = _internalRocksDB->GetProperty(cf, property_name, &ret_string); if (ret_flag) { temp = std::stoi(ret_string); } else { temp = 0; } // else if (kL0_SlowdownWritesTrigger <= temp) { temp -= (kL0_SlowdownWritesTrigger - 1); } else { temp = 0; } // else compaction_backlog += temp; property_name = rocksdb::DB::Properties::kNumImmutableMemTable; ret_flag = _internalRocksDB->GetProperty(cf, property_name, &ret_string); if (ret_flag) { temp = std::stoi(ret_string); imm_backlog += temp; } // if } // for if (imm_trigger < imm_backlog) { compaction_backlog += (imm_backlog - imm_trigger); } // if return compaction_backlog; } // RocksDBThrottle::Computebacklog /// @brief Adjust the active thread's priority to match the work /// it is performing. The routine is called HEAVILY. void RocksDBThrottle::AdjustThreadPriority(int Adjustment) { #ifndef _WIN32 // initialize thread infor if this the first time the thread has ever called if (!gThreadPriority._baseSet) { pid_t tid; int ret_val; tid = syscall(SYS_gettid); if (-1 != (int)tid) { errno = 0; ret_val = getpriority(PRIO_PROCESS, tid); // ret_val could be -1 legally, so double test if (-1 != ret_val || 0 == errno) { gThreadPriority._baseSet = true; gThreadPriority._basePriority = ret_val; gThreadPriority._currentPriority = ret_val; } // if } // if } // if // only change priorities if we if (gThreadPriority._baseSet && (gThreadPriority._basePriority + Adjustment) != gThreadPriority._currentPriority) { pid_t tid; tid = syscall(SYS_gettid); if (-1 != (int)tid) { gThreadPriority._currentPriority = gThreadPriority._basePriority + Adjustment; setpriority(PRIO_PROCESS, tid, gThreadPriority._currentPriority); } // if } // if #endif // WIN32 } // RocksDBThrottle::AdjustThreadPriority } // namespace arangodb