1
0
Fork 0
arangodb/arangod/RocksDBEngine/RocksDBThrottle.cpp

537 lines
18 KiB
C++

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2017 ArangoDB GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Matthew Von-Maszewski
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// based upon leveldb/util/throttle.cc
// Copyright (c) 2011-2017 Basho Technologies, Inc. All Rights Reserved.
//
// This file is provided to you under the Apache License,
// Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain
// a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
////////////////////////////////////////////////////////////////////////////////
#include "RocksDBThrottle.h"
#ifndef _WIN32
#include <sys/resource.h>
#endif
#ifdef TRI_HAVE_UNISTD_H
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include "Basics/ConditionLocker.h"
#include "Basics/MutexLocker.h"
#include "Logger/Logger.h"
namespace arangodb {
////////////////////////////////////////////////////////////////////////////////
/// AdjustThreadPriority() below uses the Linux setpriority() function to
/// dynamically
/// lower and raise a given thread's scheduling priority. The Linux default is
/// to only allow a thread to lower its priority, not to raise it. Even if the
/// raise would be to a previous priority.
///
/// Servers with 4 cores or less REALLY need the full benefit of
/// AdjustThreadPriority().
///
/// To get full performance benefit of this code, the server needs three
/// settings:
///
/// 1. /etc/pam.d/login must contain the line "auth require pam_cap.so"
/// 2. /etc/security/capability.conf must contain "cap_sys_nice arangodb"
/// 3. root must execute this command "setcap cap_sys_nice+ie arangod" on
/// the arangodb binary executable
///
/// The above settings allow the code to vary the threads across 3 priorities
/// based upon
/// the current compaction's level. Without the settings, threads eventual
/// lock into only 2 different priorities (which is still far better having
/// everything at same priority).
///
/// Setting 3 above must be applied to the arangod binary after every build or
/// installation.
///
/// The code does not (yet) support Windows.
////////////////////////////////////////////////////////////////////////////////
// code will dynamically change a thread's priority based upon the compaction's
// level:
// base +1 : flush mem buffer to level 0
// base +2 : level 0 compaction to level 1
// base +3 : all other compactions
struct sPriorityInfo {
bool _baseSet;
int _basePriority;
int _currentPriority;
};
thread_local sPriorityInfo gThreadPriority = {false, 0, 0};
// rocksdb flushes and compactions start and stop within same thread, no
// overlapping
// (OSX 10.12 requires a static initializer for thread_local ... time_point on
// mac does not have
// one in clang 9.0.0)
thread_local uint8_t gFlushStart[sizeof(std::chrono::steady_clock::time_point)];
//
// Setup the object, clearing variables, but do no real work
//
RocksDBThrottle::RocksDBThrottle()
: _internalRocksDB(nullptr),
_threadRunning(false),
_replaceIdx(2),
_throttleBps(0),
_firstThrottle(true) {
memset(&_throttleData, 0, sizeof(_throttleData));
}
//
// Shutdown the background thread only if it was ever started
//
RocksDBThrottle::~RocksDBThrottle() { StopThread(); }
//
// Shutdown the background thread only if it was ever started
//
void RocksDBThrottle::StopThread() {
if (_threadRunning.load()) {
{
CONDITION_LOCKER(guard, _threadCondvar);
_threadRunning.store(false);
_threadCondvar.signal();
} // lock
_threadFuture.wait();
{
CONDITION_LOCKER(guard, _threadCondvar);
_internalRocksDB = nullptr;
_delayToken.reset();
} // lock
} // if
} // RocksDBThrottle::StopThread
///
/// @brief rocksdb does not track flush time in its statistics. Save start time
/// in
/// a thread specific storage
///
void RocksDBThrottle::OnFlushBegin(rocksdb::DB* db, const rocksdb::FlushJobInfo& flush_job_info) {
// save start time in thread local storage
std::chrono::steady_clock::time_point osx_hack = std::chrono::steady_clock::now();
memcpy(gFlushStart, &osx_hack, sizeof(std::chrono::steady_clock::time_point));
AdjustThreadPriority(1);
return;
} // RocksDBThrottle::OnFlushBegin
void RocksDBThrottle::OnFlushCompleted(rocksdb::DB* db,
const rocksdb::FlushJobInfo& flush_job_info) {
std::chrono::microseconds flush_time;
uint64_t flush_size;
std::chrono::steady_clock::time_point osx_hack;
memcpy(&osx_hack, gFlushStart, sizeof(std::chrono::steady_clock::time_point));
flush_time = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::steady_clock::now() - osx_hack);
flush_size = flush_job_info.table_properties.data_size +
flush_job_info.table_properties.index_size +
flush_job_info.table_properties.filter_size;
SetThrottleWriteRate(flush_time, flush_job_info.table_properties.num_entries,
flush_size, true);
// start throttle after first data is posted
// (have seen some odd zero and small size flushes early)
// (64<<20) is default size for write_buffer_size in column family options,
// too hard to read from here
if ((64 << 19) < flush_size) {
std::call_once(_initFlag, &RocksDBThrottle::Startup, this, db);
} // if
} // RocksDBThrottle::OnFlushCompleted
void RocksDBThrottle::OnCompactionCompleted(rocksdb::DB* db,
const rocksdb::CompactionJobInfo& ci) {
std::chrono::microseconds elapsed(ci.stats.elapsed_micros);
SetThrottleWriteRate(elapsed, ci.stats.num_output_records,
ci.stats.total_output_bytes, false);
// rocksdb 5.6 had an API call for when a standard compaction started. 5.14
// has no such thing.
// this line fakes "compaction start" by making the wild assumption that the
// next level compacting is likely similar to the previous. This is only for
// thread priority manipulation, approximate is fine. (and you must have used
// "setcap" on the arangod binary for it to even matter, see comments at top)
RocksDBThrottle::AdjustThreadPriority((0 == ci.base_input_level) ? 2 : 3);
} // RocksDBThrottle::OnCompactionCompleted
void RocksDBThrottle::Startup(rocksdb::DB* db) {
CONDITION_LOCKER(guard, _threadCondvar);
_internalRocksDB = (rocksdb::DBImpl*)db;
// addresses race condition during fast start/stop
_threadFuture = std::async(std::launch::async, &RocksDBThrottle::ThreadLoop, this);
while (!_threadRunning.load()) {
_threadCondvar.wait(10000);
} // while
} // RocksDBThrottle::Startup
void RocksDBThrottle::SetThrottleWriteRate(std::chrono::microseconds Micros,
uint64_t Keys, uint64_t Bytes, bool IsLevel0) {
// lock _threadMutex while we update _throttleData
MUTEX_LOCKER(mutexLocker, _threadMutex);
unsigned target_idx;
// throw out anything smaller than 32Mbytes ... be better if this
// was calculated against write_buffer_size, but that varies by column family
if ((64 << 19) < Bytes) {
// index 0 for level 0 compactions, index 1 for all others
target_idx = (IsLevel0 ? 0 : 1);
_throttleData[target_idx]._micros += Micros;
_throttleData[target_idx]._keys += Keys;
_throttleData[target_idx]._bytes += Bytes;
_throttleData[target_idx]._compactions += 1;
// attempt to override throttle changes by rocksdb ... hammer this often
// (note that _threadMutex IS HELD)
SetThrottle();
} // if
LOG_TOPIC("7afe9", DEBUG, arangodb::Logger::ENGINES)
<< "SetThrottleWriteRate: Micros " << Micros.count() << ", Keys " << Keys
<< ", Bytes " << Bytes << ", IsLevel0 " << IsLevel0;
return;
} // RocksDBThrottle::SetThrottleWriteRate
void RocksDBThrottle::ThreadLoop() {
_replaceIdx = 2;
// addresses race condition during fast start/stop
{
CONDITION_LOCKER(guard, _threadCondvar);
_threadRunning.store(true);
_threadCondvar.signal();
} // lock
LOG_TOPIC("a4a57", DEBUG, arangodb::Logger::ENGINES) << "ThreadLoop() started";
while (_threadRunning.load()) {
//
// start actual throttle work
//
try {
RecalculateThrottle();
} catch (...) {
LOG_TOPIC("b0a2e", ERR, arangodb::Logger::ENGINES)
<< "RecalculateThrottle() sent a throw. RocksDB?";
_threadRunning.store(false);
} // try/catchxs
++_replaceIdx;
if (THROTTLE_INTERVALS == _replaceIdx) _replaceIdx = 2;
// wait on _threadCondvar
{
CONDITION_LOCKER(guard, _threadCondvar);
if (_threadRunning.load()) { // test in case of race at shutdown
_threadCondvar.wait(THROTTLE_SECONDS * 1000000);
} // if
} // lock
} // while
LOG_TOPIC("eebbe", DEBUG, arangodb::Logger::ENGINES) << "ThreadLoop() ended";
} // RocksDBThrottle::ThreadLoop
//
// Routine to actually perform the throttle calculation,
// now is external routing from ThreadLoop() to easy unit test
void RocksDBThrottle::RecalculateThrottle() {
unsigned loop;
std::chrono::microseconds tot_micros;
uint64_t tot_bytes, tot_keys, tot_compact, adjustment_bytes;
int64_t new_throttle, compaction_backlog, temp_rate;
bool no_data;
tot_micros *= 0;
tot_keys = 0;
tot_bytes = 0;
tot_compact = 0;
temp_rate = 0;
compaction_backlog = ComputeBacklog();
{
MUTEX_LOCKER(mutexLocker, _threadMutex);
_throttleData[_replaceIdx] = _throttleData[1];
memset(&_throttleData[1], 0, sizeof(_throttleData[1]));
// this could be faster by keeping running totals and
// subtracting [_replaceIdx] before copying [0] into it,
// then adding new [_replaceIdx]. But that needs more
// time for testing.
for (loop = 2; loop < THROTTLE_INTERVALS; ++loop) {
tot_micros += _throttleData[loop]._micros;
tot_keys += _throttleData[loop]._keys;
tot_bytes += _throttleData[loop]._bytes;
tot_compact += _throttleData[loop]._compactions;
} // for
// flag to skip throttle changes if zero data available
no_data = (0 == tot_bytes && 0 == _throttleData[0]._bytes);
} // unique_lock
// reduce bytes by 10% for each excess level_0 files and/or excess write
// buffers
adjustment_bytes = (tot_bytes * compaction_backlog) / 10;
if (adjustment_bytes < tot_bytes) {
tot_bytes -= adjustment_bytes;
} else {
tot_bytes = 1; // not zero, let smoothing drift number down instead of
// taking level-0
}
// lock _threadMutex while we update _throttleData
if (!no_data) {
MUTEX_LOCKER(mutexLocker, _threadMutex);
// non-level0 data available?
if (0 != tot_bytes && 0 != tot_micros.count()) {
// average bytes per secon for level 1+ compactions
// (adjust bytes upward by 1000000 since dividing by microseconds,
// yields integer bytes per second)
new_throttle = ((tot_bytes * 1000000) / tot_micros.count());
} // if
// attempt to most recent level0
// (only use most recent level0 until level1+ data becomes available,
// useful on restart of heavily loaded server)
else if (0 != _throttleData[0]._bytes && 0 != _throttleData[0]._micros.count()) {
new_throttle =
(_throttleData[0]._bytes * 1000000) / _throttleData[0]._micros.count();
} // else if
else {
new_throttle = 1;
} // else
if (0 == new_throttle) new_throttle = 1; // throttle must have an effect
// change the throttle slowly
// (+1 & +2 keep throttle moving toward goal when difference new and
// old is less than THROTTLE_SCALING)
if (!_firstThrottle) {
temp_rate = _throttleBps;
if (temp_rate < new_throttle)
temp_rate += (new_throttle - temp_rate) / THROTTLE_SCALING + 1;
else
temp_rate -= (temp_rate - new_throttle) / THROTTLE_SCALING + 2;
// +2 can make this go negative
if (temp_rate < 1) temp_rate = 1; // throttle must always have an effect
LOG_TOPIC("46d4a", DEBUG, arangodb::Logger::ENGINES)
<< "RecalculateThrottle(): old " << _throttleBps << ", new " << temp_rate;
_throttleBps = temp_rate;
// prepare for next interval
memset(&_throttleData[0], 0, sizeof(_throttleData[0]));
} else if (1 < new_throttle) {
// never had a valid throttle, and have first hint now
_throttleBps = new_throttle;
LOG_TOPIC("e0bbb", DEBUG, arangodb::Logger::ENGINES)
<< "RecalculateThrottle(): first " << _throttleBps;
_firstThrottle = false;
} // else if
// This SetThrottle() call currently occurs without holding the
// rocksdb db mutex. Not safe, seen likely crash from it.
// Add back only if this becomes a pluggable WriteController with
// access to db mutex.
// SetThrottle();
} // !no_data && unlock _threadMutex
} // RocksDBThrottle::RecalculateThrottle
///
/// @brief Hack a throttle rate into the WriteController object
///
void RocksDBThrottle::SetThrottle() {
// called by routine with _threadMutex held
// using condition variable's mutex to protect _internalRocksDB race
{
CONDITION_LOCKER(guard, _threadCondvar);
// this routine can get called before _internalRocksDB is set
if (nullptr != _internalRocksDB) {
// inform write_controller_ of our new rate
// (column_family.cc RecalculateWriteStallConditions() makes assumptions
// that could force a divide by zero if _throttleBps is less than four
// ... using 100 for safety)
if (100 < _throttleBps) {
// hard casting away of "const" ...
if (((WriteController&)_internalRocksDB->write_controller()).max_delayed_write_rate() <
_throttleBps) {
((WriteController&)_internalRocksDB->write_controller()).set_max_delayed_write_rate(_throttleBps);
} // if
// Only replace the token when absolutely necessary. GetDelayToken()
// also resets internal timers which can result in long pauses if
// flushes/compactions are happening often.
if (nullptr == _delayToken.get()) {
_delayToken =
(((WriteController&)_internalRocksDB->write_controller()).GetDelayToken(_throttleBps));
LOG_TOPIC("7c51e", DEBUG, arangodb::Logger::ENGINES)
<< "SetThrottle(): GetDelayTokey(" << _throttleBps << ")";
} else {
LOG_TOPIC("2eb9e", DEBUG, arangodb::Logger::ENGINES)
<< "SetThrottle(): set_delayed_write_rate(" << _throttleBps << ")";
((WriteController&)_internalRocksDB->write_controller()).set_delayed_write_rate(_throttleBps);
} // else
} else {
_delayToken.reset();
LOG_TOPIC("af180", DEBUG, arangodb::Logger::ENGINES)
<< "SetThrottle(): _delaytoken.reset()";
} // else
} // if
} // lock
} // RocksDBThrottle::SetThrottle
///
/// @brief Use rocksdb's internal statistics to determine if
/// additional slowing of writes is warranted
///
int64_t RocksDBThrottle::ComputeBacklog() {
int64_t compaction_backlog, imm_backlog, imm_trigger;
bool ret_flag;
std::string ret_string, property_name;
int temp;
// want count of level 0 files to estimate if compactions "behind"
// and therefore likely to start stalling / stopping
compaction_backlog = 0;
imm_backlog = 0;
if (_families.size()) {
imm_trigger = _internalRocksDB->GetOptions(_families[0]).max_write_buffer_number / 2;
} else {
imm_trigger = 3;
} // else
// loop through column families to obtain family specific counts
for (auto& cf : _families) {
property_name = rocksdb::DB::Properties::kNumFilesAtLevelPrefix;
property_name.append("0");
ret_flag = _internalRocksDB->GetProperty(cf, property_name, &ret_string);
if (ret_flag) {
temp = std::stoi(ret_string);
} else {
temp = 0;
} // else
if (kL0_SlowdownWritesTrigger <= temp) {
temp -= (kL0_SlowdownWritesTrigger - 1);
} else {
temp = 0;
} // else
compaction_backlog += temp;
property_name = rocksdb::DB::Properties::kNumImmutableMemTable;
ret_flag = _internalRocksDB->GetProperty(cf, property_name, &ret_string);
if (ret_flag) {
temp = std::stoi(ret_string);
imm_backlog += temp;
} // if
} // for
if (imm_trigger < imm_backlog) {
compaction_backlog += (imm_backlog - imm_trigger);
} // if
return compaction_backlog;
} // RocksDBThrottle::Computebacklog
/// @brief Adjust the active thread's priority to match the work
/// it is performing. The routine is called HEAVILY.
void RocksDBThrottle::AdjustThreadPriority(int Adjustment) {
#ifndef _WIN32
// initialize thread infor if this the first time the thread has ever called
if (!gThreadPriority._baseSet) {
pid_t tid;
int ret_val;
tid = syscall(SYS_gettid);
if (-1 != (int)tid) {
errno = 0;
ret_val = getpriority(PRIO_PROCESS, tid);
// ret_val could be -1 legally, so double test
if (-1 != ret_val || 0 == errno) {
gThreadPriority._baseSet = true;
gThreadPriority._basePriority = ret_val;
gThreadPriority._currentPriority = ret_val;
} // if
} // if
} // if
// only change priorities if we
if (gThreadPriority._baseSet && (gThreadPriority._basePriority + Adjustment) !=
gThreadPriority._currentPriority) {
pid_t tid;
tid = syscall(SYS_gettid);
if (-1 != (int)tid) {
gThreadPriority._currentPriority = gThreadPriority._basePriority + Adjustment;
setpriority(PRIO_PROCESS, tid, gThreadPriority._currentPriority);
} // if
} // if
#endif // WIN32
} // RocksDBThrottle::AdjustThreadPriority
} // namespace arangodb