mirror of https://gitee.com/bigwinds/arangodb
537 lines
18 KiB
C++
537 lines
18 KiB
C++
////////////////////////////////////////////////////////////////////////////////
|
|
/// DISCLAIMER
|
|
///
|
|
/// Copyright 2017 ArangoDB GmbH, Cologne, Germany
|
|
///
|
|
/// Licensed under the Apache License, Version 2.0 (the "License");
|
|
/// you may not use this file except in compliance with the License.
|
|
/// You may obtain a copy of the License at
|
|
///
|
|
/// http://www.apache.org/licenses/LICENSE-2.0
|
|
///
|
|
/// Unless required by applicable law or agreed to in writing, software
|
|
/// distributed under the License is distributed on an "AS IS" BASIS,
|
|
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
/// See the License for the specific language governing permissions and
|
|
/// limitations under the License.
|
|
///
|
|
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
|
///
|
|
/// @author Matthew Von-Maszewski
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// based upon leveldb/util/throttle.cc
|
|
// Copyright (c) 2011-2017 Basho Technologies, Inc. All Rights Reserved.
|
|
//
|
|
// This file is provided to you under the Apache License,
|
|
// Version 2.0 (the "License"); you may not use this file
|
|
// except in compliance with the License. You may obtain
|
|
// a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "RocksDBThrottle.h"
|
|
|
|
#ifndef _WIN32
|
|
#include <sys/resource.h>
|
|
#endif
|
|
|
|
#ifdef TRI_HAVE_UNISTD_H
|
|
#include <sys/syscall.h>
|
|
#include <unistd.h>
|
|
#endif
|
|
|
|
#include "Basics/ConditionLocker.h"
|
|
#include "Basics/MutexLocker.h"
|
|
#include "Logger/Logger.h"
|
|
|
|
namespace arangodb {
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// AdjustThreadPriority() below uses the Linux setpriority() function to
|
|
/// dynamically
|
|
/// lower and raise a given thread's scheduling priority. The Linux default is
|
|
/// to only allow a thread to lower its priority, not to raise it. Even if the
|
|
/// raise would be to a previous priority.
|
|
///
|
|
/// Servers with 4 cores or less REALLY need the full benefit of
|
|
/// AdjustThreadPriority().
|
|
///
|
|
/// To get full performance benefit of this code, the server needs three
|
|
/// settings:
|
|
///
|
|
/// 1. /etc/pam.d/login must contain the line "auth require pam_cap.so"
|
|
/// 2. /etc/security/capability.conf must contain "cap_sys_nice arangodb"
|
|
/// 3. root must execute this command "setcap cap_sys_nice+ie arangod" on
|
|
/// the arangodb binary executable
|
|
///
|
|
/// The above settings allow the code to vary the threads across 3 priorities
|
|
/// based upon
|
|
/// the current compaction's level. Without the settings, threads eventual
|
|
/// lock into only 2 different priorities (which is still far better having
|
|
/// everything at same priority).
|
|
///
|
|
/// Setting 3 above must be applied to the arangod binary after every build or
|
|
/// installation.
|
|
///
|
|
/// The code does not (yet) support Windows.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
// code will dynamically change a thread's priority based upon the compaction's
|
|
// level:
|
|
// base +1 : flush mem buffer to level 0
|
|
// base +2 : level 0 compaction to level 1
|
|
// base +3 : all other compactions
|
|
struct sPriorityInfo {
|
|
bool _baseSet;
|
|
int _basePriority;
|
|
int _currentPriority;
|
|
};
|
|
|
|
thread_local sPriorityInfo gThreadPriority = {false, 0, 0};
|
|
|
|
// rocksdb flushes and compactions start and stop within same thread, no
|
|
// overlapping
|
|
// (OSX 10.12 requires a static initializer for thread_local ... time_point on
|
|
// mac does not have
|
|
// one in clang 9.0.0)
|
|
thread_local uint8_t gFlushStart[sizeof(std::chrono::steady_clock::time_point)];
|
|
|
|
//
|
|
// Setup the object, clearing variables, but do no real work
|
|
//
|
|
RocksDBThrottle::RocksDBThrottle()
|
|
: _internalRocksDB(nullptr),
|
|
_threadRunning(false),
|
|
_replaceIdx(2),
|
|
_throttleBps(0),
|
|
_firstThrottle(true) {
|
|
memset(&_throttleData, 0, sizeof(_throttleData));
|
|
}
|
|
|
|
//
|
|
// Shutdown the background thread only if it was ever started
|
|
//
|
|
RocksDBThrottle::~RocksDBThrottle() { StopThread(); }
|
|
|
|
//
|
|
// Shutdown the background thread only if it was ever started
|
|
//
|
|
void RocksDBThrottle::StopThread() {
|
|
if (_threadRunning.load()) {
|
|
{
|
|
CONDITION_LOCKER(guard, _threadCondvar);
|
|
|
|
_threadRunning.store(false);
|
|
_threadCondvar.signal();
|
|
} // lock
|
|
|
|
_threadFuture.wait();
|
|
|
|
{
|
|
CONDITION_LOCKER(guard, _threadCondvar);
|
|
|
|
_internalRocksDB = nullptr;
|
|
_delayToken.reset();
|
|
} // lock
|
|
} // if
|
|
} // RocksDBThrottle::StopThread
|
|
|
|
///
|
|
/// @brief rocksdb does not track flush time in its statistics. Save start time
|
|
/// in
|
|
/// a thread specific storage
|
|
///
|
|
void RocksDBThrottle::OnFlushBegin(rocksdb::DB* db, const rocksdb::FlushJobInfo& flush_job_info) {
|
|
// save start time in thread local storage
|
|
std::chrono::steady_clock::time_point osx_hack = std::chrono::steady_clock::now();
|
|
memcpy(gFlushStart, &osx_hack, sizeof(std::chrono::steady_clock::time_point));
|
|
AdjustThreadPriority(1);
|
|
|
|
return;
|
|
|
|
} // RocksDBThrottle::OnFlushBegin
|
|
|
|
void RocksDBThrottle::OnFlushCompleted(rocksdb::DB* db,
|
|
const rocksdb::FlushJobInfo& flush_job_info) {
|
|
std::chrono::microseconds flush_time;
|
|
uint64_t flush_size;
|
|
std::chrono::steady_clock::time_point osx_hack;
|
|
|
|
memcpy(&osx_hack, gFlushStart, sizeof(std::chrono::steady_clock::time_point));
|
|
|
|
flush_time = std::chrono::duration_cast<std::chrono::microseconds>(
|
|
std::chrono::steady_clock::now() - osx_hack);
|
|
flush_size = flush_job_info.table_properties.data_size +
|
|
flush_job_info.table_properties.index_size +
|
|
flush_job_info.table_properties.filter_size;
|
|
|
|
SetThrottleWriteRate(flush_time, flush_job_info.table_properties.num_entries,
|
|
flush_size, true);
|
|
|
|
// start throttle after first data is posted
|
|
// (have seen some odd zero and small size flushes early)
|
|
// (64<<20) is default size for write_buffer_size in column family options,
|
|
// too hard to read from here
|
|
if ((64 << 19) < flush_size) {
|
|
std::call_once(_initFlag, &RocksDBThrottle::Startup, this, db);
|
|
} // if
|
|
|
|
} // RocksDBThrottle::OnFlushCompleted
|
|
|
|
void RocksDBThrottle::OnCompactionCompleted(rocksdb::DB* db,
|
|
const rocksdb::CompactionJobInfo& ci) {
|
|
std::chrono::microseconds elapsed(ci.stats.elapsed_micros);
|
|
SetThrottleWriteRate(elapsed, ci.stats.num_output_records,
|
|
ci.stats.total_output_bytes, false);
|
|
|
|
// rocksdb 5.6 had an API call for when a standard compaction started. 5.14
|
|
// has no such thing.
|
|
// this line fakes "compaction start" by making the wild assumption that the
|
|
// next level compacting is likely similar to the previous. This is only for
|
|
// thread priority manipulation, approximate is fine. (and you must have used
|
|
// "setcap" on the arangod binary for it to even matter, see comments at top)
|
|
RocksDBThrottle::AdjustThreadPriority((0 == ci.base_input_level) ? 2 : 3);
|
|
|
|
} // RocksDBThrottle::OnCompactionCompleted
|
|
|
|
void RocksDBThrottle::Startup(rocksdb::DB* db) {
|
|
CONDITION_LOCKER(guard, _threadCondvar);
|
|
|
|
_internalRocksDB = (rocksdb::DBImpl*)db;
|
|
|
|
// addresses race condition during fast start/stop
|
|
_threadFuture = std::async(std::launch::async, &RocksDBThrottle::ThreadLoop, this);
|
|
|
|
while (!_threadRunning.load()) {
|
|
_threadCondvar.wait(10000);
|
|
} // while
|
|
|
|
} // RocksDBThrottle::Startup
|
|
|
|
void RocksDBThrottle::SetThrottleWriteRate(std::chrono::microseconds Micros,
|
|
uint64_t Keys, uint64_t Bytes, bool IsLevel0) {
|
|
// lock _threadMutex while we update _throttleData
|
|
MUTEX_LOCKER(mutexLocker, _threadMutex);
|
|
unsigned target_idx;
|
|
|
|
// throw out anything smaller than 32Mbytes ... be better if this
|
|
// was calculated against write_buffer_size, but that varies by column family
|
|
if ((64 << 19) < Bytes) {
|
|
// index 0 for level 0 compactions, index 1 for all others
|
|
target_idx = (IsLevel0 ? 0 : 1);
|
|
|
|
_throttleData[target_idx]._micros += Micros;
|
|
_throttleData[target_idx]._keys += Keys;
|
|
_throttleData[target_idx]._bytes += Bytes;
|
|
_throttleData[target_idx]._compactions += 1;
|
|
|
|
// attempt to override throttle changes by rocksdb ... hammer this often
|
|
// (note that _threadMutex IS HELD)
|
|
SetThrottle();
|
|
} // if
|
|
|
|
LOG_TOPIC("7afe9", DEBUG, arangodb::Logger::ENGINES)
|
|
<< "SetThrottleWriteRate: Micros " << Micros.count() << ", Keys " << Keys
|
|
<< ", Bytes " << Bytes << ", IsLevel0 " << IsLevel0;
|
|
|
|
return;
|
|
} // RocksDBThrottle::SetThrottleWriteRate
|
|
|
|
void RocksDBThrottle::ThreadLoop() {
|
|
_replaceIdx = 2;
|
|
|
|
// addresses race condition during fast start/stop
|
|
{
|
|
CONDITION_LOCKER(guard, _threadCondvar);
|
|
|
|
_threadRunning.store(true);
|
|
_threadCondvar.signal();
|
|
} // lock
|
|
|
|
LOG_TOPIC("a4a57", DEBUG, arangodb::Logger::ENGINES) << "ThreadLoop() started";
|
|
|
|
while (_threadRunning.load()) {
|
|
//
|
|
// start actual throttle work
|
|
//
|
|
try {
|
|
RecalculateThrottle();
|
|
} catch (...) {
|
|
LOG_TOPIC("b0a2e", ERR, arangodb::Logger::ENGINES)
|
|
<< "RecalculateThrottle() sent a throw. RocksDB?";
|
|
_threadRunning.store(false);
|
|
} // try/catchxs
|
|
|
|
++_replaceIdx;
|
|
if (THROTTLE_INTERVALS == _replaceIdx) _replaceIdx = 2;
|
|
|
|
// wait on _threadCondvar
|
|
{
|
|
CONDITION_LOCKER(guard, _threadCondvar);
|
|
|
|
if (_threadRunning.load()) { // test in case of race at shutdown
|
|
_threadCondvar.wait(THROTTLE_SECONDS * 1000000);
|
|
} // if
|
|
} // lock
|
|
} // while
|
|
|
|
LOG_TOPIC("eebbe", DEBUG, arangodb::Logger::ENGINES) << "ThreadLoop() ended";
|
|
|
|
} // RocksDBThrottle::ThreadLoop
|
|
|
|
//
|
|
// Routine to actually perform the throttle calculation,
|
|
// now is external routing from ThreadLoop() to easy unit test
|
|
void RocksDBThrottle::RecalculateThrottle() {
|
|
unsigned loop;
|
|
std::chrono::microseconds tot_micros;
|
|
uint64_t tot_bytes, tot_keys, tot_compact, adjustment_bytes;
|
|
int64_t new_throttle, compaction_backlog, temp_rate;
|
|
bool no_data;
|
|
|
|
tot_micros *= 0;
|
|
tot_keys = 0;
|
|
tot_bytes = 0;
|
|
tot_compact = 0;
|
|
temp_rate = 0;
|
|
|
|
compaction_backlog = ComputeBacklog();
|
|
|
|
{
|
|
MUTEX_LOCKER(mutexLocker, _threadMutex);
|
|
|
|
_throttleData[_replaceIdx] = _throttleData[1];
|
|
memset(&_throttleData[1], 0, sizeof(_throttleData[1]));
|
|
|
|
// this could be faster by keeping running totals and
|
|
// subtracting [_replaceIdx] before copying [0] into it,
|
|
// then adding new [_replaceIdx]. But that needs more
|
|
// time for testing.
|
|
for (loop = 2; loop < THROTTLE_INTERVALS; ++loop) {
|
|
tot_micros += _throttleData[loop]._micros;
|
|
tot_keys += _throttleData[loop]._keys;
|
|
tot_bytes += _throttleData[loop]._bytes;
|
|
tot_compact += _throttleData[loop]._compactions;
|
|
} // for
|
|
|
|
// flag to skip throttle changes if zero data available
|
|
no_data = (0 == tot_bytes && 0 == _throttleData[0]._bytes);
|
|
} // unique_lock
|
|
|
|
// reduce bytes by 10% for each excess level_0 files and/or excess write
|
|
// buffers
|
|
adjustment_bytes = (tot_bytes * compaction_backlog) / 10;
|
|
if (adjustment_bytes < tot_bytes) {
|
|
tot_bytes -= adjustment_bytes;
|
|
} else {
|
|
tot_bytes = 1; // not zero, let smoothing drift number down instead of
|
|
// taking level-0
|
|
}
|
|
|
|
// lock _threadMutex while we update _throttleData
|
|
if (!no_data) {
|
|
MUTEX_LOCKER(mutexLocker, _threadMutex);
|
|
|
|
// non-level0 data available?
|
|
if (0 != tot_bytes && 0 != tot_micros.count()) {
|
|
// average bytes per secon for level 1+ compactions
|
|
// (adjust bytes upward by 1000000 since dividing by microseconds,
|
|
// yields integer bytes per second)
|
|
new_throttle = ((tot_bytes * 1000000) / tot_micros.count());
|
|
} // if
|
|
|
|
// attempt to most recent level0
|
|
// (only use most recent level0 until level1+ data becomes available,
|
|
// useful on restart of heavily loaded server)
|
|
else if (0 != _throttleData[0]._bytes && 0 != _throttleData[0]._micros.count()) {
|
|
new_throttle =
|
|
(_throttleData[0]._bytes * 1000000) / _throttleData[0]._micros.count();
|
|
} // else if
|
|
else {
|
|
new_throttle = 1;
|
|
} // else
|
|
|
|
if (0 == new_throttle) new_throttle = 1; // throttle must have an effect
|
|
|
|
// change the throttle slowly
|
|
// (+1 & +2 keep throttle moving toward goal when difference new and
|
|
// old is less than THROTTLE_SCALING)
|
|
if (!_firstThrottle) {
|
|
temp_rate = _throttleBps;
|
|
|
|
if (temp_rate < new_throttle)
|
|
temp_rate += (new_throttle - temp_rate) / THROTTLE_SCALING + 1;
|
|
else
|
|
temp_rate -= (temp_rate - new_throttle) / THROTTLE_SCALING + 2;
|
|
|
|
// +2 can make this go negative
|
|
if (temp_rate < 1) temp_rate = 1; // throttle must always have an effect
|
|
|
|
LOG_TOPIC("46d4a", DEBUG, arangodb::Logger::ENGINES)
|
|
<< "RecalculateThrottle(): old " << _throttleBps << ", new " << temp_rate;
|
|
|
|
_throttleBps = temp_rate;
|
|
|
|
// prepare for next interval
|
|
memset(&_throttleData[0], 0, sizeof(_throttleData[0]));
|
|
} else if (1 < new_throttle) {
|
|
// never had a valid throttle, and have first hint now
|
|
_throttleBps = new_throttle;
|
|
|
|
LOG_TOPIC("e0bbb", DEBUG, arangodb::Logger::ENGINES)
|
|
<< "RecalculateThrottle(): first " << _throttleBps;
|
|
|
|
_firstThrottle = false;
|
|
} // else if
|
|
|
|
// This SetThrottle() call currently occurs without holding the
|
|
// rocksdb db mutex. Not safe, seen likely crash from it.
|
|
// Add back only if this becomes a pluggable WriteController with
|
|
// access to db mutex.
|
|
// SetThrottle();
|
|
|
|
} // !no_data && unlock _threadMutex
|
|
|
|
} // RocksDBThrottle::RecalculateThrottle
|
|
|
|
///
|
|
/// @brief Hack a throttle rate into the WriteController object
|
|
///
|
|
void RocksDBThrottle::SetThrottle() {
|
|
// called by routine with _threadMutex held
|
|
|
|
// using condition variable's mutex to protect _internalRocksDB race
|
|
{
|
|
CONDITION_LOCKER(guard, _threadCondvar);
|
|
|
|
// this routine can get called before _internalRocksDB is set
|
|
if (nullptr != _internalRocksDB) {
|
|
// inform write_controller_ of our new rate
|
|
// (column_family.cc RecalculateWriteStallConditions() makes assumptions
|
|
// that could force a divide by zero if _throttleBps is less than four
|
|
// ... using 100 for safety)
|
|
if (100 < _throttleBps) {
|
|
// hard casting away of "const" ...
|
|
if (((WriteController&)_internalRocksDB->write_controller()).max_delayed_write_rate() <
|
|
_throttleBps) {
|
|
((WriteController&)_internalRocksDB->write_controller()).set_max_delayed_write_rate(_throttleBps);
|
|
} // if
|
|
|
|
// Only replace the token when absolutely necessary. GetDelayToken()
|
|
// also resets internal timers which can result in long pauses if
|
|
// flushes/compactions are happening often.
|
|
if (nullptr == _delayToken.get()) {
|
|
_delayToken =
|
|
(((WriteController&)_internalRocksDB->write_controller()).GetDelayToken(_throttleBps));
|
|
LOG_TOPIC("7c51e", DEBUG, arangodb::Logger::ENGINES)
|
|
<< "SetThrottle(): GetDelayTokey(" << _throttleBps << ")";
|
|
} else {
|
|
LOG_TOPIC("2eb9e", DEBUG, arangodb::Logger::ENGINES)
|
|
<< "SetThrottle(): set_delayed_write_rate(" << _throttleBps << ")";
|
|
((WriteController&)_internalRocksDB->write_controller()).set_delayed_write_rate(_throttleBps);
|
|
} // else
|
|
} else {
|
|
_delayToken.reset();
|
|
LOG_TOPIC("af180", DEBUG, arangodb::Logger::ENGINES)
|
|
<< "SetThrottle(): _delaytoken.reset()";
|
|
} // else
|
|
} // if
|
|
} // lock
|
|
} // RocksDBThrottle::SetThrottle
|
|
|
|
///
|
|
/// @brief Use rocksdb's internal statistics to determine if
|
|
/// additional slowing of writes is warranted
|
|
///
|
|
int64_t RocksDBThrottle::ComputeBacklog() {
|
|
int64_t compaction_backlog, imm_backlog, imm_trigger;
|
|
bool ret_flag;
|
|
std::string ret_string, property_name;
|
|
int temp;
|
|
|
|
// want count of level 0 files to estimate if compactions "behind"
|
|
// and therefore likely to start stalling / stopping
|
|
compaction_backlog = 0;
|
|
imm_backlog = 0;
|
|
if (_families.size()) {
|
|
imm_trigger = _internalRocksDB->GetOptions(_families[0]).max_write_buffer_number / 2;
|
|
} else {
|
|
imm_trigger = 3;
|
|
} // else
|
|
|
|
// loop through column families to obtain family specific counts
|
|
for (auto& cf : _families) {
|
|
property_name = rocksdb::DB::Properties::kNumFilesAtLevelPrefix;
|
|
property_name.append("0");
|
|
ret_flag = _internalRocksDB->GetProperty(cf, property_name, &ret_string);
|
|
if (ret_flag) {
|
|
temp = std::stoi(ret_string);
|
|
} else {
|
|
temp = 0;
|
|
} // else
|
|
|
|
if (kL0_SlowdownWritesTrigger <= temp) {
|
|
temp -= (kL0_SlowdownWritesTrigger - 1);
|
|
} else {
|
|
temp = 0;
|
|
} // else
|
|
|
|
compaction_backlog += temp;
|
|
|
|
property_name = rocksdb::DB::Properties::kNumImmutableMemTable;
|
|
ret_flag = _internalRocksDB->GetProperty(cf, property_name, &ret_string);
|
|
|
|
if (ret_flag) {
|
|
temp = std::stoi(ret_string);
|
|
imm_backlog += temp;
|
|
} // if
|
|
} // for
|
|
|
|
if (imm_trigger < imm_backlog) {
|
|
compaction_backlog += (imm_backlog - imm_trigger);
|
|
} // if
|
|
|
|
return compaction_backlog;
|
|
} // RocksDBThrottle::Computebacklog
|
|
|
|
/// @brief Adjust the active thread's priority to match the work
|
|
/// it is performing. The routine is called HEAVILY.
|
|
void RocksDBThrottle::AdjustThreadPriority(int Adjustment) {
|
|
#ifndef _WIN32
|
|
// initialize thread infor if this the first time the thread has ever called
|
|
if (!gThreadPriority._baseSet) {
|
|
pid_t tid;
|
|
int ret_val;
|
|
|
|
tid = syscall(SYS_gettid);
|
|
if (-1 != (int)tid) {
|
|
errno = 0;
|
|
ret_val = getpriority(PRIO_PROCESS, tid);
|
|
// ret_val could be -1 legally, so double test
|
|
if (-1 != ret_val || 0 == errno) {
|
|
gThreadPriority._baseSet = true;
|
|
gThreadPriority._basePriority = ret_val;
|
|
gThreadPriority._currentPriority = ret_val;
|
|
} // if
|
|
} // if
|
|
} // if
|
|
|
|
// only change priorities if we
|
|
if (gThreadPriority._baseSet && (gThreadPriority._basePriority + Adjustment) !=
|
|
gThreadPriority._currentPriority) {
|
|
pid_t tid;
|
|
tid = syscall(SYS_gettid);
|
|
if (-1 != (int)tid) {
|
|
gThreadPriority._currentPriority = gThreadPriority._basePriority + Adjustment;
|
|
setpriority(PRIO_PROCESS, tid, gThreadPriority._currentPriority);
|
|
} // if
|
|
} // if
|
|
|
|
#endif // WIN32
|
|
} // RocksDBThrottle::AdjustThreadPriority
|
|
|
|
} // namespace arangodb
|