////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2017 ArangoDB GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
///     http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Matthew Von-Maszewski
////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////
// based upon leveldb/util/throttle.cc
// Copyright (c) 2011-2017 Basho Technologies, Inc. All Rights Reserved.
//
// This file is provided to you under the Apache License,
// Version 2.0 (the "License"); you may not use this file
// except in compliance with the License.  You may obtain
// a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
////////////////////////////////////////////////////////////////////////////////

#include "RocksDBThrottle.h"

#ifndef _WIN32
#include <sys/syscall.h>
#include <sys/resource.h>
#endif

#include "Basics/ConditionLocker.h"
#include "Basics/MutexLocker.h"
#include "Logger/Logger.h"

namespace arangodb {

////////////////////////////////////////////////////////////////////////////////
/// AdjustThreadPriority() below uses the Linux setpriority() function to dynamically
///  lower and raise a given thread's scheduling priority.  The Linux default is
///  to only allow a thread to lower its priority, not to raise it.  Even if the
///  raise would be to a previous priority.
///
/// Servers with 4 cores or less REALLY need the full benefit of AdjustThreadPriority().
///
/// To get full performance benefit of this code, the server needs three settings:
///
///  1. /etc/pam.d/login must contain the line "auth   require    pam_cap.so"
///  2. /etc/security/capability.conf must contain "cap_sys_nice      arangodb"
///  3. root must execute this command "setcap cap_sys_nice+ie arangod" on
///      the arangodb binary executable
///
/// The above settings allow the code to vary the threads across 3 priorities based upon
///  the current compaction's level.  Without the settings, threads eventual lock into only 2
///  different priorities (which is still far better having everything at same priority).
///
/// Setting 3 above must be applied to the arangod binary after every build or installation.
///
/// The code does not (yet) support Windows.
////////////////////////////////////////////////////////////////////////////////

// code will dynamically change a thread's priority based upon the compaction's level:
//  base +1 : flush mem buffer to level 0
//  base +2 : level 0 compaction to level 1
//  base +3 : all other compactions
struct sPriorityInfo {
  bool _baseSet;
  int _basePriority;
  int _currentPriority;
};

thread_local sPriorityInfo gThreadPriority={false, 0, 0};


// rocksdb flushes and compactions start and stop within same thread, no overlapping
//  (OSX 10.12 requires a static initializer for thread_local ... time_point on mac does not have
//   one in clang 9.0.0)
thread_local uint8_t gFlushStart[sizeof(std::chrono::steady_clock::time_point)];


//
// Setup the object, clearing variables, but do no real work
//
RocksDBThrottle::RocksDBThrottle()
  : _internalRocksDB(nullptr), _threadRunning(false), _replaceIdx(2),
    _throttleBps(0), _firstThrottle(true)
{
  memset(&_throttleData, 0, sizeof(_throttleData));
}


//
// Shutdown the background thread only if it was ever started
//
RocksDBThrottle::~RocksDBThrottle() {
  StopThread();
}


//
// Shutdown the background thread only if it was ever started
//
void RocksDBThrottle::StopThread() {

  if (_threadRunning.load()) {
    {
      CONDITION_LOCKER(guard, _threadCondvar);

      _threadRunning.store(false);
      _threadCondvar.signal();
    } // lock

    _threadFuture.wait();

    {
      CONDITION_LOCKER(guard, _threadCondvar);

      _internalRocksDB = nullptr;
      _delayToken.reset();
    } // lock
  } // if
} // RocksDBThrottle::StopThread


///
/// @brief rocksdb does not track flush time in its statistics.  Save start time in
///  a thread specific storage
///
void RocksDBThrottle::OnFlushBegin(rocksdb::DB* db, const rocksdb::FlushJobInfo& flush_job_info) {

  // save start time in thread local storage
  std::chrono::steady_clock::time_point osx_hack = std::chrono::steady_clock::now();
  memcpy(gFlushStart, &osx_hack, sizeof(std::chrono::steady_clock::time_point));
  AdjustThreadPriority(1);

  return;

} // RocksDBThrottle::OnFlushBegin


void RocksDBThrottle::OnFlushCompleted(rocksdb::DB* db, const rocksdb::FlushJobInfo& flush_job_info) {
  std::chrono::microseconds flush_time;
  uint64_t flush_size;
  std::chrono::steady_clock::time_point osx_hack;

  memcpy(&osx_hack, gFlushStart, sizeof(std::chrono::steady_clock::time_point));

  flush_time = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now()
                                                                     - osx_hack);
  flush_size = flush_job_info.table_properties.data_size + flush_job_info.table_properties.index_size
    + flush_job_info.table_properties.filter_size;

  SetThrottleWriteRate(flush_time, flush_job_info.table_properties.num_entries, flush_size, true);

  // start throttle after first data is posted
  //  (have seen some odd zero and small size flushes early)
  //  (64<<20) is default size for write_buffer_size in column family options, too hard to read from here
  if ((64<<19)<flush_size) {
    std::call_once(_initFlag, &RocksDBThrottle::Startup, this, db);
  } // if

} // RocksDBThrottle::OnFlushCompleted


void RocksDBThrottle::OnCompactionCompleted(rocksdb::DB* db,
                                            const rocksdb::CompactionJobInfo& ci) {

  std::chrono::microseconds elapsed(ci.stats.elapsed_micros);
  SetThrottleWriteRate(elapsed, ci.stats.num_output_records, ci.stats.total_output_bytes, false);

  // rocksdb 5.6 had an API call for when a standard compaction started.  5.14 has no such thing.
  //  this line fakes "compaction start" by making the wild assumption that the next level compacting
  //  is likely similar to the previous.  This is only for thread priority manipulation, approximate is fine.
  //  (and you must have used "setcap" on the arangod binary for it to even matter, see comments at top)
  RocksDBThrottle::AdjustThreadPriority( (0==ci.base_input_level) ? 2 : 3);

} // RocksDBThrottle::OnCompactionCompleted


void RocksDBThrottle::Startup(rocksdb::DB* db) {
  CONDITION_LOCKER(guard, _threadCondvar);

  _internalRocksDB = (rocksdb::DBImpl *)db;

  // addresses race condition during fast start/stop
  _threadFuture = std::async(std::launch::async, &RocksDBThrottle::ThreadLoop, this);

  while(!_threadRunning.load()) {
    _threadCondvar.wait(10000);
  } // while

} // RocksDBThrottle::Startup


void RocksDBThrottle::SetThrottleWriteRate(std::chrono::microseconds Micros,
                                                uint64_t Keys, uint64_t Bytes, bool IsLevel0) {
  // lock _threadMutex while we update _throttleData
  MUTEX_LOCKER(mutexLocker, _threadMutex);
  unsigned target_idx;

  // throw out anything smaller than 32Mbytes ... be better if this
  //  was calculated against write_buffer_size, but that varies by column family
  if ((64<<19)<Bytes) {
    // index 0 for level 0 compactions, index 1 for all others
    target_idx = (IsLevel0 ? 0 : 1);

    _throttleData[target_idx]._micros+=Micros;
    _throttleData[target_idx]._keys+=Keys;
    _throttleData[target_idx]._bytes+=Bytes;
    _throttleData[target_idx]._compactions+=1;

    // attempt to override throttle changes by rocksdb ... hammer this often
    //  (note that _threadMutex IS HELD)
    SetThrottle();
  } // if

  LOG_TOPIC(DEBUG, arangodb::Logger::ENGINES)
    << "SetThrottleWriteRate: Micros " << Micros.count()
    << ", Keys " << Keys << ", Bytes " << Bytes << ", IsLevel0 " << IsLevel0;

  return;
} // RocksDBThrottle::SetThrottleWriteRate


void RocksDBThrottle::ThreadLoop() {

  _replaceIdx=2;

  // addresses race condition during fast start/stop
  {
    CONDITION_LOCKER(guard, _threadCondvar);

    _threadRunning.store(true);
    _threadCondvar.signal();
  } // lock

  LOG_TOPIC(DEBUG, arangodb::Logger::ENGINES)
    << "ThreadLoop() started";

  while(_threadRunning.load()) {
    //
    // start actual throttle work
    //
    try {
      RecalculateThrottle();
    } catch (...) {
      LOG_TOPIC(ERR, arangodb::Logger::ENGINES)
          << "RecalculateThrottle() sent a throw. RocksDB?";
      _threadRunning.store(false);
    } // try/catchxs

    ++_replaceIdx;
    if (THROTTLE_INTERVALS==_replaceIdx)
      _replaceIdx=2;

    // wait on _threadCondvar
    {
      CONDITION_LOCKER(guard, _threadCondvar);

      if (_threadRunning.load()) { // test in case of race at shutdown
        _threadCondvar.wait(THROTTLE_SECONDS * 1000000);
      } //if
    } // lock
  } // while

  LOG_TOPIC(DEBUG, arangodb::Logger::ENGINES)
    << "ThreadLoop() ended";

} // RocksDBThrottle::ThreadLoop


//
// Routine to actually perform the throttle calculation,
//  now is external routing from ThreadLoop() to easy unit test
void RocksDBThrottle::RecalculateThrottle() {
  unsigned loop;
  std::chrono::microseconds tot_micros;
  uint64_t tot_bytes, tot_keys, tot_compact, adjustment_bytes;
  int64_t new_throttle, compaction_backlog, temp_rate;
  bool no_data;

  tot_micros*=0;
  tot_keys=0;
  tot_bytes=0;
  tot_compact=0;
  temp_rate=0;

  compaction_backlog = ComputeBacklog();

  {
    MUTEX_LOCKER(mutexLocker, _threadMutex);

    _throttleData[_replaceIdx]=_throttleData[1];
    memset(&_throttleData[1], 0, sizeof(_throttleData[1]));

    // this could be faster by keeping running totals and
    //  subtracting [_replaceIdx] before copying [0] into it,
    //  then adding new [_replaceIdx].  But that needs more
    //  time for testing.
    for (loop=2; loop<THROTTLE_INTERVALS; ++loop)
    {
      tot_micros+=_throttleData[loop]._micros;
      tot_keys+=_throttleData[loop]._keys;
      tot_bytes+=_throttleData[loop]._bytes;
      tot_compact+=_throttleData[loop]._compactions;
    }   // for

    // flag to skip throttle changes if zero data available
    no_data = (0 == tot_bytes && 0 == _throttleData[0]._bytes);
  } // unique_lock

  // reduce bytes by 10% for each excess level_0 files and/or excess write buffers
  adjustment_bytes = (tot_bytes*compaction_backlog)/10;
  if (adjustment_bytes<tot_bytes) {
    tot_bytes -= adjustment_bytes;
  } else {
    tot_bytes = 1;   // not zero, let smoothing drift number down instead of taking level-0
  }

  // lock _threadMutex while we update _throttleData
  if (!no_data) {
    MUTEX_LOCKER(mutexLocker, _threadMutex);

    // non-level0 data available?
    if (0!=tot_bytes && 0!=tot_micros.count())
    {
      // average bytes per secon for level 1+ compactions
      //  (adjust bytes upward by 1000000 since dividing by microseconds,
      //   yields integer bytes per second)
      new_throttle=((tot_bytes*1000000) / tot_micros.count());
    }   // if

    // attempt to most recent level0
    //  (only use most recent level0 until level1+ data becomes available,
    //   useful on restart of heavily loaded server)
    else if (0!=_throttleData[0]._bytes && 0!=_throttleData[0]._micros.count())
    {
      new_throttle=(_throttleData[0]._bytes * 1000000) / _throttleData[0]._micros.count();
    }   // else if
    else
    {
      new_throttle=1;
    }   // else

    if (0==new_throttle)
      new_throttle=1;     // throttle must have an effect

    // change the throttle slowly
    //  (+1 & +2 keep throttle moving toward goal when difference new and
    //   old is less than THROTTLE_SCALING)
    if (!_firstThrottle) {
      temp_rate=_throttleBps;

      if (temp_rate < new_throttle)
        temp_rate+=(new_throttle - temp_rate)/THROTTLE_SCALING +1;
      else
        temp_rate-=(temp_rate - new_throttle)/THROTTLE_SCALING +2;

      // +2 can make this go negative
      if (temp_rate<1)
        temp_rate=1;   // throttle must always have an effect

      LOG_TOPIC(DEBUG, arangodb::Logger::ENGINES)
        << "RecalculateThrottle(): old " << _throttleBps
        << ", new " << temp_rate;

      _throttleBps=temp_rate;

      // prepare for next interval
      memset(&_throttleData[0], 0, sizeof(_throttleData[0]));
    } else if (1<new_throttle) {
      // never had a valid throttle, and have first hint now
      _throttleBps=new_throttle;

      LOG_TOPIC(DEBUG, arangodb::Logger::ENGINES)
        << "RecalculateThrottle(): first " << _throttleBps;

      _firstThrottle=false;
    }  // else if

    // This SetThrottle() call currently occurs without holding the
    //  rocksdb db mutex.  Not safe, seen likely crash from it.
    //  Add back only if this becomes a pluggable WriteController with
    //  access to db mutex.
    // SetThrottle();

  } // !no_data && unlock _threadMutex

} // RocksDBThrottle::RecalculateThrottle


///
/// @brief Hack a throttle rate into the WriteController object
///
void RocksDBThrottle::SetThrottle() {
  // called by routine with _threadMutex held

  // using condition variable's mutex to protect _internalRocksDB race
  {
    CONDITION_LOCKER(guard, _threadCondvar);

    // this routine can get called before _internalRocksDB is set
    if (nullptr != _internalRocksDB) {
      // inform write_controller_ of our new rate
      //  (column_family.cc RecalculateWriteStallConditions() makes assumptions
      //   that could force a divide by zero if _throttleBps is less than four ... using 100 for safety)
      if (100<_throttleBps) {
        // hard casting away of "const" ...
        if (((WriteController&)_internalRocksDB->write_controller()).max_delayed_write_rate() < _throttleBps) {
          ((WriteController&)_internalRocksDB->write_controller()).set_max_delayed_write_rate(_throttleBps);
        } // if

        // Only replace the token when absolutely necessary.  GetDelayToken()
        //  also resets internal timers which can result in long pauses if
        //  flushes/compactions are happening often.
        if (nullptr == _delayToken.get()) {
          _delayToken=(((WriteController&)_internalRocksDB->write_controller()).GetDelayToken(_throttleBps));
          LOG_TOPIC(DEBUG, arangodb::Logger::ENGINES)
            << "SetThrottle(): GetDelayTokey(" << _throttleBps << ")";
        } else {
          LOG_TOPIC(DEBUG, arangodb::Logger::ENGINES)
            << "SetThrottle(): set_delayed_write_rate(" << _throttleBps << ")";
          ((WriteController&)_internalRocksDB->write_controller()).set_delayed_write_rate(_throttleBps);
        } // else
      } else {
        _delayToken.reset();
        LOG_TOPIC(DEBUG, arangodb::Logger::ENGINES)
          << "SetThrottle(): _delaytoken.reset()";
      } // else
    } // if
  } // lock
} // RocksDBThrottle::SetThrottle


///
/// @brief Use rocksdb's internal statistics to determine if
///  additional slowing of writes is warranted
///
int64_t RocksDBThrottle::ComputeBacklog() {
  int64_t compaction_backlog, imm_backlog, imm_trigger;
  bool ret_flag;
  std::string ret_string, property_name;
  int temp;

  // want count of level 0 files to estimate if compactions "behind"
  //  and therefore likely to start stalling / stopping
  compaction_backlog = 0;
  imm_backlog = 0;
  if (_families.size()) {
    imm_trigger = _internalRocksDB->GetOptions(_families[0]).max_write_buffer_number / 2;
  } else {
    imm_trigger = 3;
  } // else

  // loop through column families to obtain family specific counts
  for (auto & cf : _families) {
    property_name = rocksdb::DB::Properties::kNumFilesAtLevelPrefix;
    property_name.append("0");
    ret_flag=_internalRocksDB->GetProperty(cf, property_name, &ret_string);
    if (ret_flag) {
      temp=std::stoi(ret_string);
    } else {
      temp =0;
    } // else

    if (kL0_SlowdownWritesTrigger<=temp) {
      temp -= (kL0_SlowdownWritesTrigger -1);
    } else {
      temp = 0;
    } // else

    compaction_backlog += temp;

    property_name=rocksdb::DB::Properties::kNumImmutableMemTable;
    ret_flag=_internalRocksDB->GetProperty(cf, property_name, &ret_string);

    if (ret_flag) {
      temp=std::stoi(ret_string);
      imm_backlog += temp;
    } // if
  } // for

  if (imm_trigger<imm_backlog) {
    compaction_backlog += (imm_backlog - imm_trigger);
  } // if

  return compaction_backlog;
} // RocksDBThrottle::Computebacklog


/// @brief Adjust the active thread's priority to match the work
///  it is performing.  The routine is called HEAVILY.
void RocksDBThrottle::AdjustThreadPriority(int Adjustment) {
#ifndef _WIN32
  // initialize thread infor if this the first time the thread has ever called
  if (!gThreadPriority._baseSet) {
    pid_t tid;
    int ret_val;

    tid = syscall(SYS_gettid);
    if (-1!=(int)tid)
    {
      errno=0;
      ret_val=getpriority(PRIO_PROCESS, tid);
      // ret_val could be -1 legally, so double test
      if (-1!=ret_val || 0==errno) {
        gThreadPriority._baseSet = true;
        gThreadPriority._basePriority = ret_val;
        gThreadPriority._currentPriority = ret_val;
      }   // if
    }   // if
  } // if

  // only change priorities if we
  if (gThreadPriority._baseSet
      && (gThreadPriority._basePriority+Adjustment)!=gThreadPriority._currentPriority) {

    pid_t tid;
    tid = syscall(SYS_gettid);
    if (-1!=(int)tid)
    {
      gThreadPriority._currentPriority = gThreadPriority._basePriority + Adjustment;
      setpriority(PRIO_PROCESS, tid, gThreadPriority._currentPriority);
    } // if
  } // if

#endif   // WIN32
} // RocksDBThrottle::AdjustThreadPriority


} // namespace arangodb