1
0
Fork 0

Fixing count

This commit is contained in:
Simon Grätzer 2017-04-12 15:22:18 +02:00
parent 1d90520a9d
commit d9510eee0b
4 changed files with 151 additions and 142 deletions

View File

@ -35,41 +35,44 @@
#include <rocksdb/utilities/transaction_db.h>
#include <rocksdb/write_batch.h>
#include <rocksdb/utilities/write_batch_with_index.h>
#include <velocypack/Iterator.h>
#include <velocypack/velocypack-aliases.h>
using namespace arangodb;
RocksDBCounterManager::Counter::Counter(VPackSlice const& slice) {
RocksDBCounterManager::CMValue::CMValue(VPackSlice const& slice) {
TRI_ASSERT(slice.isArray());
velocypack::ArrayIterator array(slice);
if (array.valid()) {
this->_sequenceNumber = (*array).getUInt();
this->_sequenceNum = (*array).getUInt();
this->_count = (*(++array)).getUInt();
this->_revisionId = (*(++array)).getUInt();
}
}
RocksDBCounterManager::Counter::Counter() : _sequenceNumber(0), _count(0), _revisionId(0) {}
void RocksDBCounterManager::Counter::serialize(VPackBuilder& b) const {
void RocksDBCounterManager::CMValue::serialize(VPackBuilder& b) const {
b.openArray();
b.add(VPackValue(this->_sequenceNumber));
b.add(VPackValue(this->_sequenceNum));
b.add(VPackValue(this->_count));
b.add(VPackValue(this->_revisionId));
b.close();
}
void RocksDBCounterManager::Counter::reset() {
_sequenceNumber = 0;
_count = 0;
_revisionId = 0;
}
/// Constructor needs to be called synchrunously,
/// will load counts from the db and scan the WAL
RocksDBCounterManager::RocksDBCounterManager(rocksdb::DB* db, double interval)
: Thread("RocksDBCounters"), _db(db), _interval(interval) {
usleep(1000000);
usleep(1000000);
usleep(1000000);
usleep(1000000);
usleep(1000000);
usleep(1000000);
usleep(1000000);
readCounterValues();
if (_counters.size() > 0) {
@ -98,44 +101,52 @@ void RocksDBCounterManager::run() {
}
}
RocksDBCounterManager::Counter const RocksDBCounterManager::loadCounter(uint64_t objectId) {
static const Counter empty{};
{
READ_LOCKER(guard, _rwLock);
auto const& it = _counters.find(objectId);
if (it != _counters.end()) {
return it->second;
}
} return empty; // do not create
RocksDBCounterManager::CounterUpdate
RocksDBCounterManager::loadCounter(uint64_t objectId) const {
TRI_ASSERT(objectId != 0);// TODO fix this
READ_LOCKER(guard, _rwLock);
auto const& it = _counters.find(objectId);
if (it != _counters.end()) {
return CounterUpdate(it->second._sequenceNum,
it->second._count,
it->second._revisionId);
}
return CounterUpdate(0,0,0); // do not create
}
/// collections / views / indexes can call this method to update
/// their total counts. Thread-Safe needs the snapshot so we know
/// the sequence number used
void RocksDBCounterManager::updateCounter(uint64_t objectId,
rocksdb::Snapshot const* snapshot,
int64_t delta, uint64_t revisionId) {
CounterUpdate const& update) {
// From write_batch.cc in rocksdb: first 64 bits in the internal rep_
// buffer are the sequence number
/*TRI_ASSERT(trx->GetState() == rocksdb::Transaction::COMMITED);
rocksdb::WriteBatchWithIndex *batch = trx->GetWriteBatch();
rocksdb::SequenceNumber seq = DecodeFixed64(batch->GetWriteBatch()->Data().data());*/
bool needsSync = false;
{
WRITE_LOCKER(guard, _rwLock);
auto const& it = _counters.find(objectId);
auto it = _counters.find(objectId);
if (it != _counters.end()) {
// already have a counter for objectId. now adjust its value
it->second._count += delta;
// just use the latest snapshot
if (snapshot->GetSequenceNumber() > it->second._sequenceNumber) {
it->second._sequenceNumber = snapshot->GetSequenceNumber();
it->second._revisionId = revisionId;
it->second._count += update.count();
// just use the latest trx info
if (update.sequenceNumber() > it->second._sequenceNum) {
it->second._sequenceNum = update.sequenceNumber();
it->second._revisionId = update.revisionId();
}
} else {
// insert new counter
TRI_ASSERT(delta >= 0);
auto tmpCounter = Counter(snapshot->GetSequenceNumber(), delta, revisionId);
_counters.emplace(std::make_pair(objectId,std::move(tmpCounter)));
needsSync = true;
TRI_ASSERT(update.count() != 0);
_counters.emplace(std::make_pair(objectId, CMValue(update.sequenceNumber(),
update.count(),
update.revisionId())));
needsSync = true;// only count values from WAL if they are in the DB
}
}
if (needsSync) {
@ -164,7 +175,7 @@ Result RocksDBCounterManager::sync() {
return Result();
}
std::unordered_map<uint64_t, Counter> copy;
std::unordered_map<uint64_t, CMValue> copy;
{ // block all updates
WRITE_LOCKER(guard, _rwLock);
if (_syncing) {
@ -180,10 +191,11 @@ Result RocksDBCounterManager::sync() {
db->BeginTransaction(writeOptions));
VPackBuilder b;
for (std::pair<uint64_t, Counter> const& pair : copy) {
for (std::pair<uint64_t, CMValue> const& pair : copy) {
// Skip values which we did not change
auto const& it = _syncedSeqNum.find(pair.first);
if (it != _syncedSeqNum.end() && it->second == pair.second._sequenceNumber) {
auto const& it = _syncedSeqNums.find(pair.first);
if (it != _syncedSeqNums.end()
&& it->second == pair.second._sequenceNum) {
continue;
}
@ -200,37 +212,35 @@ Result RocksDBCounterManager::sync() {
return rocksutils::convertStatus(s);
}
}
// we have to commit all counters in one batch otherwise
// there would be the possibility of
rocksdb::Status s = rtrx->Commit();
if (s.ok()) {
for (std::pair<uint64_t, Counter> const& pair : copy) {
_syncedSeqNum[pair.first] = pair.second._sequenceNumber;
for (std::pair<uint64_t, CMValue> const& pair : copy) {
_syncedSeqNums[pair.first] = pair.second._sequenceNum;
}
}
_syncing = false;
_syncing = false;
return rocksutils::convertStatus(s);
}
/// Parse counter values from rocksdb
void RocksDBCounterManager::readCounterValues() {
WRITE_LOCKER(guard, _rwLock);
rocksdb::TransactionDB* db = rocksutils::globalRocksDB();
rocksdb::Comparator const* cmp = db->GetOptions().comparator;
RocksDBKeyBounds bounds = RocksDBKeyBounds::CounterValues();
rocksdb::Comparator const* cmp = _db->GetOptions().comparator;
rocksdb::ReadOptions readOptions;
std::unique_ptr<rocksdb::Iterator> iter(db->NewIterator(readOptions));
std::unique_ptr<rocksdb::Iterator> iter(_db->NewIterator(readOptions));
iter->Seek(bounds.start());
while (iter->Valid() && cmp->Compare(iter->key(), bounds.end()) < 0) {
uint64_t objectId = RocksDBKey::counterObjectId(iter->key());
Counter counter(VPackSlice(iter->value().data()));
_counters.emplace(objectId, counter);
_syncedSeqNum[objectId] = counter._sequenceNumber;
auto const& it = _counters.emplace(objectId, CMValue(VPackSlice(iter->value().data())));
_syncedSeqNums[objectId] = it.first->second._sequenceNum;
iter->Next();
}
@ -238,39 +248,22 @@ void RocksDBCounterManager::readCounterValues() {
/// WAL parser, no locking required here, because we have been locked from the outside
struct WBReader : public rocksdb::WriteBatch::Handler {
std::unordered_map<uint64_t, rocksdb::SequenceNumber> prevCounters;
std::unordered_map<uint64_t, RocksDBCounterManager::Counter> deltas;
rocksdb::SequenceNumber seqNum = UINT64_MAX;
// must be set by the counter manager
std::unordered_map<uint64_t, rocksdb::SequenceNumber> seqStart;
std::unordered_map<uint64_t, RocksDBCounterManager::CounterUpdate> deltas;
rocksdb::SequenceNumber currentSeqNum;
explicit WBReader() {}
bool prepKey(const rocksdb::Slice& key) {
if (RocksDBKey::type(key) == RocksDBEntryType::CounterValue) {
if (RocksDBKey::type(key) == RocksDBEntryType::Document) {
uint64_t objectId = RocksDBKey::counterObjectId(key);
auto const& it = prevCounters.find(objectId);
if (it == prevCounters.end()) {
// We found a counter value, now we can track all writes from this
// point in the WAL
prevCounters.emplace(objectId, seqNum);
deltas.emplace(objectId, RocksDBCounterManager::Counter());
} else if (it->second < seqNum) {
// we found our counter again at a later point in the WAL
// which means we can forget about every write we have seen
// up to this point
TRI_ASSERT(deltas.find(objectId) != deltas.end());
it->second = seqNum;
deltas[objectId].reset();
}
return false;
} else if (RocksDBKey::type(key) == RocksDBEntryType::Document) {
uint64_t objectId = RocksDBKey::counterObjectId(key);
auto const& it = prevCounters.find(objectId);
if (it == prevCounters.end()) {
return false;
} else {
TRI_ASSERT(deltas.find(objectId) != deltas.end());
return it->second <= seqNum;
auto const& it = seqStart.find(objectId);
if (it != seqStart.end()) {
if (deltas.find(objectId) == deltas.end()) {
deltas.emplace(objectId, RocksDBCounterManager::CounterUpdate(0,0,0));
}
return it->second < currentSeqNum;
}
}
return false;
@ -281,12 +274,11 @@ struct WBReader : public rocksdb::WriteBatch::Handler {
if (prepKey(key)) {
uint64_t objectId = RocksDBKey::counterObjectId(key);
uint64_t revisionId = RocksDBKey::revisionId(key);
RocksDBCounterManager::Counter& cc = deltas[objectId];
if (cc._sequenceNumber <= seqNum) {
cc._sequenceNumber = seqNum;
cc._count++;
cc._revisionId = revisionId;
auto const& it = deltas.find(objectId);
if (it != deltas.end()) {
it->second._count++;
it->second._revisionId = revisionId;
}
}
}
@ -295,12 +287,11 @@ struct WBReader : public rocksdb::WriteBatch::Handler {
if (prepKey(key)) {
uint64_t objectId = RocksDBKey::counterObjectId(key);
uint64_t revisionId = RocksDBKey::revisionId(key);
RocksDBCounterManager::Counter& cc = deltas[objectId];
if (cc._sequenceNumber <= seqNum) {
cc._sequenceNumber = seqNum;
cc._count--;
cc._revisionId = revisionId;
auto const& it = deltas.find(objectId);
if (it != deltas.end()) {
it->second._count--;
it->second._revisionId = revisionId;
}
}
}
@ -309,12 +300,11 @@ struct WBReader : public rocksdb::WriteBatch::Handler {
if (prepKey(key)) {
uint64_t objectId = RocksDBKey::counterObjectId(key);
uint64_t revisionId = RocksDBKey::revisionId(key);
RocksDBCounterManager::Counter& cc = deltas[objectId];
if (cc._sequenceNumber <= seqNum) {
cc._sequenceNumber = seqNum;
cc._count--;
cc._revisionId = revisionId;
auto const& it = deltas.find(objectId);
if (it != deltas.end()) {
it->second._count--;
it->second._revisionId = revisionId;
}
}
}
@ -325,27 +315,27 @@ bool RocksDBCounterManager::parseRocksWAL() {
WRITE_LOCKER(guard, _rwLock);
TRI_ASSERT(_counters.size() > 0);
// Determine which counter values might be outdated
rocksdb::SequenceNumber lastWrittenSeq = UINT64_MAX;
for (auto const& it : _syncedSeqNum) {
if (it.second < lastWrittenSeq) {
lastWrittenSeq = it.second;
}
}
rocksdb::SequenceNumber start = UINT64_MAX;
// Tell the WriteBatch reader the transaction markers to look for
std::unique_ptr<WBReader> handler(new WBReader());
for (auto const& pair : _counters) {
handler->seqStart.emplace(pair.first, pair.second._sequenceNum);
start = std::min(start, pair.second._sequenceNum);
}
std::unique_ptr<rocksdb::TransactionLogIterator> iterator; // reader();
rocksdb::Status s = _db->GetUpdatesSince(lastWrittenSeq-1, &iterator);
rocksdb::Status s = _db->GetUpdatesSince(start, &iterator);
if (!s.ok()) { // TODO do something?
return false;
}
while (iterator->Valid()) {
s = iterator->status();
if (s.ok()) {
rocksdb::BatchResult result = iterator->GetBatch();
handler->seqNum = result.sequence;
s = result.writeBatchPtr->Iterate(handler.get());
rocksdb::BatchResult batch = iterator->GetBatch();
start = batch.sequence;
handler->currentSeqNum = start;
s = batch.writeBatchPtr->Iterate(handler.get());
}
if (!s.ok()) {
LOG_TOPIC(ERR, Logger::ENGINES) << "Error during WAL scan";
@ -357,12 +347,10 @@ bool RocksDBCounterManager::parseRocksWAL() {
}
LOG_TOPIC(WARN, Logger::FIXME) << "Finished WAL scan";
for (std::pair<uint64_t, RocksDBCounterManager::Counter> pair : handler->deltas) {
for (std::pair<uint64_t, RocksDBCounterManager::CounterUpdate> pair : handler->deltas) {
auto const& it = _counters.find(pair.first);
if (it != _counters.end()
&& it->second._sequenceNumber < pair.second._sequenceNumber) {
it->second._sequenceNumber = pair.second._sequenceNumber;
if (it != _counters.end()) {
it->second._sequenceNum = start;
it->second._count += pair.second._count;
it->second._revisionId = pair.second._revisionId;
LOG_TOPIC(WARN, Logger::FIXME) << "WAL recovered " << pair.second._count
@ -372,3 +360,4 @@ bool RocksDBCounterManager::parseRocksWAL() {
}
return handler->deltas.size() > 0;
}

View File

@ -30,53 +30,50 @@
#include "Basics/ReadWriteLock.h"
#include "Basics/Result.h"
#include "Basics/Thread.h"
#include "VocBase/voc-types.h"
#include "RocksDBEngine/RocksDBTypes.h"
#include <velocypack/Builder.h>
#include <velocypack/Slice.h>
#include <velocypack/velocypack-aliases.h>
namespace rocksdb {
class DB;
class Transaction;
class Snapshot;
}
namespace arangodb {
class RocksDBCounterManager : Thread {
friend class RocksDBEngine;
/// Constructor needs to be called synchronously,
/// will load counts from the db and scan the WAL
RocksDBCounterManager(rocksdb::DB* db, double interval);
public:
struct Counter {
rocksdb::SequenceNumber _sequenceNumber;
struct CounterUpdate {
rocksdb::SequenceNumber _sequenceNum;
uint64_t _count; // used for number of documents
uint64_t _revisionId; // used for revision id
Counter();
explicit Counter(VPackSlice const&);
Counter(rocksdb::SequenceNumber seq, uint64_t count, uint64_t revisionId)
: _sequenceNumber(seq), _count(count), _revisionId(revisionId) {}
void serialize(VPackBuilder&) const;
void reset();
uint64_t count() { return _count; }
uint64_t revisionId() { return _revisionId; }
TRI_voc_rid_t _revisionId; // used for revision id
CounterUpdate(rocksdb::SequenceNumber seq, uint64_t count, TRI_voc_rid_t revisionId)
: _sequenceNum(seq), _count(count), _revisionId(revisionId) {}
rocksdb::SequenceNumber sequenceNumber() const {return _sequenceNum;};
uint64_t count() const { return _count; }
TRI_voc_rid_t revisionId() const { return _revisionId; }
};
/// Thread-Safe load a counter
Counter const loadCounter(uint64_t objectId);
CounterUpdate loadCounter(uint64_t objectId) const;
/// collections / views / indexes can call this method to update
/// their total counts. Thread-Safe needs the snapshot so we know
/// the sequence number used
void updateCounter(uint64_t objectId, rocksdb::Snapshot const* snapshot,
int64_t delta, uint64_t revisionId);
void updateCounter(uint64_t objectId,
CounterUpdate const&);
/// Thread-Safe remove a counter
void removeCounter(uint64_t objectId);
@ -88,20 +85,35 @@ class RocksDBCounterManager : Thread {
protected:
void run() override;
struct CMValue {
/// ArangoDB transaction ID
rocksdb::SequenceNumber _sequenceNum;
/// used for number of documents
uint64_t _count;
/// used for revision id
TRI_voc_rid_t _revisionId;
CMValue(rocksdb::SequenceNumber sq, uint64_t cc, TRI_voc_rid_t rid)
: _sequenceNum(sq), _count(cc), _revisionId(rid) {}
explicit CMValue(arangodb::velocypack::Slice const&);
void serialize(arangodb::velocypack::Builder&) const;
};
private:
void readCounterValues();
bool parseRocksWAL();
//////////////////////////////////////////////////////////////////////////////
/// @brief counter values
//////////////////////////////////////////////////////////////////////////////
std::unordered_map<uint64_t, Counter> _counters;
std::unordered_map<uint64_t, CMValue> _counters;
//////////////////////////////////////////////////////////////////////////////
/// @brief synced sequence numbers
//////////////////////////////////////////////////////////////////////////////
std::unordered_map<uint64_t, rocksdb::SequenceNumber> _syncedSeqNum;
std::unordered_map<uint64_t, rocksdb::SequenceNumber> _syncedSeqNums;
//////////////////////////////////////////////////////////////////////////////
/// @brief currently syncing
@ -126,7 +138,7 @@ class RocksDBCounterManager : Thread {
//////////////////////////////////////////////////////////////////////////////
/// @brief protect _syncing and _counters
//////////////////////////////////////////////////////////////////////////////
basics::ReadWriteLock _rwLock;
mutable basics::ReadWriteLock _rwLock;
};
}

View File

@ -129,6 +129,10 @@ Result RocksDBTransactionState::beginTransaction(transaction::Hints hints) {
_rocksWriteOptions, rocksdb::TransactionOptions()));
_rocksTransaction->SetSnapshot();
_rocksReadOptions.snapshot = _rocksTransaction->GetSnapshot();
// add WAL trx marker; Always in before all other operations!!
RocksDBValue tid = RocksDBValue::TransactionID(_id);
_rocksTransaction->PutLogData(tid.string());
/*LOG_TOPIC(ERR, Logger::FIXME)
<< "#" << _id << " BEGIN (read-only: " << isReadOnlyTransaction()
@ -177,7 +181,9 @@ Result RocksDBTransactionState::commitTransaction(
_rocksWriteOptions.sync = true;
}
// TODO wait for response on github issue to see how we can use the sequence number
result = rocksutils::convertStatus(_rocksTransaction->Commit());
rocksdb::SequenceNumber latestSeq = rocksutils::globalRocksDB()->GetLatestSequenceNumber();
if (!result.ok()) {
abortTransaction(activeTrx);
return result;
@ -209,8 +215,10 @@ Result RocksDBTransactionState::commitTransaction(
RocksDBEngine* engine =
static_cast<RocksDBEngine*>(EngineSelectorFeature::ENGINE);
engine->counterManager()->updateCounter(
coll->objectId(), snap, adjustment, collection->revision());
RocksDBCounterManager::CounterUpdate update(latestSeq,
adjustment,
collection->revision());
engine->counterManager()->updateCounter(coll->objectId(), update);
}
}

View File

@ -1377,9 +1377,9 @@ static void JS_BinaryWrite(v8::FunctionCallbackInfo<v8::Value> const& args) {
max_length = MIN(length, MIN(buffer->_length - offset, max_length));
int written = DecodeWrite(isolate, p, max_length, s, BINARY);
ssize_t written = DecodeWrite(isolate, p, max_length, s, BINARY);
TRI_V8_RETURN(v8::Integer::New(isolate, written));
TRI_V8_RETURN(v8::Integer::New(isolate, (int32_t)written));
}
////////////////////////////////////////////////////////////////////////////////