mirror of https://gitee.com/bigwinds/arangodb
581 lines
15 KiB
C++
581 lines
15 KiB
C++
// By Emil Ernerfeldt 2014-2016
|
|
// LICENSE:
|
|
// This software is dual-licensed to the public domain and under the following
|
|
// license: you are granted a perpetual, irrevocable license to copy, modify,
|
|
// publish, and distribute this file as you see fit.
|
|
|
|
#ifndef ARANGODB_HASH_SET_H
|
|
#define ARANGODB_HASH_SET_H 1
|
|
|
|
#include <iterator>
|
|
#include <type_traits>
|
|
#include <utility>
|
|
#include <initializer_list>
|
|
#include <cstring>
|
|
|
|
namespace emilib {
|
|
|
|
/// like std::equal_to but no need to `#include <functional>`
|
|
template<typename T>
|
|
struct HashSetEqualTo {
|
|
constexpr bool operator()(const T& lhs, const T& rhs) const {
|
|
return lhs == rhs;
|
|
}
|
|
};
|
|
|
|
/// A cache-friendly hash set with open addressing, linear probing and power-of-two capacity
|
|
template <typename KeyT, typename HashT = std::hash<KeyT>, typename EqT = HashSetEqualTo<KeyT>>
|
|
class HashSet {
|
|
private:
|
|
using MyType = HashSet<KeyT, HashT, EqT>;
|
|
|
|
public:
|
|
using size_type = size_t;
|
|
using value_type = KeyT;
|
|
using reference = KeyT&;
|
|
using const_reference = const KeyT&;
|
|
|
|
class iterator {
|
|
public:
|
|
using iterator_category = std::forward_iterator_tag;
|
|
using difference_type = size_t;
|
|
using distance_type = size_t;
|
|
using value_type = KeyT;
|
|
using pointer = value_type*;
|
|
using reference = value_type&;
|
|
|
|
iterator() { }
|
|
|
|
iterator(MyType* hash_set, size_t bucket) : _set(hash_set), _bucket(bucket) {}
|
|
|
|
iterator& operator++() {
|
|
this->goto_next_element();
|
|
return *this;
|
|
}
|
|
|
|
iterator operator++(int) {
|
|
size_t old_index = _bucket;
|
|
this->goto_next_element();
|
|
return iterator(_set, old_index);
|
|
}
|
|
|
|
reference operator*() const {
|
|
return _set->_keys[_bucket];
|
|
}
|
|
|
|
pointer operator->() const {
|
|
return _set->_keys + _bucket;
|
|
}
|
|
|
|
bool operator==(const iterator& rhs) const {
|
|
return this->_bucket == rhs._bucket;
|
|
}
|
|
|
|
bool operator!=(const iterator& rhs) const {
|
|
return this->_bucket != rhs._bucket;
|
|
}
|
|
|
|
private:
|
|
void goto_next_element() {
|
|
do {
|
|
_bucket++;
|
|
} while (_bucket < _set->_num_buckets && _set->_states[_bucket] != State::FILLED);
|
|
}
|
|
|
|
public:
|
|
MyType* _set;
|
|
size_t _bucket;
|
|
};
|
|
|
|
class const_iterator {
|
|
public:
|
|
using iterator_category = std::forward_iterator_tag;
|
|
using difference_type = size_t;
|
|
using distance_type = size_t;
|
|
using value_type = const KeyT;
|
|
using pointer = value_type*;
|
|
using reference = value_type&;
|
|
|
|
const_iterator() { }
|
|
|
|
const_iterator(iterator proto) : _set(proto._set), _bucket(proto._bucket) {}
|
|
|
|
const_iterator(const MyType* hash_set, size_t bucket) : _set(hash_set), _bucket(bucket) {}
|
|
|
|
const_iterator& operator++() {
|
|
this->goto_next_element();
|
|
return *this;
|
|
}
|
|
|
|
const_iterator operator++(int) {
|
|
size_t old_index = _bucket;
|
|
this->goto_next_element();
|
|
return const_iterator(_set, old_index);
|
|
}
|
|
|
|
reference operator*() const {
|
|
return _set->_keys[_bucket];
|
|
}
|
|
|
|
pointer operator->() const {
|
|
return _set->_keys + _bucket;
|
|
}
|
|
|
|
bool operator==(const const_iterator& rhs) const {
|
|
return this->_bucket == rhs._bucket;
|
|
}
|
|
|
|
bool operator!=(const const_iterator& rhs) const {
|
|
return this->_bucket != rhs._bucket;
|
|
}
|
|
|
|
private:
|
|
void goto_next_element() {
|
|
do {
|
|
_bucket++;
|
|
} while (_bucket < _set->_num_buckets && _set->_states[_bucket] != State::FILLED);
|
|
}
|
|
|
|
public:
|
|
const MyType* _set;
|
|
size_t _bucket;
|
|
};
|
|
|
|
HashSet() = default;
|
|
|
|
HashSet(const HashSet& other) {
|
|
reserve(other.size());
|
|
insert(other.cbegin(), other.cend());
|
|
}
|
|
|
|
HashSet(HashSet&& other) noexcept {
|
|
doMove(std::move(other));
|
|
}
|
|
|
|
HashSet(std::initializer_list<KeyT> const& values) : HashSet() {
|
|
for (auto const& v : values) {
|
|
insert(v);
|
|
}
|
|
}
|
|
|
|
HashSet& operator=(const HashSet& other) {
|
|
if (this != &other) {
|
|
clear();
|
|
reserve(other.size());
|
|
insert(other.cbegin(), other.cend());
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
HashSet& operator=(HashSet&& other) noexcept {
|
|
if (this != &other) {
|
|
for (size_t bucket=0; bucket<_num_buckets; ++bucket) {
|
|
if (_states[bucket] == State::FILLED) {
|
|
_keys[bucket].~KeyT();
|
|
}
|
|
}
|
|
if (_buffer != &_local_buffer[0]) {
|
|
delete[] _buffer;
|
|
}
|
|
doMove(std::move(other));
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
~HashSet() {
|
|
for (size_t bucket=0; bucket<_num_buckets; ++bucket) {
|
|
if (_states[bucket] == State::FILLED) {
|
|
_keys[bucket].~KeyT();
|
|
}
|
|
}
|
|
if (_buffer != &_local_buffer[0]) {
|
|
delete[] _buffer;
|
|
}
|
|
}
|
|
|
|
void swap(HashSet& other) {
|
|
std::swap(_hasher, other._hasher);
|
|
std::swap(_eq, other._eq);
|
|
std::swap(_buffer, other._buffer);
|
|
std::swap(_states, other._states);
|
|
std::swap(_keys, other._keys);
|
|
std::swap(_num_buckets, other._num_buckets);
|
|
std::swap(_num_filled, other._num_filled);
|
|
std::swap(_max_probe_length, other._max_probe_length);
|
|
std::swap(_mask, other._mask);
|
|
}
|
|
|
|
// -------------------------------------------------------------
|
|
|
|
iterator begin() {
|
|
size_t bucket = 0;
|
|
while (bucket<_num_buckets && _states[bucket] != State::FILLED) {
|
|
++bucket;
|
|
}
|
|
return iterator(this, bucket);
|
|
}
|
|
|
|
const_iterator cbegin() const {
|
|
size_t bucket = 0;
|
|
while (bucket<_num_buckets && _states[bucket] != State::FILLED) {
|
|
++bucket;
|
|
}
|
|
return const_iterator(this, bucket);
|
|
}
|
|
|
|
const_iterator begin() const {
|
|
return cbegin();
|
|
}
|
|
|
|
iterator end() {
|
|
return iterator(this, _num_buckets);
|
|
}
|
|
|
|
const_iterator cend() const {
|
|
return const_iterator(this, _num_buckets);
|
|
}
|
|
|
|
const_iterator end() const {
|
|
return cend();
|
|
}
|
|
|
|
size_t size() const {
|
|
return _num_filled;
|
|
}
|
|
|
|
bool empty() const {
|
|
return _num_filled == 0;
|
|
}
|
|
|
|
// Returns the number of buckets.
|
|
size_t bucket_count() const {
|
|
return _num_buckets;
|
|
}
|
|
|
|
/// Returns average number of elements per bucket.
|
|
float load_factor() const {
|
|
return static_cast<float>(_num_filled) / static_cast<float>(_num_buckets);
|
|
}
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
iterator find(const KeyT& key) {
|
|
auto bucket = this->find_filled_bucket(key);
|
|
if (bucket == (size_t)-1) {
|
|
return this->end();
|
|
}
|
|
return iterator(this, bucket);
|
|
}
|
|
|
|
const_iterator find(const KeyT& key) const {
|
|
auto bucket = this->find_filled_bucket(key);
|
|
if (bucket == (size_t)-1) {
|
|
return this->end();
|
|
}
|
|
return const_iterator(this, bucket);
|
|
}
|
|
|
|
bool contains(const KeyT& k) const {
|
|
return find_filled_bucket(k) != (size_t)-1;
|
|
}
|
|
|
|
size_t count(const KeyT& k) const {
|
|
return find_filled_bucket(k) != (size_t)-1 ? 1 : 0;
|
|
}
|
|
|
|
// -----------------------------------------------------
|
|
|
|
/// Insert an element, unless it already exists.
|
|
/// Returns a pair consisting of an iterator to the inserted element
|
|
/// (or to the element that prevented the insertion)
|
|
/// and a bool denoting whether the insertion took place.
|
|
std::pair<iterator, bool> insert(const KeyT& key) {
|
|
check_expand_need();
|
|
|
|
auto bucket = find_or_allocate(key);
|
|
|
|
if (_states[bucket] == State::FILLED) {
|
|
return { iterator(this, bucket), false };
|
|
} else {
|
|
_states[bucket] = State::FILLED;
|
|
new(_keys + bucket) KeyT(key);
|
|
_num_filled++;
|
|
return { iterator(this, bucket), true };
|
|
}
|
|
}
|
|
|
|
/// Insert an element, unless it already exists.
|
|
/// Returns a pair consisting of an iterator to the inserted element
|
|
/// (or to the element that prevented the insertion)
|
|
/// and a bool denoting whether the insertion took place.
|
|
std::pair<iterator, bool> insert(KeyT&& key) {
|
|
check_expand_need();
|
|
|
|
auto bucket = find_or_allocate(key);
|
|
|
|
if (_states[bucket] == State::FILLED) {
|
|
return { iterator(this, bucket), false };
|
|
} else {
|
|
_states[bucket] = State::FILLED;
|
|
new(_keys + bucket) KeyT(std::move(key));
|
|
_num_filled++;
|
|
return { iterator(this, bucket), true };
|
|
}
|
|
}
|
|
|
|
template<class... Args>
|
|
std::pair<iterator, bool> emplace(Args&&... args) {
|
|
return insert(KeyT(std::forward<Args>(args)...));
|
|
}
|
|
|
|
void insert(const_iterator begin, const_iterator end) {
|
|
// TODO: reserve space exactly once.
|
|
for (; begin != end; ++begin) {
|
|
insert(*begin);
|
|
}
|
|
}
|
|
|
|
// -------------------------------------------------------
|
|
|
|
/// Erase an element from the hash set.
|
|
/// return false if element was not found.
|
|
bool erase(const KeyT& key) {
|
|
auto bucket = find_filled_bucket(key);
|
|
if (bucket != (size_t)-1) {
|
|
_states[bucket] = State::ACTIVE;
|
|
_keys[bucket].~KeyT();
|
|
_num_filled -= 1;
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/// Erase an element using an iterator.
|
|
/// Returns an iterator to the next element (or end()).
|
|
iterator erase(iterator it) {
|
|
_states[it._bucket] = State::ACTIVE;
|
|
_keys[it._bucket].~KeyT();
|
|
_num_filled -= 1;
|
|
return ++it;
|
|
}
|
|
|
|
/// Remove all elements, keeping full capacity.
|
|
void clear() {
|
|
for (size_t bucket=0; bucket<_num_buckets; ++bucket) {
|
|
if (_states[bucket] == State::FILLED) {
|
|
_states[bucket] = State::INACTIVE;
|
|
_keys[bucket].~KeyT();
|
|
}
|
|
}
|
|
_num_filled = 0;
|
|
_max_probe_length = -1;
|
|
}
|
|
|
|
/// Make room for this many elements
|
|
void reserve(size_t num_elems) {
|
|
size_t required_buckets = num_elems + num_elems/2 + 1;
|
|
if (required_buckets <= _num_buckets) {
|
|
return;
|
|
}
|
|
size_t num_buckets = 8;
|
|
while (num_buckets < required_buckets) { num_buckets *= 2; }
|
|
|
|
size_t states_size = ((num_buckets * sizeof(State) + 8 - 1) / 8) * 8;
|
|
size_t keys_size = num_buckets * sizeof(KeyT);
|
|
// intentionally uninitialized here, initialization will be done below
|
|
char* new_buffer;
|
|
if (_buffer == nullptr && states_size + keys_size <= sizeof(_local_buffer)) {
|
|
new_buffer = &_local_buffer[0];
|
|
} else {
|
|
new_buffer = new char[states_size + keys_size];
|
|
}
|
|
auto new_states = (State*)new_buffer;
|
|
auto new_keys = (KeyT*)(new_buffer + states_size);
|
|
|
|
// auto old_num_filled = _num_filled;
|
|
auto old_num_buckets = _num_buckets;
|
|
auto old_buffer = _buffer;
|
|
auto old_states = _states;
|
|
auto old_keys = _keys;
|
|
|
|
_num_filled = 0;
|
|
_num_buckets = num_buckets;
|
|
_mask = _num_buckets - 1;
|
|
_buffer = new_buffer;
|
|
_states = new_states;
|
|
_keys = new_keys;
|
|
|
|
std::fill_n(_states, num_buckets, State::INACTIVE);
|
|
|
|
_max_probe_length = -1;
|
|
|
|
for (size_t src_bucket=0; src_bucket<old_num_buckets; src_bucket++) {
|
|
if (old_states[src_bucket] == State::FILLED) {
|
|
auto& src = old_keys[src_bucket];
|
|
|
|
auto dst_bucket = find_empty_bucket(src);
|
|
_states[dst_bucket] = State::FILLED;
|
|
new(_keys + dst_bucket) KeyT(std::move(src));
|
|
_num_filled += 1;
|
|
|
|
src.~KeyT();
|
|
}
|
|
}
|
|
|
|
if (old_buffer != &_local_buffer[0]) {
|
|
delete[] old_buffer;
|
|
}
|
|
}
|
|
|
|
private:
|
|
void doMove(HashSet&& other) noexcept {
|
|
if (other._buffer == nullptr) {
|
|
_buffer = nullptr;
|
|
_states = nullptr;
|
|
_keys = nullptr;
|
|
} else if (other._buffer != &other._local_buffer[0]) {
|
|
// we can just steal the other's buffer
|
|
_buffer = other._buffer;
|
|
_states = other._states;
|
|
_keys = other._keys;
|
|
} else {
|
|
// we can copy the other's local buffer
|
|
_buffer = &_local_buffer[0];
|
|
_states = (State*)_buffer;
|
|
size_t states_size = ((other._num_buckets * sizeof(State) + 8 - 1) / 8) * 8;
|
|
memcpy(&_local_buffer[0], &other._local_buffer[0], states_size);
|
|
_keys = (KeyT*)(_buffer + states_size);
|
|
|
|
for (size_t src_bucket=0; src_bucket<other._num_buckets; src_bucket++) {
|
|
if (other._states[src_bucket] == State::FILLED) {
|
|
auto& src = other._keys[src_bucket];
|
|
new(_keys + src_bucket) KeyT(std::move(src));
|
|
}
|
|
}
|
|
}
|
|
|
|
_num_buckets = other._num_buckets;
|
|
_num_filled = other._num_filled;
|
|
_max_probe_length = other._max_probe_length;
|
|
_mask = other._mask;
|
|
|
|
other._buffer = nullptr;
|
|
other._states = nullptr;
|
|
other._keys = nullptr;
|
|
other._num_buckets = 0;
|
|
other._num_filled = 0;
|
|
other._max_probe_length = -1;
|
|
other._mask = 0;
|
|
}
|
|
|
|
// Can we fit another element?
|
|
void check_expand_need() {
|
|
reserve(_num_filled + 1);
|
|
}
|
|
|
|
// Find the bucket with this key, or return (size_t)-1
|
|
size_t find_filled_bucket(const KeyT& key) const {
|
|
if (empty()) { return (size_t)-1; } // Optimization
|
|
|
|
auto hash_value = _hasher(key);
|
|
for (int offset=0; offset<=_max_probe_length; ++offset) {
|
|
auto bucket = (hash_value + offset) & _mask;
|
|
if (_states[bucket] == State::FILLED) {
|
|
if (_eq(_keys[bucket], key)) {
|
|
return bucket;
|
|
}
|
|
} else if (_states[bucket] == State::INACTIVE) {
|
|
return (size_t)-1; // End of the chain!
|
|
}
|
|
}
|
|
return (size_t)-1;
|
|
}
|
|
|
|
// Find the bucket with this key, or return a good empty bucket to place the key in.
|
|
// In the latter case, the bucket is expected to be filled.
|
|
size_t find_or_allocate(const KeyT& key) {
|
|
auto hash_value = _hasher(key);
|
|
size_t hole = (size_t)-1;
|
|
int offset=0;
|
|
for (; offset<=_max_probe_length; ++offset) {
|
|
auto bucket = (hash_value + offset) & _mask;
|
|
|
|
if (_states[bucket] == State::FILLED) {
|
|
if (_eq(_keys[bucket], key)) {
|
|
return bucket;
|
|
}
|
|
} else if (_states[bucket] == State::INACTIVE) {
|
|
return bucket;
|
|
} else {
|
|
// ACTIVE: keep searching
|
|
if (hole == (size_t)-1) {
|
|
hole = bucket;
|
|
}
|
|
}
|
|
}
|
|
|
|
// No key found - but maybe a hole for it
|
|
if (hole != (size_t)-1) {
|
|
return hole;
|
|
}
|
|
|
|
// No hole found within _max_probe_length
|
|
for (; ; ++offset) {
|
|
auto bucket = (hash_value + offset) & _mask;
|
|
|
|
if (_states[bucket] != State::FILLED) {
|
|
_max_probe_length = offset;
|
|
return bucket;
|
|
}
|
|
}
|
|
}
|
|
|
|
// key is not in this map. Find a place to put it.
|
|
size_t find_empty_bucket(const KeyT& key) {
|
|
auto hash_value = _hasher(key);
|
|
for (int offset=0; ; ++offset) {
|
|
auto bucket = (hash_value + offset) & _mask;
|
|
if (_states[bucket] != State::FILLED) {
|
|
if (offset > _max_probe_length) {
|
|
_max_probe_length = offset;
|
|
}
|
|
return bucket;
|
|
}
|
|
}
|
|
}
|
|
|
|
private:
|
|
enum class State : uint8_t {
|
|
INACTIVE, // Never been touched
|
|
ACTIVE, // Is inside a search-chain, but is empty
|
|
FILLED // Is set with key/value
|
|
};
|
|
|
|
HashT _hasher;
|
|
EqT _eq;
|
|
char* _buffer = nullptr;
|
|
State* _states = nullptr;
|
|
KeyT* _keys = nullptr;
|
|
size_t _num_buckets = 0;
|
|
size_t _num_filled = 0;
|
|
int _max_probe_length = -1; // Our longest bucket-brigade is this long. ONLY when we have zero elements is this ever negative (-1).
|
|
size_t _mask = 0; // _num_buckets minus one
|
|
char _local_buffer[80]; // first few bytes are always allocated here
|
|
};
|
|
|
|
} // namespace emilib
|
|
|
|
// map HashSet into arangodb namespace
|
|
namespace arangodb {
|
|
|
|
template<typename T>
|
|
using HashSetEqualTo = emilib::HashSetEqualTo<T>;
|
|
|
|
template<typename KeyT, typename HashT = std::hash<KeyT>, typename EqT = HashSetEqualTo<KeyT>>
|
|
using HashSet = emilib::HashSet<KeyT, HashT, EqT>;
|
|
|
|
}
|
|
|
|
#endif
|