arangodb/lib/Basics/HashSet.h

// By Emil Ernerfeldt 2014-2016
// LICENSE:
//   This software is dual-licensed to the public domain and under the following
//   license: you are granted a perpetual, irrevocable license to copy, modify,
//   publish, and distribute this file as you see fit.

#ifndef ARANGODB_HASH_SET_H
#define ARANGODB_HASH_SET_H 1

#include <iterator>
#include <type_traits>
#include <utility>
#include <initializer_list>
#include <cstring>

namespace emilib {

/// like std::equal_to but no need to `#include <functional>`
template<typename T>
struct HashSetEqualTo {
  constexpr bool operator()(const T& lhs, const T& rhs) const {
    return lhs == rhs;
  }
};

/// A cache-friendly hash set with open addressing, linear probing and power-of-two capacity
template <typename KeyT, typename HashT = std::hash<KeyT>, typename EqT = HashSetEqualTo<KeyT>>
class HashSet {
 private:
  using MyType = HashSet<KeyT, HashT, EqT>;

 public:
  using size_type       = size_t;
  using value_type      = KeyT;
  using reference       = KeyT&;
  using const_reference = const KeyT&;

  class iterator {
   public:
    using iterator_category = std::forward_iterator_tag;
    using difference_type   = size_t;
    using distance_type     = size_t;
    using value_type        = KeyT;
    using pointer           = value_type*;
    using reference         = value_type&;

    iterator() { }

    iterator(MyType* hash_set, size_t bucket) : _set(hash_set), _bucket(bucket) {}

    iterator& operator++() {
      this->goto_next_element();
      return *this;
    }

    iterator operator++(int) {
      size_t old_index = _bucket;
      this->goto_next_element();
      return iterator(_set, old_index);
    }

    reference operator*() const {
      return _set->_keys[_bucket];
    }

    pointer operator->() const {
      return _set->_keys + _bucket;
    }

    bool operator==(const iterator& rhs) const {
      return this->_bucket == rhs._bucket;
    }

    bool operator!=(const iterator& rhs) const {
      return this->_bucket != rhs._bucket;
    }

   private:
    void goto_next_element() {
      do {
        _bucket++;
      } while (_bucket < _set->_num_buckets && _set->_states[_bucket] != State::FILLED);
    }

   public:
    MyType* _set;
    size_t  _bucket;
  };

  class const_iterator {
   public:
    using iterator_category = std::forward_iterator_tag;
    using difference_type   = size_t;
    using distance_type     = size_t;
    using value_type        = const KeyT;
    using pointer           = value_type*;
    using reference         = value_type&;

    const_iterator() { }

    const_iterator(iterator proto) : _set(proto._set), _bucket(proto._bucket) {}

    const_iterator(const MyType* hash_set, size_t bucket) : _set(hash_set), _bucket(bucket) {}

    const_iterator& operator++() {
      this->goto_next_element();
      return *this;
    }

    const_iterator operator++(int) {
      size_t old_index = _bucket;
      this->goto_next_element();
      return const_iterator(_set, old_index);
    }

    reference operator*() const {
      return _set->_keys[_bucket];
    }

    pointer operator->() const {
      return _set->_keys + _bucket;
    }

    bool operator==(const const_iterator& rhs) const {
      return this->_bucket == rhs._bucket;
    }

    bool operator!=(const const_iterator& rhs) const {
      return this->_bucket != rhs._bucket;
    }

   private:
    void goto_next_element() {
      do {
        _bucket++;
      } while (_bucket < _set->_num_buckets && _set->_states[_bucket] != State::FILLED);
    }

   public:
    const MyType* _set;
    size_t        _bucket;
  };

  HashSet() = default;

  HashSet(const HashSet& other) {
    reserve(other.size());
    insert(other.cbegin(), other.cend());
  }

  HashSet(HashSet&& other) noexcept {
    doMove(std::move(other));
  }

  HashSet(std::initializer_list<KeyT> const& values) : HashSet() {
    for (auto const& v : values) {
      insert(v);
    }
  }

  HashSet& operator=(const HashSet& other) {
    if (this != &other) {
      clear();
      reserve(other.size());
      insert(other.cbegin(), other.cend());
    }
    return *this;
  }

  HashSet& operator=(HashSet&& other) noexcept {
    if (this != &other) {
      for (size_t bucket=0; bucket<_num_buckets; ++bucket) {
        if (_states[bucket] == State::FILLED) {
          _keys[bucket].~KeyT();
        }
      }
      if (_buffer != &_local_buffer[0]) {
        delete[] _buffer;
      }
      doMove(std::move(other));
    }
    return *this;
  }

  ~HashSet() {
    for (size_t bucket=0; bucket<_num_buckets; ++bucket) {
      if (_states[bucket] == State::FILLED) {
        _keys[bucket].~KeyT();
      }
    }
    if (_buffer != &_local_buffer[0]) {
      delete[] _buffer;
    }
  }

  void swap(HashSet& other) {
    std::swap(_hasher,           other._hasher);
    std::swap(_eq,               other._eq);
    std::swap(_buffer,           other._buffer);
    std::swap(_states,           other._states);
    std::swap(_keys,             other._keys);
    std::swap(_num_buckets,      other._num_buckets);
    std::swap(_num_filled,       other._num_filled);
    std::swap(_max_probe_length, other._max_probe_length);
    std::swap(_mask,             other._mask);
  }

  // -------------------------------------------------------------

  iterator begin() {
    size_t bucket = 0;
    while (bucket<_num_buckets && _states[bucket] != State::FILLED) {
      ++bucket;
    }
    return iterator(this, bucket);
  }

  const_iterator cbegin() const {
    size_t bucket = 0;
    while (bucket<_num_buckets && _states[bucket] != State::FILLED) {
      ++bucket;
    }
    return const_iterator(this, bucket);
  }

  const_iterator begin() const {
    return cbegin();
  }

  iterator end() {
    return iterator(this, _num_buckets);
  }

  const_iterator cend() const {
    return const_iterator(this, _num_buckets);
  }

  const_iterator end() const {
    return cend();
  }

  size_t size() const {
    return _num_filled;
  }

  bool empty() const {
    return _num_filled == 0;
  }

  // Returns the number of buckets.
  size_t bucket_count() const {
    return _num_buckets;
  }

  /// Returns average number of elements per bucket.
  float load_factor() const {
    return static_cast<float>(_num_filled) / static_cast<float>(_num_buckets);
  }

  // ------------------------------------------------------------

  iterator find(const KeyT& key) {
    auto bucket = this->find_filled_bucket(key);
    if (bucket == (size_t)-1) {
      return this->end();
    }
    return iterator(this, bucket);
  }

  const_iterator find(const KeyT& key) const {
    auto bucket = this->find_filled_bucket(key);
    if (bucket == (size_t)-1) {
      return this->end();
    }
    return const_iterator(this, bucket);
  }

  bool contains(const KeyT& k) const {
    return find_filled_bucket(k) != (size_t)-1;
  }

  size_t count(const KeyT& k) const {
    return find_filled_bucket(k) != (size_t)-1 ? 1 : 0;
  }

  // -----------------------------------------------------

  /// Insert an element, unless it already exists.
  /// Returns a pair consisting of an iterator to the inserted element
  /// (or to the element that prevented the insertion)
  /// and a bool denoting whether the insertion took place.
  std::pair<iterator, bool> insert(const KeyT& key) {
    check_expand_need();

    auto bucket = find_or_allocate(key);

    if (_states[bucket] == State::FILLED) {
      return { iterator(this, bucket), false };
    } else {
      _states[bucket] = State::FILLED;
      new(_keys + bucket) KeyT(key);
      _num_filled++;
      return { iterator(this, bucket), true };
    }
  }

  /// Insert an element, unless it already exists.
  /// Returns a pair consisting of an iterator to the inserted element
  /// (or to the element that prevented the insertion)
  /// and a bool denoting whether the insertion took place.
  std::pair<iterator, bool> insert(KeyT&& key) {
    check_expand_need();

    auto bucket = find_or_allocate(key);

    if (_states[bucket] == State::FILLED) {
      return { iterator(this, bucket), false };
    } else {
      _states[bucket] = State::FILLED;
      new(_keys + bucket) KeyT(std::move(key));
      _num_filled++;
      return { iterator(this, bucket), true };
    }
  }

  template<class... Args>
  std::pair<iterator, bool> emplace(Args&&... args) {
    return insert(KeyT(std::forward<Args>(args)...));
  }

  void insert(const_iterator begin, const_iterator end) {
    // TODO: reserve space exactly once.
    for (; begin != end; ++begin) {
      insert(*begin);
    }
  }

  // -------------------------------------------------------

  /// Erase an element from the hash set.
  /// return false if element was not found.
  bool erase(const KeyT& key) {
    auto bucket = find_filled_bucket(key);
    if (bucket != (size_t)-1) {
      _states[bucket] = State::ACTIVE;
      _keys[bucket].~KeyT();
      _num_filled -= 1;
      return true;
    } else {
      return false;
    }
  }

  /// Erase an element using an iterator.
  /// Returns an iterator to the next element (or end()).
  iterator erase(iterator it) {
    _states[it._bucket] = State::ACTIVE;
    _keys[it._bucket].~KeyT();
    _num_filled -= 1;
    return ++it;
  }

  /// Remove all elements, keeping full capacity.
  void clear() {
    for (size_t bucket=0; bucket<_num_buckets; ++bucket) {
      if (_states[bucket] == State::FILLED) {
        _states[bucket] = State::INACTIVE;
        _keys[bucket].~KeyT();
      }
    }
    _num_filled = 0;
    _max_probe_length = -1;
  }

  /// Make room for this many elements
  void reserve(size_t num_elems) {
    size_t required_buckets = num_elems + num_elems/2 + 1;
    if (required_buckets <= _num_buckets) {
      return;
    }
    size_t num_buckets = 8;
    while (num_buckets < required_buckets) { num_buckets *= 2; }

    size_t states_size = ((num_buckets * sizeof(State) + 8 - 1) / 8) * 8;
    size_t keys_size = num_buckets * sizeof(KeyT);
    // intentionally uninitialized here, initialization will be done below
    char* new_buffer;
    if (_buffer == nullptr && states_size + keys_size <= sizeof(_local_buffer)) {
      new_buffer = &_local_buffer[0];
    } else {
      new_buffer = new char[states_size + keys_size];
    }
    auto new_states = (State*)new_buffer;
    auto new_keys  = (KeyT*)(new_buffer + states_size);

    // auto old_num_filled  = _num_filled;
    auto old_num_buckets = _num_buckets;
    auto old_buffer      = _buffer;
    auto old_states      = _states;
    auto old_keys        = _keys;

    _num_filled  = 0;
    _num_buckets = num_buckets;
    _mask        = _num_buckets - 1;
    _buffer      = new_buffer;
    _states      = new_states;
    _keys        = new_keys;

    std::fill_n(_states, num_buckets, State::INACTIVE);

    _max_probe_length = -1;

    for (size_t src_bucket=0; src_bucket<old_num_buckets; src_bucket++) {
      if (old_states[src_bucket] == State::FILLED) {
        auto& src = old_keys[src_bucket];

        auto dst_bucket = find_empty_bucket(src);
        _states[dst_bucket] = State::FILLED;
        new(_keys + dst_bucket) KeyT(std::move(src));
        _num_filled += 1;

        src.~KeyT();
      }
    }

    if (old_buffer != &_local_buffer[0]) {
      delete[] old_buffer;
    }
  }

 private:
  void doMove(HashSet&& other) noexcept {
    if (other._buffer == nullptr) {
      _buffer = nullptr;
      _states = nullptr;
      _keys = nullptr;
    } else if (other._buffer != &other._local_buffer[0]) {
      // we can just steal the other's buffer
      _buffer = other._buffer;
      _states = other._states;
      _keys = other._keys;
    } else {
      // we can copy the other's local buffer
      _buffer = &_local_buffer[0];
      _states = (State*)_buffer;
      size_t states_size = ((other._num_buckets * sizeof(State) + 8 - 1) / 8) * 8;
      memcpy(&_local_buffer[0], &other._local_buffer[0], states_size);
      _keys  = (KeyT*)(_buffer + states_size);

      for (size_t src_bucket=0; src_bucket<other._num_buckets; src_bucket++) {
        if (other._states[src_bucket] == State::FILLED) {
          auto& src = other._keys[src_bucket];
          new(_keys + src_bucket) KeyT(std::move(src));
        }
      }
    }

    _num_buckets = other._num_buckets;
    _num_filled = other._num_filled;
    _max_probe_length = other._max_probe_length;
    _mask = other._mask;

    other._buffer = nullptr;
    other._states = nullptr;
    other._keys = nullptr;
    other._num_buckets = 0;
    other._num_filled = 0;
    other._max_probe_length = -1;
    other._mask = 0;
  }

  // Can we fit another element?
  void check_expand_need() {
    reserve(_num_filled + 1);
  }

  // Find the bucket with this key, or return (size_t)-1
  size_t find_filled_bucket(const KeyT& key) const {
    if (empty()) { return (size_t)-1; } // Optimization

    auto hash_value = _hasher(key);
    for (int offset=0; offset<=_max_probe_length; ++offset) {
      auto bucket = (hash_value + offset) & _mask;
      if (_states[bucket] == State::FILLED) {
        if (_eq(_keys[bucket], key)) {
          return bucket;
        }
      } else if (_states[bucket] == State::INACTIVE) {
        return (size_t)-1; // End of the chain!
      }
    }
    return (size_t)-1;
  }

  // Find the bucket with this key, or return a good empty bucket to place the key in.
  // In the latter case, the bucket is expected to be filled.
  size_t find_or_allocate(const KeyT& key) {
    auto hash_value = _hasher(key);
    size_t hole = (size_t)-1;
    int offset=0;
    for (; offset<=_max_probe_length; ++offset) {
      auto bucket = (hash_value + offset) & _mask;

      if (_states[bucket] == State::FILLED) {
        if (_eq(_keys[bucket], key)) {
          return bucket;
        }
      } else if (_states[bucket] == State::INACTIVE) {
        return bucket;
      } else {
        // ACTIVE: keep searching
        if (hole == (size_t)-1) {
          hole = bucket;
        }
      }
    }

    // No key found - but maybe a hole for it
    if (hole != (size_t)-1) {
      return hole;
    }

    // No hole found within _max_probe_length
    for (; ; ++offset) {
      auto bucket = (hash_value + offset) & _mask;

      if (_states[bucket] != State::FILLED) {
        _max_probe_length = offset;
        return bucket;
      }
    }
  }

  // key is not in this map. Find a place to put it.
  size_t find_empty_bucket(const KeyT& key) {
    auto hash_value = _hasher(key);
    for (int offset=0; ; ++offset) {
      auto bucket = (hash_value + offset) & _mask;
      if (_states[bucket] != State::FILLED) {
        if (offset > _max_probe_length) {
          _max_probe_length = offset;
        }
        return bucket;
      }
    }
  }

 private:
  enum class State : uint8_t {
    INACTIVE, // Never been touched
    ACTIVE,   // Is inside a search-chain, but is empty
    FILLED    // Is set with key/value
  };

  HashT   _hasher;
  EqT     _eq;
  char*   _buffer           = nullptr;
  State*  _states           = nullptr;
  KeyT*   _keys             = nullptr;
  size_t  _num_buckets      =  0;
  size_t  _num_filled       =  0;
  int     _max_probe_length = -1; // Our longest bucket-brigade is this long. ONLY when we have zero elements is this ever negative (-1).
  size_t  _mask             = 0;  // _num_buckets minus one
  char    _local_buffer[80];      // first few bytes are always allocated here
};

} // namespace emilib

// map HashSet into arangodb namespace
namespace arangodb {

template<typename T>
using HashSetEqualTo = emilib::HashSetEqualTo<T>;

template<typename KeyT, typename HashT = std::hash<KeyT>, typename EqT = HashSetEqualTo<KeyT>>
using HashSet = emilib::HashSet<KeyT, HashT, EqT>;

}

#endif