update ArangoSearch consolidation policy (#7801)

2018-12-19 01:55:08 +03:00 · 2018-12-19 01:55:08 +03:00 · 7f4740b335
parent 4dc64cbab5
commit 7f4740b335
7 changed files with 99 additions and 81 deletions
--- a/3rdParty/iresearch/core/utils/attributes.hpp
+++ b/3rdParty/iresearch/core/utils/attributes.hpp
@ -505,14 +505,13 @@ class IRESEARCH_API attribute_store
  }

  template<typename T, typename... Args>
-  typename ref<T>::type& emplace(Args&&... args) {
+  typename ref<T>::type& try_emplace(bool& inserted, Args&&... args) {
    REGISTER_TIMER_DETAILED();

    typedef typename std::enable_if<
      std::is_base_of<stored_attribute, T>::value, T
    >::type type;

-    bool inserted;
    auto& attr = attribute_map::emplace(inserted, type::type());

    if (inserted) {
@ -521,6 +520,12 @@ class IRESEARCH_API attribute_store

    return reinterpret_cast<typename ref<T>::type&>(attr);
  }
+
+  template<typename T, typename... Args>
+  typename ref<T>::type& emplace(Args&&... args) {
+    bool inserted;
+    return try_emplace<T>(inserted, std::forward<Args>(args)...);
+  }
 }; // attribute_store

 //////////////////////////////////////////////////////////////////////////////
--- a/3rdParty/iresearch/core/utils/file_utils.cpp
+++ b/3rdParty/iresearch/core/utils/file_utils.cpp
@ -735,7 +735,14 @@ bool mkdir(const file_path_t path) NOEXCEPT {

    // '\\?\' cannot be used with relative paths
    if (!abs) {
-      return 0 != CreateDirectoryW(path, nullptr);
+      if (0 == ::CreateDirectoryW(path, nullptr)) {
+        auto utf8path = boost::locale::conv::utf_to_utf<char>(path);
+        IR_FRMT_ERROR("Failed to create path: '%s', error %d", utf8path.c_str(), GetLastError());
+
+        return false;
+      }
+
+      return true;
    }

    // workaround for path MAX_PATH
@ -746,10 +753,21 @@ bool mkdir(const file_path_t path) NOEXCEPT {
      &dirname[0], &dirname[0] + dirname.size(), L'/', file_path_delimiter
    );

-    return 0 != ::CreateDirectoryW(dirname.c_str(), nullptr);
+    if (0 == ::CreateDirectoryW(dirname.c_str(), nullptr)) {
+      auto utf8path = boost::locale::conv::utf_to_utf<char>(path);
+      IR_FRMT_ERROR("Failed to create path: '%s', error %d", utf8path.c_str(), GetLastError());
+
+      return false;
+    }
  #else
-    return 0 == ::mkdir(path, S_IRWXU|S_IRWXG|S_IRWXO);
+    if (0 != ::mkdir(path, S_IRWXU|S_IRWXG|S_IRWXO)) {
+      IR_FRMT_ERROR("Failed to create path: '%s', error %d", path, errno);
+
+      return false;
+    }
  #endif
+
+  return true;
 }

 bool move(const file_path_t src_path, const file_path_t dst_path) NOEXCEPT {
--- a/3rdParty/iresearch/core/utils/index_utils.cpp
+++ b/3rdParty/iresearch/core/utils/index_utils.cpp
@ -52,11 +52,13 @@ struct segment_stat {
    auto& lhs = *this;

    if (lhs.size == rhs.size) {
-      if (lhs.fill_factor == rhs.fill_factor) {
-        return lhs.meta->name < rhs.meta->name;
+      if (lhs.fill_factor > rhs.fill_factor) {
+        return true;
+      } else if (lhs.fill_factor < rhs.fill_factor) {
+        return false;
      }

-      return lhs.fill_factor > rhs.fill_factor;
+      return lhs.meta->name < rhs.meta->name;
    }

    return lhs.size < rhs.size;
@ -71,8 +73,6 @@ struct consolidation_candidate {
  typedef std::set<segment_stat>::const_iterator iterator_t;
  typedef std::pair<iterator_t, iterator_t> range_t;

-  consolidation_candidate() = default;
-
  explicit consolidation_candidate(iterator_t i) NOEXCEPT
    : segments(i, i) {
  }
@ -101,7 +101,7 @@ struct consolidation_candidate {
  range_t segments;
  size_t count{ 0 };
  size_t size{ 0 }; // estimated size of the level
-  double_t score{ -1. }; // how good this permutation is
+  double_t score{ DBL_MIN }; // how good this permutation is
 };

 struct consolidation {
@ -127,7 +127,7 @@ struct consolidation {

  std::vector<segment_stat> segments;
  size_t size{ 0 }; // estimated size of the level
-  double_t score{ -1. }; // how good this permutation is
+  double_t score{ DBL_MIN }; // how good this permutation is
 };

 /// @returns score of the consolidation bucket
@ -136,42 +136,63 @@ double_t consolidation_score(
    const size_t segments_per_tier,
    const size_t floor_segment_bytes
 ) NOEXCEPT {
+  // to detect how skewed the consolidation we do the following:
+  // 1. evaluate coefficient of variation, less is better
+  // 2. good candidates are in range [0;1]
+  // 3. favor condidates where number of segments is equal to 'segments_per_tier' approx
+  // 4. prefer smaller consolidations
+  // 5. prefer consolidations which clean removals
+
  switch (consolidation.count) {
    case 0:
-      return -1.;
+      // empty consolidation makes not sense
+      return DBL_MIN;
    case 1: {
      auto& meta = *consolidation.segments.first->meta;
+
      if (meta.docs_count == meta.live_docs_count) {
-        // singleton without removals makes no sense
-        // note: that is important to return score
-        // higher than default value to avoid infinite loop
-        return 0.;
+        // singletone without removals makes no sense
+        return DBL_MIN;
      }
-    } break;
+
+      // FIXME honor number of deletes???
+      // signletone with removals makes sense if nothing better is found
+       return DBL_MIN + DBL_EPSILON;
+    }
  }

  size_t size_before_consolidation = 0;
  size_t size_after_consolidation = 0;
+  size_t size_after_consolidation_floored = 0;
  for (auto& segment_stat : consolidation) {
    size_before_consolidation += segment_stat.meta->size;
    size_after_consolidation += segment_stat.size;
+    size_after_consolidation_floored += std::max(segment_stat.size, floor_segment_bytes);
  }

-  // detect how skewed the consolidation is, we want
-  // to consolidate segments of approximately the same size
-  const auto first = std::max(consolidation.front().size, floor_segment_bytes);
-  const auto last = std::max(consolidation.back().size, floor_segment_bytes);
+  // evaluate coefficient of variation
+  double_t sum_square_differences = 0;
+  const auto segment_size_after_consolidaton_mean = double_t(size_after_consolidation_floored) / consolidation.count;
+  for (auto& segment_stat : consolidation) {
+    const double_t diff = std::max(segment_stat.size, floor_segment_bytes)-segment_size_after_consolidaton_mean;
+    sum_square_differences += diff*diff;
+  }

-  auto score = double_t(first) / last;
+  const auto stdev = std::sqrt(sum_square_differences/consolidation.count);
+  const auto cv = (stdev / segment_size_after_consolidaton_mean);
+
+  // evaluate initial score
+  auto score = 1. - cv;

  // favor consolidations that contain approximately the requested number of segments
  score *= std::pow(consolidation.count/double_t(segments_per_tier), 1.5);

+  // FIXME use relative measure, e.g. cosolidation_size/total_size
  // carefully prefer smaller consolidations over the bigger ones
-  score /= std::pow(size_after_consolidation, 0.05);
+  score /= std::pow(size_after_consolidation, 0.5);

  // favor consolidations which clean out removals
-  score /= std::pow(double_t(size_after_consolidation)/size_before_consolidation, 2.);
+  score /= std::pow(double_t(size_after_consolidation)/size_before_consolidation, 2);

  return score;
 }
@ -329,14 +350,14 @@ index_writer::consolidation_policy_t consolidation_policy(
  const consolidate_tier& options
 ) {
  // validate input
+  const auto max_segments_per_tier = (std::max)(size_t(1), options.max_segments); // can't merge less than 1 segment
  auto min_segments_per_tier = (std::max)(size_t(1), options.min_segments); // can't merge less than 1 segment
-  auto max_segments_per_tier = (std::max)(size_t(1), options.max_segments); // can't merge less than 1 segment
  min_segments_per_tier = (std::min)(min_segments_per_tier, max_segments_per_tier); // ensure min_segments_per_tier <= max_segments_per_tier
-  auto max_segments_bytes = (std::max)(size_t(1), options.max_segments_bytes);
-  auto floor_segment_bytes = (std::max)(size_t(1), options.floor_segment_bytes);
-  auto lookahead = std::max(size_t(1), options.lookahead);
+  const auto max_segments_bytes = (std::max)(size_t(1), options.max_segments_bytes);
+  const auto floor_segment_bytes = (std::max)(size_t(1), options.floor_segment_bytes);
+  const auto min_score = options.min_score; // skip consolidation that have score less than min_score

-  return [max_segments_per_tier, min_segments_per_tier, floor_segment_bytes, max_segments_bytes, lookahead, options](
+  return [max_segments_per_tier, min_segments_per_tier, floor_segment_bytes, max_segments_bytes, min_score](
      std::set<const segment_meta*>& candidates,
      const index_meta& meta,
      const index_writer::consolidating_segments_t& consolidating_segments
@ -413,22 +434,28 @@ index_writer::consolidation_policy_t consolidation_policy(
    /// find candidates
    ///////////////////////////////////////////////////////////////////////////

-    std::vector<consolidation> consolidation_candidates;
+    consolidation_candidate best(sorted_segments.begin());

-    for (consolidation_candidate best; sorted_segments.size() >= min_segments_per_tier; best.reset()) {
+    if (sorted_segments.size() >= min_segments_per_tier) {
      for (auto i = sorted_segments.begin(), end = sorted_segments.end(); i != end; ++i) {
        consolidation_candidate candidate(i);

        while (
            candidate.segments.second != end
            && candidate.count < max_segments_per_tier
-            && candidate.size < max_segments_bytes
        ) {
          candidate.size += candidate.segments.second->size;
+
+          if (candidate.size > max_segments_bytes) {
+            // overcome the limit
+            break;
+          }
+
          ++candidate.count;
          ++candidate.segments.second;

          if (candidate.count < min_segments_per_tier) {
+            // not enough segments yet
            continue;
          }

@ -436,32 +463,16 @@ index_writer::consolidation_policy_t consolidation_policy(
            candidate, max_segments_per_tier, floor_segment_bytes
          );

+          if (candidate.score < min_score) {
+            // score is too small
+            continue;
+          }
+
          if (best.score < candidate.score) {
            best = candidate;
          }
        }
      }
-
-      if (!best.count) {
-        // can't find a suitable candidate
-        break;
-      }
-
-      // remember the best candidate
-      consolidation_candidates.emplace_back(best);
-      std::push_heap(consolidation_candidates.begin(), consolidation_candidates.end());
-
-      // remove picked segments from the list
-      sorted_segments.erase(best.segments.first, best.segments.second);
-
-      if (consolidation_candidates.size() >= lookahead) {
-        break;
-      }
-    }
-
-    if (consolidation_candidates.empty()) {
-      // nothing ot merge
-      return;
    }

    ///////////////////////////////////////////////////////////////////////////
@ -469,8 +480,8 @@ index_writer::consolidation_policy_t consolidation_policy(
    /// pick the best candidate
    ///////////////////////////////////////////////////////////////////////////

-    for (auto& segment : consolidation_candidates.front().segments) {
-      candidates.insert(segment.meta);
+    for (auto& candidate : best) {
+      candidates.insert(candidate.meta);
    }
  };
 }
--- a/3rdParty/iresearch/core/utils/index_utils.hpp
+++ b/3rdParty/iresearch/core/utils/index_utils.hpp
@ -74,13 +74,14 @@ struct consolidate_docs_fill {
 /// @param max_segments_bytes maxinum allowed size of all consolidated segments
 /// @param floor_segment_bytes treat all smaller segments as equal for consolidation selection
 /// @param lookahead how many tiers have to be inspected
+/// @param min_score filter out candidates with score less than min_score
 ////////////////////////////////////////////////////////////////////////////////
 struct consolidate_tier {
  size_t min_segments = 1;
  size_t max_segments = 10;
  size_t max_segments_bytes = size_t(5)*(1<<30);
  size_t floor_segment_bytes = size_t(2)*(1<<20);
-  size_t lookahead = integer_traits<size_t>::const_max;
+  double_t min_score = 0.;
 };

 ////////////////////////////////////////////////////////////////////////////////
--- a/3rdParty/iresearch/core/utils/ref_counter.hpp
+++ b/3rdParty/iresearch/core/utils/ref_counter.hpp
@ -41,14 +41,14 @@ class ref_counter : public util::noncopyable { // noncopyable because shared_ptr
  typedef std::shared_ptr<const Key> ref_t;

  struct equal_to : Equal {
-    bool operator()(const ref_t& lhs, const ref_t& rhs) const {
+    bool operator()(const ref_t& lhs, const ref_t& rhs) const NOEXCEPT {
      assert(lhs && rhs);
      return Equal::operator()(*lhs, *rhs);
    }
  }; // equal_to

  struct hash : Hash {
-    size_t operator()(const ref_t& value) const {
+    size_t operator()(const ref_t& value) const NOEXCEPT {
      assert(value);
      return Hash::operator()(*value);
    }
--- a/3rdParty/iresearch/core/utils/text_format.hpp
+++ b/3rdParty/iresearch/core/utils/text_format.hpp
@ -67,8 +67,8 @@ IRESEARCH_IGNORE_UNUSED static const auto& json = json_t();
 ////////////////////////////////////////////////////////////////////////////////
 /// @class raw text format type without any specific encoding
 ////////////////////////////////////////////////////////////////////////////////
-IRESEARCH_IGNORE_UNUSED IRESEARCH_API const type_id& text_t();
-static const auto& text = text_t();
+IRESEARCH_API const type_id& text_t();
+IRESEARCH_IGNORE_UNUSED static const auto& text = text_t();

 ////////////////////////////////////////////////////////////////////////////////
 /// @class XML format type https://en.wikipedia.org/wiki/XML
@ -83,4 +83,4 @@ IRESEARCH_IGNORE_UNUSED static const auto& xml = xml_t();
 NS_END // text_format
 NS_END // ROOT

-#endif
+#endif
--- a/arangod/IResearch/IResearchViewMeta.cpp
+++ b/arangod/IResearch/IResearchViewMeta.cpp
@ -98,23 +98,6 @@ arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy createConsolidationP
  irs::index_utils::consolidate_tier options;
  arangodb::velocypack::Builder properties;

-  {
-    // optional size_t
-    static const std::string fieldName("lookahead");
-
-    if (slice.hasKey(fieldName)) {
-      auto field = slice.get(fieldName);
-
-      if (!field.isNumber<size_t>()) {
-        errorField = fieldName;
-
-        return arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy();
-      }
-
-      options.lookahead = field.getNumber<size_t>();
-    }
-  }
-
  {
    // optional size_t
    static const std::string fieldName("segmentsBytesFloor");
@ -185,7 +168,7 @@ arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy createConsolidationP

  properties.openObject();
  properties.add("type", arangodb::iresearch::toValuePair(POLICY_TIER));
-  properties.add("lookahead", arangodb::velocypack::Value(options.lookahead));
+  properties.add("lookahead", arangodb::velocypack::Value(size_t(1))); // FIXME remove in 3.5
  properties.add("segmentsBytesFloor", arangodb::velocypack::Value(options.floor_segment_bytes));
  properties.add("segmentsBytesMax", arangodb::velocypack::Value(options.max_segments_bytes));
  properties.add("segmentsMax", arangodb::velocypack::Value(options.max_segments));
@ -772,4 +755,4 @@ NS_END // arangodb

 // -----------------------------------------------------------------------------
 // --SECTION--                                                       END-OF-FILE
-// -----------------------------------------------------------------------------
+// -----------------------------------------------------------------------------