From 7f4740b335e963d057fbd0854a6ea0d822f790fc Mon Sep 17 00:00:00 2001
From: Andrey Abramov <gnusmas@kvolitek.ru>
Date: Wed, 19 Dec 2018 01:55:08 +0300
Subject: [PATCH] update ArangoSearch consolidation policy (#7801)

---
 3rdParty/iresearch/core/utils/attributes.hpp  |   9 +-
 3rdParty/iresearch/core/utils/file_utils.cpp  |  24 +++-
 3rdParty/iresearch/core/utils/index_utils.cpp | 113 ++++++++++--------
 3rdParty/iresearch/core/utils/index_utils.hpp |   3 +-
 3rdParty/iresearch/core/utils/ref_counter.hpp |   4 +-
 3rdParty/iresearch/core/utils/text_format.hpp |   6 +-
 arangod/IResearch/IResearchViewMeta.cpp       |  21 +---
 7 files changed, 99 insertions(+), 81 deletions(-)
diff --git a/3rdParty/iresearch/core/utils/attributes.hpp b/3rdParty/iresearch/core/utils/attributes.hpp
index 05dbee0291..675f54dffb 100644
--- a/3rdParty/iresearch/core/utils/attributes.hpp
+++ b/3rdParty/iresearch/core/utils/attributes.hpp
@@ -505,14 +505,13 @@ class IRESEARCH_API attribute_store
   }
 
   template<typename T, typename... Args>
-  typename ref<T>::type& emplace(Args&&... args) {
+  typename ref<T>::type& try_emplace(bool& inserted, Args&&... args) {
     REGISTER_TIMER_DETAILED();
 
     typedef typename std::enable_if<
       std::is_base_of<stored_attribute, T>::value, T
     >::type type;
 
-    bool inserted;
     auto& attr = attribute_map::emplace(inserted, type::type());
 
     if (inserted) {
@@ -521,6 +520,12 @@ class IRESEARCH_API attribute_store
 
     return reinterpret_cast<typename ref<T>::type&>(attr);
   }
+
+  template<typename T, typename... Args>
+  typename ref<T>::type& emplace(Args&&... args) {
+    bool inserted;
+    return try_emplace<T>(inserted, std::forward<Args>(args)...);
+  }
 }; // attribute_store
 
 //////////////////////////////////////////////////////////////////////////////
diff --git a/3rdParty/iresearch/core/utils/file_utils.cpp b/3rdParty/iresearch/core/utils/file_utils.cpp
index 23c974ec64..93eb4f205c 100644
--- a/3rdParty/iresearch/core/utils/file_utils.cpp
+++ b/3rdParty/iresearch/core/utils/file_utils.cpp
@@ -735,7 +735,14 @@ bool mkdir(const file_path_t path) NOEXCEPT {
 
     // '\\?\' cannot be used with relative paths
     if (!abs) {
-      return 0 != CreateDirectoryW(path, nullptr);
+      if (0 == ::CreateDirectoryW(path, nullptr)) {
+        auto utf8path = boost::locale::conv::utf_to_utf<char>(path);
+        IR_FRMT_ERROR("Failed to create path: '%s', error %d", utf8path.c_str(), GetLastError());
+
+        return false;
+      }
+
+      return true;
     }
 
     // workaround for path MAX_PATH
@@ -746,10 +753,21 @@ bool mkdir(const file_path_t path) NOEXCEPT {
       &dirname[0], &dirname[0] + dirname.size(), L'/', file_path_delimiter
     );
 
-    return 0 != ::CreateDirectoryW(dirname.c_str(), nullptr);
+    if (0 == ::CreateDirectoryW(dirname.c_str(), nullptr)) {
+      auto utf8path = boost::locale::conv::utf_to_utf<char>(path);
+      IR_FRMT_ERROR("Failed to create path: '%s', error %d", utf8path.c_str(), GetLastError());
+
+      return false;
+    }
   #else
-    return 0 == ::mkdir(path, S_IRWXU|S_IRWXG|S_IRWXO);
+    if (0 != ::mkdir(path, S_IRWXU|S_IRWXG|S_IRWXO)) {
+      IR_FRMT_ERROR("Failed to create path: '%s', error %d", path, errno);
+
+      return false;
+    }
   #endif
+
+  return true;
 }
 
 bool move(const file_path_t src_path, const file_path_t dst_path) NOEXCEPT {
diff --git a/3rdParty/iresearch/core/utils/index_utils.cpp b/3rdParty/iresearch/core/utils/index_utils.cpp
index a38694b1b3..f23ee0b74b 100644
--- a/3rdParty/iresearch/core/utils/index_utils.cpp
+++ b/3rdParty/iresearch/core/utils/index_utils.cpp
@@ -52,11 +52,13 @@ struct segment_stat {
     auto& lhs = *this;
 
     if (lhs.size == rhs.size) {
-      if (lhs.fill_factor == rhs.fill_factor) {
-        return lhs.meta->name < rhs.meta->name;
+      if (lhs.fill_factor > rhs.fill_factor) {
+        return true;
+      } else if (lhs.fill_factor < rhs.fill_factor) {
+        return false;
       }
 
-      return lhs.fill_factor > rhs.fill_factor;
+      return lhs.meta->name < rhs.meta->name;
     }
 
     return lhs.size < rhs.size;
@@ -71,8 +73,6 @@ struct consolidation_candidate {
   typedef std::set<segment_stat>::const_iterator iterator_t;
   typedef std::pair<iterator_t, iterator_t> range_t;
 
-  consolidation_candidate() = default;
-
   explicit consolidation_candidate(iterator_t i) NOEXCEPT
     : segments(i, i) {
   }
@@ -101,7 +101,7 @@ struct consolidation_candidate {
   range_t segments;
   size_t count{ 0 };
   size_t size{ 0 }; // estimated size of the level
-  double_t score{ -1. }; // how good this permutation is
+  double_t score{ DBL_MIN }; // how good this permutation is
 };
 
 struct consolidation {
@@ -127,7 +127,7 @@ struct consolidation {
 
   std::vector<segment_stat> segments;
   size_t size{ 0 }; // estimated size of the level
-  double_t score{ -1. }; // how good this permutation is
+  double_t score{ DBL_MIN }; // how good this permutation is
 };
 
 /// @returns score of the consolidation bucket
@@ -136,42 +136,63 @@ double_t consolidation_score(
     const size_t segments_per_tier,
     const size_t floor_segment_bytes
 ) NOEXCEPT {
+  // to detect how skewed the consolidation we do the following:
+  // 1. evaluate coefficient of variation, less is better
+  // 2. good candidates are in range [0;1]
+  // 3. favor condidates where number of segments is equal to 'segments_per_tier' approx
+  // 4. prefer smaller consolidations
+  // 5. prefer consolidations which clean removals
+
   switch (consolidation.count) {
     case 0:
-      return -1.;
+      // empty consolidation makes not sense
+      return DBL_MIN;
     case 1: {
       auto& meta = *consolidation.segments.first->meta;
+
       if (meta.docs_count == meta.live_docs_count) {
-        // singleton without removals makes no sense
-        // note: that is important to return score
-        // higher than default value to avoid infinite loop
-        return 0.;
+        // singletone without removals makes no sense
+        return DBL_MIN;
       }
-    } break;
+
+      // FIXME honor number of deletes???
+      // signletone with removals makes sense if nothing better is found
+       return DBL_MIN + DBL_EPSILON;
+    }
   }
 
   size_t size_before_consolidation = 0;
   size_t size_after_consolidation = 0;
+  size_t size_after_consolidation_floored = 0;
   for (auto& segment_stat : consolidation) {
     size_before_consolidation += segment_stat.meta->size;
     size_after_consolidation += segment_stat.size;
+    size_after_consolidation_floored += std::max(segment_stat.size, floor_segment_bytes);
   }
 
-  // detect how skewed the consolidation is, we want
-  // to consolidate segments of approximately the same size
-  const auto first = std::max(consolidation.front().size, floor_segment_bytes);
-  const auto last = std::max(consolidation.back().size, floor_segment_bytes);
+  // evaluate coefficient of variation
+  double_t sum_square_differences = 0;
+  const auto segment_size_after_consolidaton_mean = double_t(size_after_consolidation_floored) / consolidation.count;
+  for (auto& segment_stat : consolidation) {
+    const double_t diff = std::max(segment_stat.size, floor_segment_bytes)-segment_size_after_consolidaton_mean;
+    sum_square_differences += diff*diff;
+  }
 
-  auto score = double_t(first) / last;
+  const auto stdev = std::sqrt(sum_square_differences/consolidation.count);
+  const auto cv = (stdev / segment_size_after_consolidaton_mean);
+
+  // evaluate initial score
+  auto score = 1. - cv;
 
   // favor consolidations that contain approximately the requested number of segments
   score *= std::pow(consolidation.count/double_t(segments_per_tier), 1.5);
 
+  // FIXME use relative measure, e.g. cosolidation_size/total_size
   // carefully prefer smaller consolidations over the bigger ones
-  score /= std::pow(size_after_consolidation, 0.05);
+  score /= std::pow(size_after_consolidation, 0.5);
 
   // favor consolidations which clean out removals
-  score /= std::pow(double_t(size_after_consolidation)/size_before_consolidation, 2.);
+  score /= std::pow(double_t(size_after_consolidation)/size_before_consolidation, 2);
 
   return score;
 }
@@ -329,14 +350,14 @@ index_writer::consolidation_policy_t consolidation_policy(
   const consolidate_tier& options
 ) {
   // validate input
+  const auto max_segments_per_tier = (std::max)(size_t(1), options.max_segments); // can't merge less than 1 segment
   auto min_segments_per_tier = (std::max)(size_t(1), options.min_segments); // can't merge less than 1 segment
-  auto max_segments_per_tier = (std::max)(size_t(1), options.max_segments); // can't merge less than 1 segment
   min_segments_per_tier = (std::min)(min_segments_per_tier, max_segments_per_tier); // ensure min_segments_per_tier <= max_segments_per_tier
-  auto max_segments_bytes = (std::max)(size_t(1), options.max_segments_bytes);
-  auto floor_segment_bytes = (std::max)(size_t(1), options.floor_segment_bytes);
-  auto lookahead = std::max(size_t(1), options.lookahead);
+  const auto max_segments_bytes = (std::max)(size_t(1), options.max_segments_bytes);
+  const auto floor_segment_bytes = (std::max)(size_t(1), options.floor_segment_bytes);
+  const auto min_score = options.min_score; // skip consolidation that have score less than min_score
 
-  return [max_segments_per_tier, min_segments_per_tier, floor_segment_bytes, max_segments_bytes, lookahead, options](
+  return [max_segments_per_tier, min_segments_per_tier, floor_segment_bytes, max_segments_bytes, min_score](
       std::set<const segment_meta*>& candidates,
       const index_meta& meta,
       const index_writer::consolidating_segments_t& consolidating_segments
@@ -413,22 +434,28 @@ index_writer::consolidation_policy_t consolidation_policy(
     /// find candidates
     ///////////////////////////////////////////////////////////////////////////
 
-    std::vector<consolidation> consolidation_candidates;
+    consolidation_candidate best(sorted_segments.begin());
 
-    for (consolidation_candidate best; sorted_segments.size() >= min_segments_per_tier; best.reset()) {
+    if (sorted_segments.size() >= min_segments_per_tier) {
       for (auto i = sorted_segments.begin(), end = sorted_segments.end(); i != end; ++i) {
         consolidation_candidate candidate(i);
 
         while (
             candidate.segments.second != end
             && candidate.count < max_segments_per_tier
-            && candidate.size < max_segments_bytes
         ) {
           candidate.size += candidate.segments.second->size;
+
+          if (candidate.size > max_segments_bytes) {
+            // overcome the limit
+            break;
+          }
+
           ++candidate.count;
           ++candidate.segments.second;
 
           if (candidate.count < min_segments_per_tier) {
+            // not enough segments yet
             continue;
           }
 
@@ -436,32 +463,16 @@ index_writer::consolidation_policy_t consolidation_policy(
             candidate, max_segments_per_tier, floor_segment_bytes
           );
 
+          if (candidate.score < min_score) {
+            // score is too small
+            continue;
+          }
+
           if (best.score < candidate.score) {
             best = candidate;
           }
         }
       }
-
-      if (!best.count) {
-        // can't find a suitable candidate
-        break;
-      }
-
-      // remember the best candidate
-      consolidation_candidates.emplace_back(best);
-      std::push_heap(consolidation_candidates.begin(), consolidation_candidates.end());
-
-      // remove picked segments from the list
-      sorted_segments.erase(best.segments.first, best.segments.second);
-
-      if (consolidation_candidates.size() >= lookahead) {
-        break;
-      }
-    }
-
-    if (consolidation_candidates.empty()) {
-      // nothing ot merge
-      return;
     }
 
     ///////////////////////////////////////////////////////////////////////////
@@ -469,8 +480,8 @@ index_writer::consolidation_policy_t consolidation_policy(
     /// pick the best candidate
     ///////////////////////////////////////////////////////////////////////////
 
-    for (auto& segment : consolidation_candidates.front().segments) {
-      candidates.insert(segment.meta);
+    for (auto& candidate : best) {
+      candidates.insert(candidate.meta);
     }
   };
 }
diff --git a/3rdParty/iresearch/core/utils/index_utils.hpp b/3rdParty/iresearch/core/utils/index_utils.hpp
index c98c607841..5155b1fb6f 100644
--- a/3rdParty/iresearch/core/utils/index_utils.hpp
+++ b/3rdParty/iresearch/core/utils/index_utils.hpp
@@ -74,13 +74,14 @@ struct consolidate_docs_fill {
 /// @param max_segments_bytes maxinum allowed size of all consolidated segments
 /// @param floor_segment_bytes treat all smaller segments as equal for consolidation selection
 /// @param lookahead how many tiers have to be inspected
+/// @param min_score filter out candidates with score less than min_score
 ////////////////////////////////////////////////////////////////////////////////
 struct consolidate_tier {
   size_t min_segments = 1;
   size_t max_segments = 10;
   size_t max_segments_bytes = size_t(5)*(1<<30);
   size_t floor_segment_bytes = size_t(2)*(1<<20);
-  size_t lookahead = integer_traits<size_t>::const_max;
+  double_t min_score = 0.;
 };
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/3rdParty/iresearch/core/utils/ref_counter.hpp b/3rdParty/iresearch/core/utils/ref_counter.hpp
index 4d4da99519..74d1693c21 100644
--- a/3rdParty/iresearch/core/utils/ref_counter.hpp
+++ b/3rdParty/iresearch/core/utils/ref_counter.hpp
@@ -41,14 +41,14 @@ class ref_counter : public util::noncopyable { // noncopyable because shared_ptr
   typedef std::shared_ptr<const Key> ref_t;
 
   struct equal_to : Equal {
-    bool operator()(const ref_t& lhs, const ref_t& rhs) const {
+    bool operator()(const ref_t& lhs, const ref_t& rhs) const NOEXCEPT {
       assert(lhs && rhs);
       return Equal::operator()(*lhs, *rhs);
     }
   }; // equal_to
 
   struct hash : Hash {
-    size_t operator()(const ref_t& value) const {
+    size_t operator()(const ref_t& value) const NOEXCEPT {
       assert(value);
       return Hash::operator()(*value);
     }
diff --git a/3rdParty/iresearch/core/utils/text_format.hpp b/3rdParty/iresearch/core/utils/text_format.hpp
index 2d44c5a309..60963b33a8 100644
--- a/3rdParty/iresearch/core/utils/text_format.hpp
+++ b/3rdParty/iresearch/core/utils/text_format.hpp
@@ -67,8 +67,8 @@ IRESEARCH_IGNORE_UNUSED static const auto& json = json_t();
 ////////////////////////////////////////////////////////////////////////////////
 /// @class raw text format type without any specific encoding
 ////////////////////////////////////////////////////////////////////////////////
-IRESEARCH_IGNORE_UNUSED IRESEARCH_API const type_id& text_t();
-static const auto& text = text_t();
+IRESEARCH_API const type_id& text_t();
+IRESEARCH_IGNORE_UNUSED static const auto& text = text_t();
 
 ////////////////////////////////////////////////////////////////////////////////
 /// @class XML format type https://en.wikipedia.org/wiki/XML
@@ -83,4 +83,4 @@ IRESEARCH_IGNORE_UNUSED static const auto& xml = xml_t();
 NS_END // text_format
 NS_END // ROOT
 
-#endif
+#endif
\ No newline at end of file
diff --git a/arangod/IResearch/IResearchViewMeta.cpp b/arangod/IResearch/IResearchViewMeta.cpp
index 9532e45d4f..7d5c1c8c03 100644
--- a/arangod/IResearch/IResearchViewMeta.cpp
+++ b/arangod/IResearch/IResearchViewMeta.cpp
@@ -98,23 +98,6 @@ arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy createConsolidationP
   irs::index_utils::consolidate_tier options;
   arangodb::velocypack::Builder properties;
 
-  {
-    // optional size_t
-    static const std::string fieldName("lookahead");
-
-    if (slice.hasKey(fieldName)) {
-      auto field = slice.get(fieldName);
-
-      if (!field.isNumber<size_t>()) {
-        errorField = fieldName;
-
-        return arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy();
-      }
-
-      options.lookahead = field.getNumber<size_t>();
-    }
-  }
-
   {
     // optional size_t
     static const std::string fieldName("segmentsBytesFloor");
@@ -185,7 +168,7 @@ arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy createConsolidationP
 
   properties.openObject();
   properties.add("type", arangodb::iresearch::toValuePair(POLICY_TIER));
-  properties.add("lookahead", arangodb::velocypack::Value(options.lookahead));
+  properties.add("lookahead", arangodb::velocypack::Value(size_t(1))); // FIXME remove in 3.5
   properties.add("segmentsBytesFloor", arangodb::velocypack::Value(options.floor_segment_bytes));
   properties.add("segmentsBytesMax", arangodb::velocypack::Value(options.max_segments_bytes));
   properties.add("segmentsMax", arangodb::velocypack::Value(options.max_segments));
@@ -772,4 +755,4 @@ NS_END // arangodb
 
 // -----------------------------------------------------------------------------
 // --SECTION--                                                       END-OF-FILE
-// -----------------------------------------------------------------------------
\ No newline at end of file
+// -----------------------------------------------------------------------------