From 7f4740b335e963d057fbd0854a6ea0d822f790fc Mon Sep 17 00:00:00 2001 From: Andrey Abramov Date: Wed, 19 Dec 2018 01:55:08 +0300 Subject: [PATCH] update ArangoSearch consolidation policy (#7801) --- 3rdParty/iresearch/core/utils/attributes.hpp | 9 +- 3rdParty/iresearch/core/utils/file_utils.cpp | 24 +++- 3rdParty/iresearch/core/utils/index_utils.cpp | 113 ++++++++++-------- 3rdParty/iresearch/core/utils/index_utils.hpp | 3 +- 3rdParty/iresearch/core/utils/ref_counter.hpp | 4 +- 3rdParty/iresearch/core/utils/text_format.hpp | 6 +- arangod/IResearch/IResearchViewMeta.cpp | 21 +--- 7 files changed, 99 insertions(+), 81 deletions(-) diff --git a/3rdParty/iresearch/core/utils/attributes.hpp b/3rdParty/iresearch/core/utils/attributes.hpp index 05dbee0291..675f54dffb 100644 --- a/3rdParty/iresearch/core/utils/attributes.hpp +++ b/3rdParty/iresearch/core/utils/attributes.hpp @@ -505,14 +505,13 @@ class IRESEARCH_API attribute_store } template - typename ref::type& emplace(Args&&... args) { + typename ref::type& try_emplace(bool& inserted, Args&&... args) { REGISTER_TIMER_DETAILED(); typedef typename std::enable_if< std::is_base_of::value, T >::type type; - bool inserted; auto& attr = attribute_map::emplace(inserted, type::type()); if (inserted) { @@ -521,6 +520,12 @@ class IRESEARCH_API attribute_store return reinterpret_cast::type&>(attr); } + + template + typename ref::type& emplace(Args&&... args) { + bool inserted; + return try_emplace(inserted, std::forward(args)...); + } }; // attribute_store ////////////////////////////////////////////////////////////////////////////// diff --git a/3rdParty/iresearch/core/utils/file_utils.cpp b/3rdParty/iresearch/core/utils/file_utils.cpp index 23c974ec64..93eb4f205c 100644 --- a/3rdParty/iresearch/core/utils/file_utils.cpp +++ b/3rdParty/iresearch/core/utils/file_utils.cpp @@ -735,7 +735,14 @@ bool mkdir(const file_path_t path) NOEXCEPT { // '\\?\' cannot be used with relative paths if (!abs) { - return 0 != CreateDirectoryW(path, nullptr); + if (0 == ::CreateDirectoryW(path, nullptr)) { + auto utf8path = boost::locale::conv::utf_to_utf(path); + IR_FRMT_ERROR("Failed to create path: '%s', error %d", utf8path.c_str(), GetLastError()); + + return false; + } + + return true; } // workaround for path MAX_PATH @@ -746,10 +753,21 @@ bool mkdir(const file_path_t path) NOEXCEPT { &dirname[0], &dirname[0] + dirname.size(), L'/', file_path_delimiter ); - return 0 != ::CreateDirectoryW(dirname.c_str(), nullptr); + if (0 == ::CreateDirectoryW(dirname.c_str(), nullptr)) { + auto utf8path = boost::locale::conv::utf_to_utf(path); + IR_FRMT_ERROR("Failed to create path: '%s', error %d", utf8path.c_str(), GetLastError()); + + return false; + } #else - return 0 == ::mkdir(path, S_IRWXU|S_IRWXG|S_IRWXO); + if (0 != ::mkdir(path, S_IRWXU|S_IRWXG|S_IRWXO)) { + IR_FRMT_ERROR("Failed to create path: '%s', error %d", path, errno); + + return false; + } #endif + + return true; } bool move(const file_path_t src_path, const file_path_t dst_path) NOEXCEPT { diff --git a/3rdParty/iresearch/core/utils/index_utils.cpp b/3rdParty/iresearch/core/utils/index_utils.cpp index a38694b1b3..f23ee0b74b 100644 --- a/3rdParty/iresearch/core/utils/index_utils.cpp +++ b/3rdParty/iresearch/core/utils/index_utils.cpp @@ -52,11 +52,13 @@ struct segment_stat { auto& lhs = *this; if (lhs.size == rhs.size) { - if (lhs.fill_factor == rhs.fill_factor) { - return lhs.meta->name < rhs.meta->name; + if (lhs.fill_factor > rhs.fill_factor) { + return true; + } else if (lhs.fill_factor < rhs.fill_factor) { + return false; } - return lhs.fill_factor > rhs.fill_factor; + return lhs.meta->name < rhs.meta->name; } return lhs.size < rhs.size; @@ -71,8 +73,6 @@ struct consolidation_candidate { typedef std::set::const_iterator iterator_t; typedef std::pair range_t; - consolidation_candidate() = default; - explicit consolidation_candidate(iterator_t i) NOEXCEPT : segments(i, i) { } @@ -101,7 +101,7 @@ struct consolidation_candidate { range_t segments; size_t count{ 0 }; size_t size{ 0 }; // estimated size of the level - double_t score{ -1. }; // how good this permutation is + double_t score{ DBL_MIN }; // how good this permutation is }; struct consolidation { @@ -127,7 +127,7 @@ struct consolidation { std::vector segments; size_t size{ 0 }; // estimated size of the level - double_t score{ -1. }; // how good this permutation is + double_t score{ DBL_MIN }; // how good this permutation is }; /// @returns score of the consolidation bucket @@ -136,42 +136,63 @@ double_t consolidation_score( const size_t segments_per_tier, const size_t floor_segment_bytes ) NOEXCEPT { + // to detect how skewed the consolidation we do the following: + // 1. evaluate coefficient of variation, less is better + // 2. good candidates are in range [0;1] + // 3. favor condidates where number of segments is equal to 'segments_per_tier' approx + // 4. prefer smaller consolidations + // 5. prefer consolidations which clean removals + switch (consolidation.count) { case 0: - return -1.; + // empty consolidation makes not sense + return DBL_MIN; case 1: { auto& meta = *consolidation.segments.first->meta; + if (meta.docs_count == meta.live_docs_count) { - // singleton without removals makes no sense - // note: that is important to return score - // higher than default value to avoid infinite loop - return 0.; + // singletone without removals makes no sense + return DBL_MIN; } - } break; + + // FIXME honor number of deletes??? + // signletone with removals makes sense if nothing better is found + return DBL_MIN + DBL_EPSILON; + } } size_t size_before_consolidation = 0; size_t size_after_consolidation = 0; + size_t size_after_consolidation_floored = 0; for (auto& segment_stat : consolidation) { size_before_consolidation += segment_stat.meta->size; size_after_consolidation += segment_stat.size; + size_after_consolidation_floored += std::max(segment_stat.size, floor_segment_bytes); } - // detect how skewed the consolidation is, we want - // to consolidate segments of approximately the same size - const auto first = std::max(consolidation.front().size, floor_segment_bytes); - const auto last = std::max(consolidation.back().size, floor_segment_bytes); + // evaluate coefficient of variation + double_t sum_square_differences = 0; + const auto segment_size_after_consolidaton_mean = double_t(size_after_consolidation_floored) / consolidation.count; + for (auto& segment_stat : consolidation) { + const double_t diff = std::max(segment_stat.size, floor_segment_bytes)-segment_size_after_consolidaton_mean; + sum_square_differences += diff*diff; + } - auto score = double_t(first) / last; + const auto stdev = std::sqrt(sum_square_differences/consolidation.count); + const auto cv = (stdev / segment_size_after_consolidaton_mean); + + // evaluate initial score + auto score = 1. - cv; // favor consolidations that contain approximately the requested number of segments score *= std::pow(consolidation.count/double_t(segments_per_tier), 1.5); + // FIXME use relative measure, e.g. cosolidation_size/total_size // carefully prefer smaller consolidations over the bigger ones - score /= std::pow(size_after_consolidation, 0.05); + score /= std::pow(size_after_consolidation, 0.5); // favor consolidations which clean out removals - score /= std::pow(double_t(size_after_consolidation)/size_before_consolidation, 2.); + score /= std::pow(double_t(size_after_consolidation)/size_before_consolidation, 2); return score; } @@ -329,14 +350,14 @@ index_writer::consolidation_policy_t consolidation_policy( const consolidate_tier& options ) { // validate input + const auto max_segments_per_tier = (std::max)(size_t(1), options.max_segments); // can't merge less than 1 segment auto min_segments_per_tier = (std::max)(size_t(1), options.min_segments); // can't merge less than 1 segment - auto max_segments_per_tier = (std::max)(size_t(1), options.max_segments); // can't merge less than 1 segment min_segments_per_tier = (std::min)(min_segments_per_tier, max_segments_per_tier); // ensure min_segments_per_tier <= max_segments_per_tier - auto max_segments_bytes = (std::max)(size_t(1), options.max_segments_bytes); - auto floor_segment_bytes = (std::max)(size_t(1), options.floor_segment_bytes); - auto lookahead = std::max(size_t(1), options.lookahead); + const auto max_segments_bytes = (std::max)(size_t(1), options.max_segments_bytes); + const auto floor_segment_bytes = (std::max)(size_t(1), options.floor_segment_bytes); + const auto min_score = options.min_score; // skip consolidation that have score less than min_score - return [max_segments_per_tier, min_segments_per_tier, floor_segment_bytes, max_segments_bytes, lookahead, options]( + return [max_segments_per_tier, min_segments_per_tier, floor_segment_bytes, max_segments_bytes, min_score]( std::set& candidates, const index_meta& meta, const index_writer::consolidating_segments_t& consolidating_segments @@ -413,22 +434,28 @@ index_writer::consolidation_policy_t consolidation_policy( /// find candidates /////////////////////////////////////////////////////////////////////////// - std::vector consolidation_candidates; + consolidation_candidate best(sorted_segments.begin()); - for (consolidation_candidate best; sorted_segments.size() >= min_segments_per_tier; best.reset()) { + if (sorted_segments.size() >= min_segments_per_tier) { for (auto i = sorted_segments.begin(), end = sorted_segments.end(); i != end; ++i) { consolidation_candidate candidate(i); while ( candidate.segments.second != end && candidate.count < max_segments_per_tier - && candidate.size < max_segments_bytes ) { candidate.size += candidate.segments.second->size; + + if (candidate.size > max_segments_bytes) { + // overcome the limit + break; + } + ++candidate.count; ++candidate.segments.second; if (candidate.count < min_segments_per_tier) { + // not enough segments yet continue; } @@ -436,32 +463,16 @@ index_writer::consolidation_policy_t consolidation_policy( candidate, max_segments_per_tier, floor_segment_bytes ); + if (candidate.score < min_score) { + // score is too small + continue; + } + if (best.score < candidate.score) { best = candidate; } } } - - if (!best.count) { - // can't find a suitable candidate - break; - } - - // remember the best candidate - consolidation_candidates.emplace_back(best); - std::push_heap(consolidation_candidates.begin(), consolidation_candidates.end()); - - // remove picked segments from the list - sorted_segments.erase(best.segments.first, best.segments.second); - - if (consolidation_candidates.size() >= lookahead) { - break; - } - } - - if (consolidation_candidates.empty()) { - // nothing ot merge - return; } /////////////////////////////////////////////////////////////////////////// @@ -469,8 +480,8 @@ index_writer::consolidation_policy_t consolidation_policy( /// pick the best candidate /////////////////////////////////////////////////////////////////////////// - for (auto& segment : consolidation_candidates.front().segments) { - candidates.insert(segment.meta); + for (auto& candidate : best) { + candidates.insert(candidate.meta); } }; } diff --git a/3rdParty/iresearch/core/utils/index_utils.hpp b/3rdParty/iresearch/core/utils/index_utils.hpp index c98c607841..5155b1fb6f 100644 --- a/3rdParty/iresearch/core/utils/index_utils.hpp +++ b/3rdParty/iresearch/core/utils/index_utils.hpp @@ -74,13 +74,14 @@ struct consolidate_docs_fill { /// @param max_segments_bytes maxinum allowed size of all consolidated segments /// @param floor_segment_bytes treat all smaller segments as equal for consolidation selection /// @param lookahead how many tiers have to be inspected +/// @param min_score filter out candidates with score less than min_score //////////////////////////////////////////////////////////////////////////////// struct consolidate_tier { size_t min_segments = 1; size_t max_segments = 10; size_t max_segments_bytes = size_t(5)*(1<<30); size_t floor_segment_bytes = size_t(2)*(1<<20); - size_t lookahead = integer_traits::const_max; + double_t min_score = 0.; }; //////////////////////////////////////////////////////////////////////////////// diff --git a/3rdParty/iresearch/core/utils/ref_counter.hpp b/3rdParty/iresearch/core/utils/ref_counter.hpp index 4d4da99519..74d1693c21 100644 --- a/3rdParty/iresearch/core/utils/ref_counter.hpp +++ b/3rdParty/iresearch/core/utils/ref_counter.hpp @@ -41,14 +41,14 @@ class ref_counter : public util::noncopyable { // noncopyable because shared_ptr typedef std::shared_ptr ref_t; struct equal_to : Equal { - bool operator()(const ref_t& lhs, const ref_t& rhs) const { + bool operator()(const ref_t& lhs, const ref_t& rhs) const NOEXCEPT { assert(lhs && rhs); return Equal::operator()(*lhs, *rhs); } }; // equal_to struct hash : Hash { - size_t operator()(const ref_t& value) const { + size_t operator()(const ref_t& value) const NOEXCEPT { assert(value); return Hash::operator()(*value); } diff --git a/3rdParty/iresearch/core/utils/text_format.hpp b/3rdParty/iresearch/core/utils/text_format.hpp index 2d44c5a309..60963b33a8 100644 --- a/3rdParty/iresearch/core/utils/text_format.hpp +++ b/3rdParty/iresearch/core/utils/text_format.hpp @@ -67,8 +67,8 @@ IRESEARCH_IGNORE_UNUSED static const auto& json = json_t(); //////////////////////////////////////////////////////////////////////////////// /// @class raw text format type without any specific encoding //////////////////////////////////////////////////////////////////////////////// -IRESEARCH_IGNORE_UNUSED IRESEARCH_API const type_id& text_t(); -static const auto& text = text_t(); +IRESEARCH_API const type_id& text_t(); +IRESEARCH_IGNORE_UNUSED static const auto& text = text_t(); //////////////////////////////////////////////////////////////////////////////// /// @class XML format type https://en.wikipedia.org/wiki/XML @@ -83,4 +83,4 @@ IRESEARCH_IGNORE_UNUSED static const auto& xml = xml_t(); NS_END // text_format NS_END // ROOT -#endif +#endif \ No newline at end of file diff --git a/arangod/IResearch/IResearchViewMeta.cpp b/arangod/IResearch/IResearchViewMeta.cpp index 9532e45d4f..7d5c1c8c03 100644 --- a/arangod/IResearch/IResearchViewMeta.cpp +++ b/arangod/IResearch/IResearchViewMeta.cpp @@ -98,23 +98,6 @@ arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy createConsolidationP irs::index_utils::consolidate_tier options; arangodb::velocypack::Builder properties; - { - // optional size_t - static const std::string fieldName("lookahead"); - - if (slice.hasKey(fieldName)) { - auto field = slice.get(fieldName); - - if (!field.isNumber()) { - errorField = fieldName; - - return arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy(); - } - - options.lookahead = field.getNumber(); - } - } - { // optional size_t static const std::string fieldName("segmentsBytesFloor"); @@ -185,7 +168,7 @@ arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy createConsolidationP properties.openObject(); properties.add("type", arangodb::iresearch::toValuePair(POLICY_TIER)); - properties.add("lookahead", arangodb::velocypack::Value(options.lookahead)); + properties.add("lookahead", arangodb::velocypack::Value(size_t(1))); // FIXME remove in 3.5 properties.add("segmentsBytesFloor", arangodb::velocypack::Value(options.floor_segment_bytes)); properties.add("segmentsBytesMax", arangodb::velocypack::Value(options.max_segments_bytes)); properties.add("segmentsMax", arangodb::velocypack::Value(options.max_segments)); @@ -772,4 +755,4 @@ NS_END // arangodb // ----------------------------------------------------------------------------- // --SECTION-- END-OF-FILE -// ----------------------------------------------------------------------------- \ No newline at end of file +// -----------------------------------------------------------------------------