1
0
Fork 0

update ArangoSearch consolidation policy (#7801)

This commit is contained in:
Andrey Abramov 2018-12-19 01:55:08 +03:00 committed by GitHub
parent 4dc64cbab5
commit 7f4740b335
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 99 additions and 81 deletions

View File

@ -505,14 +505,13 @@ class IRESEARCH_API attribute_store
}
template<typename T, typename... Args>
typename ref<T>::type& emplace(Args&&... args) {
typename ref<T>::type& try_emplace(bool& inserted, Args&&... args) {
REGISTER_TIMER_DETAILED();
typedef typename std::enable_if<
std::is_base_of<stored_attribute, T>::value, T
>::type type;
bool inserted;
auto& attr = attribute_map::emplace(inserted, type::type());
if (inserted) {
@ -521,6 +520,12 @@ class IRESEARCH_API attribute_store
return reinterpret_cast<typename ref<T>::type&>(attr);
}
template<typename T, typename... Args>
typename ref<T>::type& emplace(Args&&... args) {
bool inserted;
return try_emplace<T>(inserted, std::forward<Args>(args)...);
}
}; // attribute_store
//////////////////////////////////////////////////////////////////////////////

View File

@ -735,7 +735,14 @@ bool mkdir(const file_path_t path) NOEXCEPT {
// '\\?\' cannot be used with relative paths
if (!abs) {
return 0 != CreateDirectoryW(path, nullptr);
if (0 == ::CreateDirectoryW(path, nullptr)) {
auto utf8path = boost::locale::conv::utf_to_utf<char>(path);
IR_FRMT_ERROR("Failed to create path: '%s', error %d", utf8path.c_str(), GetLastError());
return false;
}
return true;
}
// workaround for path MAX_PATH
@ -746,10 +753,21 @@ bool mkdir(const file_path_t path) NOEXCEPT {
&dirname[0], &dirname[0] + dirname.size(), L'/', file_path_delimiter
);
return 0 != ::CreateDirectoryW(dirname.c_str(), nullptr);
if (0 == ::CreateDirectoryW(dirname.c_str(), nullptr)) {
auto utf8path = boost::locale::conv::utf_to_utf<char>(path);
IR_FRMT_ERROR("Failed to create path: '%s', error %d", utf8path.c_str(), GetLastError());
return false;
}
#else
return 0 == ::mkdir(path, S_IRWXU|S_IRWXG|S_IRWXO);
if (0 != ::mkdir(path, S_IRWXU|S_IRWXG|S_IRWXO)) {
IR_FRMT_ERROR("Failed to create path: '%s', error %d", path, errno);
return false;
}
#endif
return true;
}
bool move(const file_path_t src_path, const file_path_t dst_path) NOEXCEPT {

View File

@ -52,11 +52,13 @@ struct segment_stat {
auto& lhs = *this;
if (lhs.size == rhs.size) {
if (lhs.fill_factor == rhs.fill_factor) {
return lhs.meta->name < rhs.meta->name;
if (lhs.fill_factor > rhs.fill_factor) {
return true;
} else if (lhs.fill_factor < rhs.fill_factor) {
return false;
}
return lhs.fill_factor > rhs.fill_factor;
return lhs.meta->name < rhs.meta->name;
}
return lhs.size < rhs.size;
@ -71,8 +73,6 @@ struct consolidation_candidate {
typedef std::set<segment_stat>::const_iterator iterator_t;
typedef std::pair<iterator_t, iterator_t> range_t;
consolidation_candidate() = default;
explicit consolidation_candidate(iterator_t i) NOEXCEPT
: segments(i, i) {
}
@ -101,7 +101,7 @@ struct consolidation_candidate {
range_t segments;
size_t count{ 0 };
size_t size{ 0 }; // estimated size of the level
double_t score{ -1. }; // how good this permutation is
double_t score{ DBL_MIN }; // how good this permutation is
};
struct consolidation {
@ -127,7 +127,7 @@ struct consolidation {
std::vector<segment_stat> segments;
size_t size{ 0 }; // estimated size of the level
double_t score{ -1. }; // how good this permutation is
double_t score{ DBL_MIN }; // how good this permutation is
};
/// @returns score of the consolidation bucket
@ -136,42 +136,63 @@ double_t consolidation_score(
const size_t segments_per_tier,
const size_t floor_segment_bytes
) NOEXCEPT {
// to detect how skewed the consolidation we do the following:
// 1. evaluate coefficient of variation, less is better
// 2. good candidates are in range [0;1]
// 3. favor condidates where number of segments is equal to 'segments_per_tier' approx
// 4. prefer smaller consolidations
// 5. prefer consolidations which clean removals
switch (consolidation.count) {
case 0:
return -1.;
// empty consolidation makes not sense
return DBL_MIN;
case 1: {
auto& meta = *consolidation.segments.first->meta;
if (meta.docs_count == meta.live_docs_count) {
// singleton without removals makes no sense
// note: that is important to return score
// higher than default value to avoid infinite loop
return 0.;
// singletone without removals makes no sense
return DBL_MIN;
}
} break;
// FIXME honor number of deletes???
// signletone with removals makes sense if nothing better is found
return DBL_MIN + DBL_EPSILON;
}
}
size_t size_before_consolidation = 0;
size_t size_after_consolidation = 0;
size_t size_after_consolidation_floored = 0;
for (auto& segment_stat : consolidation) {
size_before_consolidation += segment_stat.meta->size;
size_after_consolidation += segment_stat.size;
size_after_consolidation_floored += std::max(segment_stat.size, floor_segment_bytes);
}
// detect how skewed the consolidation is, we want
// to consolidate segments of approximately the same size
const auto first = std::max(consolidation.front().size, floor_segment_bytes);
const auto last = std::max(consolidation.back().size, floor_segment_bytes);
// evaluate coefficient of variation
double_t sum_square_differences = 0;
const auto segment_size_after_consolidaton_mean = double_t(size_after_consolidation_floored) / consolidation.count;
for (auto& segment_stat : consolidation) {
const double_t diff = std::max(segment_stat.size, floor_segment_bytes)-segment_size_after_consolidaton_mean;
sum_square_differences += diff*diff;
}
auto score = double_t(first) / last;
const auto stdev = std::sqrt(sum_square_differences/consolidation.count);
const auto cv = (stdev / segment_size_after_consolidaton_mean);
// evaluate initial score
auto score = 1. - cv;
// favor consolidations that contain approximately the requested number of segments
score *= std::pow(consolidation.count/double_t(segments_per_tier), 1.5);
// FIXME use relative measure, e.g. cosolidation_size/total_size
// carefully prefer smaller consolidations over the bigger ones
score /= std::pow(size_after_consolidation, 0.05);
score /= std::pow(size_after_consolidation, 0.5);
// favor consolidations which clean out removals
score /= std::pow(double_t(size_after_consolidation)/size_before_consolidation, 2.);
score /= std::pow(double_t(size_after_consolidation)/size_before_consolidation, 2);
return score;
}
@ -329,14 +350,14 @@ index_writer::consolidation_policy_t consolidation_policy(
const consolidate_tier& options
) {
// validate input
const auto max_segments_per_tier = (std::max)(size_t(1), options.max_segments); // can't merge less than 1 segment
auto min_segments_per_tier = (std::max)(size_t(1), options.min_segments); // can't merge less than 1 segment
auto max_segments_per_tier = (std::max)(size_t(1), options.max_segments); // can't merge less than 1 segment
min_segments_per_tier = (std::min)(min_segments_per_tier, max_segments_per_tier); // ensure min_segments_per_tier <= max_segments_per_tier
auto max_segments_bytes = (std::max)(size_t(1), options.max_segments_bytes);
auto floor_segment_bytes = (std::max)(size_t(1), options.floor_segment_bytes);
auto lookahead = std::max(size_t(1), options.lookahead);
const auto max_segments_bytes = (std::max)(size_t(1), options.max_segments_bytes);
const auto floor_segment_bytes = (std::max)(size_t(1), options.floor_segment_bytes);
const auto min_score = options.min_score; // skip consolidation that have score less than min_score
return [max_segments_per_tier, min_segments_per_tier, floor_segment_bytes, max_segments_bytes, lookahead, options](
return [max_segments_per_tier, min_segments_per_tier, floor_segment_bytes, max_segments_bytes, min_score](
std::set<const segment_meta*>& candidates,
const index_meta& meta,
const index_writer::consolidating_segments_t& consolidating_segments
@ -413,22 +434,28 @@ index_writer::consolidation_policy_t consolidation_policy(
/// find candidates
///////////////////////////////////////////////////////////////////////////
std::vector<consolidation> consolidation_candidates;
consolidation_candidate best(sorted_segments.begin());
for (consolidation_candidate best; sorted_segments.size() >= min_segments_per_tier; best.reset()) {
if (sorted_segments.size() >= min_segments_per_tier) {
for (auto i = sorted_segments.begin(), end = sorted_segments.end(); i != end; ++i) {
consolidation_candidate candidate(i);
while (
candidate.segments.second != end
&& candidate.count < max_segments_per_tier
&& candidate.size < max_segments_bytes
) {
candidate.size += candidate.segments.second->size;
if (candidate.size > max_segments_bytes) {
// overcome the limit
break;
}
++candidate.count;
++candidate.segments.second;
if (candidate.count < min_segments_per_tier) {
// not enough segments yet
continue;
}
@ -436,32 +463,16 @@ index_writer::consolidation_policy_t consolidation_policy(
candidate, max_segments_per_tier, floor_segment_bytes
);
if (candidate.score < min_score) {
// score is too small
continue;
}
if (best.score < candidate.score) {
best = candidate;
}
}
}
if (!best.count) {
// can't find a suitable candidate
break;
}
// remember the best candidate
consolidation_candidates.emplace_back(best);
std::push_heap(consolidation_candidates.begin(), consolidation_candidates.end());
// remove picked segments from the list
sorted_segments.erase(best.segments.first, best.segments.second);
if (consolidation_candidates.size() >= lookahead) {
break;
}
}
if (consolidation_candidates.empty()) {
// nothing ot merge
return;
}
///////////////////////////////////////////////////////////////////////////
@ -469,8 +480,8 @@ index_writer::consolidation_policy_t consolidation_policy(
/// pick the best candidate
///////////////////////////////////////////////////////////////////////////
for (auto& segment : consolidation_candidates.front().segments) {
candidates.insert(segment.meta);
for (auto& candidate : best) {
candidates.insert(candidate.meta);
}
};
}

View File

@ -74,13 +74,14 @@ struct consolidate_docs_fill {
/// @param max_segments_bytes maxinum allowed size of all consolidated segments
/// @param floor_segment_bytes treat all smaller segments as equal for consolidation selection
/// @param lookahead how many tiers have to be inspected
/// @param min_score filter out candidates with score less than min_score
////////////////////////////////////////////////////////////////////////////////
struct consolidate_tier {
size_t min_segments = 1;
size_t max_segments = 10;
size_t max_segments_bytes = size_t(5)*(1<<30);
size_t floor_segment_bytes = size_t(2)*(1<<20);
size_t lookahead = integer_traits<size_t>::const_max;
double_t min_score = 0.;
};
////////////////////////////////////////////////////////////////////////////////

View File

@ -41,14 +41,14 @@ class ref_counter : public util::noncopyable { // noncopyable because shared_ptr
typedef std::shared_ptr<const Key> ref_t;
struct equal_to : Equal {
bool operator()(const ref_t& lhs, const ref_t& rhs) const {
bool operator()(const ref_t& lhs, const ref_t& rhs) const NOEXCEPT {
assert(lhs && rhs);
return Equal::operator()(*lhs, *rhs);
}
}; // equal_to
struct hash : Hash {
size_t operator()(const ref_t& value) const {
size_t operator()(const ref_t& value) const NOEXCEPT {
assert(value);
return Hash::operator()(*value);
}

View File

@ -67,8 +67,8 @@ IRESEARCH_IGNORE_UNUSED static const auto& json = json_t();
////////////////////////////////////////////////////////////////////////////////
/// @class raw text format type without any specific encoding
////////////////////////////////////////////////////////////////////////////////
IRESEARCH_IGNORE_UNUSED IRESEARCH_API const type_id& text_t();
static const auto& text = text_t();
IRESEARCH_API const type_id& text_t();
IRESEARCH_IGNORE_UNUSED static const auto& text = text_t();
////////////////////////////////////////////////////////////////////////////////
/// @class XML format type https://en.wikipedia.org/wiki/XML
@ -83,4 +83,4 @@ IRESEARCH_IGNORE_UNUSED static const auto& xml = xml_t();
NS_END // text_format
NS_END // ROOT
#endif
#endif

View File

@ -98,23 +98,6 @@ arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy createConsolidationP
irs::index_utils::consolidate_tier options;
arangodb::velocypack::Builder properties;
{
// optional size_t
static const std::string fieldName("lookahead");
if (slice.hasKey(fieldName)) {
auto field = slice.get(fieldName);
if (!field.isNumber<size_t>()) {
errorField = fieldName;
return arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy();
}
options.lookahead = field.getNumber<size_t>();
}
}
{
// optional size_t
static const std::string fieldName("segmentsBytesFloor");
@ -185,7 +168,7 @@ arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy createConsolidationP
properties.openObject();
properties.add("type", arangodb::iresearch::toValuePair(POLICY_TIER));
properties.add("lookahead", arangodb::velocypack::Value(options.lookahead));
properties.add("lookahead", arangodb::velocypack::Value(size_t(1))); // FIXME remove in 3.5
properties.add("segmentsBytesFloor", arangodb::velocypack::Value(options.floor_segment_bytes));
properties.add("segmentsBytesMax", arangodb::velocypack::Value(options.max_segments_bytes));
properties.add("segmentsMax", arangodb::velocypack::Value(options.max_segments));
@ -772,4 +755,4 @@ NS_END // arangodb
// -----------------------------------------------------------------------------
// --SECTION-- END-OF-FILE
// -----------------------------------------------------------------------------
// -----------------------------------------------------------------------------