mirror of https://gitee.com/bigwinds/arangodb
update ArangoSearch consolidation policy (#7801)
This commit is contained in:
parent
4dc64cbab5
commit
7f4740b335
|
@ -505,14 +505,13 @@ class IRESEARCH_API attribute_store
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, typename... Args>
|
template<typename T, typename... Args>
|
||||||
typename ref<T>::type& emplace(Args&&... args) {
|
typename ref<T>::type& try_emplace(bool& inserted, Args&&... args) {
|
||||||
REGISTER_TIMER_DETAILED();
|
REGISTER_TIMER_DETAILED();
|
||||||
|
|
||||||
typedef typename std::enable_if<
|
typedef typename std::enable_if<
|
||||||
std::is_base_of<stored_attribute, T>::value, T
|
std::is_base_of<stored_attribute, T>::value, T
|
||||||
>::type type;
|
>::type type;
|
||||||
|
|
||||||
bool inserted;
|
|
||||||
auto& attr = attribute_map::emplace(inserted, type::type());
|
auto& attr = attribute_map::emplace(inserted, type::type());
|
||||||
|
|
||||||
if (inserted) {
|
if (inserted) {
|
||||||
|
@ -521,6 +520,12 @@ class IRESEARCH_API attribute_store
|
||||||
|
|
||||||
return reinterpret_cast<typename ref<T>::type&>(attr);
|
return reinterpret_cast<typename ref<T>::type&>(attr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T, typename... Args>
|
||||||
|
typename ref<T>::type& emplace(Args&&... args) {
|
||||||
|
bool inserted;
|
||||||
|
return try_emplace<T>(inserted, std::forward<Args>(args)...);
|
||||||
|
}
|
||||||
}; // attribute_store
|
}; // attribute_store
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
|
@ -735,7 +735,14 @@ bool mkdir(const file_path_t path) NOEXCEPT {
|
||||||
|
|
||||||
// '\\?\' cannot be used with relative paths
|
// '\\?\' cannot be used with relative paths
|
||||||
if (!abs) {
|
if (!abs) {
|
||||||
return 0 != CreateDirectoryW(path, nullptr);
|
if (0 == ::CreateDirectoryW(path, nullptr)) {
|
||||||
|
auto utf8path = boost::locale::conv::utf_to_utf<char>(path);
|
||||||
|
IR_FRMT_ERROR("Failed to create path: '%s', error %d", utf8path.c_str(), GetLastError());
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// workaround for path MAX_PATH
|
// workaround for path MAX_PATH
|
||||||
|
@ -746,10 +753,21 @@ bool mkdir(const file_path_t path) NOEXCEPT {
|
||||||
&dirname[0], &dirname[0] + dirname.size(), L'/', file_path_delimiter
|
&dirname[0], &dirname[0] + dirname.size(), L'/', file_path_delimiter
|
||||||
);
|
);
|
||||||
|
|
||||||
return 0 != ::CreateDirectoryW(dirname.c_str(), nullptr);
|
if (0 == ::CreateDirectoryW(dirname.c_str(), nullptr)) {
|
||||||
|
auto utf8path = boost::locale::conv::utf_to_utf<char>(path);
|
||||||
|
IR_FRMT_ERROR("Failed to create path: '%s', error %d", utf8path.c_str(), GetLastError());
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
return 0 == ::mkdir(path, S_IRWXU|S_IRWXG|S_IRWXO);
|
if (0 != ::mkdir(path, S_IRWXU|S_IRWXG|S_IRWXO)) {
|
||||||
|
IR_FRMT_ERROR("Failed to create path: '%s', error %d", path, errno);
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool move(const file_path_t src_path, const file_path_t dst_path) NOEXCEPT {
|
bool move(const file_path_t src_path, const file_path_t dst_path) NOEXCEPT {
|
||||||
|
|
|
@ -52,11 +52,13 @@ struct segment_stat {
|
||||||
auto& lhs = *this;
|
auto& lhs = *this;
|
||||||
|
|
||||||
if (lhs.size == rhs.size) {
|
if (lhs.size == rhs.size) {
|
||||||
if (lhs.fill_factor == rhs.fill_factor) {
|
if (lhs.fill_factor > rhs.fill_factor) {
|
||||||
return lhs.meta->name < rhs.meta->name;
|
return true;
|
||||||
|
} else if (lhs.fill_factor < rhs.fill_factor) {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return lhs.fill_factor > rhs.fill_factor;
|
return lhs.meta->name < rhs.meta->name;
|
||||||
}
|
}
|
||||||
|
|
||||||
return lhs.size < rhs.size;
|
return lhs.size < rhs.size;
|
||||||
|
@ -71,8 +73,6 @@ struct consolidation_candidate {
|
||||||
typedef std::set<segment_stat>::const_iterator iterator_t;
|
typedef std::set<segment_stat>::const_iterator iterator_t;
|
||||||
typedef std::pair<iterator_t, iterator_t> range_t;
|
typedef std::pair<iterator_t, iterator_t> range_t;
|
||||||
|
|
||||||
consolidation_candidate() = default;
|
|
||||||
|
|
||||||
explicit consolidation_candidate(iterator_t i) NOEXCEPT
|
explicit consolidation_candidate(iterator_t i) NOEXCEPT
|
||||||
: segments(i, i) {
|
: segments(i, i) {
|
||||||
}
|
}
|
||||||
|
@ -101,7 +101,7 @@ struct consolidation_candidate {
|
||||||
range_t segments;
|
range_t segments;
|
||||||
size_t count{ 0 };
|
size_t count{ 0 };
|
||||||
size_t size{ 0 }; // estimated size of the level
|
size_t size{ 0 }; // estimated size of the level
|
||||||
double_t score{ -1. }; // how good this permutation is
|
double_t score{ DBL_MIN }; // how good this permutation is
|
||||||
};
|
};
|
||||||
|
|
||||||
struct consolidation {
|
struct consolidation {
|
||||||
|
@ -127,7 +127,7 @@ struct consolidation {
|
||||||
|
|
||||||
std::vector<segment_stat> segments;
|
std::vector<segment_stat> segments;
|
||||||
size_t size{ 0 }; // estimated size of the level
|
size_t size{ 0 }; // estimated size of the level
|
||||||
double_t score{ -1. }; // how good this permutation is
|
double_t score{ DBL_MIN }; // how good this permutation is
|
||||||
};
|
};
|
||||||
|
|
||||||
/// @returns score of the consolidation bucket
|
/// @returns score of the consolidation bucket
|
||||||
|
@ -136,42 +136,63 @@ double_t consolidation_score(
|
||||||
const size_t segments_per_tier,
|
const size_t segments_per_tier,
|
||||||
const size_t floor_segment_bytes
|
const size_t floor_segment_bytes
|
||||||
) NOEXCEPT {
|
) NOEXCEPT {
|
||||||
|
// to detect how skewed the consolidation we do the following:
|
||||||
|
// 1. evaluate coefficient of variation, less is better
|
||||||
|
// 2. good candidates are in range [0;1]
|
||||||
|
// 3. favor condidates where number of segments is equal to 'segments_per_tier' approx
|
||||||
|
// 4. prefer smaller consolidations
|
||||||
|
// 5. prefer consolidations which clean removals
|
||||||
|
|
||||||
switch (consolidation.count) {
|
switch (consolidation.count) {
|
||||||
case 0:
|
case 0:
|
||||||
return -1.;
|
// empty consolidation makes not sense
|
||||||
|
return DBL_MIN;
|
||||||
case 1: {
|
case 1: {
|
||||||
auto& meta = *consolidation.segments.first->meta;
|
auto& meta = *consolidation.segments.first->meta;
|
||||||
|
|
||||||
if (meta.docs_count == meta.live_docs_count) {
|
if (meta.docs_count == meta.live_docs_count) {
|
||||||
// singleton without removals makes no sense
|
// singletone without removals makes no sense
|
||||||
// note: that is important to return score
|
return DBL_MIN;
|
||||||
// higher than default value to avoid infinite loop
|
}
|
||||||
return 0.;
|
|
||||||
|
// FIXME honor number of deletes???
|
||||||
|
// signletone with removals makes sense if nothing better is found
|
||||||
|
return DBL_MIN + DBL_EPSILON;
|
||||||
}
|
}
|
||||||
} break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t size_before_consolidation = 0;
|
size_t size_before_consolidation = 0;
|
||||||
size_t size_after_consolidation = 0;
|
size_t size_after_consolidation = 0;
|
||||||
|
size_t size_after_consolidation_floored = 0;
|
||||||
for (auto& segment_stat : consolidation) {
|
for (auto& segment_stat : consolidation) {
|
||||||
size_before_consolidation += segment_stat.meta->size;
|
size_before_consolidation += segment_stat.meta->size;
|
||||||
size_after_consolidation += segment_stat.size;
|
size_after_consolidation += segment_stat.size;
|
||||||
|
size_after_consolidation_floored += std::max(segment_stat.size, floor_segment_bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
// detect how skewed the consolidation is, we want
|
// evaluate coefficient of variation
|
||||||
// to consolidate segments of approximately the same size
|
double_t sum_square_differences = 0;
|
||||||
const auto first = std::max(consolidation.front().size, floor_segment_bytes);
|
const auto segment_size_after_consolidaton_mean = double_t(size_after_consolidation_floored) / consolidation.count;
|
||||||
const auto last = std::max(consolidation.back().size, floor_segment_bytes);
|
for (auto& segment_stat : consolidation) {
|
||||||
|
const double_t diff = std::max(segment_stat.size, floor_segment_bytes)-segment_size_after_consolidaton_mean;
|
||||||
|
sum_square_differences += diff*diff;
|
||||||
|
}
|
||||||
|
|
||||||
auto score = double_t(first) / last;
|
const auto stdev = std::sqrt(sum_square_differences/consolidation.count);
|
||||||
|
const auto cv = (stdev / segment_size_after_consolidaton_mean);
|
||||||
|
|
||||||
|
// evaluate initial score
|
||||||
|
auto score = 1. - cv;
|
||||||
|
|
||||||
// favor consolidations that contain approximately the requested number of segments
|
// favor consolidations that contain approximately the requested number of segments
|
||||||
score *= std::pow(consolidation.count/double_t(segments_per_tier), 1.5);
|
score *= std::pow(consolidation.count/double_t(segments_per_tier), 1.5);
|
||||||
|
|
||||||
|
// FIXME use relative measure, e.g. cosolidation_size/total_size
|
||||||
// carefully prefer smaller consolidations over the bigger ones
|
// carefully prefer smaller consolidations over the bigger ones
|
||||||
score /= std::pow(size_after_consolidation, 0.05);
|
score /= std::pow(size_after_consolidation, 0.5);
|
||||||
|
|
||||||
// favor consolidations which clean out removals
|
// favor consolidations which clean out removals
|
||||||
score /= std::pow(double_t(size_after_consolidation)/size_before_consolidation, 2.);
|
score /= std::pow(double_t(size_after_consolidation)/size_before_consolidation, 2);
|
||||||
|
|
||||||
return score;
|
return score;
|
||||||
}
|
}
|
||||||
|
@ -329,14 +350,14 @@ index_writer::consolidation_policy_t consolidation_policy(
|
||||||
const consolidate_tier& options
|
const consolidate_tier& options
|
||||||
) {
|
) {
|
||||||
// validate input
|
// validate input
|
||||||
|
const auto max_segments_per_tier = (std::max)(size_t(1), options.max_segments); // can't merge less than 1 segment
|
||||||
auto min_segments_per_tier = (std::max)(size_t(1), options.min_segments); // can't merge less than 1 segment
|
auto min_segments_per_tier = (std::max)(size_t(1), options.min_segments); // can't merge less than 1 segment
|
||||||
auto max_segments_per_tier = (std::max)(size_t(1), options.max_segments); // can't merge less than 1 segment
|
|
||||||
min_segments_per_tier = (std::min)(min_segments_per_tier, max_segments_per_tier); // ensure min_segments_per_tier <= max_segments_per_tier
|
min_segments_per_tier = (std::min)(min_segments_per_tier, max_segments_per_tier); // ensure min_segments_per_tier <= max_segments_per_tier
|
||||||
auto max_segments_bytes = (std::max)(size_t(1), options.max_segments_bytes);
|
const auto max_segments_bytes = (std::max)(size_t(1), options.max_segments_bytes);
|
||||||
auto floor_segment_bytes = (std::max)(size_t(1), options.floor_segment_bytes);
|
const auto floor_segment_bytes = (std::max)(size_t(1), options.floor_segment_bytes);
|
||||||
auto lookahead = std::max(size_t(1), options.lookahead);
|
const auto min_score = options.min_score; // skip consolidation that have score less than min_score
|
||||||
|
|
||||||
return [max_segments_per_tier, min_segments_per_tier, floor_segment_bytes, max_segments_bytes, lookahead, options](
|
return [max_segments_per_tier, min_segments_per_tier, floor_segment_bytes, max_segments_bytes, min_score](
|
||||||
std::set<const segment_meta*>& candidates,
|
std::set<const segment_meta*>& candidates,
|
||||||
const index_meta& meta,
|
const index_meta& meta,
|
||||||
const index_writer::consolidating_segments_t& consolidating_segments
|
const index_writer::consolidating_segments_t& consolidating_segments
|
||||||
|
@ -413,22 +434,28 @@ index_writer::consolidation_policy_t consolidation_policy(
|
||||||
/// find candidates
|
/// find candidates
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
std::vector<consolidation> consolidation_candidates;
|
consolidation_candidate best(sorted_segments.begin());
|
||||||
|
|
||||||
for (consolidation_candidate best; sorted_segments.size() >= min_segments_per_tier; best.reset()) {
|
if (sorted_segments.size() >= min_segments_per_tier) {
|
||||||
for (auto i = sorted_segments.begin(), end = sorted_segments.end(); i != end; ++i) {
|
for (auto i = sorted_segments.begin(), end = sorted_segments.end(); i != end; ++i) {
|
||||||
consolidation_candidate candidate(i);
|
consolidation_candidate candidate(i);
|
||||||
|
|
||||||
while (
|
while (
|
||||||
candidate.segments.second != end
|
candidate.segments.second != end
|
||||||
&& candidate.count < max_segments_per_tier
|
&& candidate.count < max_segments_per_tier
|
||||||
&& candidate.size < max_segments_bytes
|
|
||||||
) {
|
) {
|
||||||
candidate.size += candidate.segments.second->size;
|
candidate.size += candidate.segments.second->size;
|
||||||
|
|
||||||
|
if (candidate.size > max_segments_bytes) {
|
||||||
|
// overcome the limit
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
++candidate.count;
|
++candidate.count;
|
||||||
++candidate.segments.second;
|
++candidate.segments.second;
|
||||||
|
|
||||||
if (candidate.count < min_segments_per_tier) {
|
if (candidate.count < min_segments_per_tier) {
|
||||||
|
// not enough segments yet
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -436,32 +463,16 @@ index_writer::consolidation_policy_t consolidation_policy(
|
||||||
candidate, max_segments_per_tier, floor_segment_bytes
|
candidate, max_segments_per_tier, floor_segment_bytes
|
||||||
);
|
);
|
||||||
|
|
||||||
|
if (candidate.score < min_score) {
|
||||||
|
// score is too small
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (best.score < candidate.score) {
|
if (best.score < candidate.score) {
|
||||||
best = candidate;
|
best = candidate;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!best.count) {
|
|
||||||
// can't find a suitable candidate
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// remember the best candidate
|
|
||||||
consolidation_candidates.emplace_back(best);
|
|
||||||
std::push_heap(consolidation_candidates.begin(), consolidation_candidates.end());
|
|
||||||
|
|
||||||
// remove picked segments from the list
|
|
||||||
sorted_segments.erase(best.segments.first, best.segments.second);
|
|
||||||
|
|
||||||
if (consolidation_candidates.size() >= lookahead) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (consolidation_candidates.empty()) {
|
|
||||||
// nothing ot merge
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -469,8 +480,8 @@ index_writer::consolidation_policy_t consolidation_policy(
|
||||||
/// pick the best candidate
|
/// pick the best candidate
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
for (auto& segment : consolidation_candidates.front().segments) {
|
for (auto& candidate : best) {
|
||||||
candidates.insert(segment.meta);
|
candidates.insert(candidate.meta);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -74,13 +74,14 @@ struct consolidate_docs_fill {
|
||||||
/// @param max_segments_bytes maxinum allowed size of all consolidated segments
|
/// @param max_segments_bytes maxinum allowed size of all consolidated segments
|
||||||
/// @param floor_segment_bytes treat all smaller segments as equal for consolidation selection
|
/// @param floor_segment_bytes treat all smaller segments as equal for consolidation selection
|
||||||
/// @param lookahead how many tiers have to be inspected
|
/// @param lookahead how many tiers have to be inspected
|
||||||
|
/// @param min_score filter out candidates with score less than min_score
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
struct consolidate_tier {
|
struct consolidate_tier {
|
||||||
size_t min_segments = 1;
|
size_t min_segments = 1;
|
||||||
size_t max_segments = 10;
|
size_t max_segments = 10;
|
||||||
size_t max_segments_bytes = size_t(5)*(1<<30);
|
size_t max_segments_bytes = size_t(5)*(1<<30);
|
||||||
size_t floor_segment_bytes = size_t(2)*(1<<20);
|
size_t floor_segment_bytes = size_t(2)*(1<<20);
|
||||||
size_t lookahead = integer_traits<size_t>::const_max;
|
double_t min_score = 0.;
|
||||||
};
|
};
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
|
@ -41,14 +41,14 @@ class ref_counter : public util::noncopyable { // noncopyable because shared_ptr
|
||||||
typedef std::shared_ptr<const Key> ref_t;
|
typedef std::shared_ptr<const Key> ref_t;
|
||||||
|
|
||||||
struct equal_to : Equal {
|
struct equal_to : Equal {
|
||||||
bool operator()(const ref_t& lhs, const ref_t& rhs) const {
|
bool operator()(const ref_t& lhs, const ref_t& rhs) const NOEXCEPT {
|
||||||
assert(lhs && rhs);
|
assert(lhs && rhs);
|
||||||
return Equal::operator()(*lhs, *rhs);
|
return Equal::operator()(*lhs, *rhs);
|
||||||
}
|
}
|
||||||
}; // equal_to
|
}; // equal_to
|
||||||
|
|
||||||
struct hash : Hash {
|
struct hash : Hash {
|
||||||
size_t operator()(const ref_t& value) const {
|
size_t operator()(const ref_t& value) const NOEXCEPT {
|
||||||
assert(value);
|
assert(value);
|
||||||
return Hash::operator()(*value);
|
return Hash::operator()(*value);
|
||||||
}
|
}
|
||||||
|
|
|
@ -67,8 +67,8 @@ IRESEARCH_IGNORE_UNUSED static const auto& json = json_t();
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
/// @class raw text format type without any specific encoding
|
/// @class raw text format type without any specific encoding
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
IRESEARCH_IGNORE_UNUSED IRESEARCH_API const type_id& text_t();
|
IRESEARCH_API const type_id& text_t();
|
||||||
static const auto& text = text_t();
|
IRESEARCH_IGNORE_UNUSED static const auto& text = text_t();
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
/// @class XML format type https://en.wikipedia.org/wiki/XML
|
/// @class XML format type https://en.wikipedia.org/wiki/XML
|
||||||
|
|
|
@ -98,23 +98,6 @@ arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy createConsolidationP
|
||||||
irs::index_utils::consolidate_tier options;
|
irs::index_utils::consolidate_tier options;
|
||||||
arangodb::velocypack::Builder properties;
|
arangodb::velocypack::Builder properties;
|
||||||
|
|
||||||
{
|
|
||||||
// optional size_t
|
|
||||||
static const std::string fieldName("lookahead");
|
|
||||||
|
|
||||||
if (slice.hasKey(fieldName)) {
|
|
||||||
auto field = slice.get(fieldName);
|
|
||||||
|
|
||||||
if (!field.isNumber<size_t>()) {
|
|
||||||
errorField = fieldName;
|
|
||||||
|
|
||||||
return arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy();
|
|
||||||
}
|
|
||||||
|
|
||||||
options.lookahead = field.getNumber<size_t>();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
{
|
||||||
// optional size_t
|
// optional size_t
|
||||||
static const std::string fieldName("segmentsBytesFloor");
|
static const std::string fieldName("segmentsBytesFloor");
|
||||||
|
@ -185,7 +168,7 @@ arangodb::iresearch::IResearchViewMeta::ConsolidationPolicy createConsolidationP
|
||||||
|
|
||||||
properties.openObject();
|
properties.openObject();
|
||||||
properties.add("type", arangodb::iresearch::toValuePair(POLICY_TIER));
|
properties.add("type", arangodb::iresearch::toValuePair(POLICY_TIER));
|
||||||
properties.add("lookahead", arangodb::velocypack::Value(options.lookahead));
|
properties.add("lookahead", arangodb::velocypack::Value(size_t(1))); // FIXME remove in 3.5
|
||||||
properties.add("segmentsBytesFloor", arangodb::velocypack::Value(options.floor_segment_bytes));
|
properties.add("segmentsBytesFloor", arangodb::velocypack::Value(options.floor_segment_bytes));
|
||||||
properties.add("segmentsBytesMax", arangodb::velocypack::Value(options.max_segments_bytes));
|
properties.add("segmentsBytesMax", arangodb::velocypack::Value(options.max_segments_bytes));
|
||||||
properties.add("segmentsMax", arangodb::velocypack::Value(options.max_segments));
|
properties.add("segmentsMax", arangodb::velocypack::Value(options.max_segments));
|
||||||
|
|
Loading…
Reference in New Issue