1
0
Fork 0
arangodb/3rdParty/iresearch/core/index/segment_writer.cpp

287 lines
8.2 KiB
C++

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2016 by EMC Corporation, All Rights Reserved
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is EMC Corporation
///
/// @author Andrey Abramov
/// @author Vasiliy Nabatchikov
////////////////////////////////////////////////////////////////////////////////
#include "shared.hpp"
#include "segment_writer.hpp"
#include "store/store_utils.hpp"
#include "index_meta.hpp"
#include "analysis/token_stream.hpp"
#include "analysis/token_attributes.hpp"
#include "utils/log.hpp"
#include "utils/map_utils.hpp"
#include "utils/timer_utils.hpp"
#include "utils/type_limits.hpp"
#include "utils/version_utils.hpp"
#include <math.h>
#include <set>
NS_ROOT
segment_writer::column::column(
const string_ref& name,
columnstore_writer& columnstore) {
this->name.assign(name.c_str(), name.size());
this->handle = columnstore.push_column();
}
doc_id_t segment_writer::begin(
const update_context& ctx,
size_t reserve_rollback_extra /*= 0*/
) {
assert(docs_cached() + type_limits<type_t::doc_id_t>::min() - 1 < type_limits<type_t::doc_id_t>::eof());
valid_ = true;
norm_fields_.clear(); // clear norm fields
if (docs_mask_.capacity() <= docs_mask_.size() + 1 + reserve_rollback_extra) {
docs_mask_.reserve(
math::roundup_power2(docs_mask_.size() + 1 + reserve_rollback_extra) // reserve in blocks of power-of-2
); // reserve space for potential rollback
}
if (docs_context_.size() >= docs_context_.capacity()) {
docs_context_.reserve(math::roundup_power2(docs_context_.size() + 1)); // reserve in blocks of power-of-2
}
docs_context_.emplace_back(ctx);
return doc_id_t(docs_cached() + type_limits<type_t::doc_id_t>::min() - 1); // -1 for 0-based offset
}
segment_writer::ptr segment_writer::make(directory& dir) {
// can't use make_unique becuase of the private constructor
return memory::maker<segment_writer>::make(dir);
}
size_t segment_writer::memory() const NOEXCEPT {
return sizeof(segment_writer)
+ (sizeof(update_contexts::value_type) * docs_context_.size())
+ (sizeof(bitvector) + docs_mask_.count() * sizeof(bitvector::word_t))
+ fields_.memory()
;
}
bool segment_writer::remove(doc_id_t doc_id) {
if (!type_limits<type_t::doc_id_t>::valid(doc_id)
|| (doc_id - type_limits<type_t::doc_id_t>::min()) >= docs_cached()
|| docs_mask_.test(doc_id - type_limits<type_t::doc_id_t>::min())) {
return false;
}
docs_mask_.set(doc_id - type_limits<type_t::doc_id_t>::min());
return true;
}
segment_writer::segment_writer(directory& dir) NOEXCEPT
: dir_(dir), initialized_(false) {
}
bool segment_writer::index(
const hashed_string_ref& name,
token_stream& tokens,
const flags& features) {
REGISTER_TIMER_DETAILED();
assert(docs_cached() + type_limits<type_t::doc_id_t>::min() - 1 < type_limits<type_t::doc_id_t>::eof()); // user should check return of begin() != eof()
auto doc_id =
doc_id_t(docs_cached() + type_limits<type_t::doc_id_t>::min() - 1); // -1 for 0-based offset
auto& slot = fields_.get(name);
auto& slot_features = slot.meta().features;
// invert only if new field features are a subset of slot features
if ((slot.empty() || features.is_subset_of(slot_features)) &&
slot.invert(tokens, slot.empty() ? features : slot_features, doc_id)) {
if (features.check<norm>()) {
norm_fields_.insert(&slot);
}
fields_ += features; // accumulate segment features
return true;
}
return false;
}
columnstore_writer::column_output& segment_writer::stream(
doc_id_t doc_id, const hashed_string_ref& name) {
REGISTER_TIMER_DETAILED();
static auto generator = [](
const hashed_string_ref& key,
const column& value) NOEXCEPT {
// reuse hash but point ref at value
return hashed_string_ref(key.hash(), value.name);
};
// replace original reference to 'name' provided by the caller
// with a reference to the cached copy in 'value'
return map_utils::try_emplace_update_key(
columns_, // container
generator, // key generator
name, // key
name, *col_writer_ // value
).first->second.handle.second(doc_id);
}
void segment_writer::finish() {
REGISTER_TIMER_DETAILED();
// write document normalization factors (for each field marked for normalization))
float_t value;
for (auto* field : norm_fields_) {
value = 1.f / float_t(std::sqrt(double_t(field->size())));
if (value != norm::DEFAULT()) {
auto& stream = field->norms(*col_writer_);
write_zvfloat(stream, value);
}
}
}
bool segment_writer::flush(std::string& filename, segment_meta& meta) {
REGISTER_TIMER_DETAILED();
// flush columnstore and columns indices
if (col_writer_->flush() && !columns_.empty()) {
struct less_t {
bool operator()(
const column* lhs,
const column* rhs
) const NOEXCEPT {
return lhs->name < rhs->name;
};
};
std::set<const column*, less_t> columns;
// ensure columns are sorted
for (auto& entry : columns_) {
columns.emplace(&entry.second);
}
// flush columns meta
col_meta_writer_->prepare(dir_, meta);
for (auto& column: columns) {
col_meta_writer_->write(column->name, column->handle.first);
}
col_meta_writer_->flush();
columns_.clear();
meta.column_store = true;
}
// flush fields metadata & inverted data
if (docs_cached()) {
flush_state state;
state.dir = &dir_;
state.doc_count = docs_cached();
state.name = seg_name_;
state.ver = IRESEARCH_VERSION;
fields_.flush(*field_writer_, state);
}
size_t docs_mask_count = 0;
// write non-empty document mask
if (docs_mask_.any()) {
document_mask docs_mask;
auto writer = meta.codec->get_document_mask_writer();
docs_mask.reserve(docs_mask_.size());
for (size_t doc_id = 0, doc_id_end = docs_mask_.size();
doc_id < doc_id_end;
++doc_id) {
if (docs_mask_.test(doc_id)) {
assert(size_t(integer_traits<doc_id_t>::const_max) >= doc_id + type_limits<type_t::doc_id_t>::min());
docs_mask.emplace(
doc_id_t(doc_id + type_limits<type_t::doc_id_t>::min())
);
}
}
writer->write(dir_, meta, docs_mask);
docs_mask_count = docs_mask.size();
}
assert(docs_cached() >= docs_mask_count);
meta.docs_count = docs_cached();
meta.live_docs_count = meta.docs_count - docs_mask_count;
meta.files.clear(); // prepare empy set to be swaped into dir_
if (!dir_.swap_tracked(meta.files)) {
IR_FRMT_ERROR("Failed to swap list of tracked files in: %s", __FUNCTION__);
return false;
}
// flush segment metadata
{
segment_meta_writer::ptr writer = meta.codec->get_segment_meta_writer();
writer->write(dir_, meta);
filename = writer->filename(meta);
}
return true;
}
void segment_writer::reset() {
initialized_ = false;
tracking_directory::file_set empty;
if (!dir_.swap_tracked(empty)) {
// on failre next segment might have extra files which will fail to get refs
IR_FRMT_ERROR("Failed to swap list of tracked files in: %s", __FUNCTION__);
}
docs_context_.clear();
docs_mask_.clear();
fields_.reset();
}
void segment_writer::reset(const segment_meta& meta) {
reset();
seg_name_ = meta.name;
if (!field_writer_) {
field_writer_ = meta.codec->get_field_writer(false);
}
if (!col_meta_writer_) {
col_meta_writer_ = meta.codec->get_column_meta_writer();
}
if (!col_writer_) {
col_writer_ = meta.codec->get_columnstore_writer();
}
col_writer_->prepare(dir_, meta);
initialized_ = true;
}
NS_END