mirror of https://gitee.com/bigwinds/arangodb
287 lines
8.2 KiB
C++
287 lines
8.2 KiB
C++
////////////////////////////////////////////////////////////////////////////////
|
|
/// DISCLAIMER
|
|
///
|
|
/// Copyright 2016 by EMC Corporation, All Rights Reserved
|
|
///
|
|
/// Licensed under the Apache License, Version 2.0 (the "License");
|
|
/// you may not use this file except in compliance with the License.
|
|
/// You may obtain a copy of the License at
|
|
///
|
|
/// http://www.apache.org/licenses/LICENSE-2.0
|
|
///
|
|
/// Unless required by applicable law or agreed to in writing, software
|
|
/// distributed under the License is distributed on an "AS IS" BASIS,
|
|
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
/// See the License for the specific language governing permissions and
|
|
/// limitations under the License.
|
|
///
|
|
/// Copyright holder is EMC Corporation
|
|
///
|
|
/// @author Andrey Abramov
|
|
/// @author Vasiliy Nabatchikov
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "shared.hpp"
|
|
#include "segment_writer.hpp"
|
|
#include "store/store_utils.hpp"
|
|
#include "index_meta.hpp"
|
|
#include "analysis/token_stream.hpp"
|
|
#include "analysis/token_attributes.hpp"
|
|
#include "utils/log.hpp"
|
|
#include "utils/map_utils.hpp"
|
|
#include "utils/timer_utils.hpp"
|
|
#include "utils/type_limits.hpp"
|
|
#include "utils/version_utils.hpp"
|
|
|
|
#include <math.h>
|
|
#include <set>
|
|
|
|
NS_ROOT
|
|
|
|
segment_writer::column::column(
|
|
const string_ref& name,
|
|
columnstore_writer& columnstore) {
|
|
this->name.assign(name.c_str(), name.size());
|
|
this->handle = columnstore.push_column();
|
|
}
|
|
|
|
doc_id_t segment_writer::begin(
|
|
const update_context& ctx,
|
|
size_t reserve_rollback_extra /*= 0*/
|
|
) {
|
|
assert(docs_cached() + type_limits<type_t::doc_id_t>::min() - 1 < type_limits<type_t::doc_id_t>::eof());
|
|
valid_ = true;
|
|
norm_fields_.clear(); // clear norm fields
|
|
|
|
if (docs_mask_.capacity() <= docs_mask_.size() + 1 + reserve_rollback_extra) {
|
|
docs_mask_.reserve(
|
|
math::roundup_power2(docs_mask_.size() + 1 + reserve_rollback_extra) // reserve in blocks of power-of-2
|
|
); // reserve space for potential rollback
|
|
}
|
|
|
|
if (docs_context_.size() >= docs_context_.capacity()) {
|
|
docs_context_.reserve(math::roundup_power2(docs_context_.size() + 1)); // reserve in blocks of power-of-2
|
|
}
|
|
|
|
docs_context_.emplace_back(ctx);
|
|
|
|
return doc_id_t(docs_cached() + type_limits<type_t::doc_id_t>::min() - 1); // -1 for 0-based offset
|
|
}
|
|
|
|
segment_writer::ptr segment_writer::make(directory& dir) {
|
|
// can't use make_unique becuase of the private constructor
|
|
return memory::maker<segment_writer>::make(dir);
|
|
}
|
|
|
|
size_t segment_writer::memory() const NOEXCEPT {
|
|
return sizeof(segment_writer)
|
|
+ (sizeof(update_contexts::value_type) * docs_context_.size())
|
|
+ (sizeof(bitvector) + docs_mask_.count() * sizeof(bitvector::word_t))
|
|
+ fields_.memory()
|
|
;
|
|
}
|
|
|
|
bool segment_writer::remove(doc_id_t doc_id) {
|
|
if (!type_limits<type_t::doc_id_t>::valid(doc_id)
|
|
|| (doc_id - type_limits<type_t::doc_id_t>::min()) >= docs_cached()
|
|
|| docs_mask_.test(doc_id - type_limits<type_t::doc_id_t>::min())) {
|
|
return false;
|
|
}
|
|
|
|
docs_mask_.set(doc_id - type_limits<type_t::doc_id_t>::min());
|
|
|
|
return true;
|
|
}
|
|
|
|
segment_writer::segment_writer(directory& dir) NOEXCEPT
|
|
: dir_(dir), initialized_(false) {
|
|
}
|
|
|
|
bool segment_writer::index(
|
|
const hashed_string_ref& name,
|
|
token_stream& tokens,
|
|
const flags& features) {
|
|
REGISTER_TIMER_DETAILED();
|
|
|
|
assert(docs_cached() + type_limits<type_t::doc_id_t>::min() - 1 < type_limits<type_t::doc_id_t>::eof()); // user should check return of begin() != eof()
|
|
auto doc_id =
|
|
doc_id_t(docs_cached() + type_limits<type_t::doc_id_t>::min() - 1); // -1 for 0-based offset
|
|
auto& slot = fields_.get(name);
|
|
auto& slot_features = slot.meta().features;
|
|
|
|
// invert only if new field features are a subset of slot features
|
|
if ((slot.empty() || features.is_subset_of(slot_features)) &&
|
|
slot.invert(tokens, slot.empty() ? features : slot_features, doc_id)) {
|
|
if (features.check<norm>()) {
|
|
norm_fields_.insert(&slot);
|
|
}
|
|
|
|
fields_ += features; // accumulate segment features
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
columnstore_writer::column_output& segment_writer::stream(
|
|
doc_id_t doc_id, const hashed_string_ref& name) {
|
|
REGISTER_TIMER_DETAILED();
|
|
|
|
static auto generator = [](
|
|
const hashed_string_ref& key,
|
|
const column& value) NOEXCEPT {
|
|
// reuse hash but point ref at value
|
|
return hashed_string_ref(key.hash(), value.name);
|
|
};
|
|
|
|
// replace original reference to 'name' provided by the caller
|
|
// with a reference to the cached copy in 'value'
|
|
return map_utils::try_emplace_update_key(
|
|
columns_, // container
|
|
generator, // key generator
|
|
name, // key
|
|
name, *col_writer_ // value
|
|
).first->second.handle.second(doc_id);
|
|
}
|
|
|
|
void segment_writer::finish() {
|
|
REGISTER_TIMER_DETAILED();
|
|
|
|
// write document normalization factors (for each field marked for normalization))
|
|
float_t value;
|
|
for (auto* field : norm_fields_) {
|
|
value = 1.f / float_t(std::sqrt(double_t(field->size())));
|
|
if (value != norm::DEFAULT()) {
|
|
auto& stream = field->norms(*col_writer_);
|
|
write_zvfloat(stream, value);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool segment_writer::flush(std::string& filename, segment_meta& meta) {
|
|
REGISTER_TIMER_DETAILED();
|
|
|
|
// flush columnstore and columns indices
|
|
if (col_writer_->flush() && !columns_.empty()) {
|
|
struct less_t {
|
|
bool operator()(
|
|
const column* lhs,
|
|
const column* rhs
|
|
) const NOEXCEPT {
|
|
return lhs->name < rhs->name;
|
|
};
|
|
};
|
|
|
|
std::set<const column*, less_t> columns;
|
|
|
|
// ensure columns are sorted
|
|
for (auto& entry : columns_) {
|
|
columns.emplace(&entry.second);
|
|
}
|
|
|
|
// flush columns meta
|
|
col_meta_writer_->prepare(dir_, meta);
|
|
|
|
for (auto& column: columns) {
|
|
col_meta_writer_->write(column->name, column->handle.first);
|
|
}
|
|
|
|
col_meta_writer_->flush();
|
|
columns_.clear();
|
|
meta.column_store = true;
|
|
}
|
|
|
|
// flush fields metadata & inverted data
|
|
if (docs_cached()) {
|
|
flush_state state;
|
|
state.dir = &dir_;
|
|
state.doc_count = docs_cached();
|
|
state.name = seg_name_;
|
|
state.ver = IRESEARCH_VERSION;
|
|
|
|
fields_.flush(*field_writer_, state);
|
|
}
|
|
|
|
size_t docs_mask_count = 0;
|
|
|
|
// write non-empty document mask
|
|
if (docs_mask_.any()) {
|
|
document_mask docs_mask;
|
|
auto writer = meta.codec->get_document_mask_writer();
|
|
|
|
docs_mask.reserve(docs_mask_.size());
|
|
|
|
for (size_t doc_id = 0, doc_id_end = docs_mask_.size();
|
|
doc_id < doc_id_end;
|
|
++doc_id) {
|
|
if (docs_mask_.test(doc_id)) {
|
|
assert(size_t(integer_traits<doc_id_t>::const_max) >= doc_id + type_limits<type_t::doc_id_t>::min());
|
|
docs_mask.emplace(
|
|
doc_id_t(doc_id + type_limits<type_t::doc_id_t>::min())
|
|
);
|
|
}
|
|
}
|
|
|
|
writer->write(dir_, meta, docs_mask);
|
|
docs_mask_count = docs_mask.size();
|
|
}
|
|
|
|
assert(docs_cached() >= docs_mask_count);
|
|
meta.docs_count = docs_cached();
|
|
meta.live_docs_count = meta.docs_count - docs_mask_count;
|
|
meta.files.clear(); // prepare empy set to be swaped into dir_
|
|
|
|
if (!dir_.swap_tracked(meta.files)) {
|
|
IR_FRMT_ERROR("Failed to swap list of tracked files in: %s", __FUNCTION__);
|
|
|
|
return false;
|
|
}
|
|
|
|
// flush segment metadata
|
|
{
|
|
segment_meta_writer::ptr writer = meta.codec->get_segment_meta_writer();
|
|
writer->write(dir_, meta);
|
|
|
|
filename = writer->filename(meta);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void segment_writer::reset() {
|
|
initialized_ = false;
|
|
|
|
tracking_directory::file_set empty;
|
|
|
|
if (!dir_.swap_tracked(empty)) {
|
|
// on failre next segment might have extra files which will fail to get refs
|
|
IR_FRMT_ERROR("Failed to swap list of tracked files in: %s", __FUNCTION__);
|
|
}
|
|
|
|
docs_context_.clear();
|
|
docs_mask_.clear();
|
|
fields_.reset();
|
|
}
|
|
|
|
void segment_writer::reset(const segment_meta& meta) {
|
|
reset();
|
|
|
|
seg_name_ = meta.name;
|
|
|
|
if (!field_writer_) {
|
|
field_writer_ = meta.codec->get_field_writer(false);
|
|
}
|
|
|
|
if (!col_meta_writer_) {
|
|
col_meta_writer_ = meta.codec->get_column_meta_writer();
|
|
}
|
|
|
|
if (!col_writer_) {
|
|
col_writer_ = meta.codec->get_columnstore_writer();
|
|
}
|
|
|
|
col_writer_->prepare(dir_, meta);
|
|
initialized_ = true;
|
|
}
|
|
|
|
NS_END |