1
0
Fork 0
arangodb/3rdParty/iresearch/tests/index/merge_writer_tests.cpp

2475 lines
85 KiB
C++

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2016 by EMC Corporation, All Rights Reserved
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is EMC Corporation
///
/// @author Andrey Abramov
/// @author Vasiliy Nabatchikov
////////////////////////////////////////////////////////////////////////////////
#include "index_tests.hpp"
#include "formats/formats_10.hpp"
#include "iql/query_builder.hpp"
#include "store/memory_directory.hpp"
#include "utils/type_limits.hpp"
#include "utils/lz4compression.hpp"
#include "index/merge_writer.hpp"
namespace tests {
class merge_writer_tests: public ::testing::Test {
virtual void SetUp() {
// Code here will be called immediately after the constructor (right before each test).
}
virtual void TearDown() {
// Code here will be called immediately after each test (right before the destructor).
}
};
template<typename T>
void validate_terms(
const iresearch::sub_reader& segment,
const iresearch::term_reader& terms,
uint64_t doc_count,
const iresearch::bytes_ref& min,
const iresearch::bytes_ref& max,
size_t term_size,
const iresearch::flags& term_features,
std::unordered_map<T, std::unordered_set<iresearch::doc_id_t>>& expected_terms,
size_t* frequency = nullptr,
std::vector<uint32_t>* position = nullptr
) {
ASSERT_EQ(doc_count, terms.docs_count());
ASSERT_EQ((max), (terms.max)());
ASSERT_EQ((min), (terms.min)());
ASSERT_EQ(term_size, terms.size());
ASSERT_EQ(term_features, terms.meta().features);
for (auto term_itr = terms.iterator(); term_itr->next();) {
auto itr = expected_terms.find(term_itr->value());
ASSERT_NE(expected_terms.end(), itr);
for (auto docs_itr = segment.mask(term_itr->postings(term_features)); docs_itr->next();) {
auto& attrs = docs_itr->attributes();
ASSERT_EQ(1, itr->second.erase(docs_itr->value()));
ASSERT_EQ(1 + (frequency ? 1 : 0) + (position ? 1 : 0), attrs.size());
ASSERT_TRUE(attrs.contains(iresearch::document::type()));
if (frequency) {
ASSERT_TRUE(attrs.contains(iresearch::frequency::type()));
ASSERT_EQ(*frequency, attrs.get<iresearch::frequency>()->value);
}
if (position) {
ASSERT_TRUE(attrs.contains(iresearch::position::type()));
for (auto pos: *position) {
ASSERT_TRUE(attrs.get<iresearch::position>()->next());
ASSERT_EQ(pos, attrs.get<iresearch::position>()->value());
}
ASSERT_FALSE(attrs.get<iresearch::position>()->next());
}
}
ASSERT_TRUE(itr->second.empty());
expected_terms.erase(itr);
}
ASSERT_TRUE(expected_terms.empty());
}
}
using namespace tests;
// -----------------------------------------------------------------------------
// --SECTION-- test suite
// -----------------------------------------------------------------------------
TEST_F(merge_writer_tests, test_merge_writer_columns_remove) {
iresearch::flags STRING_FIELD_FEATURES{ iresearch::frequency::type(), iresearch::position::type() };
iresearch::flags TEXT_FIELD_FEATURES{ iresearch::frequency::type(), iresearch::position::type(), iresearch::offset::type(), iresearch::payload::type() };
std::string string1;
std::string string2;
std::string string3;
std::string string4;
string1.append("string1_data");
string2.append("string2_data");
string3.append("string3_data");
string4.append("string4_data");
tests::document doc1; // doc_int, doc_string
tests::document doc2; // doc_string, doc_int
tests::document doc3; // doc_string, doc_int
tests::document doc4; // doc_string, another_column
doc1.insert(std::make_shared<tests::int_field>()); {
auto& field = doc1.indexed.back<tests::int_field>();
field.name(iresearch::string_ref("doc_int"));
field.value(42 * 1);
}
doc1.insert(
std::make_shared<tests::templates::string_field>("doc_string", string1)
);
doc2.insert(std::make_shared<tests::templates::string_field>("doc_string", string2));
doc2.insert(std::make_shared<tests::int_field>());
{
auto& field = doc2.indexed.back<tests::int_field>();
field.name(iresearch::string_ref("doc_int"));
field.value(42 * 2);
}
doc3.insert(std::make_shared<tests::templates::string_field>("doc_string", string3));
doc3.insert(std::make_shared<tests::int_field>()); {
auto& field = doc3.indexed.back<tests::int_field>();
field.name(iresearch::string_ref("doc_int"));
field.value(42 * 3);
}
doc4.insert(std::make_shared<tests::templates::string_field>("doc_string", string4));
doc4.insert(std::make_shared<tests::templates::string_field>("another_column", "another_value"));
auto codec_ptr = irs::formats::get("1_0");
irs::memory_directory dir;
// populate directory
{
auto query_doc4 = iresearch::iql::query_builder().build("doc_string==string4_data", std::locale::classic());
auto writer = iresearch::index_writer::make(dir, codec_ptr, iresearch::OM_CREATE);
ASSERT_TRUE(insert(*writer, doc1.indexed.end(), doc1.indexed.end(), doc1.stored.begin(), doc1.stored.end()));
ASSERT_TRUE(insert(*writer, doc3.indexed.end(), doc3.indexed.end(), doc3.stored.begin(), doc3.stored.end()));
writer->commit();
ASSERT_TRUE(insert(*writer, doc2.indexed.end(), doc2.indexed.end(), doc2.stored.begin(), doc2.stored.end()));
ASSERT_TRUE(insert(*writer, doc4.indexed.begin(), doc4.indexed.end(), doc4.stored.begin(), doc4.stored.end()));
writer->commit();
writer->documents().remove(std::move(query_doc4.filter));
writer->commit();
}
irs::column_info_provider_t column_info = [](const irs::string_ref&) {
return irs::column_info(irs::compression::lz4::type(), irs::compression::options{}, true );
};
auto reader = iresearch::directory_reader::open(dir, codec_ptr);
irs::merge_writer writer(dir, column_info);
ASSERT_EQ(2, reader.size());
ASSERT_EQ(2, reader[0].docs_count());
ASSERT_EQ(2, reader[1].docs_count());
// check for columns segment 0
{
auto& segment = reader[0];
auto columns = segment.columns();
ASSERT_TRUE(columns->next());
ASSERT_EQ("doc_int", columns->value().name);
ASSERT_EQ(0, columns->value().id);
ASSERT_TRUE(columns->next());
ASSERT_EQ("doc_string", columns->value().name);
ASSERT_EQ(1, columns->value().id);
ASSERT_FALSE(columns->next());
// check 'doc_int' column
{
std::unordered_map<int, iresearch::doc_id_t> expected_values{
{ 1 * 42, 1 },
{ 3 * 42, 2 }
};
size_t calls_count = 0;
auto reader = [&calls_count, &expected_values] (iresearch::doc_id_t doc, const irs::bytes_ref& value) {
++calls_count;
irs::bytes_ref_input in(value);
const auto actual_value = iresearch::read_zvint(in);
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
return true;
};
// read values for 'doc_int'
auto* meta = segment.column("doc_int");
ASSERT_NE(nullptr, meta);
auto* column = segment.column_reader(meta->id);
ASSERT_NE(nullptr, column);
ASSERT_EQ(column, segment.column_reader(meta->name));
ASSERT_TRUE(column->visit(reader));
ASSERT_EQ(expected_values.size(), calls_count);
}
// check 'doc_string' column
{
std::unordered_map <irs::string_ref, iresearch::doc_id_t > expected_values{
{ "string1_data", 1 },
{ "string3_data", 2 }
};
size_t calls_count = 0;
auto reader = [&calls_count, &expected_values] (irs::doc_id_t doc, const irs::bytes_ref& actual_value) {
++calls_count;
const auto actual_value_string = irs::to_string<irs::string_ref>(actual_value.c_str());
auto it = expected_values.find(actual_value_string);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
return true;
};
// read values for 'doc_string'
auto* meta = segment.column("doc_string");
ASSERT_NE(nullptr, meta);
auto* column = segment.column_reader(meta->id);
ASSERT_NE(nullptr, column);
ASSERT_EQ(column, segment.column_reader(meta->name));
ASSERT_TRUE(column->visit(reader));
ASSERT_EQ(expected_values.size(), calls_count);
}
// check wrong column
{
ASSERT_EQ(nullptr, segment.column("invalid_column"));
ASSERT_EQ(nullptr, segment.column_reader("invalid_column"));
}
}
// check for columns segment 1
{
auto& segment = reader[1];
auto columns = segment.columns();
ASSERT_TRUE(columns->next());
ASSERT_EQ("another_column", columns->value().name);
ASSERT_EQ(2, columns->value().id);
ASSERT_TRUE(columns->next());
ASSERT_EQ("doc_int", columns->value().name);
ASSERT_EQ(1, columns->value().id);
ASSERT_TRUE(columns->next());
ASSERT_EQ("doc_string", columns->value().name);
ASSERT_EQ(0, columns->value().id);
ASSERT_FALSE(columns->next());
// check 'doc_int' column
{
std::unordered_map<int, iresearch::doc_id_t> expected_values{
{ 2 * 42, 1 },
};
size_t calls_count = 0;
auto reader = [&calls_count, &expected_values] (irs::doc_id_t doc, const irs::bytes_ref& in) {
++calls_count;
irs::bytes_ref_input stream(in);
const auto actual_value = iresearch::read_zvint(stream);
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
return true;
};
// read values for 'doc_int'
auto* meta = segment.column("doc_int");
ASSERT_NE(nullptr, meta);
auto* column = segment.column_reader(meta->id);
ASSERT_NE(nullptr, column);
ASSERT_EQ(column, segment.column_reader(meta->name));
ASSERT_TRUE(column->visit(reader));
ASSERT_EQ(expected_values.size(), calls_count);
}
// check 'doc_string' column
{
std::unordered_map <irs::string_ref, iresearch::doc_id_t > expected_values{
{ "string2_data", 1 },
{ "string4_data", 2 }
};
size_t calls_count = 0;
auto reader = [&calls_count, &expected_values] (iresearch::doc_id_t doc, const irs::bytes_ref& in) {
++calls_count;
irs::bytes_ref_input stream(in);
const auto actual_value = iresearch::read_string<std::string>(stream);
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
return true;
};
// read values for 'doc_string'
auto* meta = segment.column("doc_string");
ASSERT_NE(nullptr, meta);
auto* column = segment.column_reader(meta->id);
ASSERT_NE(nullptr, column);
ASSERT_EQ(column, segment.column_reader(meta->name));
ASSERT_TRUE(column->visit(reader));
ASSERT_EQ(expected_values.size(), calls_count);
}
// check 'another_column' column
{
std::unordered_map <std::string, iresearch::doc_id_t > expected_values{
{ "another_value", 2 }
};
size_t calls_count = 0;
auto reader = [&calls_count, &expected_values] (iresearch::doc_id_t doc, const irs::bytes_ref& in) {
++calls_count;
irs::bytes_ref_input stream(in);
const auto actual_value = iresearch::read_string<std::string>(stream);
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
return true;
};
// read values for 'another_column'
auto* meta = segment.column("another_column");
ASSERT_NE(nullptr, meta);
auto* column = segment.column_reader(meta->id);
ASSERT_NE(nullptr, column);
ASSERT_EQ(column, segment.column_reader(meta->name));
ASSERT_TRUE(column->visit(reader));
ASSERT_EQ(expected_values.size(), calls_count);
}
// check invalid column
{
ASSERT_EQ(nullptr, segment.column("invalid_column"));
ASSERT_EQ(nullptr, segment.column_reader("invalid_column"));
}
}
writer.add(reader[0]);
writer.add(reader[1]);
irs::index_meta::index_segment_t index_segment;
index_segment.meta.codec = codec_ptr;
writer.flush(index_segment);
{
auto segment = irs::segment_reader::open(dir, index_segment.meta);
ASSERT_EQ(3, segment.docs_count());
auto columns = segment.columns();
ASSERT_TRUE(columns->next());
ASSERT_EQ("doc_int", columns->value().name);
ASSERT_EQ(0, columns->value().id); // 0 since 'doc_int' < 'doc_string'
ASSERT_TRUE(columns->next());
ASSERT_EQ("doc_string", columns->value().name);
ASSERT_EQ(1, columns->value().id);
ASSERT_FALSE(columns->next());
// check 'doc_int' column
{
std::unordered_map<int, iresearch::doc_id_t> expected_values{
// segment 0
{ 1 * 42, 1 },
{ 3 * 42, 2 },
// segment 1
{ 2 * 42, 3 }
};
size_t calls_count = 0;
auto reader = [&calls_count, &expected_values] (iresearch::doc_id_t doc, const irs::bytes_ref& value) {
++calls_count;
irs::bytes_ref_input in(value);
const auto actual_value = iresearch::read_zvint(in);
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
return true;
};
// read values for 'doc_int'
auto* meta = segment.column("doc_int");
ASSERT_NE(nullptr, meta);
auto* column = segment.column_reader(meta->id);
ASSERT_NE(nullptr, column);
ASSERT_EQ(column, segment.column_reader(meta->name));
ASSERT_TRUE(column->visit(reader));
ASSERT_EQ(expected_values.size(), calls_count);
}
// check 'doc_string' column
{
std::unordered_map <irs::string_ref, iresearch::doc_id_t > expected_values{
// segment 0
{ "string1_data", 1 },
{ "string3_data", 2 },
// segment 1
{ "string2_data", 3 }
};
size_t calls_count = 0;
auto reader = [&calls_count, &expected_values] (iresearch::doc_id_t doc, const irs::bytes_ref& value) {
++calls_count;
irs::bytes_ref_input in(value);
const auto actual_value = iresearch::read_string<std::string>(in);
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
return true;
};
// read values for 'doc_string'
auto* meta = segment.column("doc_string");
ASSERT_NE(nullptr, meta);
auto* column = segment.column_reader(meta->id);
ASSERT_NE(nullptr, column);
ASSERT_EQ(column, segment.column_reader(meta->name));
ASSERT_TRUE(column->visit(reader));
ASSERT_EQ(expected_values.size(), calls_count);
}
// check that 'another_column' has been removed
{
ASSERT_EQ(nullptr, segment.column("another_column"));
ASSERT_EQ(nullptr, segment.column_reader("another_column"));
}
}
}
TEST_F(merge_writer_tests, test_merge_writer_columns) {
iresearch::flags STRING_FIELD_FEATURES{ iresearch::frequency::type(), iresearch::position::type() };
iresearch::flags TEXT_FIELD_FEATURES{ iresearch::frequency::type(), iresearch::position::type(), iresearch::offset::type(), iresearch::payload::type() };
std::string string1;
std::string string2;
std::string string3;
std::string string4;
string1.append("string1_data");
string2.append("string2_data");
string3.append("string3_data");
string4.append("string4_data");
tests::document doc1; // doc_string, doc_int
tests::document doc2; // doc_string, doc_int
tests::document doc3; // doc_string, doc_int
tests::document doc4; // doc_string
doc1.insert(std::make_shared<tests::int_field>());
{
auto& field = doc1.indexed.back<tests::int_field>();
field.name(iresearch::string_ref("doc_int"));
field.value(42 * 1);
}
doc1.insert(std::make_shared<tests::templates::string_field>("doc_string", string1));
doc2.insert(std::make_shared<tests::templates::string_field>("doc_string", string2));
doc2.insert(std::make_shared<tests::int_field>()); {
auto& field = doc2.indexed.back<tests::int_field>();
field.name(iresearch::string_ref("doc_int"));
field.value(42 * 2);
}
doc3.insert(std::make_shared<tests::templates::string_field>("doc_string", string3));
doc3.insert(std::make_shared<tests::int_field>());
{
auto& field = doc3.indexed.back<tests::int_field>();
field.name(iresearch::string_ref("doc_int"));
field.value(42 * 3);
}
doc4.insert(std::make_shared<tests::templates::string_field>("doc_string", string4));
auto codec_ptr = irs::formats::get("1_0");
ASSERT_NE(nullptr, codec_ptr);
irs::memory_directory dir;
// populate directory
{
auto writer = iresearch::index_writer::make(dir, codec_ptr, iresearch::OM_CREATE);
ASSERT_TRUE(insert(*writer, doc1.indexed.end(), doc1.indexed.end(), doc1.stored.begin(), doc1.stored.end()));
ASSERT_TRUE(insert(*writer, doc3.indexed.end(), doc3.indexed.end(), doc3.stored.begin(), doc3.stored.end()));
writer->commit();
ASSERT_TRUE(insert(*writer, doc2.indexed.end(), doc2.indexed.end(), doc2.stored.begin(), doc2.stored.end()));
ASSERT_TRUE(insert(*writer, doc4.indexed.end(), doc4.indexed.end(), doc4.stored.begin(), doc4.stored.end()));
writer->commit();
}
irs::column_info_provider_t column_info = [](const irs::string_ref&) {
return irs::column_info(irs::compression::lz4::type(), irs::compression::options{}, true );
};
auto reader = iresearch::directory_reader::open(dir, codec_ptr);
irs::merge_writer writer(dir, column_info);
ASSERT_EQ(2, reader.size());
ASSERT_EQ(2, reader[0].docs_count());
ASSERT_EQ(2, reader[1].docs_count());
// check for columns segment 0
{
auto& segment = reader[0];
auto columns = segment.columns();
ASSERT_TRUE(columns->next());
ASSERT_EQ("doc_int", columns->value().name);
ASSERT_EQ(0, columns->value().id);
ASSERT_TRUE(columns->next());
ASSERT_EQ("doc_string", columns->value().name);
ASSERT_EQ(1, columns->value().id);
ASSERT_FALSE(columns->next());
ASSERT_FALSE(columns->next());
// check 'doc_int' column
{
std::unordered_map<int, iresearch::doc_id_t> expected_values{
{ 1 * 42, 1 },
{ 3 * 42, 2 }
};
size_t calls_count = 0;
auto reader = [&calls_count, &expected_values] (iresearch::doc_id_t doc, const irs::bytes_ref& value) {
++calls_count;
irs::bytes_ref_input in(value);
const auto actual_value = iresearch::read_zvint(in);
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
return true;
};
// read values for 'doc_int'
auto* meta = segment.column("doc_int");
ASSERT_NE(nullptr, meta);
auto* column = segment.column_reader(meta->id);
ASSERT_NE(nullptr, column);
ASSERT_EQ(column, segment.column_reader(meta->name));
ASSERT_TRUE(column->visit(reader));
ASSERT_EQ(expected_values.size(), calls_count);
}
// check 'doc_string' column
{
std::unordered_map <irs::string_ref, iresearch::doc_id_t > expected_values{
{ "string1_data", 1 },
{ "string3_data", 2 }
};
size_t calls_count = 0;
auto reader = [&calls_count, &expected_values] (iresearch::doc_id_t doc, const irs::bytes_ref& value) {
++calls_count;
irs::bytes_ref_input in(value);
const auto actual_value = iresearch::read_string<std::string>(in);
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
return true;
};
// read values for 'doc_string'
auto* meta = segment.column("doc_string");
ASSERT_NE(nullptr, meta);
auto* column = segment.column_reader(meta->id);
ASSERT_NE(nullptr, column);
ASSERT_EQ(column, segment.column_reader(meta->name));
ASSERT_TRUE(column->visit(reader));
ASSERT_EQ(expected_values.size(), calls_count);
}
// check wrong column
{
ASSERT_EQ(nullptr, segment.column("invalid_column"));
ASSERT_EQ(nullptr, segment.column_reader("invalid_column"));
}
}
// check for columns segment 1
{
auto& segment = reader[1];
auto columns = segment.columns();
ASSERT_TRUE(columns->next());
ASSERT_EQ("doc_int", columns->value().name);
ASSERT_EQ(1, columns->value().id);
ASSERT_TRUE(columns->next());
ASSERT_EQ("doc_string", columns->value().name);
ASSERT_EQ(0, columns->value().id);
ASSERT_FALSE(columns->next());
// check 'doc_int' column
{
std::unordered_map<int, iresearch::doc_id_t> expected_values{
{ 2 * 42, 1 },
};
size_t calls_count = 0;
auto reader = [&calls_count, &expected_values] (irs::doc_id_t doc, const irs::bytes_ref& value) {
++calls_count;
irs::bytes_ref_input in(value);
const auto actual_value = iresearch::read_zvint(in);
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
return true;
};
// read values for 'doc_int'
auto* meta = segment.column("doc_int");
ASSERT_NE(nullptr, meta);
auto* column = segment.column_reader(meta->id);
ASSERT_NE(nullptr, column);
ASSERT_EQ(column, segment.column_reader(meta->name));
ASSERT_TRUE(column->visit(reader));
ASSERT_EQ(expected_values.size(), calls_count);
}
// check 'doc_string' column
{
std::unordered_map <irs::string_ref, iresearch::doc_id_t > expected_values{
{ "string2_data", 1 },
{ "string4_data", 2 }
};
size_t calls_count = 0;
auto reader = [&calls_count, &expected_values] (iresearch::doc_id_t doc, const irs::bytes_ref& value) {
++calls_count;
irs::bytes_ref_input in(value);
const auto actual_value = iresearch::read_string<std::string>(in);
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
return true;
};
// read values for 'doc_string'
auto* meta = segment.column("doc_string");
ASSERT_NE(nullptr, meta);
auto* column = segment.column_reader(meta->id);
ASSERT_NE(nullptr, column);
ASSERT_EQ(column, segment.column_reader(meta->name));
ASSERT_TRUE(column->visit(reader));
ASSERT_EQ(expected_values.size(), calls_count);
}
// check wrong column
{
ASSERT_EQ(nullptr, segment.column("invalid_column"));
ASSERT_EQ(nullptr, segment.column_reader("invalid_column"));
}
}
writer.add(reader[0]);
writer.add(reader[1]);
irs::index_meta::index_segment_t index_segment;
index_segment.meta.codec = codec_ptr;
writer.flush(index_segment);
{
auto segment = irs::segment_reader::open(dir, index_segment.meta);
ASSERT_EQ(4, segment.docs_count());
auto columns = segment.columns();
ASSERT_TRUE(columns->next());
ASSERT_EQ("doc_int", columns->value().name);
ASSERT_EQ(0, columns->value().id); // 0 since 'doc_int' < 'doc_string'
ASSERT_TRUE(columns->next());
ASSERT_EQ("doc_string", columns->value().name);
ASSERT_EQ(1, columns->value().id);
ASSERT_FALSE(columns->next());
// check 'doc_int' column
{
std::unordered_map<int, iresearch::doc_id_t> expected_values{
// segment 0
{ 1 * 42, 1 },
{ 3 * 42, 2 },
// segment 1
{ 2 * 42, 3 }
};
size_t calls_count = 0;
auto reader = [&calls_count, &expected_values] (iresearch::doc_id_t doc, const irs::bytes_ref& value) {
++calls_count;
irs::bytes_ref_input in(value);
const auto actual_value = iresearch::read_zvint(in);
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
return true;
};
// read values for 'doc_int'
auto* meta = segment.column("doc_int");
ASSERT_NE(nullptr, meta);
auto* column = segment.column_reader(meta->id);
ASSERT_NE(nullptr, column);
ASSERT_EQ(column, segment.column_reader(meta->name));
ASSERT_TRUE(column->visit(reader));
ASSERT_EQ(expected_values.size(), calls_count);
}
// check 'doc_string' column
{
std::unordered_map <irs::string_ref, iresearch::doc_id_t > expected_values{
// segment 0
{ "string1_data", 1 },
{ "string3_data", 2 },
// segment 1
{ "string2_data", 3 },
{ "string4_data", 4 }
};
size_t calls_count = 0;
auto reader = [&calls_count, &expected_values] (iresearch::doc_id_t doc, const irs::bytes_ref& value) {
++calls_count;
irs::bytes_ref_input in(value);
const auto actual_value = iresearch::read_string<std::string>(in);
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
return true;
};
// read values for 'doc_string'
auto* meta = segment.column("doc_string");
ASSERT_NE(nullptr, meta);
auto* column = segment.column_reader(meta->id);
ASSERT_NE(nullptr, column);
ASSERT_EQ(column, segment.column_reader(meta->name));
ASSERT_TRUE(column->visit(reader));
ASSERT_EQ(expected_values.size(), calls_count);
}
}
}
TEST_F(merge_writer_tests, test_merge_writer) {
auto codec_ptr = irs::formats::get("1_0");
ASSERT_NE(nullptr, codec_ptr);
irs::memory_directory dir;
iresearch::bstring bytes1;
iresearch::bstring bytes2;
iresearch::bstring bytes3;
bytes1.append(iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("bytes1_data")));
bytes2.append(iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("bytes2_data")));
bytes3.append(iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("bytes3_data")));
iresearch::flags STRING_FIELD_FEATURES{ iresearch::frequency::type(), iresearch::position::type() };
iresearch::flags TEXT_FIELD_FEATURES{ iresearch::frequency::type(), iresearch::position::type(), iresearch::offset::type(), iresearch::payload::type() };
std::string string1;
std::string string2;
std::string string3;
std::string string4;
string1.append("string1_data");
string2.append("string2_data");
string3.append("string3_data");
string4.append("string4_data");
std::string text1;
std::string text2;
std::string text3;
text1.append("text1_data");
text2.append("text2_data");
text3.append("text3_data");
tests::document doc1;
tests::document doc2;
tests::document doc3;
tests::document doc4;
// norm for 'doc_bytes' in 'doc1' : 1/sqrt(4)
doc1.insert(std::make_shared<tests::binary_field>()); {
auto& field = doc1.indexed.back<tests::binary_field>();
field.name(iresearch::string_ref("doc_bytes"));
field.value(bytes1);
field.features().add<iresearch::norm>();
}
doc1.insert(std::make_shared<tests::binary_field>()); {
auto& field = doc1.indexed.back<tests::binary_field>();
field.name(iresearch::string_ref("doc_bytes"));
field.value(bytes1);
field.features().add<iresearch::norm>();
}
doc1.insert(std::make_shared<tests::binary_field>()); {
auto& field = doc1.indexed.back<tests::binary_field>();
field.name(iresearch::string_ref("doc_bytes"));
field.value(bytes1);
field.features().add<iresearch::norm>();
}
doc1.insert(std::make_shared<tests::binary_field>()); {
auto& field = doc1.indexed.back<tests::binary_field>();
field.name(iresearch::string_ref("doc_bytes"));
field.value(bytes1);
field.features().add<iresearch::norm>();
}
// do not track norms for 'doc_bytes' in 'doc2'
doc2.insert(std::make_shared<tests::binary_field>()); {
auto& field = doc2.indexed.back<tests::binary_field>();
field.name(iresearch::string_ref("doc_bytes"));
field.value(bytes2);
}
doc2.insert(std::make_shared<tests::binary_field>()); {
auto& field = doc2.indexed.back<tests::binary_field>();
field.name(iresearch::string_ref("doc_bytes"));
field.value(bytes2);
}
// norm for 'doc_bytes' in 'doc3' : 1/sqrt(2)
doc3.insert(std::make_shared<tests::binary_field>()); {
auto& field = doc3.indexed.back<tests::binary_field>();
field.name(iresearch::string_ref("doc_bytes"));
field.value(bytes3);
field.features().add<iresearch::norm>();
}
doc3.insert(std::make_shared<tests::binary_field>()); {
auto& field = doc3.indexed.back<tests::binary_field>();
field.name(iresearch::string_ref("doc_bytes"));
field.value(bytes3);
field.features().add<iresearch::norm>();
}
doc1.insert(std::make_shared<tests::double_field>()); {
auto& field = doc1.indexed.back<tests::double_field>();
field.name(iresearch::string_ref("doc_double"));
field.value(2.718281828 * 1);
}
doc2.insert(std::make_shared<tests::double_field>()); {
auto& field = doc2.indexed.back<tests::double_field>();
field.name(iresearch::string_ref("doc_double"));
field.value(2.718281828 * 2);
}
doc3.insert(std::make_shared<tests::double_field>()); {
auto& field = doc3.indexed.back<tests::double_field>();
field.name(iresearch::string_ref("doc_double"));
field.value(2.718281828 * 3);
}
doc1.insert(std::make_shared<tests::float_field>()); {
auto& field = doc1.indexed.back<tests::float_field>();
field.name(iresearch::string_ref("doc_float"));
field.value(3.1415926535f * 1);
}
doc2.insert(std::make_shared<tests::float_field>()); {
auto& field = doc2.indexed.back<tests::float_field>();
field.name(iresearch::string_ref("doc_float"));
field.value(3.1415926535f * 2);
}
doc3.insert(std::make_shared<tests::float_field>()); {
auto& field = doc3.indexed.back<tests::float_field>();
field.name(iresearch::string_ref("doc_float"));
field.value(3.1415926535f * 3);
}
doc1.insert(std::make_shared<tests::int_field>()); {
auto& field = doc1.indexed.back<tests::int_field>();
field.name(iresearch::string_ref("doc_int"));
field.value(42 * 1);
}
doc2.insert(std::make_shared<tests::int_field>()); {
auto& field = doc2.indexed.back<tests::int_field>();
field.name(iresearch::string_ref("doc_int"));
field.value(42 * 2);
}
doc3.insert(std::make_shared<tests::int_field>()); {
auto& field = doc3.indexed.back<tests::int_field>();
field.name(iresearch::string_ref("doc_int"));
field.value(42 * 3);
}
doc1.insert(std::make_shared<tests::long_field>()); {
auto& field = doc1.indexed.back<tests::long_field>();
field.name(iresearch::string_ref("doc_long"));
field.value(12345 * 1);
}
doc2.insert(std::make_shared<tests::long_field>()); {
auto& field = doc2.indexed.back<tests::long_field>();
field.name(iresearch::string_ref("doc_long"));
field.value(12345 * 2);
}
doc3.insert(std::make_shared<tests::long_field>()); {
auto& field = doc3.indexed.back<tests::long_field>();
field.name(iresearch::string_ref("doc_long"));
field.value(12345 * 3);
}
doc1.insert(std::make_shared<tests::templates::string_field>("doc_string", string1));
doc2.insert(std::make_shared<tests::templates::string_field>("doc_string", string2));
doc3.insert(std::make_shared<tests::templates::string_field>("doc_string", string3));
doc4.insert(std::make_shared<tests::templates::string_field>("doc_string", string4));
doc1.indexed.push_back(std::make_shared<tests::templates::text_field<iresearch::string_ref>>("doc_text", text1));
doc2.indexed.push_back(std::make_shared<tests::templates::text_field<iresearch::string_ref>>("doc_text", text2));
doc3.indexed.push_back(std::make_shared<tests::templates::text_field<iresearch::string_ref>>("doc_text", text3));
// populate directory
{
auto query_doc4 = iresearch::iql::query_builder().build("doc_string==string4_data", std::locale::classic());
auto writer = iresearch::index_writer::make(dir, codec_ptr, iresearch::OM_CREATE);
ASSERT_TRUE(insert(*writer,
doc1.indexed.begin(), doc1.indexed.end(),
doc1.stored.begin(), doc1.stored.end()
));
ASSERT_TRUE(insert(*writer,
doc2.indexed.begin(), doc2.indexed.end(),
doc2.stored.begin(), doc2.stored.end()
));
writer->commit();
ASSERT_TRUE(insert(*writer,
doc3.indexed.begin(), doc3.indexed.end(),
doc3.stored.begin(), doc3.stored.end()
));
ASSERT_TRUE(insert(*writer,
doc4.indexed.begin(), doc4.indexed.end(),
doc4.stored.begin(), doc4.stored.end()
));
writer->commit();
writer->documents().remove(std::move(query_doc4.filter));
writer->commit();
}
auto docs_count = [](const irs::sub_reader& segment, const irs::string_ref& field) {
auto* reader = segment.field(field);
return reader ? reader->docs_count() : 0;
};
irs::column_info_provider_t column_info = [](const irs::string_ref&) {
return irs::column_info(irs::compression::lz4::type(), irs::compression::options{}, true );
};
auto reader = iresearch::directory_reader::open(dir, codec_ptr);
irs::merge_writer writer(dir, column_info);
ASSERT_EQ(2, reader.size());
ASSERT_EQ(2, reader[0].docs_count());
ASSERT_EQ(2, reader[1].docs_count());
// validate initial data (segment 0)
{
auto& segment = reader[0];
ASSERT_EQ(2, segment.docs_count());
{
auto fields = segment.fields();
size_t size = 0;
while (fields->next()) {
++size;
}
ASSERT_EQ(7, size);
}
// validate bytes field
{
auto terms = segment.field("doc_bytes");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::binary_field().features();
features.add<iresearch::norm>();
std::unordered_map<iresearch::bytes_ref, std::unordered_set<iresearch::doc_id_t>> expected_terms;
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("bytes1_data"))].emplace(1);
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("bytes2_data"))].emplace(2);
ASSERT_EQ(2, docs_count(segment, "doc_bytes"));
ASSERT_TRUE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // 'norm' attribute has been specified
ASSERT_EQ(features, field.features);
validate_terms(
segment,
*terms,
2,
bytes1,
bytes2,
2,
features,
expected_terms
);
std::unordered_map<float_t, iresearch::doc_id_t> expected_values{
{ 0.5f, 1 },
};
auto reader = [&expected_values] (iresearch::doc_id_t doc, const irs::bytes_ref& value) {
irs::bytes_ref_input in(value);
const auto actual_value = iresearch::read_zvfloat(in); // read norm value
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
expected_values.erase(it);
return true;
};
auto* column = segment.column_reader(field.norm);
ASSERT_NE(nullptr, column);
ASSERT_TRUE(column->visit(reader));
ASSERT_TRUE(expected_values.empty());
}
// validate double field
{
auto terms = segment.field("doc_double");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::double_field().features();
iresearch::numeric_token_stream max;
max.reset((double_t) (2.718281828 * 2));
iresearch::numeric_token_stream min;
min.reset((double_t) (2.718281828 * 1));
std::unordered_map<iresearch::bstring, std::unordered_set<iresearch::doc_id_t>> expected_terms;
{
iresearch::numeric_token_stream itr;
itr.reset((double_t) (2.718281828 * 1));
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(1));
}
{
iresearch::numeric_token_stream itr;
itr.reset((double_t) (2.718281828 * 2));
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(2));
}
ASSERT_EQ(2, docs_count(segment, "doc_double"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
ASSERT_TRUE(max.next() && max.next() && max.next() && max.next()); // skip to last value
ASSERT_TRUE(min.next()); // skip to first value
validate_terms(
segment,
*terms,
2,
min.attributes().get<iresearch::term_attribute>()->value(),
max.attributes().get<iresearch::term_attribute>()->value(),
8,
features,
expected_terms
);
}
// validate float field
{
auto terms = segment.field("doc_float");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::float_field().features();
iresearch::numeric_token_stream max;
max.reset((float_t) (3.1415926535 * 2));
iresearch::numeric_token_stream min;
min.reset((float_t) (3.1415926535 * 1));
std::unordered_map<iresearch::bstring, std::unordered_set<iresearch::doc_id_t>> expected_terms;
{
iresearch::numeric_token_stream itr;
itr.reset((float_t) (3.1415926535 * 1));
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(1));
}
{
iresearch::numeric_token_stream itr;
itr.reset((float_t) (3.1415926535 * 2));
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(2));
}
ASSERT_EQ(2, docs_count(segment, "doc_float"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
ASSERT_TRUE(max.next() && max.next()); // skip to last value
ASSERT_TRUE(min.next()); // skip to first value
validate_terms(
segment,
*terms,
2,
min.attributes().get<iresearch::term_attribute>()->value(),
max.attributes().get<iresearch::term_attribute>()->value(),
4,
features,
expected_terms
);
}
// validate int field
{
auto terms = segment.field("doc_int");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::int_field().features();
iresearch::numeric_token_stream max;
max.reset(42 * 2);
iresearch::numeric_token_stream min;
min.reset(42 * 1);
std::unordered_map<iresearch::bstring, std::unordered_set<iresearch::doc_id_t>> expected_terms;
{
iresearch::numeric_token_stream itr;
itr.reset(42 * 1);
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(1));
}
{
iresearch::numeric_token_stream itr;
itr.reset(42 * 2);
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(2));
}
ASSERT_EQ(2, docs_count(segment, "doc_int"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
ASSERT_TRUE(max.next() && max.next()); // skip to last value
ASSERT_TRUE(min.next()); // skip to first value
validate_terms(
segment,
*terms,
2,
min.attributes().get<iresearch::term_attribute>()->value(),
max.attributes().get<iresearch::term_attribute>()->value(),
3,
features,
expected_terms
);
}
// validate long field
{
auto terms = segment.field("doc_long");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::long_field().features();
iresearch::numeric_token_stream max;
max.reset((int64_t) 12345 * 2);
iresearch::numeric_token_stream min;
min.reset((int64_t) 12345 * 1);
std::unordered_map<iresearch::bstring, std::unordered_set<iresearch::doc_id_t>> expected_terms;
{
iresearch::numeric_token_stream itr;
itr.reset((int64_t) 12345 * 1);
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(1));
}
{
iresearch::numeric_token_stream itr;
itr.reset((int64_t) 12345 * 2);
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(2));
}
ASSERT_EQ(2, docs_count(segment, "doc_long"));
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
ASSERT_TRUE(max.next() && max.next() && max.next() && max.next()); // skip to last value
ASSERT_TRUE(min.next()); // skip to first value
validate_terms(
segment,
*terms,
2,
min.attributes().get<iresearch::term_attribute>()->value(),
max.attributes().get<iresearch::term_attribute>()->value(),
5,
features,
expected_terms
);
}
// validate string field
{
auto terms = segment.field("doc_string");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto& features = STRING_FIELD_FEATURES;
size_t frequency = 1;
std::vector<uint32_t> position = { 0 };
std::unordered_map<iresearch::bytes_ref, std::unordered_set<iresearch::doc_id_t>> expected_terms;
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("string1_data"))].emplace(1);
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("string2_data"))].emplace(2);
ASSERT_EQ(2, docs_count(segment, "doc_string"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
validate_terms(
segment,
*terms,
2,
iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref(string1)),
iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref(string2)),
2,
features,
expected_terms,
&frequency,
&position
);
}
// validate text field
{
auto terms = segment.field("doc_text");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto& features = TEXT_FIELD_FEATURES;
size_t frequency = 1;
std::vector<uint32_t> position = { 0 };
std::unordered_map<iresearch::bytes_ref, std::unordered_set<iresearch::doc_id_t>> expected_terms;
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("text1_data"))].emplace(1);
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("text2_data"))].emplace(2);
ASSERT_EQ(2, docs_count(segment, "doc_text"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
validate_terms(
segment,
*terms,
2,
iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref(text1)),
iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref(text2)),
2,
features,
expected_terms,
&frequency,
&position
);
}
// ...........................................................................
// validate documents
// ...........................................................................
std::unordered_set<iresearch::bytes_ref> expected_bytes;
auto column = segment.column_reader("doc_bytes");
ASSERT_NE(nullptr, column);
auto bytes_values = column->values();
std::unordered_set<double> expected_double;
column = segment.column_reader("doc_double");
ASSERT_NE(nullptr, column);
auto double_values = column->values();
std::unordered_set<float> expected_float;
column = segment.column_reader("doc_float");
ASSERT_NE(nullptr, column);
auto float_values = column->values();
std::unordered_set<int> expected_int;
column = segment.column_reader("doc_int");
ASSERT_NE(nullptr, column);
auto int_values = column->values();
std::unordered_set<int64_t> expected_long;
column = segment.column_reader("doc_long");
ASSERT_NE(nullptr, column);
auto long_values = column->values();
std::unordered_set<std::string> expected_string;
column = segment.column_reader("doc_string");
ASSERT_NE(nullptr, column);
auto string_values = column->values();
expected_bytes = { iresearch::bytes_ref(bytes1), iresearch::bytes_ref(bytes2) };
expected_double = { 2.718281828 * 1, 2.718281828 * 2 };
expected_float = { (float)(3.1415926535 * 1), (float)(3.1415926535 * 2) };
expected_int = { 42 * 1, 42 * 2 };
expected_long = { 12345 * 1, 12345 * 2 };
expected_string = { string1, string2 };
// can't have more docs then highest doc_id
irs::bytes_ref value;
irs::bytes_ref_input in;
for (size_t i = 0, count = segment.docs_count(); i < count; ++i) {
const auto doc = iresearch::doc_id_t((iresearch::type_limits<iresearch::type_t::doc_id_t>::min)() + i);
ASSERT_TRUE(bytes_values(doc, value)); in.reset(value);
ASSERT_EQ(1, expected_bytes.erase(irs::read_string<irs::bstring>(in)));
ASSERT_TRUE(double_values(doc, value)); in.reset(value);
ASSERT_EQ(1, expected_double.erase(irs::read_zvdouble(in)));
ASSERT_TRUE(float_values(doc, value)); in.reset(value);
ASSERT_EQ(1, expected_float.erase(irs::read_zvfloat(in)));
ASSERT_TRUE(int_values(doc, value)); in.reset(value);
ASSERT_EQ(1, expected_int.erase(irs::read_zvint(in)));
ASSERT_TRUE(long_values(doc, value)); in.reset(value);
ASSERT_EQ(1, expected_long.erase(irs::read_zvlong(in)));
ASSERT_TRUE(string_values(doc, value)); in.reset(value);
ASSERT_EQ(1, expected_string.erase(irs::read_string<std::string>(in)));
}
ASSERT_TRUE(expected_bytes.empty());
ASSERT_TRUE(expected_double.empty());
ASSERT_TRUE(expected_float.empty());
ASSERT_TRUE(expected_int.empty());
ASSERT_TRUE(expected_long.empty());
ASSERT_TRUE(expected_string.empty());
}
// validate initial data (segment 1)
{
auto& segment = reader[1];
ASSERT_EQ(2, segment.docs_count());
{
auto fields = segment.fields();
size_t size = 0;
while (fields->next()) {
++size;
}
ASSERT_EQ(7, size);
}
// validate bytes field
{
auto terms = segment.field("doc_bytes");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::binary_field().features();
std::unordered_map<iresearch::bytes_ref, std::unordered_set<iresearch::doc_id_t>> expected_terms;
features.add<iresearch::norm>();
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("bytes3_data"))].emplace(1);
ASSERT_EQ(1, docs_count(segment, "doc_bytes"));
ASSERT_TRUE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
validate_terms(
segment,
*terms,
1,
bytes3,
bytes3,
1,
features,
expected_terms
);
std::unordered_map<float_t, iresearch::doc_id_t> expected_values{
{ float(1./std::sqrt(2)), 1 },
};
auto reader = [&expected_values] (iresearch::doc_id_t doc, const irs::bytes_ref& value) {
irs::bytes_ref_input in(value);
const auto actual_value = iresearch::read_zvfloat(in); // read norm value
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
expected_values.erase(it);
return true;
};
auto* column = segment.column_reader(field.norm);
ASSERT_NE(nullptr, column);
ASSERT_TRUE(column->visit(reader));
ASSERT_TRUE(expected_values.empty());
}
// validate double field
{
auto terms = segment.field("doc_double");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::double_field().features();
iresearch::numeric_token_stream max;
max.reset((double_t) (2.718281828 * 3));
iresearch::numeric_token_stream min;
min.reset((double_t) (2.718281828 * 3));
std::unordered_map<iresearch::bstring, std::unordered_set<iresearch::doc_id_t>> expected_terms;
{
iresearch::numeric_token_stream itr;
itr.reset((double_t) (2.718281828 * 3));
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(1));
}
ASSERT_EQ(1, docs_count(segment, "doc_double"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
ASSERT_TRUE(max.next() && max.next() && max.next() && max.next()); // skip to last value
ASSERT_TRUE(min.next()); // skip to first value
validate_terms(
segment,
*terms,
1,
min.attributes().get<iresearch::term_attribute>()->value(),
max.attributes().get<iresearch::term_attribute>()->value(),
4,
features,
expected_terms
);
}
// validate float field
{
auto terms = segment.field("doc_float");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::float_field().features();
iresearch::numeric_token_stream max;
max.reset((float_t) (3.1415926535 * 3));
iresearch::numeric_token_stream min;
min.reset((float_t) (3.1415926535 * 3));
std::unordered_map<iresearch::bstring, std::unordered_set<iresearch::doc_id_t>> expected_terms;
{
iresearch::numeric_token_stream itr;
itr.reset((float_t) (3.1415926535 * 3));
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(1));
}
ASSERT_EQ(1, docs_count(segment, "doc_float"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
ASSERT_TRUE(max.next() && max.next()); // skip to last value
ASSERT_TRUE(min.next()); // skip to first value
validate_terms(
segment,
*terms,
1,
min.attributes().get<iresearch::term_attribute>()->value(),
max.attributes().get<iresearch::term_attribute>()->value(),
2,
features,
expected_terms
);
}
// validate int field
{
auto terms = segment.field("doc_int");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::int_field().features();
iresearch::numeric_token_stream max;
max.reset(42 * 3);
iresearch::numeric_token_stream min;
min.reset(42 * 3);
std::unordered_map<iresearch::bstring, std::unordered_set<iresearch::doc_id_t>> expected_terms;
{
iresearch::numeric_token_stream itr;
itr.reset(42 * 3);
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(1));
}
ASSERT_EQ(1, docs_count(segment, "doc_int"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
ASSERT_TRUE(max.next() && max.next()); // skip to last value
ASSERT_TRUE(min.next()); // skip to first value
validate_terms(
segment,
*terms,
1,
min.attributes().get<iresearch::term_attribute>()->value(),
max.attributes().get<iresearch::term_attribute>()->value(),
2,
features,
expected_terms
);
}
// validate long field
{
auto terms = segment.field("doc_long");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::long_field().features();
iresearch::numeric_token_stream max;
max.reset((int64_t) 12345 * 3);
iresearch::numeric_token_stream min;
min.reset((int64_t) 12345 * 3);
std::unordered_map<iresearch::bstring, std::unordered_set<iresearch::doc_id_t>> expected_terms;
{
iresearch::numeric_token_stream itr;
itr.reset((int64_t) 12345 * 3);
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(1));
}
ASSERT_EQ(1, docs_count(segment, "doc_long"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
ASSERT_TRUE(max.next() && max.next() && max.next() && max.next()); // skip to last value
ASSERT_TRUE(min.next()); // skip to first value
validate_terms(
segment,
*terms,
1,
min.attributes().get<iresearch::term_attribute>()->value(),
max.attributes().get<iresearch::term_attribute>()->value(),
4,
features,
expected_terms
);
}
// validate string field
{
auto terms = segment.field("doc_string");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto& features = STRING_FIELD_FEATURES;
size_t frequency = 1;
std::vector<uint32_t> position = { 0 };
std::unordered_map<iresearch::bytes_ref, std::unordered_set<iresearch::doc_id_t>> expected_terms;
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("string3_data"))].emplace(1);
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("string4_data"))];
ASSERT_EQ(2, docs_count(segment, "doc_string"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
validate_terms(
segment,
*terms,
2,
iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref(string3)),
iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref(string4)),
2,
features,
expected_terms,
&frequency,
&position
);
}
// validate text field
{
auto terms = segment.field("doc_text");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto& features = TEXT_FIELD_FEATURES;
size_t frequency = 1;
std::vector<uint32_t> position = { 0 };
std::unordered_map<iresearch::bytes_ref, std::unordered_set<iresearch::doc_id_t>> expected_terms;
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("text3_data"))].emplace(1);
ASSERT_EQ(1, docs_count(segment, "doc_text"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
validate_terms(
segment,
*terms,
1,
iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref(text3)),
iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref(text3)),
1,
features,
expected_terms,
&frequency,
&position
);
}
// ...........................................................................
// validate documents
// ...........................................................................
std::unordered_set<iresearch::bytes_ref> expected_bytes;
auto column = segment.column_reader("doc_bytes");
ASSERT_NE(nullptr, column);
auto bytes_values = column->values();
std::unordered_set<double> expected_double;
column = segment.column_reader("doc_double");
ASSERT_NE(nullptr, column);
auto double_values = column->values();
std::unordered_set<float> expected_float;
column = segment.column_reader("doc_float");
ASSERT_NE(nullptr, column);
auto float_values = column->values();
std::unordered_set<int> expected_int;
column = segment.column_reader("doc_int");
ASSERT_NE(nullptr, column);
auto int_values = column->values();
std::unordered_set<int64_t> expected_long;
column = segment.column_reader("doc_long");
ASSERT_NE(nullptr, column);
auto long_values = column->values();
std::unordered_set<std::string> expected_string;
column = segment.column_reader("doc_string");
ASSERT_NE(nullptr, column);
auto string_values = column->values();
expected_bytes = { iresearch::bytes_ref(bytes3) };
expected_double = { 2.718281828 * 3 };
expected_float = { (float)(3.1415926535 * 3) };
expected_int = { 42 * 3 };
expected_long = { 12345 * 3 };
expected_string = { string3, string4 };
// can't have more docs then highest doc_id
irs::bytes_ref value;
irs::bytes_ref_input in;
for (size_t i = 0, count = segment.docs_count(); i < count; ++i) {
const auto doc = iresearch::doc_id_t((iresearch::type_limits<iresearch::type_t::doc_id_t>::min)() + i);
ASSERT_EQ(!expected_bytes.empty(), bytes_values(doc, value)); in.reset(value);
expected_bytes.erase(irs::read_string<irs::bstring>(in));
ASSERT_EQ(!expected_double.empty(), double_values(doc, value)); in.reset(value);
expected_double.erase(irs::read_zvdouble(in));
ASSERT_EQ(!expected_float.empty(), float_values(doc, value)); in.reset(value);
expected_float.erase(irs::read_zvfloat(in));
ASSERT_EQ(!expected_int.empty(), int_values(doc, value)); in.reset(value);
expected_int.erase(irs::read_zvint(in));
ASSERT_EQ(!expected_long.empty(), long_values(doc, value)); in.reset(value);
expected_long.erase(irs::read_zvlong(in));
ASSERT_TRUE(string_values(doc, value)); in.reset(value);
ASSERT_EQ(1, expected_string.erase(irs::read_string<std::string>(in)));
}
ASSERT_TRUE(expected_bytes.empty());
ASSERT_TRUE(expected_double.empty());
ASSERT_TRUE(expected_float.empty());
ASSERT_TRUE(expected_int.empty());
ASSERT_TRUE(expected_long.empty());
ASSERT_TRUE(expected_string.empty());
}
writer.add(reader[0]);
writer.add(reader[1]);
irs::index_meta::index_segment_t index_segment;
index_segment.meta.codec = codec_ptr;
writer.flush(index_segment);
auto segment = irs::segment_reader::open(dir, index_segment.meta);
ASSERT_EQ(3, segment.docs_count()); //doc4 removed during merge
{
auto fields = segment.fields();
size_t size = 0;
while (fields->next()) {
++size;
}
ASSERT_EQ(7, size);
}
// validate bytes field
{
auto terms = segment.field("doc_bytes");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::binary_field().features();
features.add<iresearch::norm>();
std::unordered_map<iresearch::bytes_ref, std::unordered_set<iresearch::doc_id_t>> expected_terms;
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("bytes1_data"))].emplace(1);
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("bytes2_data"))].emplace(2);
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("bytes3_data"))].emplace(3);
ASSERT_EQ(3, docs_count(segment, "doc_bytes"));
ASSERT_TRUE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
validate_terms(
segment,
*terms,
3,
bytes1,
bytes3,
3,
features,
expected_terms
);
std::unordered_map<float_t, iresearch::doc_id_t> expected_values{
{ 0.5f, 1 }, // norm value for 'doc_bytes' in 'doc1'
{ float_t(1/std::sqrt(2)), 3 }, // norm value for 'doc_bytes' in 'doc3'
};
auto reader = [&expected_values] (iresearch::doc_id_t doc, const irs::bytes_ref& value) {
irs::bytes_ref_input in(value);
const auto actual_value = iresearch::read_zvfloat(in); // read norm value
auto it = expected_values.find(actual_value);
if (it == expected_values.end()) {
// can't find value
return false;
}
if (it->second != doc) {
// wrong document
return false;
}
expected_values.erase(it);
return true;
};
auto* column = segment.column_reader(field.norm);
ASSERT_NE(nullptr, column);
ASSERT_TRUE(column->visit(reader));
ASSERT_TRUE(expected_values.empty());
}
// validate double field
{
auto terms = segment.field("doc_double");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::double_field().features();
iresearch::numeric_token_stream max;
max.reset((double_t) (2.718281828 * 3));
iresearch::numeric_token_stream min;
min.reset((double_t) (2.718281828 * 1));
std::unordered_map<iresearch::bstring, std::unordered_set<iresearch::doc_id_t>> expected_terms;
{
iresearch::numeric_token_stream itr;
itr.reset((double_t) (2.718281828 * 1));
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(1));
}
{
iresearch::numeric_token_stream itr;
itr.reset((double_t) (2.718281828 * 2));
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(2));
}
{
iresearch::numeric_token_stream itr;
itr.reset((double_t) (2.718281828 * 3));
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(3));
}
ASSERT_EQ(3, docs_count(segment, "doc_double"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
ASSERT_TRUE(max.next() && max.next() && max.next() && max.next()); // skip to last value
ASSERT_TRUE(min.next()); // skip to first value
validate_terms(
segment,
*terms,
3,
min.attributes().get<iresearch::term_attribute>()->value(),
max.attributes().get<iresearch::term_attribute>()->value(),
12,
features,
expected_terms
);
}
// validate float field
{
auto terms = segment.field("doc_float");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::float_field().features();
iresearch::numeric_token_stream max;
max.reset((float_t) (3.1415926535 * 3));
iresearch::numeric_token_stream min;
min.reset((float_t) (3.1415926535 * 1));
std::unordered_map<iresearch::bstring, std::unordered_set<iresearch::doc_id_t>> expected_terms;
{
iresearch::numeric_token_stream itr;
itr.reset((float_t) (3.1415926535 * 1));
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(1));
}
{
iresearch::numeric_token_stream itr;
itr.reset((float_t) (3.1415926535 * 2));
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(2));
}
{
iresearch::numeric_token_stream itr;
itr.reset((float_t) (3.1415926535 * 3));
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(3));
}
ASSERT_EQ(3, docs_count(segment, "doc_float"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
ASSERT_TRUE(max.next() && max.next()); // skip to last value
ASSERT_TRUE(min.next()); // skip to first value
validate_terms(
segment,
*terms,
3,
min.attributes().get<iresearch::term_attribute>()->value(),
max.attributes().get<iresearch::term_attribute>()->value(),
6,
features,
expected_terms
);
}
// validate int field
{
auto terms = segment.field("doc_int");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::int_field().features();
iresearch::numeric_token_stream max;
max.reset(42 * 3);
iresearch::numeric_token_stream min;
min.reset(42 * 1);
std::unordered_map<iresearch::bstring, std::unordered_set<iresearch::doc_id_t>> expected_terms;
{
iresearch::numeric_token_stream itr;
itr.reset(42 * 1);
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(1));
}
{
iresearch::numeric_token_stream itr;
itr.reset(42 * 2);
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(2));
}
{
iresearch::numeric_token_stream itr;
itr.reset(42 * 3);
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(3));
}
ASSERT_EQ(3, docs_count(segment, "doc_int"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
ASSERT_TRUE(max.next() && max.next()); // skip to last value
ASSERT_TRUE(min.next()); // skip to first value
validate_terms(
segment,
*terms,
3,
min.attributes().get<iresearch::term_attribute>()->value(),
max.attributes().get<iresearch::term_attribute>()->value(),
4,
features,
expected_terms
);
}
// validate long field
{
auto terms = segment.field("doc_long");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto features = tests::long_field().features();
iresearch::numeric_token_stream max;
max.reset((int64_t) 12345 * 3);
iresearch::numeric_token_stream min;
min.reset((int64_t) 12345 * 1);
std::unordered_map<iresearch::bstring, std::unordered_set<iresearch::doc_id_t>> expected_terms;
{
iresearch::numeric_token_stream itr;
itr.reset((int64_t) 12345 * 1);
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(1));
}
{
iresearch::numeric_token_stream itr;
itr.reset((int64_t) 12345 * 2);
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(2));
}
{
iresearch::numeric_token_stream itr;
itr.reset((int64_t) 12345 * 3);
for (; itr.next(); expected_terms[iresearch::bstring(itr.attributes().get<iresearch::term_attribute>()->value())].emplace(3));
}
ASSERT_EQ(3, docs_count(segment, "doc_long"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
ASSERT_TRUE(max.next() && max.next() && max.next() && max.next()); // skip to last value
ASSERT_TRUE(min.next()); // skip to first value
validate_terms(
segment,
*terms,
3,
min.attributes().get<iresearch::term_attribute>()->value(),
max.attributes().get<iresearch::term_attribute>()->value(),
6,
features,
expected_terms
);
}
// validate string field
{
auto terms = segment.field("doc_string");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto& features = STRING_FIELD_FEATURES;
size_t frequency = 1;
std::vector<uint32_t> position = { 0 };
std::unordered_map<iresearch::bytes_ref, std::unordered_set<iresearch::doc_id_t>> expected_terms;
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("string1_data"))].emplace(1);
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("string2_data"))].emplace(2);
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("string3_data"))].emplace(3);
ASSERT_EQ(3, docs_count(segment, "doc_string"));
ASSERT_FALSE(iresearch::type_limits<iresearch::type_t::field_id_t>::valid(field.norm)); // norm attribute has not been specified
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
validate_terms(
segment,
*terms,
3,
iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref(string1)),
iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref(string3)),
3,
features,
expected_terms,
&frequency,
&position
);
}
// validate text field
{
auto terms = segment.field("doc_text");
ASSERT_NE(nullptr, terms);
auto& field = terms->meta();
auto& features = TEXT_FIELD_FEATURES;
size_t frequency = 1;
std::vector<uint32_t> position = { 0 };
std::unordered_map<iresearch::bytes_ref, std::unordered_set<iresearch::doc_id_t>> expected_terms;
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("text1_data"))].emplace(1);
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("text2_data"))].emplace(2);
expected_terms[iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref("text3_data"))].emplace(3);
ASSERT_EQ(3, docs_count(segment, "doc_text"));
ASSERT_EQ(features, field.features);
ASSERT_NE(nullptr, terms);
validate_terms(
segment,
*terms,
3,
iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref(text1)),
iresearch::ref_cast<iresearch::byte_type>(iresearch::string_ref(text3)),
3,
features,
expected_terms,
&frequency,
&position
);
}
// ...........................................................................
// validate documents
// ...........................................................................
std::unordered_set<iresearch::bytes_ref> expected_bytes;
auto column = segment.column_reader("doc_bytes");
ASSERT_NE(nullptr, column);
auto bytes_values = column->values();
std::unordered_set<double> expected_double;
column = segment.column_reader("doc_double");
ASSERT_NE(nullptr, column);
auto double_values = column->values();
std::unordered_set<float> expected_float;
column = segment.column_reader("doc_float");
ASSERT_NE(nullptr, column);
auto float_values = column->values();
std::unordered_set<int> expected_int;
column = segment.column_reader("doc_int");
ASSERT_NE(nullptr, column);
auto int_values = column->values();
std::unordered_set<int64_t> expected_long;
column = segment.column_reader("doc_long");
ASSERT_NE(nullptr, column);
auto long_values = column->values();
std::unordered_set<std::string> expected_string;
column = segment.column_reader("doc_string");
ASSERT_NE(nullptr, column);
auto string_values = column->values();
expected_bytes = { iresearch::bytes_ref(bytes1), iresearch::bytes_ref(bytes2), iresearch::bytes_ref(bytes3) };
expected_double = { 2.718281828 * 1, 2.718281828 * 2, 2.718281828 * 3 };
expected_float = { (float)(3.1415926535 * 1), (float)(3.1415926535 * 2), (float)(3.1415926535 * 3) };
expected_int = { 42 * 1, 42 * 2, 42 * 3 };
expected_long = { 12345 * 1, 12345 * 2, 12345 * 3 };
expected_string = { string1, string2, string3 };
// can't have more docs then highest doc_id
irs::bytes_ref value;
irs::bytes_ref_input in;
for (size_t i = 0, count = segment.docs_count(); i < count; ++i) {
const auto doc = iresearch::doc_id_t((iresearch::type_limits<iresearch::type_t::doc_id_t>::min)() + i);
ASSERT_TRUE(bytes_values(doc, value)); in.reset(value);
ASSERT_EQ(1, expected_bytes.erase(irs::read_string<irs::bstring>(in)));
ASSERT_TRUE(double_values(doc, value)); in.reset(value);
ASSERT_EQ(1, expected_double.erase(irs::read_zvdouble(in)));
ASSERT_TRUE(float_values(doc, value)); in.reset(value);
ASSERT_EQ(1, expected_float.erase(irs::read_zvfloat(in)));
ASSERT_TRUE(int_values(doc, value)); in.reset(value);
ASSERT_EQ(1, expected_int.erase(irs::read_zvint(in)));
ASSERT_TRUE(long_values(doc, value)); in.reset(value);
ASSERT_EQ(1, expected_long.erase(irs::read_zvlong(in)));
ASSERT_TRUE(string_values(doc, value)); in.reset(value);
ASSERT_EQ(1, expected_string.erase(irs::read_string<std::string>(in)));
}
ASSERT_TRUE(expected_bytes.empty());
ASSERT_TRUE(expected_double.empty());
ASSERT_TRUE(expected_float.empty());
ASSERT_TRUE(expected_int.empty());
ASSERT_TRUE(expected_long.empty());
ASSERT_TRUE(expected_string.empty());
}
TEST_F(merge_writer_tests, test_merge_writer_add_segments) {
auto codec_ptr = irs::formats::get("1_0");
ASSERT_NE(nullptr, codec_ptr);
irs::memory_directory data_dir;
// populate directory
{
tests::json_doc_generator gen(
test_base::resource("simple_sequential_33.json"),
&tests::generic_json_field_factory
);
std::vector<const tests::document*> docs;
docs.reserve(33);
for (size_t i = 0; i < 33; ++i) {
docs.emplace_back(gen.next());
}
auto writer = irs::index_writer::make(data_dir, codec_ptr, irs::OM_CREATE);
for (auto* doc: docs) {
ASSERT_NE(nullptr, doc);
ASSERT_TRUE(insert(
*writer,
doc->indexed.begin(), doc->indexed.end(),
doc->stored.begin(), doc->stored.end()
));
writer->commit(); // create segmentN
}
}
auto reader = irs::directory_reader::open(data_dir, codec_ptr);
ASSERT_EQ(33, reader.size());
// merge 33 segments to writer (segments > 32 to trigger GCC 8.2.0 optimizer bug)
{
irs::column_info_provider_t column_info = [](const irs::string_ref&) {
return irs::column_info(irs::compression::lz4::type(), irs::compression::options{}, true );
};
irs::memory_directory dir;
irs::index_meta::index_segment_t index_segment;
irs::merge_writer writer(dir, column_info);
for (auto& sub_reader: reader) {
writer.add(sub_reader);
}
index_segment.meta.codec = codec_ptr;
ASSERT_TRUE(writer.flush(index_segment));
auto segment = irs::segment_reader::open(dir, index_segment.meta);
ASSERT_EQ(33, segment.docs_count());
ASSERT_EQ(33, segment.field("name")->docs_count());
ASSERT_EQ(33, segment.field("seq")->docs_count());
ASSERT_EQ(33, segment.field("same")->docs_count());
ASSERT_EQ(13, segment.field("duplicated")->docs_count());
}
}
TEST_F(merge_writer_tests, test_merge_writer_flush_progress) {
auto codec_ptr = irs::formats::get("1_0");
ASSERT_NE(nullptr, codec_ptr);
irs::memory_directory data_dir;
// populate directory
{
tests::json_doc_generator gen(
test_base::resource("simple_sequential.json"),
&tests::generic_json_field_factory
);
auto* doc1 = gen.next();
auto* doc2 = gen.next();
auto writer = irs::index_writer::make(data_dir, codec_ptr, irs::OM_CREATE);
ASSERT_TRUE(insert(
*writer,
doc1->indexed.begin(), doc1->indexed.end(),
doc1->stored.begin(), doc1->stored.end()
));
writer->commit(); // create segment0
ASSERT_TRUE(insert(
*writer,
doc2->indexed.begin(), doc2->indexed.end(),
doc2->stored.begin(), doc2->stored.end()
));
writer->commit(); // create segment1
}
auto reader = irs::directory_reader::open(data_dir, codec_ptr);
ASSERT_EQ(2, reader.size());
ASSERT_EQ(1, reader[0].docs_count());
ASSERT_EQ(1, reader[1].docs_count());
// test default progress (false)
{
irs::column_info_provider_t column_info = [](const irs::string_ref&) {
return irs::column_info(irs::compression::lz4::type(), irs::compression::options{}, true );
};
irs::memory_directory dir;
irs::index_meta::index_segment_t index_segment;
irs::merge_writer::flush_progress_t progress;
irs::merge_writer writer(dir, column_info);
index_segment.meta.codec = codec_ptr;
writer.add(reader[0]);
writer.add(reader[1]);
ASSERT_TRUE(writer.flush(index_segment, progress));
ASSERT_FALSE(index_segment.meta.files.empty());
ASSERT_EQ(2, index_segment.meta.docs_count);
ASSERT_EQ(2, index_segment.meta.live_docs_count);
ASSERT_EQ(0, index_segment.meta.version);
ASSERT_EQ(true, index_segment.meta.column_store);
auto segment = irs::segment_reader::open(dir, index_segment.meta);
ASSERT_EQ(2, segment.docs_count());
}
// test always-false progress
{
irs::column_info_provider_t column_info = [](const irs::string_ref&) {
return irs::column_info(irs::compression::lz4::type(), irs::compression::options{}, true );
};
irs::memory_directory dir;
irs::index_meta::index_segment_t index_segment;
irs::merge_writer::flush_progress_t progress = []()->bool { return false; };
irs::merge_writer writer(dir, column_info);
index_segment.meta.codec = codec_ptr;
writer.add(reader[0]);
writer.add(reader[1]);
ASSERT_FALSE(writer.flush(index_segment, progress));
ASSERT_TRUE(index_segment.filename.empty());
ASSERT_TRUE(index_segment.meta.name.empty());
ASSERT_TRUE(index_segment.meta.files.empty());
ASSERT_FALSE(index_segment.meta.column_store);
ASSERT_EQ(0, index_segment.meta.version);
ASSERT_EQ(0, index_segment.meta.docs_count);
ASSERT_EQ(0, index_segment.meta.live_docs_count);
ASSERT_EQ(0, index_segment.meta.size);
ASSERT_ANY_THROW(irs::segment_reader::open(dir, index_segment.meta));
}
size_t progress_call_count = 0;
// test always-true progress
{
irs::column_info_provider_t column_info = [](const irs::string_ref&) {
return irs::column_info(irs::compression::lz4::type(), irs::compression::options{}, true );
};
irs::memory_directory dir;
irs::index_meta::index_segment_t index_segment;
irs::merge_writer::flush_progress_t progress =
[&progress_call_count]()->bool { ++progress_call_count; return true; };
irs::merge_writer writer(dir, column_info);
index_segment.meta.codec = codec_ptr;
writer.add(reader[0]);
writer.add(reader[1]);
ASSERT_TRUE(writer.flush(index_segment, progress));
ASSERT_FALSE(index_segment.meta.files.empty());
ASSERT_EQ(2, index_segment.meta.docs_count);
ASSERT_EQ(2, index_segment.meta.live_docs_count);
ASSERT_EQ(0, index_segment.meta.version);
ASSERT_EQ(true, index_segment.meta.column_store);
auto segment = irs::segment_reader::open(dir, index_segment.meta);
ASSERT_EQ(2, segment.docs_count());
}
ASSERT_TRUE(progress_call_count); // there should have been at least some calls
irs::column_info_provider_t column_info = [](const irs::string_ref&) {
return irs::column_info(irs::compression::lz4::type(), irs::compression::options{}, true );
};
// test limited-true progress
for (size_t i = 1; i < progress_call_count; ++i) { // +1 for pre-decrement in 'progress'
size_t call_count = i;
irs::memory_directory dir;
irs::index_meta::index_segment_t index_segment;
irs::merge_writer::flush_progress_t progress =
[&call_count]()->bool { return --call_count; };
irs::merge_writer writer(dir, column_info);
index_segment.meta.codec = codec_ptr;
index_segment.meta.name = "merged";
writer.add(reader[0]);
writer.add(reader[1]);
ASSERT_FALSE(writer.flush(index_segment, progress));
ASSERT_EQ(0, call_count);
ASSERT_TRUE(index_segment.filename.empty());
ASSERT_TRUE(index_segment.meta.name.empty());
ASSERT_TRUE(index_segment.meta.files.empty());
ASSERT_FALSE(index_segment.meta.column_store);
ASSERT_EQ(0, index_segment.meta.version);
ASSERT_EQ(0, index_segment.meta.docs_count);
ASSERT_EQ(0, index_segment.meta.live_docs_count);
ASSERT_EQ(0, index_segment.meta.size);
ASSERT_ANY_THROW(irs::segment_reader::open(dir, index_segment.meta));
}
}
TEST_F(merge_writer_tests, test_merge_writer_field_features) {
//iresearch::flags STRING_FIELD_FEATURES{ iresearch::frequency::type(), iresearch::position::type() };
//iresearch::flags TEXT_FIELD_FEATURES{ iresearch::frequency::type(), iresearch::position::type(), iresearch::offset::type(), iresearch::payload::type() };
std::string field("doc_string");
std::string data("string_data");
tests::document doc1; // string
tests::document doc2; // text
doc1.insert(std::make_shared<tests::templates::string_field>(field, data));
doc2.indexed.push_back(std::make_shared<tests::templates::text_field<iresearch::string_ref>>(field, data, true));
ASSERT_TRUE(doc1.indexed.get(field)->features().is_subset_of(doc2.indexed.get(field)->features()));
ASSERT_FALSE(doc2.indexed.get(field)->features().is_subset_of(doc1.indexed.get(field)->features()));
auto codec_ptr = irs::formats::get("1_0");
ASSERT_NE(nullptr, codec_ptr);
irs::memory_directory dir;
// populate directory
{
auto writer = iresearch::index_writer::make(dir, codec_ptr, iresearch::OM_CREATE);
ASSERT_TRUE(insert(*writer,
doc1.indexed.begin(), doc1.indexed.end(),
doc1.stored.begin(), doc1.stored.end()
));
writer->commit();
ASSERT_TRUE(insert(*writer,
doc2.indexed.begin(), doc2.indexed.end(),
doc2.stored.begin(), doc2.stored.end()
));
writer->commit();
}
auto reader = iresearch::directory_reader::open(dir, codec_ptr);
ASSERT_EQ(2, reader.size());
ASSERT_EQ(1, reader[0].docs_count());
ASSERT_EQ(1, reader[1].docs_count());
// test merge existing with feature subset (success)
{
irs::column_info_provider_t column_info = [](const irs::string_ref&) {
return irs::column_info(irs::compression::lz4::type(), irs::compression::options{}, true );
};
irs::merge_writer writer(dir, column_info);
writer.add(reader[1]); // assume 1 is segment with text field
writer.add(reader[0]); // assume 0 is segment with string field
irs::index_meta::index_segment_t index_segment;
index_segment.meta.codec = codec_ptr;
ASSERT_TRUE(writer.flush(index_segment));
}
// test merge existing with feature superset (fail)
{
irs::column_info_provider_t column_info = [](const irs::string_ref&) {
return irs::column_info(irs::compression::lz4::type(), irs::compression::options{}, true );
};
irs::merge_writer writer(dir, column_info);
writer.add(reader[0]); // assume 0 is segment with text field
writer.add(reader[1]); // assume 1 is segment with string field
irs::index_meta::index_segment_t index_segment;
index_segment.meta.codec = codec_ptr;
ASSERT_FALSE(writer.flush(index_segment));
}
}
// -----------------------------------------------------------------------------
// --SECTION-- END-OF-FILE
// -----------------------------------------------------------------------------