mirror of https://gitee.com/bigwinds/arangodb
259 lines
7.1 KiB
C++
259 lines
7.1 KiB
C++
////////////////////////////////////////////////////////////////////////////////
|
|
/// DISCLAIMER
|
|
///
|
|
/// Copyright 2016 by EMC Corporation, All Rights Reserved
|
|
///
|
|
/// Licensed under the Apache License, Version 2.0 (the "License");
|
|
/// you may not use this file except in compliance with the License.
|
|
/// You may obtain a copy of the License at
|
|
///
|
|
/// http://www.apache.org/licenses/LICENSE-2.0
|
|
///
|
|
/// Unless required by applicable law or agreed to in writing, software
|
|
/// distributed under the License is distributed on an "AS IS" BASIS,
|
|
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
/// See the License for the specific language governing permissions and
|
|
/// limitations under the License.
|
|
///
|
|
/// Copyright holder is EMC Corporation
|
|
///
|
|
/// @author Andrey Abramov
|
|
/// @author Vasiliy Nabatchikov
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "utils/register.hpp"
|
|
|
|
// list of statically loaded scorers via init()
|
|
#ifndef IRESEARCH_DLL
|
|
#include "delimited_token_stream.hpp"
|
|
#include "ngram_token_stream.hpp"
|
|
#include "text_token_normalizing_stream.hpp"
|
|
#include "text_token_stemming_stream.hpp"
|
|
#include "text_token_stream.hpp"
|
|
#include "token_masking_stream.hpp"
|
|
#endif
|
|
|
|
#include "analyzers.hpp"
|
|
|
|
NS_LOCAL
|
|
|
|
struct key {
|
|
key(const irs::string_ref& name,
|
|
const irs::text_format::type_id& args_format)
|
|
: args_format(args_format),
|
|
name(name) {
|
|
}
|
|
|
|
bool operator==(const key& other) const NOEXCEPT {
|
|
return &args_format == &other.args_format && name == other.name;
|
|
}
|
|
|
|
bool operator!=(const key& other) const NOEXCEPT {
|
|
return !(*this == other);
|
|
}
|
|
|
|
const irs::text_format::type_id& args_format;
|
|
const irs::string_ref name;
|
|
};
|
|
|
|
struct value{
|
|
explicit value(
|
|
irs::analysis::factory_f factory = nullptr,
|
|
irs::analysis::normalizer_f normalizer = nullptr)
|
|
: factory(factory),
|
|
normalizer(normalizer) {
|
|
}
|
|
|
|
bool empty() const NOEXCEPT { return nullptr == factory; }
|
|
|
|
bool operator==(const value& other) const NOEXCEPT {
|
|
return factory == other.factory && normalizer == other.normalizer;
|
|
}
|
|
|
|
bool operator!=(const value& other) const NOEXCEPT {
|
|
return !(*this == other);
|
|
}
|
|
|
|
const irs::analysis::factory_f factory;
|
|
const irs::analysis::normalizer_f normalizer;
|
|
};
|
|
|
|
NS_END
|
|
|
|
NS_BEGIN(std)
|
|
|
|
template<>
|
|
struct hash<::key> {
|
|
size_t operator()(const ::key& value) const NOEXCEPT {
|
|
return std::hash<irs::string_ref>()(value.name);
|
|
}
|
|
}; // hash
|
|
|
|
NS_END // std
|
|
|
|
NS_LOCAL
|
|
|
|
const std::string FILENAME_PREFIX("libanalyzer-");
|
|
|
|
class analyzer_register
|
|
: public irs::tagged_generic_register<::key, ::value, irs::string_ref, analyzer_register> {
|
|
protected:
|
|
virtual std::string key_to_filename(const key_type& key) const override {
|
|
auto& name = key.name;
|
|
std::string filename(FILENAME_PREFIX.size() + name.size(), 0);
|
|
|
|
std::memcpy(
|
|
&filename[0],
|
|
FILENAME_PREFIX.c_str(),
|
|
FILENAME_PREFIX.size()
|
|
);
|
|
|
|
irs::string_ref::traits_type::copy(
|
|
&filename[0] + FILENAME_PREFIX.size(),
|
|
name.c_str(),
|
|
name.size()
|
|
);
|
|
|
|
return filename;
|
|
}
|
|
};
|
|
|
|
NS_END
|
|
|
|
NS_ROOT
|
|
NS_BEGIN(analysis)
|
|
|
|
/*static*/ bool analyzers::exists(
|
|
const string_ref& name,
|
|
const irs::text_format::type_id& args_format,
|
|
bool load_library /*= true*/
|
|
) {
|
|
return !analyzer_register::instance().get(::key(name, args_format), load_library).empty();
|
|
}
|
|
|
|
/*static*/ bool analyzers::normalize(
|
|
std::string& out,
|
|
const string_ref& name,
|
|
const irs::text_format::type_id& args_format,
|
|
const string_ref& args,
|
|
bool load_library /*= true*/
|
|
) NOEXCEPT {
|
|
try {
|
|
auto* normalizer = analyzer_register::instance().get(
|
|
::key(name, args_format),
|
|
load_library
|
|
).normalizer;
|
|
|
|
return normalizer ? normalizer(args, out) : false;
|
|
} catch (...) {
|
|
IR_FRMT_ERROR("Caught exception while normalizing analyzer '%s' arguments", name.c_str());
|
|
IR_LOG_EXCEPTION();
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*static*/ analyzer::ptr analyzers::get(
|
|
const string_ref& name,
|
|
const irs::text_format::type_id& args_format,
|
|
const string_ref& args,
|
|
bool load_library /*= true*/
|
|
) NOEXCEPT {
|
|
try {
|
|
auto* factory = analyzer_register::instance().get(
|
|
::key(name, args_format),
|
|
load_library
|
|
).factory;
|
|
|
|
return factory ? factory(args) : nullptr;
|
|
} catch (...) {
|
|
IR_FRMT_ERROR("Caught exception while getting an analyzer instance");
|
|
IR_LOG_EXCEPTION();
|
|
}
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
/*static*/ void analyzers::init() {
|
|
#ifndef IRESEARCH_DLL
|
|
irs::analysis::delimited_token_stream::init();
|
|
irs::analysis::ngram_token_stream_base::init();
|
|
irs::analysis::text_token_normalizing_stream::init();
|
|
irs::analysis::text_token_stemming_stream::init();
|
|
irs::analysis::text_token_stream::init();
|
|
irs::analysis::token_masking_stream::init();
|
|
#endif
|
|
}
|
|
|
|
/*static*/ void analyzers::load_all(const std::string& path) {
|
|
load_libraries(path, FILENAME_PREFIX, "");
|
|
}
|
|
|
|
/*static*/ bool analyzers::visit(
|
|
const std::function<bool(const string_ref&, const irs::text_format::type_id&)>& visitor
|
|
) {
|
|
analyzer_register::visitor_t wrapper = [&visitor](const ::key& key)->bool {
|
|
return visitor(key.name, key.args_format);
|
|
};
|
|
|
|
return analyzer_register::instance().visit(wrapper);
|
|
}
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// --SECTION-- analyzer registration
|
|
// -----------------------------------------------------------------------------
|
|
|
|
analyzer_registrar::analyzer_registrar(
|
|
const analyzer::type_id& type,
|
|
const irs::text_format::type_id& args_format,
|
|
analyzer::ptr(*factory)(const irs::string_ref& args),
|
|
bool(*normalizer)(const irs::string_ref& args, std::string& config),
|
|
const char* source /*= nullptr*/
|
|
) {
|
|
irs::string_ref source_ref(source);
|
|
const auto new_entry = ::value(factory, normalizer);
|
|
auto entry = analyzer_register::instance().set(
|
|
::key(type.name(), args_format),
|
|
new_entry,
|
|
source_ref.null() ? nullptr : &source_ref
|
|
);
|
|
|
|
registered_ = entry.second;
|
|
|
|
if (!registered_ && new_entry != entry.first) {
|
|
auto* registered_source =
|
|
analyzer_register::instance().tag(::key(type.name(), args_format));
|
|
|
|
if (source && registered_source) {
|
|
IR_FRMT_WARN(
|
|
"type name collision detected while registering analyzer, ignoring: type '%s' from %s, previously from %s",
|
|
type.name().c_str(),
|
|
source,
|
|
registered_source->c_str()
|
|
);
|
|
} else if (source) {
|
|
IR_FRMT_WARN(
|
|
"type name collision detected while registering analyzer, ignoring: type '%s' from %s",
|
|
type.name().c_str(),
|
|
source
|
|
);
|
|
} else if (registered_source) {
|
|
IR_FRMT_WARN(
|
|
"type name collision detected while registering analyzer, ignoring: type '%s', previously from %s",
|
|
type.name().c_str(),
|
|
registered_source->c_str()
|
|
);
|
|
} else {
|
|
IR_FRMT_WARN(
|
|
"type name collision detected while registering analyzer, ignoring: type '%s'",
|
|
type.name().c_str()
|
|
);
|
|
}
|
|
|
|
IR_LOG_STACK_TRACE();
|
|
}
|
|
}
|
|
|
|
NS_END // analysis
|
|
NS_END
|