From 2de95100513837a91e151efdbd7082ecd12a79f1 Mon Sep 17 00:00:00 2001 From: Vasiliy Date: Sat, 20 Apr 2019 23:20:15 +0300 Subject: [PATCH] issue 539.1: backport from iresearch upstream: add support for text normalizing analyzer, make loading shared object ptional in the registry (#8813) --- 3rdParty/iresearch/core/CMakeLists.txt | 131 +++++++ .../iresearch/core/analysis/analyzers.cpp | 15 +- .../iresearch/core/analysis/analyzers.hpp | 7 +- .../text_token_normalizing_stream.cpp | 355 ++++++++++++++++++ .../text_token_normalizing_stream.hpp | 80 ++++ 3rdParty/iresearch/core/formats/formats.cpp | 16 +- 3rdParty/iresearch/core/formats/formats.hpp | 9 +- 3rdParty/iresearch/core/search/scorers.cpp | 13 +- 3rdParty/iresearch/core/search/scorers.hpp | 7 +- 3rdParty/iresearch/core/utils/attributes.cpp | 12 +- 3rdParty/iresearch/core/utils/attributes.hpp | 7 +- 3rdParty/iresearch/core/utils/register.hpp | 15 +- .../IResearch/IResearchAnalyzerFeature.cpp | 4 +- arangod/IResearch/IResearchOrderFactory.cpp | 18 +- 14 files changed, 652 insertions(+), 37 deletions(-) create mode 100644 3rdParty/iresearch/core/analysis/text_token_normalizing_stream.cpp create mode 100644 3rdParty/iresearch/core/analysis/text_token_normalizing_stream.hpp diff --git a/3rdParty/iresearch/core/CMakeLists.txt b/3rdParty/iresearch/core/CMakeLists.txt index 9d416a31e5..966f5e7f56 100644 --- a/3rdParty/iresearch/core/CMakeLists.txt +++ b/3rdParty/iresearch/core/CMakeLists.txt @@ -367,6 +367,7 @@ add_dependencies(${IResearch_TARGET_NAME}-static ${IResearch_TARGET_NAME}-analyzer-delimited-static ${IResearch_TARGET_NAME}-analyzer-ngram-static ${IResearch_TARGET_NAME}-analyzer-text-static + ${IResearch_TARGET_NAME}-analyzer-text-normalizing-static ${IResearch_TARGET_NAME}-analyzer-text-stemming-static ${IResearch_TARGET_NAME}-analyzer-token-masking-static ${IResearch_TARGET_NAME}-format-1_0-static @@ -387,6 +388,7 @@ if(MSVC) ${IResearch_TARGET_NAME}-analyzer-delimited-static-scrt ${IResearch_TARGET_NAME}-analyzer-ngram-static-scrt ${IResearch_TARGET_NAME}-analyzer-text-static-scrt + ${IResearch_TARGET_NAME}-analyzer-text-normalizing-static-scrt ${IResearch_TARGET_NAME}-analyzer-text-stemming-static-scrt ${IResearch_TARGET_NAME}-analyzer-token-masking-static-scrt ${IResearch_TARGET_NAME}-format-1_0-static-scrt @@ -505,6 +507,7 @@ target_link_libraries(${IResearch_TARGET_NAME}-static ${IResearch_TARGET_NAME}-analyzer-delimited-static ${IResearch_TARGET_NAME}-analyzer-ngram-static ${IResearch_TARGET_NAME}-analyzer-text-static + ${IResearch_TARGET_NAME}-analyzer-text-normalizing-static ${IResearch_TARGET_NAME}-analyzer-text-stemming-static ${IResearch_TARGET_NAME}-analyzer-token-masking-static ${IResearch_TARGET_NAME}-format-1_0-static @@ -537,6 +540,7 @@ if(MSVC) ${IResearch_TARGET_NAME}-analyzer-delimited-static-scrt ${IResearch_TARGET_NAME}-analyzer-ngram-static-scrt ${IResearch_TARGET_NAME}-analyzer-text-static-scrt + ${IResearch_TARGET_NAME}-analyzer-text-normalizing-static-scrt ${IResearch_TARGET_NAME}-analyzer-text-stemming-static-scrt ${IResearch_TARGET_NAME}-analyzer-token-masking-static-scrt ${IResearch_TARGET_NAME}-format-1_0-static-scrt @@ -566,6 +570,7 @@ if (MSVC) "$" "$" "$" + "$" "$" "$" "$" @@ -624,6 +629,7 @@ if (MSVC) "$" "$" "$" + "$" "$" "$" "$" @@ -683,6 +689,7 @@ elseif (APPLE) "$" "$" "$" + "$" "$" "$" "$" @@ -741,6 +748,7 @@ else() "$" "$" "$" + "$" "$" "$" "$" @@ -1021,6 +1029,129 @@ if(MSVC) ) endif() +################################################################################ +### analysis plugin : text token normalizing +################################################################################ + +add_library(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared + SHARED + ./analysis/text_token_normalizing_stream.cpp + ./analysis/text_token_normalizing_stream.hpp +) + +add_library(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static + STATIC + ./analysis/text_token_normalizing_stream.cpp +) + +# setup CRT +if(MSVC) + add_library(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared-scrt + SHARED + ./analysis/text_token_normalizing_stream.cpp + ) + + add_library(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static-scrt + STATIC + ./analysis/text_token_normalizing_stream.cpp + ) +endif() + +target_include_directories(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared + PRIVATE ${ICU_INCLUDE_DIR} # cmake on MSVC does not properly expand dependencies for linked libraries +) + +target_include_directories(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static + PRIVATE ${ICU_INCLUDE_DIR} # cmake on MSVC does not properly expand dependencies for linked libraries +) + +# setup CRT +if(MSVC) + target_include_directories(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared-scrt + PRIVATE ${ICU_INCLUDE_DIR} # cmake on MSVC does not properly expand dependencies for linked libraries + ) + + target_include_directories(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static-scrt + PRIVATE ${ICU_INCLUDE_DIR} # cmake on MSVC does not properly expand dependencies for linked libraries + ) +endif() + +# setup CRT +if(MSVC) + target_compile_options(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared + PRIVATE "$<$:/MDd>$<$>:/MD>" + ) + + target_compile_options(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static + PRIVATE "$<$:/MDd>$<$>:/MD>" + ) + + target_compile_options(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared-scrt + PRIVATE "$<$:/MTd>$<$>:/MT>" + ) + + target_compile_options(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static-scrt + PRIVATE "$<$:/MTd>$<$>:/MT>" + ) +endif() + +set_target_properties(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared + PROPERTIES + PREFIX lib + IMPORT_PREFIX lib + OUTPUT_NAME analyzer-text-token-normalize + DEBUG_POSTFIX "" # otherwise library names will not match expected dynamically loaded value + COMPILE_DEFINITIONS "$<$:IRESEARCH_DEBUG>;IRESEARCH_DLL;IRESEARCH_DLL_EXPORTS;IRESEARCH_DLL_PLUGIN" + CXX_VISIBILITY_PRESET hidden +) + +set_target_properties(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static + PROPERTIES + PREFIX lib + IMPORT_PREFIX lib + OUTPUT_NAME analyzer-text-token-normalize-s + COMPILE_DEFINITIONS "$<$:IRESEARCH_DEBUG>" +) + +# setup CRT +if(MSVC) + set_target_properties(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared-scrt + PROPERTIES + PREFIX lib + IMPORT_PREFIX lib + OUTPUT_NAME analyzer-text-token-normalize-scrt + COMPILE_DEFINITIONS "$<$:IRESEARCH_DEBUG>;IRESEARCH_DLL;IRESEARCH_DLL_EXPORTS;IRESEARCH_DLL_PLUGIN" + CXX_VISIBILITY_PRESET hidden + ) + + set_target_properties(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static-scrt + PROPERTIES + PREFIX lib + IMPORT_PREFIX lib + OUTPUT_NAME analyzer-text-token-normalize-scrt-s + COMPILE_DEFINITIONS "$<$:IRESEARCH_DEBUG>" + ) +endif() + +target_link_libraries(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared + ${IResearch_TARGET_NAME}-shared +) + +target_link_libraries(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static + ${IResearch_TARGET_NAME}-static +) + +# setup CRT +if(MSVC) + target_link_libraries(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared-scrt + ${IResearch_TARGET_NAME}-shared-scrt + ) + + target_link_libraries(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static-scrt + ${IResearch_TARGET_NAME}-static-scrt + ) +endif() + ################################################################################ ### analysis plugin : text token stemming ################################################################################ diff --git a/3rdParty/iresearch/core/analysis/analyzers.cpp b/3rdParty/iresearch/core/analysis/analyzers.cpp index ec31ae11fb..a12047b084 100644 --- a/3rdParty/iresearch/core/analysis/analyzers.cpp +++ b/3rdParty/iresearch/core/analysis/analyzers.cpp @@ -27,6 +27,7 @@ #ifndef IRESEARCH_DLL #include "delimited_token_stream.hpp" #include "ngram_token_stream.hpp" + #include "text_token_normalizing_stream.hpp" #include "text_token_stemming_stream.hpp" #include "text_token_stream.hpp" #include "token_masking_stream.hpp" @@ -94,19 +95,22 @@ NS_BEGIN(analysis) /*static*/ bool analyzers::exists( const string_ref& name, - const irs::text_format::type_id& args_format + const irs::text_format::type_id& args_format, + bool load_library /*= true*/ ) { - return nullptr != analyzer_register::instance().get(entry_key_t(name, args_format)); + return nullptr != analyzer_register::instance().get(entry_key_t(name, args_format), load_library); } /*static*/ analyzer::ptr analyzers::get( const string_ref& name, const irs::text_format::type_id& args_format, - const string_ref& args + const string_ref& args, + bool load_library /*= true*/ ) NOEXCEPT { try { - auto* factory = - analyzer_register::instance().get(entry_key_t(name, args_format)); + auto* factory = analyzer_register::instance().get( + entry_key_t(name, args_format), load_library + ); return factory ? factory(args) : nullptr; } catch (...) { @@ -121,6 +125,7 @@ NS_BEGIN(analysis) #ifndef IRESEARCH_DLL irs::analysis::delimited_token_stream::init(); irs::analysis::ngram_token_stream::init(); + irs::analysis::text_token_normalizing_stream::init(); irs::analysis::text_token_stemming_stream::init(); irs::analysis::text_token_stream::init(); irs::analysis::token_masking_stream::init(); diff --git a/3rdParty/iresearch/core/analysis/analyzers.hpp b/3rdParty/iresearch/core/analysis/analyzers.hpp index 4e340d99e0..f4d45c919f 100644 --- a/3rdParty/iresearch/core/analysis/analyzers.hpp +++ b/3rdParty/iresearch/core/analysis/analyzers.hpp @@ -78,7 +78,9 @@ class IRESEARCH_API analyzers { /// @brief checks whether an analyzer with the specified name is registered //////////////////////////////////////////////////////////////////////////////// static bool exists( - const string_ref& name, const irs::text_format::type_id& args_format + const string_ref& name, + const irs::text_format::type_id& args_format, + bool load_library = true ); //////////////////////////////////////////////////////////////////////////////// @@ -90,7 +92,8 @@ class IRESEARCH_API analyzers { static analyzer::ptr get( const string_ref& name, const irs::text_format::type_id& args_format, - const string_ref& args + const string_ref& args, + bool load_library = true ) NOEXCEPT; //////////////////////////////////////////////////////////////////////////////// diff --git a/3rdParty/iresearch/core/analysis/text_token_normalizing_stream.cpp b/3rdParty/iresearch/core/analysis/text_token_normalizing_stream.cpp new file mode 100644 index 0000000000..c32bd20b4f --- /dev/null +++ b/3rdParty/iresearch/core/analysis/text_token_normalizing_stream.cpp @@ -0,0 +1,355 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2019 ArangoDB GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Andrey Abramov +/// @author Vasiliy Nabatchikov +//////////////////////////////////////////////////////////////////////////////// + +#include // for rapidjson::Document +#include // for icu::Locale + +#if defined(_MSC_VER) + #pragma warning(disable: 4512) +#endif + + #include // for icu::Normalizer2 + +#if defined(_MSC_VER) + #pragma warning(default: 4512) +#endif + +#include // for icu::Transliterator + +#if defined(_MSC_VER) + #pragma warning(disable: 4229) +#endif + + #include // for u_cleanup + +#if defined(_MSC_VER) + #pragma warning(default: 4229) +#endif + +#include "utils/locale_utils.hpp" + +#include "text_token_normalizing_stream.hpp" + +NS_ROOT +NS_BEGIN(analysis) + +// ----------------------------------------------------------------------------- +// --SECTION-- private types +// ----------------------------------------------------------------------------- + +struct text_token_normalizing_stream::state_t { + icu::UnicodeString data; + icu::Locale icu_locale; + std::locale locale; + std::shared_ptr normalizer; + const options_t options; + std::string term_buf; // used by reset() + std::shared_ptr transliterator; + state_t(const options_t& opts): icu_locale("C"), options(opts) { + // NOTE: use of the default constructor for Locale() or + // use of Locale::createFromName(nullptr) + // causes a memory leak with Boost 1.58, as detected by valgrind + icu_locale.setToBogus(); // set to uninitialized + } +}; + +NS_END // analysis +NS_END // ROOT + +NS_LOCAL + +//////////////////////////////////////////////////////////////////////////////// +/// @brief args is a jSON encoded object with the following attributes: +/// "locale"(string): the locale to use for stemming +/// "case_convert"(string enum): modify token case using "locale" +/// "no_accent"(bool): remove accents +//////////////////////////////////////////////////////////////////////////////// +irs::analysis::analyzer::ptr make_json(const irs::string_ref& args) { + rapidjson::Document json; + + if (json.Parse(args.c_str(), args.size()).HasParseError()) { + IR_FRMT_ERROR( + "Invalid jSON arguments passed while constructing text_token_normalizing_stream, arguments: %s", + args.c_str() + ); + + return nullptr; + } + + try { + typedef irs::analysis::text_token_normalizing_stream::options_t options_t; + options_t options; + + switch (json.GetType()) { + case rapidjson::kStringType: + options.locale = json.GetString(); // required + + return irs::memory::make_shared( + std::move(options) + ); + case rapidjson::kObjectType: + if (json.HasMember("locale") && json["locale"].IsString()) { + options.locale = json["locale"].GetString(); // required + + if (json.HasMember("case_convert")) { + auto& case_convert = json["case_convert"]; // optional string enum + + if (!case_convert.IsString()) { + IR_FRMT_WARN("Non-string value in 'case_convert' while constructing text_token_normalizing_stream from jSON arguments: %s", args.c_str()); + + return nullptr; + } + + static const std::unordered_map case_convert_map = { + { "lower", options_t::case_convert_t::LOWER }, + { "none", options_t::case_convert_t::NONE }, + { "upper", options_t::case_convert_t::UPPER }, + }; + auto itr = case_convert_map.find(case_convert.GetString()); + + if (itr == case_convert_map.end()) { + IR_FRMT_WARN("Invalid value in 'case_convert' while constructing text_token_normalizing_stream from jSON arguments: %s", args.c_str()); + + return nullptr; + } + + options.case_convert = itr->second; + } + + if (json.HasMember("no_accent")) { + auto& no_accent = json["no_accent"]; // optional bool + + if (!no_accent.IsBool()) { + IR_FRMT_WARN("Non-boolean value in 'no_accent' while constructing text_token_normalizing_stream from jSON arguments: %s", args.c_str()); + + return nullptr; + } + + options.no_accent = no_accent.GetBool(); + } + + return irs::memory::make_shared( + std::move(options) + ); + } + default: // fall through + IR_FRMT_ERROR( + "Missing 'locale' while constructing text_token_normalizing_stream from jSON arguments: %s", + args.c_str() + ); + } + } catch (...) { + IR_FRMT_ERROR( + "Caught error while constructing text_token_normalizing_stream from jSON arguments: %s", + args.c_str() + ); + IR_LOG_EXCEPTION(); + } + + return nullptr; +} + +//////////////////////////////////////////////////////////////////////////////// +/// @brief args is a language to use for normalizing +//////////////////////////////////////////////////////////////////////////////// +irs::analysis::analyzer::ptr make_text(const irs::string_ref& args) { + try { + irs::analysis::text_token_normalizing_stream::options_t options; + + options.locale = args; // interpret 'args' as a locale name + + return irs::memory::make_shared( + std::move(options) + ); + } catch (...) { + IR_FRMT_ERROR( + "Caught error while constructing text_token_normalizing_stream TEXT arguments: %s", + args.c_str() + ); + IR_LOG_EXCEPTION(); + } + + return nullptr; +} + +REGISTER_ANALYZER_JSON(irs::analysis::text_token_normalizing_stream, make_json); +REGISTER_ANALYZER_TEXT(irs::analysis::text_token_normalizing_stream, make_text); + +NS_END + +NS_ROOT +NS_BEGIN(analysis) + +DEFINE_ANALYZER_TYPE_NAMED(text_token_normalizing_stream, "text-token-normalize") + +text_token_normalizing_stream::text_token_normalizing_stream( + const options_t& options +): analyzer(text_token_normalizing_stream::type()), + attrs_(4), // increment + offset + payload + term + state_(memory::make_unique(options)), + term_eof_(true) { + attrs_.emplace(inc_); + attrs_.emplace(offset_); + attrs_.emplace(payload_); + attrs_.emplace(term_); +} + +/*static*/ void text_token_normalizing_stream::init() { + REGISTER_ANALYZER_JSON(text_token_normalizing_stream, make_json); // match registration above + REGISTER_ANALYZER_TEXT(text_token_normalizing_stream, make_text); // match registration above +} + +/*static*/ analyzer::ptr text_token_normalizing_stream::make( + const string_ref& locale +) { + return make_text(locale); +} + +bool text_token_normalizing_stream::next() { + if (term_eof_) { + return false; + } + + term_eof_ = true; + + return true; +} + +bool text_token_normalizing_stream::reset(const irs::string_ref& data) { + if (state_->icu_locale.isBogus()) { + state_->locale = irs::locale_utils::locale( + state_->options.locale, irs::string_ref::NIL, true // true == convert to unicode, required for ICU and Snowball + ); + state_->icu_locale = icu::Locale( + std::string(irs::locale_utils::language(state_->locale)).c_str(), + std::string(irs::locale_utils::country(state_->locale)).c_str() + ); + + if (state_->icu_locale.isBogus()) { + return false; + } + } + + auto err = UErrorCode::U_ZERO_ERROR; // a value that passes the U_SUCCESS() test + + if (!state_->normalizer) { + // reusable object owned by ICU + state_->normalizer.reset( + icu::Normalizer2::getNFCInstance(err), [](const icu::Normalizer2*)->void{} + ); + + if (!U_SUCCESS(err) || !state_->normalizer) { + state_->normalizer.reset(); + + return false; + } + } + + if (state_->options.no_accent && !state_->transliterator) { + // transliteration rule taken verbatim from: http://userguide.icu-project.org/transforms/general + icu::UnicodeString collationRule("NFD; [:Nonspacing Mark:] Remove; NFC"); // do not allocate statically since it causes memory leaks in ICU + + // reusable object owned by *this + state_->transliterator.reset(icu::Transliterator::createInstance( + collationRule, UTransDirection::UTRANS_FORWARD, err + )); + + if (!U_SUCCESS(err) || !state_->transliterator) { + state_->transliterator.reset(); + + return false; + } + } + + // ........................................................................... + // convert encoding to UTF8 for use with ICU + // ........................................................................... + std::string data_utf8; + + // valid conversion since 'locale_' was created with internal unicode encoding + if (!irs::locale_utils::append_internal(data_utf8, data, state_->locale)) { + return false; // UTF8 conversion failure + } + + if (data_utf8.size() > irs::integer_traits::const_max) { + return false; // ICU UnicodeString signatures can handle at most INT32_MAX + } + + state_->data = icu::UnicodeString::fromUTF8( + icu::StringPiece(data_utf8.c_str(), (int32_t)(data_utf8.size())) + ); + + // ........................................................................... + // normalize unicode + // ........................................................................... + icu::UnicodeString term_icu; + + state_->normalizer->normalize(state_->data, term_icu, err); + + if (!U_SUCCESS(err)) { + term_icu = state_->data; // use non-normalized value if normalization failure + } + + // ........................................................................... + // case-convert unicode + // ........................................................................... + switch (state_->options.case_convert) { + case options_t::case_convert_t::LOWER: + term_icu.toLower(state_->icu_locale); // inplace case-conversion + break; + case options_t::case_convert_t::UPPER: + term_icu.toUpper(state_->icu_locale); // inplace case-conversion + break; + default: + {} // NOOP + }; + + // ........................................................................... + // collate value, e.g. remove accents + // ........................................................................... + if (state_->transliterator) { + state_->transliterator->transliterate(term_icu); // inplace translitiration + } + + state_->term_buf.clear(); + term_icu.toUTF8String(state_->term_buf); + + // ........................................................................... + // use the normalized value + // ........................................................................... + static_assert(sizeof(irs::byte_type) == sizeof(char), "sizeof(irs::byte_type) != sizeof(char)"); + term_.value(irs::ref_cast(irs::string_ref(state_->term_buf))); + offset_.start = 0; + offset_.end = data.size(); + payload_.value = ref_cast(data); + term_eof_ = false; + + return true; +} + +NS_END // analysis +NS_END // ROOT + +// ----------------------------------------------------------------------------- +// --SECTION-- END-OF-FILE +// ----------------------------------------------------------------------------- \ No newline at end of file diff --git a/3rdParty/iresearch/core/analysis/text_token_normalizing_stream.hpp b/3rdParty/iresearch/core/analysis/text_token_normalizing_stream.hpp new file mode 100644 index 0000000000..b1bd874f91 --- /dev/null +++ b/3rdParty/iresearch/core/analysis/text_token_normalizing_stream.hpp @@ -0,0 +1,80 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2019 ArangoDB GmbH, Cologne, Germany +/// +/// Licensed under the Apache License, Version 2.0 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// http://www.apache.org/licenses/LICENSE-2.0 +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Andrey Abramov +/// @author Vasiliy Nabatchikov +//////////////////////////////////////////////////////////////////////////////// + +#ifndef IRESEARCH_TEXT_TOKEN_NORMALIZING_STREAM_H +#define IRESEARCH_TEXT_TOKEN_NORMALIZING_STREAM_H + +#include "analyzers.hpp" +#include "token_attributes.hpp" + +NS_ROOT +NS_BEGIN(analysis) + +//////////////////////////////////////////////////////////////////////////////// +/// @brief an analyser capable of normalizing the text, treated as a single +/// token, i.e. case conversion and accent removal +//////////////////////////////////////////////////////////////////////////////// +class text_token_normalizing_stream: public analyzer, util::noncopyable { + public: + struct options_t { + enum case_convert_t { LOWER, NONE, UPPER }; + case_convert_t case_convert{case_convert_t::NONE}; // no extra normalization + std::string locale; + bool no_accent{false}; // no extra normalization + }; + + struct state_t; + + DECLARE_ANALYZER_TYPE(); + + // for use with irs::order::add() and default args (static build) + DECLARE_FACTORY(const string_ref& locale); + + text_token_normalizing_stream(const options_t& options); + virtual const irs::attribute_view& attributes() const NOEXCEPT override { + return attrs_; + } + static void init(); // for trigering registration in a static build + virtual bool next() override; + virtual bool reset(const irs::string_ref& data) override; + + private: + class term_attribute final: public irs::term_attribute { + public: + using irs::term_attribute::value; + void value(const irs::bytes_ref& value) { value_ = value; } + }; + + irs::attribute_view attrs_; + irs::increment inc_; + irs::offset offset_; + irs::payload payload_; // raw token value + std::shared_ptr state_; + term_attribute term_; // token value with evaluated quotes + bool term_eof_; +}; + +NS_END // analysis +NS_END // ROOT + +#endif \ No newline at end of file diff --git a/3rdParty/iresearch/core/formats/formats.cpp b/3rdParty/iresearch/core/formats/formats.cpp index e0739ae478..5742c182f8 100644 --- a/3rdParty/iresearch/core/formats/formats.cpp +++ b/3rdParty/iresearch/core/formats/formats.cpp @@ -90,13 +90,19 @@ DEFINE_ATTRIBUTE_TYPE(iresearch::term_meta) meta.segments_ = std::move(segments); } -/*static*/ bool formats::exists(const string_ref& name) { - return nullptr != format_register::instance().get(name); +/*static*/ bool formats::exists( + const string_ref& name, + bool load_library /*= true*/ +) { + return nullptr != format_register::instance().get(name, load_library); } -/*static*/ format::ptr formats::get(const string_ref& name) NOEXCEPT { +/*static*/ format::ptr formats::get( + const string_ref& name, + bool load_library /*= true*/ +) NOEXCEPT { try { - auto* factory = format_register::instance().get(name); + auto* factory = format_register::instance().get(name, load_library); return factory ? factory() : nullptr; } catch (...) { @@ -182,4 +188,4 @@ NS_END // ----------------------------------------------------------------------------- // --SECTION-- END-OF-FILE -// ----------------------------------------------------------------------------- +// ----------------------------------------------------------------------------- \ No newline at end of file diff --git a/3rdParty/iresearch/core/formats/formats.hpp b/3rdParty/iresearch/core/formats/formats.hpp index 28a1f46a68..0f0ee8fb7d 100644 --- a/3rdParty/iresearch/core/formats/formats.hpp +++ b/3rdParty/iresearch/core/formats/formats.hpp @@ -525,7 +525,7 @@ class IRESEARCH_API formats { //////////////////////////////////////////////////////////////////////////////// /// @brief checks whether a format with the specified name is registered //////////////////////////////////////////////////////////////////////////////// - static bool exists(const string_ref& name); + static bool exists(const string_ref& name, bool load_library = true); ////////////////////////////////////////////////////////////////////////////// /// @brief find a format by name, or nullptr if not found @@ -533,7 +533,10 @@ class IRESEARCH_API formats { /// requires use of DECLARE_FACTORY() in class definition /// NOTE: make(...) MUST be defined in CPP to ensire proper code scope ////////////////////////////////////////////////////////////////////////////// - static format::ptr get(const string_ref& name) NOEXCEPT; + static format::ptr get( + const string_ref& name, + bool load_library = true + ) NOEXCEPT; //////////////////////////////////////////////////////////////////////////////// /// @brief for static lib reference all known formats in lib @@ -589,4 +592,4 @@ class IRESEARCH_API format_registrar { NS_END -#endif +#endif \ No newline at end of file diff --git a/3rdParty/iresearch/core/search/scorers.cpp b/3rdParty/iresearch/core/search/scorers.cpp index f670173818..32a1808cdb 100644 --- a/3rdParty/iresearch/core/search/scorers.cpp +++ b/3rdParty/iresearch/core/search/scorers.cpp @@ -89,19 +89,22 @@ NS_ROOT /*static*/ bool scorers::exists( const string_ref& name, - const irs::text_format::type_id& args_format + const irs::text_format::type_id& args_format, + bool load_library /*= true*/ ) { - return nullptr != scorer_register::instance().get(entry_key_t(name, args_format)); + return nullptr != scorer_register::instance().get(entry_key_t(name, args_format),load_library); } /*static*/ sort::ptr scorers::get( const string_ref& name, const irs::text_format::type_id& args_format, - const string_ref& args + const string_ref& args, + bool load_library /*= true*/ ) NOEXCEPT { try { - auto* factory = - scorer_register::instance().get(entry_key_t(name, args_format)); + auto* factory = scorer_register::instance().get( + entry_key_t(name, args_format), load_library + ); return factory ? factory(args) : nullptr; } catch (...) { diff --git a/3rdParty/iresearch/core/search/scorers.hpp b/3rdParty/iresearch/core/search/scorers.hpp index 9f3f428ded..f5a5030a37 100644 --- a/3rdParty/iresearch/core/search/scorers.hpp +++ b/3rdParty/iresearch/core/search/scorers.hpp @@ -77,7 +77,9 @@ class IRESEARCH_API scorers { /// @brief checks whether scorer with a specified name is registered //////////////////////////////////////////////////////////////////////////////// static bool exists( - const string_ref& name, const irs::text_format::type_id& args_format + const string_ref& name, + const irs::text_format::type_id& args_format, + bool load_library = true ); //////////////////////////////////////////////////////////////////////////////// @@ -89,7 +91,8 @@ class IRESEARCH_API scorers { static sort::ptr get( const string_ref& name, const irs::text_format::type_id& args_format, - const string_ref& args + const string_ref& args, + bool load_library = true ) NOEXCEPT; //////////////////////////////////////////////////////////////////////////////// diff --git a/3rdParty/iresearch/core/utils/attributes.cpp b/3rdParty/iresearch/core/utils/attributes.cpp index 45015c4d3b..e06f362712 100644 --- a/3rdParty/iresearch/core/utils/attributes.cpp +++ b/3rdParty/iresearch/core/utils/attributes.cpp @@ -44,15 +44,19 @@ NS_ROOT // --SECTION-- attribute::type_id // ----------------------------------------------------------------------------- -/*static*/ bool attribute::type_id::exists(const string_ref& name) { - return nullptr != attribute_register::instance().get(name); +/*static*/ bool attribute::type_id::exists( + const string_ref& name, + bool load_library /*= true*/ +) { + return nullptr != attribute_register::instance().get(name, load_library); } /*static*/ const attribute::type_id* attribute::type_id::get( - const string_ref& name + const string_ref& name, + bool load_library /*= true*/ ) NOEXCEPT { try { - return attribute_register::instance().get(name); + return attribute_register::instance().get(name, load_library); } catch (...) { IR_FRMT_ERROR("Caught exception while getting an attribute instance"); IR_LOG_EXCEPTION(); diff --git a/3rdParty/iresearch/core/utils/attributes.hpp b/3rdParty/iresearch/core/utils/attributes.hpp index 1aa672ab2a..62824ad7ca 100644 --- a/3rdParty/iresearch/core/utils/attributes.hpp +++ b/3rdParty/iresearch/core/utils/attributes.hpp @@ -56,8 +56,11 @@ struct IRESEARCH_API attribute { public: type_id(const string_ref& name): name_(name) {} operator const type_id*() const { return this; } - static bool exists(const string_ref& name); - static const type_id* get(const string_ref& name) NOEXCEPT; + static bool exists(const string_ref& name, bool load_library = true); + static const type_id* get( + const string_ref& name, + bool load_library = true + ) NOEXCEPT; const string_ref& name() const { return name_; } private: diff --git a/3rdParty/iresearch/core/utils/register.hpp b/3rdParty/iresearch/core/utils/register.hpp index 9553b2852f..be3a8f2573 100644 --- a/3rdParty/iresearch/core/utils/register.hpp +++ b/3rdParty/iresearch/core/utils/register.hpp @@ -74,9 +74,20 @@ class generic_register: public singleton { return std::make_pair(itr.first->second, itr.second); } - entry_type get(const key_type& key) const { + entry_type get(const key_type& key, bool load_library) const { const entry_type* entry = lookup(key); - return entry ? *entry : load_entry_from_so(key); + + if (entry) { + return *entry; + } + + if (load_library) { + return load_entry_from_so(key); + } + + IR_FRMT_ERROR("%s : key not found", __FUNCTION__); + + return entry_type(); } bool visit(const visitor_t& visitor) { diff --git a/arangod/IResearch/IResearchAnalyzerFeature.cpp b/arangod/IResearch/IResearchAnalyzerFeature.cpp index add1a49ca1..d425ffc97a 100644 --- a/arangod/IResearch/IResearchAnalyzerFeature.cpp +++ b/arangod/IResearch/IResearchAnalyzerFeature.cpp @@ -682,7 +682,9 @@ IResearchAnalyzerFeature::AnalyzerPool::Builder::make(irs::string_ref const& typ // ArangoDB, for API consistency, only supports analyzers configurable via // jSON - return irs::analysis::analyzers::get(type, irs::text_format::json, properties); + return irs::analysis::analyzers::get( // get analyzer + type, irs::text_format::json, properties, false // args + ); } IResearchAnalyzerFeature::AnalyzerPool::AnalyzerPool(irs::string_ref const& name) diff --git a/arangod/IResearch/IResearchOrderFactory.cpp b/arangod/IResearch/IResearchOrderFactory.cpp index c38b3cebbc..9bb72fe1c4 100644 --- a/arangod/IResearch/IResearchOrderFactory.cpp +++ b/arangod/IResearch/IResearchOrderFactory.cpp @@ -90,11 +90,13 @@ bool makeScorer(irs::sort::ptr& scorer, irs::string_ref const& name, break; case 1: { // ArangoDB, for API consistency, only supports scorers configurable via jSON - scorer = irs::scorers::get(name, irs::text_format::json, irs::string_ref::NIL); + scorer = irs::scorers::get( // get scorer + name, irs::text_format::json, irs::string_ref::NIL, false // args + ); if (!scorer) { // ArangoDB, for API consistency, only supports scorers configurable via jSON - scorer = irs::scorers::get(name, irs::text_format::json, "[]"); // pass arg as json array + scorer = irs::scorers::get(name, irs::text_format::json, "[]", false); // pass arg as json array } } break; default: { // fall through @@ -123,7 +125,9 @@ bool makeScorer(irs::sort::ptr& scorer, irs::string_ref const& name, builder.close(); // ArangoDB, for API consistency, only supports scorers configurable via jSON - scorer = irs::scorers::get(name, irs::text_format::json, builder.toJson()); // pass arg as json + scorer = irs::scorers::get( // get scorer + name, irs::text_format::json, builder.toJson(), false // pass arg as json + ); } } @@ -143,7 +147,7 @@ bool fromFCall(irs::sort::ptr* scorer, irs::string_ref const& scorerName, if (!scorer) { // cheap shallow check // ArangoDB, for API consistency, only supports scorers configurable via jSON - return irs::scorers::exists(scorerName, irs::text_format::json); + return irs::scorers::exists(scorerName, irs::text_format::json, false); } // we don't support non-constant arguments for scorers now, if it @@ -330,12 +334,14 @@ void ScorerReplacer::extract(aql::Variable const& var, std::vector& scor if (!comparer) { // cheap shallow check // ArangoDB, for API consistency, only supports scorers configurable via jSON - return irs::scorers::exists(scorerName, irs::text_format::json); + return irs::scorers::exists(scorerName, irs::text_format::json, false); } // create scorer with default arguments // ArangoDB, for API consistency, only supports scorers configurable via jSON - *comparer = irs::scorers::get(scorerName, irs::text_format::json, irs::string_ref::NIL); + *comparer = irs::scorers::get( // get scorer + scorerName, irs::text_format::json, irs::string_ref::NIL, false // args + ); return bool(*comparer); }