issue 539.1: backport from iresearch upstream: add support for text normalizing analyzer, make loading shared object ptional in the registry (#8813)

2019-04-20 23:20:15 +03:00 · 2019-04-20 23:20:15 +03:00 · 2de9510051
parent 0375218ca2
commit 2de9510051
14 changed files with 652 additions and 37 deletions
--- a/3rdParty/iresearch/core/CMakeLists.txt
+++ b/3rdParty/iresearch/core/CMakeLists.txt
@ -367,6 +367,7 @@ add_dependencies(${IResearch_TARGET_NAME}-static
  ${IResearch_TARGET_NAME}-analyzer-delimited-static
  ${IResearch_TARGET_NAME}-analyzer-ngram-static
  ${IResearch_TARGET_NAME}-analyzer-text-static
+  ${IResearch_TARGET_NAME}-analyzer-text-normalizing-static
  ${IResearch_TARGET_NAME}-analyzer-text-stemming-static
  ${IResearch_TARGET_NAME}-analyzer-token-masking-static
  ${IResearch_TARGET_NAME}-format-1_0-static
@ -387,6 +388,7 @@ if(MSVC)
    ${IResearch_TARGET_NAME}-analyzer-delimited-static-scrt
    ${IResearch_TARGET_NAME}-analyzer-ngram-static-scrt
    ${IResearch_TARGET_NAME}-analyzer-text-static-scrt
+    ${IResearch_TARGET_NAME}-analyzer-text-normalizing-static-scrt
    ${IResearch_TARGET_NAME}-analyzer-text-stemming-static-scrt
    ${IResearch_TARGET_NAME}-analyzer-token-masking-static-scrt
    ${IResearch_TARGET_NAME}-format-1_0-static-scrt
@ -505,6 +507,7 @@ target_link_libraries(${IResearch_TARGET_NAME}-static
  ${IResearch_TARGET_NAME}-analyzer-delimited-static
  ${IResearch_TARGET_NAME}-analyzer-ngram-static
  ${IResearch_TARGET_NAME}-analyzer-text-static
+  ${IResearch_TARGET_NAME}-analyzer-text-normalizing-static
  ${IResearch_TARGET_NAME}-analyzer-text-stemming-static
  ${IResearch_TARGET_NAME}-analyzer-token-masking-static
  ${IResearch_TARGET_NAME}-format-1_0-static
@ -537,6 +540,7 @@ if(MSVC)
    ${IResearch_TARGET_NAME}-analyzer-delimited-static-scrt
    ${IResearch_TARGET_NAME}-analyzer-ngram-static-scrt
    ${IResearch_TARGET_NAME}-analyzer-text-static-scrt
+    ${IResearch_TARGET_NAME}-analyzer-text-normalizing-static-scrt
    ${IResearch_TARGET_NAME}-analyzer-text-stemming-static-scrt
    ${IResearch_TARGET_NAME}-analyzer-token-masking-static-scrt
    ${IResearch_TARGET_NAME}-format-1_0-static-scrt
@ -566,6 +570,7 @@ if (MSVC)
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-delimited-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-ngram-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-text-static>"
+    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-text-normalizing-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-text-stemming-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-token-masking-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-format-1_0-static>"
@ -624,6 +629,7 @@ if (MSVC)
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-delimited-static-scrt>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-ngram-static-scrt>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-text-static-scrt>"
+    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-text-normalizing-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-text-stemming-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-token-masking-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-format-1_0-static-scrt>"
@ -683,6 +689,7 @@ elseif (APPLE)
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-delimited-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-ngram-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-text-static>"
+    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-text-normalizing-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-text-stemming-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-token-masking-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-format-1_0-static>"
@ -741,6 +748,7 @@ else()
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-delimited-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-ngram-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-text-static>"
+    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-text-normalizing-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-text-stemming-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-analyzer-token-masking-static>"
    "$<TARGET_FILE:${IResearch_TARGET_NAME}-format-1_0-static>"
@ -1021,6 +1029,129 @@ if(MSVC)
  )
 endif()

+################################################################################
+### analysis plugin : text token normalizing
+################################################################################
+
+add_library(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared
+  SHARED
+  ./analysis/text_token_normalizing_stream.cpp
+  ./analysis/text_token_normalizing_stream.hpp
+)
+
+add_library(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static
+  STATIC
+  ./analysis/text_token_normalizing_stream.cpp
+)
+
+# setup CRT
+if(MSVC)
+  add_library(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared-scrt
+    SHARED
+    ./analysis/text_token_normalizing_stream.cpp
+  )
+
+  add_library(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static-scrt
+    STATIC
+    ./analysis/text_token_normalizing_stream.cpp
+  )
+endif()
+
+target_include_directories(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared
+  PRIVATE ${ICU_INCLUDE_DIR} # cmake on MSVC does not properly expand dependencies for linked libraries
+)
+
+target_include_directories(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static
+  PRIVATE ${ICU_INCLUDE_DIR} # cmake on MSVC does not properly expand dependencies for linked libraries
+)
+
+# setup CRT
+if(MSVC)
+  target_include_directories(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared-scrt
+    PRIVATE ${ICU_INCLUDE_DIR} # cmake on MSVC does not properly expand dependencies for linked libraries
+  )
+
+  target_include_directories(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static-scrt
+    PRIVATE ${ICU_INCLUDE_DIR} # cmake on MSVC does not properly expand dependencies for linked libraries
+  )
+endif()
+
+# setup CRT
+if(MSVC)
+  target_compile_options(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared
+    PRIVATE "$<$<CONFIG:Debug>:/MDd>$<$<NOT:$<CONFIG:Debug>>:/MD>"
+  )
+
+  target_compile_options(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static
+    PRIVATE "$<$<CONFIG:Debug>:/MDd>$<$<NOT:$<CONFIG:Debug>>:/MD>"
+  )
+
+  target_compile_options(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared-scrt
+    PRIVATE "$<$<CONFIG:Debug>:/MTd>$<$<NOT:$<CONFIG:Debug>>:/MT>"
+  )
+
+  target_compile_options(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static-scrt
+    PRIVATE "$<$<CONFIG:Debug>:/MTd>$<$<NOT:$<CONFIG:Debug>>:/MT>"
+  )
+endif()
+
+set_target_properties(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared
+  PROPERTIES
+  PREFIX lib
+  IMPORT_PREFIX lib
+  OUTPUT_NAME analyzer-text-token-normalize
+  DEBUG_POSTFIX "" # otherwise library names will not match expected dynamically loaded value
+  COMPILE_DEFINITIONS "$<$<CONFIG:Debug>:IRESEARCH_DEBUG>;IRESEARCH_DLL;IRESEARCH_DLL_EXPORTS;IRESEARCH_DLL_PLUGIN"
+  CXX_VISIBILITY_PRESET hidden
+)
+
+set_target_properties(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static
+  PROPERTIES
+  PREFIX lib
+  IMPORT_PREFIX lib
+  OUTPUT_NAME analyzer-text-token-normalize-s
+  COMPILE_DEFINITIONS "$<$<CONFIG:Debug>:IRESEARCH_DEBUG>"
+)
+
+# setup CRT
+if(MSVC)
+  set_target_properties(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared-scrt
+    PROPERTIES
+    PREFIX lib
+    IMPORT_PREFIX lib
+    OUTPUT_NAME analyzer-text-token-normalize-scrt
+    COMPILE_DEFINITIONS "$<$<CONFIG:Debug>:IRESEARCH_DEBUG>;IRESEARCH_DLL;IRESEARCH_DLL_EXPORTS;IRESEARCH_DLL_PLUGIN"
+    CXX_VISIBILITY_PRESET hidden
+  )
+
+  set_target_properties(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static-scrt
+    PROPERTIES
+    PREFIX lib
+    IMPORT_PREFIX lib
+    OUTPUT_NAME analyzer-text-token-normalize-scrt-s
+    COMPILE_DEFINITIONS "$<$<CONFIG:Debug>:IRESEARCH_DEBUG>"
+  )
+endif()
+
+target_link_libraries(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared
+  ${IResearch_TARGET_NAME}-shared
+)
+
+target_link_libraries(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static
+  ${IResearch_TARGET_NAME}-static
+)
+
+# setup CRT
+if(MSVC)
+  target_link_libraries(${IResearch_TARGET_NAME}-analyzer-text-normalizing-shared-scrt
+    ${IResearch_TARGET_NAME}-shared-scrt
+  )
+
+  target_link_libraries(${IResearch_TARGET_NAME}-analyzer-text-normalizing-static-scrt
+    ${IResearch_TARGET_NAME}-static-scrt
+  )
+endif()
+
 ################################################################################
 ### analysis plugin : text token stemming
 ################################################################################
--- a/3rdParty/iresearch/core/analysis/analyzers.cpp
+++ b/3rdParty/iresearch/core/analysis/analyzers.cpp
@ -27,6 +27,7 @@
 #ifndef IRESEARCH_DLL
  #include "delimited_token_stream.hpp"
  #include "ngram_token_stream.hpp"
+  #include "text_token_normalizing_stream.hpp"
  #include "text_token_stemming_stream.hpp"
  #include "text_token_stream.hpp"
  #include "token_masking_stream.hpp"
@ -94,19 +95,22 @@ NS_BEGIN(analysis)

 /*static*/ bool analyzers::exists(
    const string_ref& name,
-    const irs::text_format::type_id& args_format
+    const irs::text_format::type_id& args_format,
+    bool load_library /*= true*/
 ) {
-  return nullptr != analyzer_register::instance().get(entry_key_t(name, args_format));
+  return nullptr != analyzer_register::instance().get(entry_key_t(name, args_format), load_library);
 }

 /*static*/ analyzer::ptr analyzers::get(
    const string_ref& name,
    const irs::text_format::type_id& args_format,
-    const string_ref& args
+    const string_ref& args,
+    bool load_library /*= true*/
 ) NOEXCEPT {
  try {
-    auto* factory =
-      analyzer_register::instance().get(entry_key_t(name, args_format));
+    auto* factory = analyzer_register::instance().get(
+      entry_key_t(name, args_format), load_library
+    );

    return factory ? factory(args) : nullptr;
  } catch (...) {
@ -121,6 +125,7 @@ NS_BEGIN(analysis)
  #ifndef IRESEARCH_DLL
    irs::analysis::delimited_token_stream::init();
    irs::analysis::ngram_token_stream::init();
+    irs::analysis::text_token_normalizing_stream::init();
    irs::analysis::text_token_stemming_stream::init();
    irs::analysis::text_token_stream::init();
    irs::analysis::token_masking_stream::init();
--- a/3rdParty/iresearch/core/analysis/analyzers.hpp
+++ b/3rdParty/iresearch/core/analysis/analyzers.hpp
@ -78,7 +78,9 @@ class IRESEARCH_API analyzers {
  /// @brief checks whether an analyzer with the specified name is registered
  ////////////////////////////////////////////////////////////////////////////////
  static bool exists(
-    const string_ref& name, const irs::text_format::type_id& args_format
+    const string_ref& name,
+    const irs::text_format::type_id& args_format,
+    bool load_library = true
  );

  ////////////////////////////////////////////////////////////////////////////////
@ -90,7 +92,8 @@ class IRESEARCH_API analyzers {
  static analyzer::ptr get(
    const string_ref& name,
    const irs::text_format::type_id& args_format,
-    const string_ref& args
+    const string_ref& args,
+    bool load_library = true
  ) NOEXCEPT;

  ////////////////////////////////////////////////////////////////////////////////
--- a/3rdParty/iresearch/core/analysis/text_token_normalizing_stream.cpp
+++ b/3rdParty/iresearch/core/analysis/text_token_normalizing_stream.cpp
@ -0,0 +1,355 @@
+////////////////////////////////////////////////////////////////////////////////
+/// DISCLAIMER
+///
+/// Copyright 2019 ArangoDB GmbH, Cologne, Germany
+///
+/// Licensed under the Apache License, Version 2.0 (the "License");
+/// you may not use this file except in compliance with the License.
+/// You may obtain a copy of the License at
+///
+///     http://www.apache.org/licenses/LICENSE-2.0
+///
+/// Unless required by applicable law or agreed to in writing, software
+/// distributed under the License is distributed on an "AS IS" BASIS,
+/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+/// See the License for the specific language governing permissions and
+/// limitations under the License.
+///
+/// Copyright holder is ArangoDB GmbH, Cologne, Germany
+///
+/// @author Andrey Abramov
+/// @author Vasiliy Nabatchikov
+////////////////////////////////////////////////////////////////////////////////
+
+#include <rapidjson/rapidjson/document.h> // for rapidjson::Document
+#include <unicode/locid.h> // for icu::Locale
+
+#if defined(_MSC_VER)
+  #pragma warning(disable: 4512)
+#endif
+
+  #include <unicode/normalizer2.h> // for icu::Normalizer2
+
+#if defined(_MSC_VER)
+  #pragma warning(default: 4512)
+#endif
+
+#include <unicode/translit.h> // for icu::Transliterator
+
+#if defined(_MSC_VER)
+  #pragma warning(disable: 4229)
+#endif
+
+  #include <unicode/uclean.h> // for u_cleanup
+
+#if defined(_MSC_VER)
+  #pragma warning(default: 4229)
+#endif
+
+#include "utils/locale_utils.hpp"
+
+#include "text_token_normalizing_stream.hpp"
+
+NS_ROOT
+NS_BEGIN(analysis)
+
+// -----------------------------------------------------------------------------
+// --SECTION--                                                     private types
+// -----------------------------------------------------------------------------
+
+struct text_token_normalizing_stream::state_t {
+  icu::UnicodeString data;
+  icu::Locale icu_locale;
+  std::locale locale;
+  std::shared_ptr<const icu::Normalizer2> normalizer;
+  const options_t options;
+  std::string term_buf; // used by reset()
+  std::shared_ptr<icu::Transliterator> transliterator;
+  state_t(const options_t& opts): icu_locale("C"), options(opts) {
+    // NOTE: use of the default constructor for Locale() or
+    //       use of Locale::createFromName(nullptr)
+    //       causes a memory leak with Boost 1.58, as detected by valgrind
+    icu_locale.setToBogus(); // set to uninitialized
+  }
+};
+
+NS_END // analysis
+NS_END // ROOT
+
+NS_LOCAL
+
+////////////////////////////////////////////////////////////////////////////////
+/// @brief args is a jSON encoded object with the following attributes:
+///        "locale"(string): the locale to use for stemming <required>
+///        "case_convert"(string enum): modify token case using "locale"
+///        "no_accent"(bool): remove accents
+////////////////////////////////////////////////////////////////////////////////
+irs::analysis::analyzer::ptr make_json(const irs::string_ref& args) {
+  rapidjson::Document json;
+
+  if (json.Parse(args.c_str(), args.size()).HasParseError()) {
+    IR_FRMT_ERROR(
+      "Invalid jSON arguments passed while constructing text_token_normalizing_stream, arguments: %s",
+      args.c_str()
+    );
+
+    return nullptr;
+  }
+
+  try {
+    typedef irs::analysis::text_token_normalizing_stream::options_t options_t;
+    options_t options;
+
+    switch (json.GetType()) {
+     case rapidjson::kStringType:
+      options.locale = json.GetString(); // required
+
+      return irs::memory::make_shared<irs::analysis::text_token_normalizing_stream>(
+        std::move(options)
+      );
+     case rapidjson::kObjectType:
+      if (json.HasMember("locale") && json["locale"].IsString()) {
+        options.locale = json["locale"].GetString(); // required
+
+        if (json.HasMember("case_convert")) {
+          auto& case_convert = json["case_convert"]; // optional string enum
+
+          if (!case_convert.IsString()) {
+            IR_FRMT_WARN("Non-string value in 'case_convert' while constructing text_token_normalizing_stream from jSON arguments: %s", args.c_str());
+
+            return nullptr;
+          }
+
+          static const std::unordered_map<std::string, options_t::case_convert_t> case_convert_map = {
+            { "lower", options_t::case_convert_t::LOWER },
+            { "none", options_t::case_convert_t::NONE },
+            { "upper", options_t::case_convert_t::UPPER },
+          };
+          auto itr = case_convert_map.find(case_convert.GetString());
+
+          if (itr == case_convert_map.end()) {
+            IR_FRMT_WARN("Invalid value in 'case_convert' while constructing text_token_normalizing_stream from jSON arguments: %s", args.c_str());
+
+            return nullptr;
+          }
+
+          options.case_convert = itr->second;
+        }
+
+        if (json.HasMember("no_accent")) {
+          auto& no_accent = json["no_accent"]; // optional bool
+
+          if (!no_accent.IsBool()) {
+            IR_FRMT_WARN("Non-boolean value in 'no_accent' while constructing text_token_normalizing_stream from jSON arguments: %s", args.c_str());
+
+            return nullptr;
+          }
+
+          options.no_accent = no_accent.GetBool();
+        }
+
+        return irs::memory::make_shared<irs::analysis::text_token_normalizing_stream>(
+          std::move(options)
+        );
+      }
+     default: // fall through
+      IR_FRMT_ERROR(
+        "Missing 'locale' while constructing text_token_normalizing_stream from jSON arguments: %s",
+        args.c_str()
+      );
+    }
+  } catch (...) {
+    IR_FRMT_ERROR(
+      "Caught error while constructing text_token_normalizing_stream from jSON arguments: %s",
+      args.c_str()
+    );
+    IR_LOG_EXCEPTION();
+  }
+
+  return nullptr;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// @brief args is a language to use for normalizing
+////////////////////////////////////////////////////////////////////////////////
+irs::analysis::analyzer::ptr make_text(const irs::string_ref& args) {
+  try {
+    irs::analysis::text_token_normalizing_stream::options_t options;
+
+    options.locale = args; // interpret 'args' as a locale name
+
+    return irs::memory::make_shared<irs::analysis::text_token_normalizing_stream>(
+      std::move(options)
+    );
+  } catch (...) {
+    IR_FRMT_ERROR(
+      "Caught error while constructing text_token_normalizing_stream TEXT arguments: %s",
+      args.c_str()
+    );
+    IR_LOG_EXCEPTION();
+  }
+
+  return nullptr;
+}
+
+REGISTER_ANALYZER_JSON(irs::analysis::text_token_normalizing_stream, make_json);
+REGISTER_ANALYZER_TEXT(irs::analysis::text_token_normalizing_stream, make_text);
+
+NS_END
+
+NS_ROOT
+NS_BEGIN(analysis)
+
+DEFINE_ANALYZER_TYPE_NAMED(text_token_normalizing_stream, "text-token-normalize")
+
+text_token_normalizing_stream::text_token_normalizing_stream(
+    const options_t& options
+): analyzer(text_token_normalizing_stream::type()),
+  attrs_(4), // increment + offset + payload + term
+  state_(memory::make_unique<state_t>(options)),
+  term_eof_(true) {
+ attrs_.emplace(inc_);
+ attrs_.emplace(offset_);
+ attrs_.emplace(payload_);
+ attrs_.emplace(term_);
+}
+
+/*static*/ void text_token_normalizing_stream::init() {
+  REGISTER_ANALYZER_JSON(text_token_normalizing_stream, make_json); // match registration above
+  REGISTER_ANALYZER_TEXT(text_token_normalizing_stream, make_text); // match registration above
+}
+
+/*static*/ analyzer::ptr text_token_normalizing_stream::make(
+    const string_ref& locale
+) {
+  return make_text(locale);
+}
+
+bool text_token_normalizing_stream::next() {
+  if (term_eof_) {
+    return false;
+  }
+
+  term_eof_ = true;
+
+  return true;
+}
+
+bool text_token_normalizing_stream::reset(const irs::string_ref& data) {
+  if (state_->icu_locale.isBogus()) {
+    state_->locale = irs::locale_utils::locale(
+      state_->options.locale, irs::string_ref::NIL, true // true == convert to unicode, required for ICU and Snowball
+    );
+    state_->icu_locale = icu::Locale(
+      std::string(irs::locale_utils::language(state_->locale)).c_str(),
+      std::string(irs::locale_utils::country(state_->locale)).c_str()
+    );
+
+    if (state_->icu_locale.isBogus()) {
+      return false;
+    }
+  }
+
+  auto err = UErrorCode::U_ZERO_ERROR; // a value that passes the U_SUCCESS() test
+
+  if (!state_->normalizer) {
+    // reusable object owned by ICU
+    state_->normalizer.reset(
+      icu::Normalizer2::getNFCInstance(err), [](const icu::Normalizer2*)->void{}
+    );
+
+    if (!U_SUCCESS(err) || !state_->normalizer) {
+      state_->normalizer.reset();
+
+      return false;
+    }
+  }
+
+  if (state_->options.no_accent && !state_->transliterator) {
+    // transliteration rule taken verbatim from: http://userguide.icu-project.org/transforms/general
+    icu::UnicodeString collationRule("NFD; [:Nonspacing Mark:] Remove; NFC"); // do not allocate statically since it causes memory leaks in ICU
+
+    // reusable object owned by *this
+    state_->transliterator.reset(icu::Transliterator::createInstance(
+      collationRule, UTransDirection::UTRANS_FORWARD, err
+    ));
+
+    if (!U_SUCCESS(err) || !state_->transliterator) {
+      state_->transliterator.reset();
+
+      return false;
+    }
+  }
+
+  // ...........................................................................
+  // convert encoding to UTF8 for use with ICU
+  // ...........................................................................
+  std::string data_utf8;
+
+  // valid conversion since 'locale_' was created with internal unicode encoding
+  if (!irs::locale_utils::append_internal(data_utf8, data, state_->locale)) {
+    return false; // UTF8 conversion failure
+  }
+
+  if (data_utf8.size() > irs::integer_traits<int32_t>::const_max) {
+    return false; // ICU UnicodeString signatures can handle at most INT32_MAX
+  }
+
+  state_->data = icu::UnicodeString::fromUTF8(
+    icu::StringPiece(data_utf8.c_str(), (int32_t)(data_utf8.size()))
+  );
+
+  // ...........................................................................
+  // normalize unicode
+  // ...........................................................................
+  icu::UnicodeString term_icu;
+
+  state_->normalizer->normalize(state_->data, term_icu, err);
+
+  if (!U_SUCCESS(err)) {
+    term_icu = state_->data; // use non-normalized value if normalization failure
+  }
+
+  // ...........................................................................
+  // case-convert unicode
+  // ...........................................................................
+  switch (state_->options.case_convert) {
+   case options_t::case_convert_t::LOWER:
+    term_icu.toLower(state_->icu_locale); // inplace case-conversion
+    break;
+   case options_t::case_convert_t::UPPER:
+    term_icu.toUpper(state_->icu_locale); // inplace case-conversion
+    break;
+   default:
+    {} // NOOP
+  };
+
+  // ...........................................................................
+  // collate value, e.g. remove accents
+  // ...........................................................................
+  if (state_->transliterator) {
+    state_->transliterator->transliterate(term_icu); // inplace translitiration
+  }
+
+  state_->term_buf.clear();
+  term_icu.toUTF8String(state_->term_buf);
+
+  // ...........................................................................
+  // use the normalized value
+  // ...........................................................................
+  static_assert(sizeof(irs::byte_type) == sizeof(char), "sizeof(irs::byte_type) != sizeof(char)");
+  term_.value(irs::ref_cast<irs::byte_type>(irs::string_ref(state_->term_buf)));
+  offset_.start = 0;
+  offset_.end = data.size();
+  payload_.value = ref_cast<uint8_t>(data);
+  term_eof_ = false;
+
+  return true;
+}
+
+NS_END // analysis
+NS_END // ROOT
+
+// -----------------------------------------------------------------------------
+// --SECTION--                                                       END-OF-FILE
+// -----------------------------------------------------------------------------
--- a/3rdParty/iresearch/core/analysis/text_token_normalizing_stream.hpp
+++ b/3rdParty/iresearch/core/analysis/text_token_normalizing_stream.hpp
@ -0,0 +1,80 @@
+////////////////////////////////////////////////////////////////////////////////
+/// DISCLAIMER
+///
+/// Copyright 2019 ArangoDB GmbH, Cologne, Germany
+///
+/// Licensed under the Apache License, Version 2.0 (the "License");
+/// you may not use this file except in compliance with the License.
+/// You may obtain a copy of the License at
+///
+///     http://www.apache.org/licenses/LICENSE-2.0
+///
+/// Unless required by applicable law or agreed to in writing, software
+/// distributed under the License is distributed on an "AS IS" BASIS,
+/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+/// See the License for the specific language governing permissions and
+/// limitations under the License.
+///
+/// Copyright holder is ArangoDB GmbH, Cologne, Germany
+///
+/// @author Andrey Abramov
+/// @author Vasiliy Nabatchikov
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef IRESEARCH_TEXT_TOKEN_NORMALIZING_STREAM_H
+#define IRESEARCH_TEXT_TOKEN_NORMALIZING_STREAM_H
+
+#include "analyzers.hpp"
+#include "token_attributes.hpp"
+
+NS_ROOT
+NS_BEGIN(analysis)
+
+////////////////////////////////////////////////////////////////////////////////
+/// @brief an analyser capable of normalizing the text, treated as a single
+///        token, i.e. case conversion and accent removal
+////////////////////////////////////////////////////////////////////////////////
+class text_token_normalizing_stream: public analyzer, util::noncopyable {
+ public:
+  struct options_t {
+    enum case_convert_t { LOWER, NONE, UPPER };
+    case_convert_t case_convert{case_convert_t::NONE}; // no extra normalization
+    std::string locale;
+    bool no_accent{false}; // no extra normalization
+  };
+
+  struct state_t;
+
+  DECLARE_ANALYZER_TYPE();
+
+  // for use with irs::order::add<T>() and default args (static build)
+  DECLARE_FACTORY(const string_ref& locale);
+
+  text_token_normalizing_stream(const options_t& options);
+  virtual const irs::attribute_view& attributes() const NOEXCEPT override {
+    return attrs_;
+  }
+  static void init(); // for trigering registration in a static build
+  virtual bool next() override;
+  virtual bool reset(const irs::string_ref& data) override;
+
+ private:
+  class term_attribute final: public irs::term_attribute {
+   public:
+    using irs::term_attribute::value;
+    void value(const irs::bytes_ref& value) { value_ = value; }
+  };
+
+  irs::attribute_view attrs_;
+  irs::increment inc_;
+  irs::offset offset_;
+  irs::payload payload_; // raw token value
+  std::shared_ptr<state_t> state_;
+  term_attribute term_; // token value with evaluated quotes
+  bool term_eof_;
+};
+
+NS_END // analysis
+NS_END // ROOT
+
+#endif
--- a/3rdParty/iresearch/core/formats/formats.cpp
+++ b/3rdParty/iresearch/core/formats/formats.cpp
@ -90,13 +90,19 @@ DEFINE_ATTRIBUTE_TYPE(iresearch::term_meta)
  meta.segments_ = std::move(segments);
 }

-/*static*/ bool formats::exists(const string_ref& name) {
-  return nullptr != format_register::instance().get(name);
+/*static*/ bool formats::exists(
+    const string_ref& name,
+    bool load_library /*= true*/
+) {
+  return nullptr != format_register::instance().get(name, load_library);
 }

-/*static*/ format::ptr formats::get(const string_ref& name) NOEXCEPT {
+/*static*/ format::ptr formats::get(
+    const string_ref& name,
+    bool load_library /*= true*/
+) NOEXCEPT {
  try {
-    auto* factory = format_register::instance().get(name);
+    auto* factory = format_register::instance().get(name, load_library);

    return factory ? factory() : nullptr;
  } catch (...) {
@ -182,4 +188,4 @@ NS_END

 // -----------------------------------------------------------------------------
 // --SECTION--                                                       END-OF-FILE
-// -----------------------------------------------------------------------------
+// -----------------------------------------------------------------------------
--- a/3rdParty/iresearch/core/formats/formats.hpp
+++ b/3rdParty/iresearch/core/formats/formats.hpp
@ -525,7 +525,7 @@ class IRESEARCH_API formats {
  ////////////////////////////////////////////////////////////////////////////////
  /// @brief checks whether a format with the specified name is registered
  ////////////////////////////////////////////////////////////////////////////////
-  static bool exists(const string_ref& name);
+  static bool exists(const string_ref& name, bool load_library = true);

  //////////////////////////////////////////////////////////////////////////////
  /// @brief find a format by name, or nullptr if not found
@ -533,7 +533,10 @@ class IRESEARCH_API formats {
  ///        requires use of DECLARE_FACTORY() in class definition
  ///        NOTE: make(...) MUST be defined in CPP to ensire proper code scope
  //////////////////////////////////////////////////////////////////////////////
-  static format::ptr get(const string_ref& name) NOEXCEPT;
+  static format::ptr get(
+    const string_ref& name,
+    bool load_library = true
+  ) NOEXCEPT;

  ////////////////////////////////////////////////////////////////////////////////
  /// @brief for static lib reference all known formats in lib
@ -589,4 +592,4 @@ class IRESEARCH_API format_registrar {

 NS_END

-#endif
+#endif
--- a/3rdParty/iresearch/core/search/scorers.cpp
+++ b/3rdParty/iresearch/core/search/scorers.cpp
@ -89,19 +89,22 @@ NS_ROOT

 /*static*/ bool scorers::exists(
    const string_ref& name,
-    const irs::text_format::type_id& args_format
+    const irs::text_format::type_id& args_format,
+    bool load_library /*= true*/
 ) {
-  return nullptr != scorer_register::instance().get(entry_key_t(name, args_format));
+  return nullptr != scorer_register::instance().get(entry_key_t(name, args_format),load_library);
 }

 /*static*/ sort::ptr scorers::get(
    const string_ref& name,
    const irs::text_format::type_id& args_format,
-    const string_ref& args
+    const string_ref& args,
+    bool load_library /*= true*/
 ) NOEXCEPT {
  try {
-    auto* factory =
-      scorer_register::instance().get(entry_key_t(name, args_format));
+    auto* factory = scorer_register::instance().get(
+      entry_key_t(name, args_format), load_library
+    );

    return factory ? factory(args) : nullptr;
  } catch (...) {
--- a/3rdParty/iresearch/core/search/scorers.hpp
+++ b/3rdParty/iresearch/core/search/scorers.hpp
@ -77,7 +77,9 @@ class IRESEARCH_API scorers {
  /// @brief checks whether scorer with a specified name is registered
  ////////////////////////////////////////////////////////////////////////////////
  static bool exists(
-    const string_ref& name, const irs::text_format::type_id& args_format
+    const string_ref& name,
+    const irs::text_format::type_id& args_format,
+    bool load_library = true
  );

  ////////////////////////////////////////////////////////////////////////////////
@ -89,7 +91,8 @@ class IRESEARCH_API scorers {
  static sort::ptr get(
    const string_ref& name,
    const irs::text_format::type_id& args_format,
-    const string_ref& args
+    const string_ref& args,
+    bool load_library = true
  ) NOEXCEPT;

  ////////////////////////////////////////////////////////////////////////////////
--- a/3rdParty/iresearch/core/utils/attributes.cpp
+++ b/3rdParty/iresearch/core/utils/attributes.cpp
@ -44,15 +44,19 @@ NS_ROOT
 // --SECTION--                                                attribute::type_id
 // -----------------------------------------------------------------------------

-/*static*/ bool attribute::type_id::exists(const string_ref& name) {
-  return nullptr != attribute_register::instance().get(name);
+/*static*/ bool attribute::type_id::exists(
+    const string_ref& name,
+    bool load_library /*= true*/
+) {
+  return nullptr != attribute_register::instance().get(name, load_library);
 }

 /*static*/ const attribute::type_id* attribute::type_id::get(
-    const string_ref& name
+    const string_ref& name,
+    bool load_library /*= true*/
 ) NOEXCEPT {
  try {
-    return attribute_register::instance().get(name);
+    return attribute_register::instance().get(name, load_library);
  } catch (...) {
    IR_FRMT_ERROR("Caught exception while getting an attribute instance");
    IR_LOG_EXCEPTION();
--- a/3rdParty/iresearch/core/utils/attributes.hpp
+++ b/3rdParty/iresearch/core/utils/attributes.hpp
@ -56,8 +56,11 @@ struct IRESEARCH_API attribute {
   public:
    type_id(const string_ref& name): name_(name) {}
    operator const type_id*() const { return this; }
-    static bool exists(const string_ref& name);
-    static const type_id* get(const string_ref& name) NOEXCEPT;
+    static bool exists(const string_ref& name, bool load_library = true);
+    static const type_id* get(
+      const string_ref& name,
+      bool load_library = true
+    ) NOEXCEPT;
    const string_ref& name() const { return name_; }

   private:
--- a/3rdParty/iresearch/core/utils/register.hpp
+++ b/3rdParty/iresearch/core/utils/register.hpp
@ -74,9 +74,20 @@ class generic_register: public singleton<RegisterType> {
    return std::make_pair(itr.first->second, itr.second);
  }

-  entry_type get(const key_type& key) const {
+  entry_type get(const key_type& key, bool load_library) const {
    const entry_type* entry = lookup(key);
-    return entry ? *entry : load_entry_from_so(key);
+
+    if (entry) {
+      return *entry;
+    }
+
+    if (load_library) {
+      return load_entry_from_so(key);
+    }
+
+    IR_FRMT_ERROR("%s : key not found", __FUNCTION__);
+
+    return entry_type();
  }

  bool visit(const visitor_t& visitor) {
--- a/arangod/IResearch/IResearchAnalyzerFeature.cpp
+++ b/arangod/IResearch/IResearchAnalyzerFeature.cpp
@ -682,7 +682,9 @@ IResearchAnalyzerFeature::AnalyzerPool::Builder::make(irs::string_ref const& typ

  // ArangoDB, for API consistency, only supports analyzers configurable via
  // jSON
-  return irs::analysis::analyzers::get(type, irs::text_format::json, properties);
+  return irs::analysis::analyzers::get( // get analyzer
+    type, irs::text_format::json, properties, false // args
+  );
 }

 IResearchAnalyzerFeature::AnalyzerPool::AnalyzerPool(irs::string_ref const& name)
--- a/arangod/IResearch/IResearchOrderFactory.cpp
+++ b/arangod/IResearch/IResearchOrderFactory.cpp
@ -90,11 +90,13 @@ bool makeScorer(irs::sort::ptr& scorer, irs::string_ref const& name,
      break;
    case 1: {
      // ArangoDB, for API consistency, only supports scorers configurable via jSON
-      scorer = irs::scorers::get(name, irs::text_format::json, irs::string_ref::NIL);
+      scorer = irs::scorers::get( // get scorer
+        name, irs::text_format::json, irs::string_ref::NIL, false // args
+      );

      if (!scorer) {
        // ArangoDB, for API consistency, only supports scorers configurable via jSON
-        scorer = irs::scorers::get(name, irs::text_format::json, "[]");  // pass arg as json array
+        scorer = irs::scorers::get(name, irs::text_format::json, "[]", false); // pass arg as json array
      }
    } break;
    default: {  // fall through
@ -123,7 +125,9 @@ bool makeScorer(irs::sort::ptr& scorer, irs::string_ref const& name,
      builder.close();

      // ArangoDB, for API consistency, only supports scorers configurable via jSON
-      scorer = irs::scorers::get(name, irs::text_format::json, builder.toJson());  // pass arg as json
+      scorer = irs::scorers::get( // get scorer
+        name, irs::text_format::json, builder.toJson(), false // pass arg as json
+      );
    }
  }

@ -143,7 +147,7 @@ bool fromFCall(irs::sort::ptr* scorer, irs::string_ref const& scorerName,
  if (!scorer) {
    // cheap shallow check
    // ArangoDB, for API consistency, only supports scorers configurable via jSON
-    return irs::scorers::exists(scorerName, irs::text_format::json);
+    return irs::scorers::exists(scorerName, irs::text_format::json, false);
  }

  // we don't support non-constant arguments for scorers now, if it
@ -330,12 +334,14 @@ void ScorerReplacer::extract(aql::Variable const& var, std::vector<Scorer>& scor
  if (!comparer) {
    // cheap shallow check
    // ArangoDB, for API consistency, only supports scorers configurable via jSON
-    return irs::scorers::exists(scorerName, irs::text_format::json);
+    return irs::scorers::exists(scorerName, irs::text_format::json, false);
  }

  // create scorer with default arguments
  // ArangoDB, for API consistency, only supports scorers configurable via jSON
-  *comparer = irs::scorers::get(scorerName, irs::text_format::json, irs::string_ref::NIL);
+  *comparer = irs::scorers::get( // get scorer
+    scorerName, irs::text_format::json, irs::string_ref::NIL, false // args
+  );

  return bool(*comparer);
 }