////////////////////////////////////////////////////////////////////////////// /// DISCLAIMER /// /// Copyright 2017 EMC Corporation /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is EMC Corporation /// /// @author Andrey Abramov /// @author Vasiliy Nabatchikov //////////////////////////////////////////////////////////////////////////////// #include #include #include "analysis/analyzers.hpp" #include "analysis/token_attributes.hpp" #include "utils/hash_utils.hpp" #include "utils/locale_utils.hpp" #include "Basics/StringUtils.h" #include "Cluster/ServerState.h" #include "RestServer/SystemDatabaseFeature.h" #include "VelocyPackHelper.h" #include "Basics/VelocyPackHelper.h" #include "velocypack/Builder.h" #include "velocypack/Iterator.h" #include "IResearchLinkMeta.h" #include "Misc.h" namespace { bool equalAnalyzers(std::vector const& lhs, std::vector const& rhs) noexcept { if (lhs.size() != rhs.size()) { return false; } std::unordered_multiset expected; for (auto& entry : lhs) { expected.emplace( // expected name entry._pool ? irs::string_ref(entry._pool->name()) : irs::string_ref::NIL // args ); } for (auto& entry : rhs) { auto itr = expected.find( // expected name entry._pool ? irs::string_ref(entry._pool->name()) : irs::string_ref::NIL // args ); if (itr == expected.end()) { return false; // values do not match } expected.erase(itr); // ensure same count of duplicates } return true; } } // namespace bool operator<(arangodb::iresearch::FieldMeta::Analyzer const& lhs, irs::string_ref const& rhs) noexcept { return lhs._pool->name() < rhs; } bool operator<(irs::string_ref const& lhs, arangodb::iresearch::FieldMeta::Analyzer const& rhs) noexcept { return lhs < rhs._pool->name(); } namespace arangodb { namespace iresearch { // ----------------------------------------------------------------------------- // --SECTION-- FieldMeta::Analyzer // ----------------------------------------------------------------------------- FieldMeta::Analyzer::Analyzer() : _pool(IResearchAnalyzerFeature::identity()) { if (_pool) { _shortName = _pool->name(); // static analyzers are used verbatim } } // ----------------------------------------------------------------------------- // --SECTION-- IResearchLinkMeta::FieldMeta // ----------------------------------------------------------------------------- /*static*/ const FieldMeta& FieldMeta::DEFAULT() { static const FieldMeta meta; return meta; } FieldMeta::FieldMeta() { Analyzer analyzer; // identity analyzer // identity-only tokenization if (analyzer) { _analyzers.emplace_back(std::move(analyzer)); } } bool FieldMeta::operator==(FieldMeta const& rhs) const noexcept { if (!equalAnalyzers(_analyzers, rhs._analyzers)) { return false; // values do not match } if (_fields.size() != rhs._fields.size()) { return false; // values do not match } auto itr = rhs._fields.begin(); for (auto& entry : _fields) { if (itr.key() != entry.key() || itr.value() != entry.value()) { return false; // values do not match } ++itr; } if (_includeAllFields != rhs._includeAllFields) { return false; // values do not match } if (_trackListPositions != rhs._trackListPositions) { return false; // values do not match } if (_storeValues != rhs._storeValues) { return false; // values do not match } return true; } bool FieldMeta::init(velocypack::Slice const& slice, std::string& errorField, TRI_vocbase_t const* defaultVocbase /*= nullptr*/, FieldMeta const& defaults /*= DEFAULT()*/, Mask* mask /*= nullptr*/, std::set* referencedAnalyzers /*= nullptr*/) { if (!slice.isObject()) { return false; } Mask tmpMask; if (!mask) { mask = &tmpMask; } { // optional string list static const std::string fieldName("analyzers"); mask->_analyzers = slice.hasKey(fieldName); if (!mask->_analyzers) { _analyzers = defaults._analyzers; } else { auto& server = application_features::ApplicationServer::server(); auto& analyzers = server.getFeature(); auto& sysDatabase = server.getFeature(); auto field = slice.get(fieldName); if (!field.isArray()) { errorField = fieldName; return false; } _analyzers.clear(); // reset to match read values exactly std::unordered_set uniqueGuard; // deduplicate analyzers for (velocypack::ArrayIterator itr(field); itr.valid(); ++itr) { auto value = *itr; if (!value.isString()) { errorField = fieldName + "[" + std::to_string(itr.index()) + "]"; return false; } auto name = value.copyString(); auto shortName = name; if (defaultVocbase) { auto sysVocbase = sysDatabase.use(); if (sysVocbase) { name = IResearchAnalyzerFeature::normalize( name, *defaultVocbase, *sysVocbase); shortName = IResearchAnalyzerFeature::normalize( name, *defaultVocbase, *sysVocbase, false); } } AnalyzerPool::ptr analyzer; bool found = false; if (referencedAnalyzers) { auto it = referencedAnalyzers->find(irs::string_ref(name)); if (it != referencedAnalyzers->end()) { analyzer = *it; found = static_cast(analyzer); if (ADB_UNLIKELY(!found)) { TRI_ASSERT(false); // should not happen referencedAnalyzers->erase(it); // remove null analyzer } } } if (!found) { // for cluster only check cache to avoid ClusterInfo locking issues // analyzer should have been populated via 'analyzerDefinitions' above analyzer = analyzers.get(name, ServerState::instance()->isClusterRole()); } if (!analyzer) { errorField = fieldName + "." + value.copyString(); // original (non-normalized) 'name' value return false; } if (!found && referencedAnalyzers) { // save in referencedAnalyzers referencedAnalyzers->emplace(analyzer); } // avoid adding same analyzer twice if (uniqueGuard.emplace(analyzer->name()).second) { _analyzers.emplace_back(analyzer, std::move(shortName)); } } } } { // optional bool static const std::string fieldName("includeAllFields"); mask->_includeAllFields = slice.hasKey(fieldName); if (!mask->_includeAllFields) { _includeAllFields = defaults._includeAllFields; } else { auto field = slice.get(fieldName); if (!field.isBool()) { errorField = fieldName; return false; } _includeAllFields = field.getBool(); } } { // optional bool static const std::string fieldName("trackListPositions"); mask->_trackListPositions = slice.hasKey(fieldName); if (!mask->_trackListPositions) { _trackListPositions = defaults._trackListPositions; } else { auto field = slice.get(fieldName); if (!field.isBool()) { errorField = fieldName; return false; } _trackListPositions = field.getBool(); } } { // optional string enum static const std::string fieldName("storeValues"); mask->_storeValues = slice.hasKey(fieldName); if (!mask->_storeValues) { _storeValues = defaults._storeValues; } else { auto field = slice.get(fieldName); if (!field.isString()) { errorField = fieldName; return false; } static const std::unordered_map policies = { {"none", ValueStorage::NONE}, {"id", ValueStorage::ID}, {"value", ValueStorage::VALUE} }; auto name = field.copyString(); auto itr = policies.find(name); if (itr == policies.end()) { errorField = fieldName + "." + name; return false; } _storeValues = itr->second; } } // ............................................................................. // process fields last since children inherit from parent // ............................................................................. { // optional string map static const std::string fieldName("fields"); mask->_fields = slice.hasKey(fieldName); if (!mask->_fields) { _fields = defaults._fields; } else { auto field = slice.get(fieldName); if (!field.isObject()) { errorField = fieldName; return false; } auto subDefaults = *this; subDefaults._fields.clear(); // do not inherit fields and overrides from this field _fields.clear(); // reset to match either defaults or read values exactly for (velocypack::ObjectIterator itr(field); itr.valid(); ++itr) { auto key = itr.key(); auto value = itr.value(); if (!key.isString()) { errorField = fieldName + "[" + basics::StringUtils::itoa(itr.index()) + "]"; return false; } auto name = key.copyString(); if (!value.isObject()) { errorField = fieldName + "." + name; return false; } std::string childErrorField; if (!_fields[name]->init(value, childErrorField, defaultVocbase, subDefaults, nullptr, referencedAnalyzers)) { errorField = fieldName + "." + name + "." + childErrorField; return false; } } } } return true; } bool FieldMeta::json(velocypack::Builder& builder, FieldMeta const* ignoreEqual /*= nullptr*/, TRI_vocbase_t const* defaultVocbase /*= nullptr*/, Mask const* mask /*= nullptr*/) const { if (!builder.isOpenObject()) { return false; } std::map analyzers; if ((!ignoreEqual || !equalAnalyzers(_analyzers, ignoreEqual->_analyzers)) && (!mask || mask->_analyzers)) { velocypack::Builder analyzersBuilder; analyzersBuilder.openArray(); for (auto& entry : _analyzers) { if (!entry._pool) { continue; // skip null analyzers } std::string name; if (defaultVocbase) { auto& server = application_features::ApplicationServer::server(); auto sysVocbase = server.getFeature().use(); if (!sysVocbase) { return false; } // @note: DBServerAgencySync::getLocalCollections(...) generates // 'forPersistence' definitions that are then compared in // Maintenance.cpp:compareIndexes(...) via // arangodb::Index::Compare(...) without access to // 'defaultVocbase', hence the generated definitions must not // rely on 'defaultVocbase' // hence must use 'expandVocbasePrefix==true' if // 'writeAnalyzerDefinition==true' for normalize // for 'writeAnalyzerDefinition==false' must use // 'expandVocbasePrefix==false' so that dump/restore an restore // definitions into differently named databases name = IResearchAnalyzerFeature::normalize( // normalize entry._pool->name(), // analyzer name *defaultVocbase, // active vocbase *sysVocbase, // system vocbase false // expand vocbase prefix ); } else { name = entry._pool->name(); // verbatim (assume already normalized) } analyzers.emplace(name, entry._pool); analyzersBuilder.add(velocypack::Value(std::move(name))); } analyzersBuilder.close(); builder.add("analyzers", analyzersBuilder.slice()); } if (!mask || mask->_fields) { // fields are not inherited from parent velocypack::Builder fieldsBuilder; Mask fieldMask(true); // output all non-matching fields auto subDefaults = *this; // make modifable copy subDefaults._fields.clear(); // do not inherit fields and overrides from this field fieldsBuilder.openObject(); for (auto& entry : _fields) { fieldMask._fields = !entry.value()->_fields.empty(); // do not output empty fields on subobjects fieldsBuilder.add( // add sub-object entry.key(), // field name velocypack::Value(velocypack::ValueType::Object) ); if (!entry.value()->json(fieldsBuilder, &subDefaults, defaultVocbase, &fieldMask)) { return false; } fieldsBuilder.close(); } fieldsBuilder.close(); builder.add("fields", fieldsBuilder.slice()); } if ((!ignoreEqual || _includeAllFields != ignoreEqual->_includeAllFields) && (!mask || mask->_includeAllFields)) { builder.add("includeAllFields", velocypack::Value(_includeAllFields)); } if ((!ignoreEqual || _trackListPositions != ignoreEqual->_trackListPositions) && (!mask || mask->_trackListPositions)) { builder.add("trackListPositions", velocypack::Value(_trackListPositions)); } if ((!ignoreEqual || _storeValues != ignoreEqual->_storeValues) && (!mask || mask->_storeValues)) { static_assert(adjacencyChecker::checkAdjacency(), "Values are not adjacent"); static const std::string policies[]{ "none", // ValueStorage::NONE "id", // ValueStorage::ID "value" // ValueStorage::VALUE }; auto const policyIdx = static_cast::type>(_storeValues); if (policyIdx >= IRESEARCH_COUNTOF(policies)) { return false; // unsupported value storage policy } builder.add("storeValues", velocypack::Value(policies[policyIdx])); } return true; } size_t FieldMeta::memory() const noexcept { auto size = sizeof(FieldMeta); size += _analyzers.size() * sizeof(decltype(_analyzers)::value_type); size += _fields.size() * sizeof(decltype(_fields)::value_type); for (auto& entry : _fields) { size += entry.key().size(); size += entry.value()->memory(); } return size; } // ----------------------------------------------------------------------------- // --SECTION-- IResearchLinkMeta // ----------------------------------------------------------------------------- IResearchLinkMeta::IResearchLinkMeta() { // add default analyzers for (auto& analyzer : _analyzers) { _analyzerDefinitions.emplace(analyzer._pool); } } bool IResearchLinkMeta::operator==(IResearchLinkMeta const& other) const noexcept { if (FieldMeta::operator!=(other)) { return false; } if (_sort != other._sort) { return false; } return true; } /*static*/ const IResearchLinkMeta& IResearchLinkMeta::DEFAULT() { static const IResearchLinkMeta meta; return meta; } bool IResearchLinkMeta::init(velocypack::Slice const& slice, bool readAnalyzerDefinition, std::string& errorField, TRI_vocbase_t const* defaultVocbase /*= nullptr*/, FieldMeta const& defaults /*= DEFAULT()*/, Mask* mask /*= nullptr*/) { if (!slice.isObject()) { return false; } Mask tmpMask; if (!mask) { mask = &tmpMask; } { // optional sort static VPackStringRef const fieldName("primarySort"); auto const field = slice.get(fieldName); mask->_sort = field.isArray(); if (readAnalyzerDefinition && mask->_sort) { if (!_sort.fromVelocyPack(field, errorField)) { return false; } } } { // clear existing definitions _analyzerDefinitions.clear(); // optional object list static const std::string fieldName("analyzerDefinitions"); mask->_analyzerDefinitions = slice.hasKey(fieldName); // load analyzer definitions if requested (used on cluster) // @note must load definitions before loading 'analyzers' to ensure presence if (readAnalyzerDefinition && mask->_analyzerDefinitions) { auto& server = application_features::ApplicationServer::server(); auto& sysDatabase = server.getFeature(); auto field = slice.get(fieldName); if (!field.isArray()) { errorField = fieldName; return false; } for (velocypack::ArrayIterator itr(field); itr.valid(); ++itr) { auto value = *itr; if (!value.isObject()) { errorField = fieldName + "[" + std::to_string(itr.index()) + "]"; return false; } std::string name; { // required string value static const std::string subFieldName("name"); if (!value.hasKey(subFieldName) // missing required filed || !value.get(subFieldName).isString()) { errorField = fieldName + "[" + std::to_string(itr.index()) + "]." + subFieldName; return false; } name = value.get(subFieldName).copyString(); if (defaultVocbase) { auto sysVocbase = sysDatabase.use(); if (sysVocbase) { name = IResearchAnalyzerFeature::normalize( // normalize name, *defaultVocbase, *sysVocbase, true// args ); } } } irs::string_ref type; { // required string value static const std::string subFieldName("type"); if (!value.hasKey(subFieldName) // missing required filed || !value.get(subFieldName).isString()) { errorField = fieldName + "[" + std::to_string(itr.index()) + "]." + subFieldName; return false; } type = getStringRef(value.get(subFieldName)); } VPackSlice properties; { // optional string value static const std::string subFieldName("properties"); if (value.hasKey(subFieldName)) { auto subField = value.get(subFieldName); if (!subField.isObject() && !subField.isNull()) { errorField = fieldName + "[" + std::to_string(itr.index()) + "]." + subFieldName; return false; } properties = subField; } } irs::flags features; { // optional string list static const std::string subFieldName("features"); if (value.hasKey(subFieldName)) { auto subField = value.get(subFieldName); if (!subField.isArray()) { errorField = fieldName + "[" + std::to_string(itr.index()) + "]." + subFieldName; return false; } for (velocypack::ArrayIterator subItr(subField); subItr.valid(); ++subItr) { auto subValue = *subItr; if (!subValue.isString() && !subValue.isNull()) { errorField = fieldName + "[" + std::to_string(itr.index()) + "]." + subFieldName + "[" + std::to_string(subItr.index()) + + "]"; return false; } auto featureName = getStringRef(subValue); auto* feature = irs::attribute::type_id::get(featureName); if (!feature) { errorField = fieldName + "[" + std::to_string(itr.index()) + "]." + subFieldName + "." + std::string(featureName); return false; } features.add(*feature); } } } AnalyzerPool::ptr analyzer; auto const res = IResearchAnalyzerFeature::createAnalyzerPool(analyzer, name, type, properties, features); if (res.fail() || !analyzer) { errorField = fieldName + "[" + std::to_string(itr.index()) + "]"; if (res.fail()) { errorField.append(": ").append(res.errorMessage()); } return false; } _analyzerDefinitions.emplace(analyzer); } } } return FieldMeta::init(slice, errorField, defaultVocbase, defaults, mask, &_analyzerDefinitions); } bool IResearchLinkMeta::json(velocypack::Builder& builder, bool writeAnalyzerDefinition, IResearchLinkMeta const* ignoreEqual /*= nullptr*/, TRI_vocbase_t const* defaultVocbase /*= nullptr*/, Mask const* mask /*= nullptr*/) const { if (!builder.isOpenObject()) { return false; } if (writeAnalyzerDefinition && (!ignoreEqual || _sort != ignoreEqual->_sort) && (!mask || mask->_sort)) { velocypack::ArrayBuilder arrayScope(&builder, "primarySort"); if (!_sort.toVelocyPack(builder)) { return false; } } // output definitions if 'writeAnalyzerDefinition' requested and not maked // this should be the case for the default top-most call if (writeAnalyzerDefinition && (!mask || mask->_analyzerDefinitions)) { VPackArrayBuilder arrayScope(&builder, "analyzerDefinitions"); for (auto& entry: _analyzerDefinitions) { TRI_ASSERT(entry); // ensured by emplace into 'analyzers' above entry->toVelocyPack(builder, defaultVocbase); } } return FieldMeta::json(builder, ignoreEqual, defaultVocbase, mask); } size_t IResearchLinkMeta::memory() const noexcept { auto size = sizeof(IResearchLinkMeta); size += _analyzers.size() * sizeof(decltype(_analyzers)::value_type); size += _sort.memory(); size += FieldMeta::memory(); return size; } } // namespace iresearch } // namespace arangodb