1
0
Fork 0
arangodb/arangod/Aql/RegexCache.cpp

245 lines
7.2 KiB
C++

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Jan Steemann
////////////////////////////////////////////////////////////////////////////////
#include "RegexCache.h"
#include "Basics/Utf8Helper.h"
#include <Basics/StringUtils.h>
#include <velocypack/Collection.h>
#include <velocypack/Dumper.h>
#include <velocypack/Iterator.h>
#include <velocypack/velocypack-aliases.h>
using namespace arangodb::aql;
RegexCache::~RegexCache() { clear(); }
void RegexCache::clear() noexcept {
_regexCache.clear();
_likeCache.clear();
}
icu::RegexMatcher* RegexCache::buildRegexMatcher(char const* ptr, size_t length,
bool caseInsensitive) {
buildRegexPattern(_temp, ptr, length, caseInsensitive);
return fromCache(_temp, _regexCache);
}
icu::RegexMatcher* RegexCache::buildLikeMatcher(char const* ptr, size_t length,
bool caseInsensitive) {
buildLikePattern(_temp, ptr, length, caseInsensitive);
return fromCache(_temp, _likeCache);
}
icu::RegexMatcher* RegexCache::buildSplitMatcher(AqlValue const& splitExpression,
arangodb::transaction::Methods* trx,
bool& isEmptyExpression) {
std::string rx;
AqlValueMaterializer materializer(trx);
VPackSlice slice = materializer.slice(splitExpression, false);
if (splitExpression.isArray()) {
for (auto const& it : VPackArrayIterator(slice)) {
if (!it.isString() || it.getStringLength() == 0) {
// one empty string rules them all
isEmptyExpression = true;
rx = "";
break;
}
if (rx.size() != 0) {
rx += '|';
}
arangodb::velocypack::ValueLength length;
char const* str = it.getString(length);
basics::StringUtils::escapeRegexParams(rx, str, length);
}
} else if (splitExpression.isString()) {
arangodb::velocypack::ValueLength length;
char const* str = slice.getString(length);
basics::StringUtils::escapeRegexParams(rx, str, length);
if (rx.empty()) {
isEmptyExpression = true;
}
} else {
rx.clear();
}
return fromCache(rx, _likeCache);
}
/// @brief get matcher from cache, or insert a new matcher for the specified
/// pattern
icu::RegexMatcher* RegexCache::fromCache(
std::string const& pattern,
std::unordered_map<std::string, std::unique_ptr<icu::RegexMatcher>>& cache) {
auto it = cache.find(pattern);
if (it != cache.end()) {
return (*it).second.get();
}
auto matcher = std::unique_ptr<icu::RegexMatcher>(
arangodb::basics::Utf8Helper::DefaultUtf8Helper.buildMatcher(pattern));
auto p = matcher.get();
// insert into cache, no matter if pattern is valid or not
cache.emplace(pattern, std::move(matcher));
return p;
}
/// @brief compile a REGEX pattern from a string
void RegexCache::buildRegexPattern(std::string& out, char const* ptr,
size_t length, bool caseInsensitive) {
out.clear();
if (caseInsensitive) {
out.reserve(length + 4);
out.append("(?i)");
}
out.append(ptr, length);
}
/// @brief compile a LIKE pattern from a string
void RegexCache::buildLikePattern(std::string& out, char const* ptr,
size_t length, bool caseInsensitive) {
out.clear();
out.reserve(length + 8); // reserve some room
// pattern is always anchored
out.push_back('^');
if (caseInsensitive) {
out.append("(?i)");
}
bool escaped = false;
for (size_t i = 0; i < length; ++i) {
char const c = ptr[i];
if (c == '\\') {
if (escaped) {
// literal backslash
out.append("\\\\");
}
escaped = !escaped;
} else {
if (c == '%') {
if (escaped) {
// literal %
out.push_back('%');
} else {
// wildcard
out.append("(.|[\r\n])*");
}
} else if (c == '_') {
if (escaped) {
// literal underscore
out.push_back('_');
} else {
// wildcard character
out.append("(.|[\r\n])");
}
} else if (c == '?' || c == '+' || c == '[' || c == '(' || c == ')' ||
c == '{' || c == '}' || c == '^' || c == '$' || c == '|' ||
c == '\\' || c == '.' || c == '*') {
// character with special meaning in a regex
out.push_back('\\');
out.push_back(c);
} else {
if (escaped) {
// found a backslash followed by no special character
out.append("\\\\");
}
// literal character
out.push_back(c);
}
escaped = false;
}
}
// always anchor the pattern
out.push_back('$');
}
/// @brief inspect a LIKE pattern from a string, and remove all
/// of its escape characters. will stop at the first wildcards found.
/// returns a pair with the following meaning:
/// - first: true if the inspection aborted prematurely because a
/// wildcard was found, and false if the inspection analyzed at the
/// complete string
/// - second: true if the found wildcard is the last byte in the pattern,
/// false otherwise. can only be true if first is also true
std::pair<bool, bool> RegexCache::inspectLikePattern(std::string& out,
char const* ptr, size_t length) {
out.reserve(length);
bool escaped = false;
for (size_t i = 0; i < length; ++i) {
char const c = ptr[i];
if (c == '\\') {
if (escaped) {
// literal backslash
out.push_back('\\');
}
escaped = !escaped;
} else {
if (c == '%' || c == '_') {
if (escaped) {
// literal %
out.push_back(c);
} else {
// wildcard found
bool wildcardIsLastChar = (i == length - 1);
return std::make_pair(true, wildcardIsLastChar);
}
} else if (c == '?' || c == '+' || c == '[' || c == '(' || c == ')' ||
c == '{' || c == '}' || c == '^' || c == '$' || c == '|' ||
c == '\\' || c == '.' || c == '*') {
// character with special meaning in a regex
out.push_back(c);
} else {
if (escaped) {
// found a backslash followed by no special character
out.push_back('\\');
}
// literal character
out.push_back(c);
}
escaped = false;
}
}
return std::make_pair(false, false);
}