mirror of https://gitee.com/bigwinds/arangodb
Levenshtein Distance function implementation with documentation and i… (#5922)
This commit is contained in:
parent
d4d079c014
commit
7db28da251
|
@ -239,6 +239,31 @@ LENGTH("foobar") // 6
|
|||
LENGTH("电脑坏了") // 4
|
||||
```
|
||||
|
||||
LEVENSHTEIN_DISTANCE()
|
||||
------
|
||||
|
||||
`LEVENSHTEIN_DISTANCE(value1, value2) → levenshteinDistance`
|
||||
|
||||
Return the calculated Levenshtein distance between the input strings *value1* and *value2*.
|
||||
|
||||
- **value1** (string): a string
|
||||
- **value2** (string): a string
|
||||
|
||||
`LEVENSHTEIN_DISTANCE(value1, value2) → levenshteinDistance`
|
||||
|
||||
Return the calculated Levenshtein distance between the input strings *value1* and *value2*.
|
||||
|
||||
- **value1** (string): a string
|
||||
- **value2** (string): a string
|
||||
- returns **levenshteinDistance** (number): calculated Levenshtein distance between the input strings *value1* and *value2*
|
||||
|
||||
```js
|
||||
LEVENSHTEIN_DISTANCE("foobar", "bar") // 3
|
||||
LEVENSHTEIN_DISTANCE(" ", "") // 1
|
||||
LEVENSHTEIN_DISTANCE("The quick brown fox jumps over the lazy dog", "The quick black dog jumps over the brown fox") // 13
|
||||
LEVENSHTEIN_DISTANCE("der mötör trötet", "der trötet") // 6
|
||||
```
|
||||
|
||||
*LENGTH()* can also determine the [number of elements](Array.md#length) in an array,
|
||||
the [number of attribute keys](Document.md#length) of an object / document and
|
||||
the [amount of documents](Miscellaneous.md#length) in a collection.
|
||||
|
|
|
@ -469,8 +469,9 @@ The following AQL functions have been added in ArangoDB 3.4:
|
|||
comparison order
|
||||
* `SORTED_UNIQUE`: same as `SORTED`, but additionally removes duplicates
|
||||
* `COUNT_DISTINCT`: counts the number of distinct / unique items in an array
|
||||
|
||||
The following AQL functions have been added to make working with geographical
|
||||
* `LEVENSHTEIN_DISTANCE`: calculates the Levenshtein distance between two string values
|
||||
|
||||
The following AQL functions have been added to make working with geographical
|
||||
data easier:
|
||||
|
||||
* `GEO_POINT`
|
||||
|
|
|
@ -202,6 +202,7 @@ void AqlFunctionFeature::addStringFunctions() {
|
|||
add({"ENCODE_URI_COMPONENT", ".", true, false, true, &Functions::EncodeURIComponent});
|
||||
add({"UUID", "", true, false, true, &Functions::UUID});
|
||||
add({"SOUNDEX", ".", true, false, true, &Functions::Soundex});
|
||||
add({"LEVENSHTEIN_DISTANCE", ".,.", true, false, true, &Functions::LevenshteinDistance});
|
||||
// FULLTEXT is replaced by the AQL optimizer with an index lookup
|
||||
add({"FULLTEXT", ".h,.,.|." , false, true, false, &Functions::NotImplemented});
|
||||
}
|
||||
|
|
|
@ -1509,6 +1509,29 @@ AqlValue Functions::Soundex(arangodb::aql::Query*,
|
|||
return AqlValue(encoded);
|
||||
}
|
||||
|
||||
|
||||
/// @brief function LEVENSHTEIN_DISTANCE
|
||||
AqlValue Functions::LevenshteinDistance(arangodb::aql::Query*,
|
||||
transaction::Methods* trx,
|
||||
VPackFunctionParameters const& parameters) {
|
||||
ValidateParameters(parameters, "LEVENSHTEIN_DISTANCE", 2, 2);
|
||||
AqlValue value1 = ExtractFunctionParameterValue(parameters, 0);
|
||||
AqlValue value2 = ExtractFunctionParameterValue(parameters, 1);
|
||||
|
||||
transaction::StringBufferLeaser buffer1(trx);
|
||||
transaction::StringBufferLeaser buffer2(trx);
|
||||
|
||||
arangodb::basics::VPackStringBufferAdapter adapter1(buffer1->stringBuffer());
|
||||
arangodb::basics::VPackStringBufferAdapter adapter2(buffer2->stringBuffer());
|
||||
|
||||
::appendAsString(trx, adapter1, value1);
|
||||
::appendAsString(trx, adapter2, value2);
|
||||
|
||||
int encoded = basics::StringUtils::levenshteinDistance(std::string(buffer1->begin(), buffer1->length()), std::string(buffer2->begin(), buffer2->length()));
|
||||
|
||||
return AqlValue(AqlValueHintInt(encoded));
|
||||
}
|
||||
|
||||
/// @brief function TO_BOOL
|
||||
AqlValue Functions::ToBool(arangodb::aql::Query*,
|
||||
transaction::Methods* trx,
|
||||
|
|
|
@ -161,6 +161,8 @@ struct Functions {
|
|||
VPackFunctionParameters const&);
|
||||
static AqlValue Soundex(arangodb::aql::Query*, transaction::Methods*,
|
||||
VPackFunctionParameters const&);
|
||||
static AqlValue LevenshteinDistance(arangodb::aql::Query*, transaction::Methods*,
|
||||
VPackFunctionParameters const&);
|
||||
// Date
|
||||
static AqlValue DateFromParameters(arangodb::aql::Query* query,
|
||||
transaction::Methods* trx,
|
||||
|
|
|
@ -202,6 +202,51 @@ function ahuacatlStringFunctionsTestSuite () {
|
|||
assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN SOUNDEX("test", "meow", "foo", "bar")');
|
||||
},
|
||||
|
||||
|
||||
// //////////////////////////////////////////////////////////////////////////////
|
||||
// / @brief test LevenshteinDistance
|
||||
// //////////////////////////////////////////////////////////////////////////////
|
||||
testToLevenshteinDistanceValues: function () {
|
||||
[
|
||||
[ null, "", 0 ],
|
||||
[ null, null, 0 ],
|
||||
[ "", "", 0 ],
|
||||
[ "", "foobar", 6 ],
|
||||
[ "foobar", "", 6 ],
|
||||
[ "foobar", "foo", 3 ],
|
||||
[ "foo", "foobar", 3 ],
|
||||
[ "or", "of", 1 ],
|
||||
[ "or", "", 2 ],
|
||||
[ "or", "The quick brown fox jumps over the lazy dog", 41 ],
|
||||
[ true, "foobar", 6 ],
|
||||
[ false, "foobar", 5 ],
|
||||
[ "foobar", true, 6 ],
|
||||
[ "foobar", false, 5 ],
|
||||
[ true, true, 0 ],
|
||||
[ false, false, 0 ],
|
||||
[ true, false, 4 ],
|
||||
[ false, true, 4 ],
|
||||
[ "", "", 0 ],
|
||||
[ " ", "", 1 ],
|
||||
[ "", " ", 1 ],
|
||||
[ "der mötör trötet", "der mötör trötet", 0 ],
|
||||
[ "der mötör trötet", "der trötet", 6 ],
|
||||
[ "der mötör trötet", "dertrötet", 7 ],
|
||||
[ "Öööööö", "öö", 4 ],
|
||||
[ "The quick brown fox jumps over the lazy dog", "The quick black dog jumps over the brown fox", 13 ],
|
||||
[ "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of \"de Finibus Bonorum et Malorum\" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, 'Lorem ipsum dolor sit amet..', comes from a line in section 1.10.32. The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from 'de Finibus Bonorum et Malorum' by Cicero are also reproduced in their exact original form, accompanied by English versions from the 1914 translation by H. Rackham..", "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of \"de Finibus Bonorum et Malorum\" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, 'Lorem ipsum dolor sit amet..', comes from a line in section 1.10.32. The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from 'de Finibus Bonorum et Malorum' by Cicero are also reproduced in their exact original form.", 74 ],
|
||||
].forEach(function(test) {
|
||||
assertEqual([ test[2] ], getQueryResults('RETURN LEVENSHTEIN_DISTANCE(' + JSON.stringify(test[0]) + ', ' + JSON.stringify(test[1]) + ')'), test);
|
||||
});
|
||||
},
|
||||
|
||||
testLevenshteinDistanceInvalidNumberOfParameters: function () {
|
||||
assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN LEVENSHTEIN_DISTANCE()');
|
||||
assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN LEVENSHTEIN_DISTANCE("test")');
|
||||
assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN LEVENSHTEIN_DISTANCE("test", "meow", "foo")');
|
||||
assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN LEVENSHTEIN_DISTANCE("test", "meow", "foo", "bar")');
|
||||
},
|
||||
|
||||
// //////////////////////////////////////////////////////////////////////////////
|
||||
// / @brief test JSON_STRINGIFY
|
||||
// //////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -25,6 +25,9 @@
|
|||
|
||||
#include <stdio.h>
|
||||
#include <ctype.h>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
|
@ -235,6 +238,71 @@ bool isLowSurrugate(uint32_t number) {
|
|||
return (number >= 0xDC00) && (number <= 0xDFFF);
|
||||
}
|
||||
|
||||
unsigned char consume(char const*& s) {
|
||||
return *reinterpret_cast<unsigned char const*>(s++);
|
||||
}
|
||||
|
||||
template<typename InputType>
|
||||
inline bool isEqual(InputType const& c1, InputType const& c2) {
|
||||
return c1 == c2;
|
||||
}
|
||||
|
||||
template<typename InputType, typename LengthType>
|
||||
LengthType levenshtein(InputType const* lhs,
|
||||
InputType const* rhs,
|
||||
LengthType lhsSize,
|
||||
LengthType rhsSize) {
|
||||
TRI_ASSERT(lhsSize >= rhsSize);
|
||||
|
||||
std::vector<LengthType> costs;
|
||||
costs.resize(rhsSize + 1);
|
||||
|
||||
for (LengthType i = 0; i < rhsSize; ++i) {
|
||||
costs[i] = i;
|
||||
}
|
||||
|
||||
LengthType next = 0;
|
||||
|
||||
for (LengthType i = 0; i < lhsSize; ++i) {
|
||||
LengthType current = i + 1;
|
||||
|
||||
for (LengthType j = 0; j < rhsSize; ++j) {
|
||||
LengthType cost = !(::isEqual<InputType>(lhs[i], rhs[j]) ||
|
||||
(i && j && ::isEqual<InputType>(lhs[i - 1], rhs[j]) && ::isEqual<InputType>(lhs[i], rhs[j - 1])));
|
||||
next = std::min(std::min(costs[j + 1] + 1, current + 1), costs[j] + cost);
|
||||
costs[j] = current;
|
||||
current = next;
|
||||
}
|
||||
costs[rhsSize] = next;
|
||||
}
|
||||
return next;
|
||||
}
|
||||
|
||||
size_t levenshteinDistance(std::vector<uint32_t>& vect1, std::vector<uint32_t>& vect2) {
|
||||
if (vect1.empty() || vect2.empty()) {
|
||||
return vect1.size() ? vect1.size() : vect2.size();
|
||||
}
|
||||
|
||||
if (vect1.size() < vect2.size()) {
|
||||
vect1.swap(vect2);
|
||||
}
|
||||
|
||||
size_t lhsSize = vect1.size();
|
||||
size_t rhsSize = vect2.size();
|
||||
|
||||
uint32_t const* l = vect1.data();
|
||||
uint32_t const* r = vect2.data();
|
||||
|
||||
if (lhsSize < std::numeric_limits<uint8_t>::max()) {
|
||||
return static_cast<size_t>(::levenshtein<uint32_t, uint8_t>(l, r, lhsSize, rhsSize));
|
||||
} else if (lhsSize < std::numeric_limits<uint16_t>::max()) {
|
||||
return static_cast<size_t>(::levenshtein<uint32_t, uint16_t>(l, r, lhsSize, rhsSize));
|
||||
} else if (lhsSize < std::numeric_limits<uint32_t>::max()) {
|
||||
return static_cast<size_t>(::levenshtein<uint32_t, uint32_t>(l, r, lhsSize, rhsSize));
|
||||
}
|
||||
return static_cast<size_t>(::levenshtein<uint32_t, uint64_t>(l, r, lhsSize, rhsSize));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace arangodb {
|
||||
|
@ -1406,17 +1474,70 @@ std::string soundex(char const* src, size_t const len) {
|
|||
return result;
|
||||
}
|
||||
|
||||
unsigned int levenshteinDistance(std::string const& str1, std::string const& str2) {
|
||||
// convert input strings to vectors of (multi-byte) character numbers
|
||||
std::vector<uint32_t> vect1 = characterCodes(str1);
|
||||
std::vector<uint32_t> vect2 = characterCodes(str2);
|
||||
|
||||
// calculate levenshtein distance on vectors of character numbers
|
||||
return static_cast<unsigned int>(::levenshteinDistance(vect1, vect2));
|
||||
}
|
||||
|
||||
std::vector<uint32_t> characterCodes(std::string const& str) {
|
||||
char const* s = str.data();
|
||||
char const* e = s + str.size();
|
||||
|
||||
std::vector<uint32_t> charNums;
|
||||
// be conservative, and reserve space for one number of input
|
||||
// string byte. this may be too much, but it avoids later
|
||||
// reallocation of the vector
|
||||
charNums.reserve(str.size());
|
||||
|
||||
while (s < e) {
|
||||
// note: consume advances the *s* pointer by one byte
|
||||
unsigned char c = ::consume(s);
|
||||
uint32_t n = uint32_t(c);
|
||||
|
||||
if ((c & 0x80U) == 0U) {
|
||||
// single-byte character
|
||||
charNums.push_back(n);
|
||||
} else if ((c & 0xE0U) == 0xC0U) {
|
||||
// two-byte character
|
||||
if (s >= e) {
|
||||
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "invalid UTF-8 sequence");
|
||||
}
|
||||
charNums.push_back((n << 8U) + uint32_t(::consume(s)));
|
||||
} else if ((c & 0xF0U) == 0xE0U) {
|
||||
// three-byte character
|
||||
if (s + 1 >= e) {
|
||||
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "invalid UTF-8 sequence");
|
||||
}
|
||||
charNums.push_back((n << 16U) + (uint32_t(::consume(s)) << 8U) + (uint32_t(::consume(s))));
|
||||
} else if ((c & 0xF8U) == 0XF0U){
|
||||
// four-byte character
|
||||
if (s + 2 >= e) {
|
||||
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "invalid UTF-8 sequence");
|
||||
}
|
||||
charNums.push_back((n << 24U) + (uint32_t(::consume(s)) << 16U) + (uint32_t(::consume(s)) << 8U) + (uint32_t(::consume(s))));
|
||||
} else {
|
||||
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "invalid UTF-8 sequence");
|
||||
}
|
||||
}
|
||||
|
||||
return charNums;
|
||||
}
|
||||
|
||||
// .............................................................................
|
||||
// CONVERT TO STRING
|
||||
// .............................................................................
|
||||
|
||||
std::string itoa(int16_t attr) {
|
||||
char buffer[7];
|
||||
char* p = buffer;
|
||||
|
||||
if (attr == INT16_MIN) {
|
||||
return "-32768";
|
||||
}
|
||||
|
||||
char buffer[7];
|
||||
char* p = buffer;
|
||||
|
||||
if (attr < 0) {
|
||||
*p++ = '-';
|
||||
|
@ -1466,12 +1587,12 @@ std::string itoa(uint16_t attr) {
|
|||
}
|
||||
|
||||
std::string itoa(int32_t attr) {
|
||||
char buffer[12];
|
||||
char* p = buffer;
|
||||
|
||||
if (attr == INT32_MIN) {
|
||||
return "-2147483648";
|
||||
}
|
||||
|
||||
char buffer[12];
|
||||
char* p = buffer;
|
||||
|
||||
if (attr < 0) {
|
||||
*p++ = '-';
|
||||
|
@ -1551,12 +1672,12 @@ std::string itoa(uint32_t attr) {
|
|||
}
|
||||
|
||||
std::string itoa(int64_t attr) {
|
||||
char buffer[21];
|
||||
char* p = buffer;
|
||||
|
||||
if (attr == INT64_MIN) {
|
||||
return "-9223372036854775808";
|
||||
}
|
||||
|
||||
char buffer[21];
|
||||
char* p = buffer;
|
||||
|
||||
if (attr < 0) {
|
||||
*p++ = '-';
|
||||
|
@ -1776,9 +1897,8 @@ bool boolean(std::string const& str) {
|
|||
if (lower == "true" || lower == "yes" || lower == "on" || lower == "y" ||
|
||||
lower == "1" || lower == "✓") {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
int64_t int64(std::string const& str) {
|
||||
|
@ -2064,9 +2184,7 @@ float floatDecimal(char const* value, size_t size) {
|
|||
bool unicodeToUTF8(char const* inputStr, size_t const& len,
|
||||
std::string& outputStr) {
|
||||
uint32_t outputInt = 0;
|
||||
bool ok;
|
||||
|
||||
ok = parseHexanumber(inputStr, len, &outputInt);
|
||||
bool ok = parseHexanumber(inputStr, len, &outputInt);
|
||||
if (ok == false) {
|
||||
outputStr = std::string(inputStr, len);
|
||||
return false;
|
||||
|
|
|
@ -27,6 +27,8 @@
|
|||
|
||||
#include "Basics/Common.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace arangodb {
|
||||
namespace basics {
|
||||
|
||||
|
@ -220,7 +222,13 @@ std::string soundex(std::string const& str);
|
|||
|
||||
/// @brief converts input string to soundex code
|
||||
std::string soundex(char const* src, size_t const len);
|
||||
|
||||
|
||||
/// @brief converts input string to vector of character codes
|
||||
std::vector<uint32_t> characterCodes(std::string const& str);
|
||||
|
||||
/// @brief calculates the levenshtein distance between the input strings
|
||||
unsigned int levenshteinDistance(std::string const& str1, std::string const& str2);
|
||||
|
||||
/// @brief unicode hexadecimal characters to utf8
|
||||
bool unicodeToUTF8(char const* inputStr, size_t const& len,
|
||||
std::string& outputStr);
|
||||
|
|
Loading…
Reference in New Issue