1
0
Fork 0

Levenshtein Distance function implementation with documentation and i… (#5922)

This commit is contained in:
shivamdave24 2018-07-25 03:48:58 -07:00 committed by Jan
parent d4d079c014
commit 7db28da251
8 changed files with 240 additions and 17 deletions

View File

@ -239,6 +239,31 @@ LENGTH("foobar") // 6
LENGTH("电脑坏了") // 4
```
LEVENSHTEIN_DISTANCE()
------
`LEVENSHTEIN_DISTANCE(value1, value2) → levenshteinDistance`
Return the calculated Levenshtein distance between the input strings *value1* and *value2*.
- **value1** (string): a string
- **value2** (string): a string
`LEVENSHTEIN_DISTANCE(value1, value2) → levenshteinDistance`
Return the calculated Levenshtein distance between the input strings *value1* and *value2*.
- **value1** (string): a string
- **value2** (string): a string
- returns **levenshteinDistance** (number): calculated Levenshtein distance between the input strings *value1* and *value2*
```js
LEVENSHTEIN_DISTANCE("foobar", "bar") // 3
LEVENSHTEIN_DISTANCE(" ", "") // 1
LEVENSHTEIN_DISTANCE("The quick brown fox jumps over the lazy dog", "The quick black dog jumps over the brown fox") // 13
LEVENSHTEIN_DISTANCE("der mötör trötet", "der trötet") // 6
```
*LENGTH()* can also determine the [number of elements](Array.md#length) in an array,
the [number of attribute keys](Document.md#length) of an object / document and
the [amount of documents](Miscellaneous.md#length) in a collection.

View File

@ -469,8 +469,9 @@ The following AQL functions have been added in ArangoDB 3.4:
comparison order
* `SORTED_UNIQUE`: same as `SORTED`, but additionally removes duplicates
* `COUNT_DISTINCT`: counts the number of distinct / unique items in an array
The following AQL functions have been added to make working with geographical
* `LEVENSHTEIN_DISTANCE`: calculates the Levenshtein distance between two string values
The following AQL functions have been added to make working with geographical
data easier:
* `GEO_POINT`

View File

@ -202,6 +202,7 @@ void AqlFunctionFeature::addStringFunctions() {
add({"ENCODE_URI_COMPONENT", ".", true, false, true, &Functions::EncodeURIComponent});
add({"UUID", "", true, false, true, &Functions::UUID});
add({"SOUNDEX", ".", true, false, true, &Functions::Soundex});
add({"LEVENSHTEIN_DISTANCE", ".,.", true, false, true, &Functions::LevenshteinDistance});
// FULLTEXT is replaced by the AQL optimizer with an index lookup
add({"FULLTEXT", ".h,.,.|." , false, true, false, &Functions::NotImplemented});
}

View File

@ -1509,6 +1509,29 @@ AqlValue Functions::Soundex(arangodb::aql::Query*,
return AqlValue(encoded);
}
/// @brief function LEVENSHTEIN_DISTANCE
AqlValue Functions::LevenshteinDistance(arangodb::aql::Query*,
transaction::Methods* trx,
VPackFunctionParameters const& parameters) {
ValidateParameters(parameters, "LEVENSHTEIN_DISTANCE", 2, 2);
AqlValue value1 = ExtractFunctionParameterValue(parameters, 0);
AqlValue value2 = ExtractFunctionParameterValue(parameters, 1);
transaction::StringBufferLeaser buffer1(trx);
transaction::StringBufferLeaser buffer2(trx);
arangodb::basics::VPackStringBufferAdapter adapter1(buffer1->stringBuffer());
arangodb::basics::VPackStringBufferAdapter adapter2(buffer2->stringBuffer());
::appendAsString(trx, adapter1, value1);
::appendAsString(trx, adapter2, value2);
int encoded = basics::StringUtils::levenshteinDistance(std::string(buffer1->begin(), buffer1->length()), std::string(buffer2->begin(), buffer2->length()));
return AqlValue(AqlValueHintInt(encoded));
}
/// @brief function TO_BOOL
AqlValue Functions::ToBool(arangodb::aql::Query*,
transaction::Methods* trx,

View File

@ -161,6 +161,8 @@ struct Functions {
VPackFunctionParameters const&);
static AqlValue Soundex(arangodb::aql::Query*, transaction::Methods*,
VPackFunctionParameters const&);
static AqlValue LevenshteinDistance(arangodb::aql::Query*, transaction::Methods*,
VPackFunctionParameters const&);
// Date
static AqlValue DateFromParameters(arangodb::aql::Query* query,
transaction::Methods* trx,

View File

@ -202,6 +202,51 @@ function ahuacatlStringFunctionsTestSuite () {
assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN SOUNDEX("test", "meow", "foo", "bar")');
},
// //////////////////////////////////////////////////////////////////////////////
// / @brief test LevenshteinDistance
// //////////////////////////////////////////////////////////////////////////////
testToLevenshteinDistanceValues: function () {
[
[ null, "", 0 ],
[ null, null, 0 ],
[ "", "", 0 ],
[ "", "foobar", 6 ],
[ "foobar", "", 6 ],
[ "foobar", "foo", 3 ],
[ "foo", "foobar", 3 ],
[ "or", "of", 1 ],
[ "or", "", 2 ],
[ "or", "The quick brown fox jumps over the lazy dog", 41 ],
[ true, "foobar", 6 ],
[ false, "foobar", 5 ],
[ "foobar", true, 6 ],
[ "foobar", false, 5 ],
[ true, true, 0 ],
[ false, false, 0 ],
[ true, false, 4 ],
[ false, true, 4 ],
[ "", "", 0 ],
[ " ", "", 1 ],
[ "", " ", 1 ],
[ "der mötör trötet", "der mötör trötet", 0 ],
[ "der mötör trötet", "der trötet", 6 ],
[ "der mötör trötet", "dertrötet", 7 ],
[ "Öööööö", "öö", 4 ],
[ "The quick brown fox jumps over the lazy dog", "The quick black dog jumps over the brown fox", 13 ],
[ "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of \"de Finibus Bonorum et Malorum\" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, 'Lorem ipsum dolor sit amet..', comes from a line in section 1.10.32. The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from 'de Finibus Bonorum et Malorum' by Cicero are also reproduced in their exact original form, accompanied by English versions from the 1914 translation by H. Rackham..", "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of \"de Finibus Bonorum et Malorum\" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, 'Lorem ipsum dolor sit amet..', comes from a line in section 1.10.32. The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from 'de Finibus Bonorum et Malorum' by Cicero are also reproduced in their exact original form.", 74 ],
].forEach(function(test) {
assertEqual([ test[2] ], getQueryResults('RETURN LEVENSHTEIN_DISTANCE(' + JSON.stringify(test[0]) + ', ' + JSON.stringify(test[1]) + ')'), test);
});
},
testLevenshteinDistanceInvalidNumberOfParameters: function () {
assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN LEVENSHTEIN_DISTANCE()');
assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN LEVENSHTEIN_DISTANCE("test")');
assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN LEVENSHTEIN_DISTANCE("test", "meow", "foo")');
assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN LEVENSHTEIN_DISTANCE("test", "meow", "foo", "bar")');
},
// //////////////////////////////////////////////////////////////////////////////
// / @brief test JSON_STRINGIFY
// //////////////////////////////////////////////////////////////////////////////

View File

@ -25,6 +25,9 @@
#include <stdio.h>
#include <ctype.h>
#include <vector>
#include <algorithm>
#include <limits>
#include <math.h>
#include <time.h>
@ -235,6 +238,71 @@ bool isLowSurrugate(uint32_t number) {
return (number >= 0xDC00) && (number <= 0xDFFF);
}
unsigned char consume(char const*& s) {
return *reinterpret_cast<unsigned char const*>(s++);
}
template<typename InputType>
inline bool isEqual(InputType const& c1, InputType const& c2) {
return c1 == c2;
}
template<typename InputType, typename LengthType>
LengthType levenshtein(InputType const* lhs,
InputType const* rhs,
LengthType lhsSize,
LengthType rhsSize) {
TRI_ASSERT(lhsSize >= rhsSize);
std::vector<LengthType> costs;
costs.resize(rhsSize + 1);
for (LengthType i = 0; i < rhsSize; ++i) {
costs[i] = i;
}
LengthType next = 0;
for (LengthType i = 0; i < lhsSize; ++i) {
LengthType current = i + 1;
for (LengthType j = 0; j < rhsSize; ++j) {
LengthType cost = !(::isEqual<InputType>(lhs[i], rhs[j]) ||
(i && j && ::isEqual<InputType>(lhs[i - 1], rhs[j]) && ::isEqual<InputType>(lhs[i], rhs[j - 1])));
next = std::min(std::min(costs[j + 1] + 1, current + 1), costs[j] + cost);
costs[j] = current;
current = next;
}
costs[rhsSize] = next;
}
return next;
}
size_t levenshteinDistance(std::vector<uint32_t>& vect1, std::vector<uint32_t>& vect2) {
if (vect1.empty() || vect2.empty()) {
return vect1.size() ? vect1.size() : vect2.size();
}
if (vect1.size() < vect2.size()) {
vect1.swap(vect2);
}
size_t lhsSize = vect1.size();
size_t rhsSize = vect2.size();
uint32_t const* l = vect1.data();
uint32_t const* r = vect2.data();
if (lhsSize < std::numeric_limits<uint8_t>::max()) {
return static_cast<size_t>(::levenshtein<uint32_t, uint8_t>(l, r, lhsSize, rhsSize));
} else if (lhsSize < std::numeric_limits<uint16_t>::max()) {
return static_cast<size_t>(::levenshtein<uint32_t, uint16_t>(l, r, lhsSize, rhsSize));
} else if (lhsSize < std::numeric_limits<uint32_t>::max()) {
return static_cast<size_t>(::levenshtein<uint32_t, uint32_t>(l, r, lhsSize, rhsSize));
}
return static_cast<size_t>(::levenshtein<uint32_t, uint64_t>(l, r, lhsSize, rhsSize));
}
} // namespace
namespace arangodb {
@ -1406,17 +1474,70 @@ std::string soundex(char const* src, size_t const len) {
return result;
}
unsigned int levenshteinDistance(std::string const& str1, std::string const& str2) {
// convert input strings to vectors of (multi-byte) character numbers
std::vector<uint32_t> vect1 = characterCodes(str1);
std::vector<uint32_t> vect2 = characterCodes(str2);
// calculate levenshtein distance on vectors of character numbers
return static_cast<unsigned int>(::levenshteinDistance(vect1, vect2));
}
std::vector<uint32_t> characterCodes(std::string const& str) {
char const* s = str.data();
char const* e = s + str.size();
std::vector<uint32_t> charNums;
// be conservative, and reserve space for one number of input
// string byte. this may be too much, but it avoids later
// reallocation of the vector
charNums.reserve(str.size());
while (s < e) {
// note: consume advances the *s* pointer by one byte
unsigned char c = ::consume(s);
uint32_t n = uint32_t(c);
if ((c & 0x80U) == 0U) {
// single-byte character
charNums.push_back(n);
} else if ((c & 0xE0U) == 0xC0U) {
// two-byte character
if (s >= e) {
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "invalid UTF-8 sequence");
}
charNums.push_back((n << 8U) + uint32_t(::consume(s)));
} else if ((c & 0xF0U) == 0xE0U) {
// three-byte character
if (s + 1 >= e) {
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "invalid UTF-8 sequence");
}
charNums.push_back((n << 16U) + (uint32_t(::consume(s)) << 8U) + (uint32_t(::consume(s))));
} else if ((c & 0xF8U) == 0XF0U){
// four-byte character
if (s + 2 >= e) {
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "invalid UTF-8 sequence");
}
charNums.push_back((n << 24U) + (uint32_t(::consume(s)) << 16U) + (uint32_t(::consume(s)) << 8U) + (uint32_t(::consume(s))));
} else {
THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "invalid UTF-8 sequence");
}
}
return charNums;
}
// .............................................................................
// CONVERT TO STRING
// .............................................................................
std::string itoa(int16_t attr) {
char buffer[7];
char* p = buffer;
if (attr == INT16_MIN) {
return "-32768";
}
char buffer[7];
char* p = buffer;
if (attr < 0) {
*p++ = '-';
@ -1466,12 +1587,12 @@ std::string itoa(uint16_t attr) {
}
std::string itoa(int32_t attr) {
char buffer[12];
char* p = buffer;
if (attr == INT32_MIN) {
return "-2147483648";
}
char buffer[12];
char* p = buffer;
if (attr < 0) {
*p++ = '-';
@ -1551,12 +1672,12 @@ std::string itoa(uint32_t attr) {
}
std::string itoa(int64_t attr) {
char buffer[21];
char* p = buffer;
if (attr == INT64_MIN) {
return "-9223372036854775808";
}
char buffer[21];
char* p = buffer;
if (attr < 0) {
*p++ = '-';
@ -1776,9 +1897,8 @@ bool boolean(std::string const& str) {
if (lower == "true" || lower == "yes" || lower == "on" || lower == "y" ||
lower == "1" || lower == "") {
return true;
} else {
return false;
}
return false;
}
int64_t int64(std::string const& str) {
@ -2064,9 +2184,7 @@ float floatDecimal(char const* value, size_t size) {
bool unicodeToUTF8(char const* inputStr, size_t const& len,
std::string& outputStr) {
uint32_t outputInt = 0;
bool ok;
ok = parseHexanumber(inputStr, len, &outputInt);
bool ok = parseHexanumber(inputStr, len, &outputInt);
if (ok == false) {
outputStr = std::string(inputStr, len);
return false;

View File

@ -27,6 +27,8 @@
#include "Basics/Common.h"
#include <vector>
namespace arangodb {
namespace basics {
@ -220,7 +222,13 @@ std::string soundex(std::string const& str);
/// @brief converts input string to soundex code
std::string soundex(char const* src, size_t const len);
/// @brief converts input string to vector of character codes
std::vector<uint32_t> characterCodes(std::string const& str);
/// @brief calculates the levenshtein distance between the input strings
unsigned int levenshteinDistance(std::string const& str1, std::string const& str2);
/// @brief unicode hexadecimal characters to utf8
bool unicodeToUTF8(char const* inputStr, size_t const& len,
std::string& outputStr);