From 7db28da2512e122023e754436e46837f05189ccc Mon Sep 17 00:00:00 2001 From: shivamdave24 Date: Wed, 25 Jul 2018 03:48:58 -0700 Subject: [PATCH] =?UTF-8?q?Levenshtein=20Distance=20function=20implementat?= =?UTF-8?q?ion=20with=20documentation=20and=20i=E2=80=A6=20(#5922)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Documentation/Books/AQL/Functions/String.md | 25 +++ .../Manual/ReleaseNotes/NewFeatures34.md | 5 +- arangod/Aql/AqlFunctionFeature.cpp | 1 + arangod/Aql/Functions.cpp | 23 +++ arangod/Aql/Functions.h | 2 + js/server/tests/aql/aql-functions-string.js | 45 ++++++ lib/Basics/StringUtils.cpp | 146 ++++++++++++++++-- lib/Basics/StringUtils.h | 10 +- 8 files changed, 240 insertions(+), 17 deletions(-) diff --git a/Documentation/Books/AQL/Functions/String.md b/Documentation/Books/AQL/Functions/String.md index b7dea10f92..bb631c53e9 100644 --- a/Documentation/Books/AQL/Functions/String.md +++ b/Documentation/Books/AQL/Functions/String.md @@ -239,6 +239,31 @@ LENGTH("foobar") // 6 LENGTH("电脑坏了") // 4 ``` +LEVENSHTEIN_DISTANCE() +------ + +`LEVENSHTEIN_DISTANCE(value1, value2) → levenshteinDistance` + +Return the calculated Levenshtein distance between the input strings *value1* and *value2*. + +- **value1** (string): a string +- **value2** (string): a string + +`LEVENSHTEIN_DISTANCE(value1, value2) → levenshteinDistance` + +Return the calculated Levenshtein distance between the input strings *value1* and *value2*. + +- **value1** (string): a string +- **value2** (string): a string +- returns **levenshteinDistance** (number): calculated Levenshtein distance between the input strings *value1* and *value2* + +```js +LEVENSHTEIN_DISTANCE("foobar", "bar") // 3 +LEVENSHTEIN_DISTANCE(" ", "") // 1 +LEVENSHTEIN_DISTANCE("The quick brown fox jumps over the lazy dog", "The quick black dog jumps over the brown fox") // 13 +LEVENSHTEIN_DISTANCE("der mötör trötet", "der trötet") // 6 +``` + *LENGTH()* can also determine the [number of elements](Array.md#length) in an array, the [number of attribute keys](Document.md#length) of an object / document and the [amount of documents](Miscellaneous.md#length) in a collection. diff --git a/Documentation/Books/Manual/ReleaseNotes/NewFeatures34.md b/Documentation/Books/Manual/ReleaseNotes/NewFeatures34.md index ea1305c6da..26ef17b6ca 100644 --- a/Documentation/Books/Manual/ReleaseNotes/NewFeatures34.md +++ b/Documentation/Books/Manual/ReleaseNotes/NewFeatures34.md @@ -469,8 +469,9 @@ The following AQL functions have been added in ArangoDB 3.4: comparison order * `SORTED_UNIQUE`: same as `SORTED`, but additionally removes duplicates * `COUNT_DISTINCT`: counts the number of distinct / unique items in an array - -The following AQL functions have been added to make working with geographical +* `LEVENSHTEIN_DISTANCE`: calculates the Levenshtein distance between two string values + +The following AQL functions have been added to make working with geographical data easier: * `GEO_POINT` diff --git a/arangod/Aql/AqlFunctionFeature.cpp b/arangod/Aql/AqlFunctionFeature.cpp index 14da671138..d5f4b7184c 100644 --- a/arangod/Aql/AqlFunctionFeature.cpp +++ b/arangod/Aql/AqlFunctionFeature.cpp @@ -202,6 +202,7 @@ void AqlFunctionFeature::addStringFunctions() { add({"ENCODE_URI_COMPONENT", ".", true, false, true, &Functions::EncodeURIComponent}); add({"UUID", "", true, false, true, &Functions::UUID}); add({"SOUNDEX", ".", true, false, true, &Functions::Soundex}); + add({"LEVENSHTEIN_DISTANCE", ".,.", true, false, true, &Functions::LevenshteinDistance}); // FULLTEXT is replaced by the AQL optimizer with an index lookup add({"FULLTEXT", ".h,.,.|." , false, true, false, &Functions::NotImplemented}); } diff --git a/arangod/Aql/Functions.cpp b/arangod/Aql/Functions.cpp index 8d5f557e57..033d8c09d3 100644 --- a/arangod/Aql/Functions.cpp +++ b/arangod/Aql/Functions.cpp @@ -1509,6 +1509,29 @@ AqlValue Functions::Soundex(arangodb::aql::Query*, return AqlValue(encoded); } + +/// @brief function LEVENSHTEIN_DISTANCE +AqlValue Functions::LevenshteinDistance(arangodb::aql::Query*, + transaction::Methods* trx, + VPackFunctionParameters const& parameters) { + ValidateParameters(parameters, "LEVENSHTEIN_DISTANCE", 2, 2); + AqlValue value1 = ExtractFunctionParameterValue(parameters, 0); + AqlValue value2 = ExtractFunctionParameterValue(parameters, 1); + + transaction::StringBufferLeaser buffer1(trx); + transaction::StringBufferLeaser buffer2(trx); + + arangodb::basics::VPackStringBufferAdapter adapter1(buffer1->stringBuffer()); + arangodb::basics::VPackStringBufferAdapter adapter2(buffer2->stringBuffer()); + + ::appendAsString(trx, adapter1, value1); + ::appendAsString(trx, adapter2, value2); + + int encoded = basics::StringUtils::levenshteinDistance(std::string(buffer1->begin(), buffer1->length()), std::string(buffer2->begin(), buffer2->length())); + + return AqlValue(AqlValueHintInt(encoded)); +} + /// @brief function TO_BOOL AqlValue Functions::ToBool(arangodb::aql::Query*, transaction::Methods* trx, diff --git a/arangod/Aql/Functions.h b/arangod/Aql/Functions.h index 83ee678f4f..961b47eb51 100644 --- a/arangod/Aql/Functions.h +++ b/arangod/Aql/Functions.h @@ -161,6 +161,8 @@ struct Functions { VPackFunctionParameters const&); static AqlValue Soundex(arangodb::aql::Query*, transaction::Methods*, VPackFunctionParameters const&); + static AqlValue LevenshteinDistance(arangodb::aql::Query*, transaction::Methods*, + VPackFunctionParameters const&); // Date static AqlValue DateFromParameters(arangodb::aql::Query* query, transaction::Methods* trx, diff --git a/js/server/tests/aql/aql-functions-string.js b/js/server/tests/aql/aql-functions-string.js index 2cd4531eae..b0ff26f9ba 100644 --- a/js/server/tests/aql/aql-functions-string.js +++ b/js/server/tests/aql/aql-functions-string.js @@ -202,6 +202,51 @@ function ahuacatlStringFunctionsTestSuite () { assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN SOUNDEX("test", "meow", "foo", "bar")'); }, + + // ////////////////////////////////////////////////////////////////////////////// +// / @brief test LevenshteinDistance +// ////////////////////////////////////////////////////////////////////////////// + testToLevenshteinDistanceValues: function () { + [ + [ null, "", 0 ], + [ null, null, 0 ], + [ "", "", 0 ], + [ "", "foobar", 6 ], + [ "foobar", "", 6 ], + [ "foobar", "foo", 3 ], + [ "foo", "foobar", 3 ], + [ "or", "of", 1 ], + [ "or", "", 2 ], + [ "or", "The quick brown fox jumps over the lazy dog", 41 ], + [ true, "foobar", 6 ], + [ false, "foobar", 5 ], + [ "foobar", true, 6 ], + [ "foobar", false, 5 ], + [ true, true, 0 ], + [ false, false, 0 ], + [ true, false, 4 ], + [ false, true, 4 ], + [ "", "", 0 ], + [ " ", "", 1 ], + [ "", " ", 1 ], + [ "der mötör trötet", "der mötör trötet", 0 ], + [ "der mötör trötet", "der trötet", 6 ], + [ "der mötör trötet", "dertrötet", 7 ], + [ "Öööööö", "öö", 4 ], + [ "The quick brown fox jumps over the lazy dog", "The quick black dog jumps over the brown fox", 13 ], + [ "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of \"de Finibus Bonorum et Malorum\" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, 'Lorem ipsum dolor sit amet..', comes from a line in section 1.10.32. The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from 'de Finibus Bonorum et Malorum' by Cicero are also reproduced in their exact original form, accompanied by English versions from the 1914 translation by H. Rackham..", "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of \"de Finibus Bonorum et Malorum\" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, 'Lorem ipsum dolor sit amet..', comes from a line in section 1.10.32. The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from 'de Finibus Bonorum et Malorum' by Cicero are also reproduced in their exact original form.", 74 ], + ].forEach(function(test) { + assertEqual([ test[2] ], getQueryResults('RETURN LEVENSHTEIN_DISTANCE(' + JSON.stringify(test[0]) + ', ' + JSON.stringify(test[1]) + ')'), test); + }); + }, + + testLevenshteinDistanceInvalidNumberOfParameters: function () { + assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN LEVENSHTEIN_DISTANCE()'); + assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN LEVENSHTEIN_DISTANCE("test")'); + assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN LEVENSHTEIN_DISTANCE("test", "meow", "foo")'); + assertQueryError(errors.ERROR_QUERY_FUNCTION_ARGUMENT_NUMBER_MISMATCH.code, 'RETURN LEVENSHTEIN_DISTANCE("test", "meow", "foo", "bar")'); + }, + // ////////////////////////////////////////////////////////////////////////////// // / @brief test JSON_STRINGIFY // ////////////////////////////////////////////////////////////////////////////// diff --git a/lib/Basics/StringUtils.cpp b/lib/Basics/StringUtils.cpp index b6bdac1360..24b09a8b83 100644 --- a/lib/Basics/StringUtils.cpp +++ b/lib/Basics/StringUtils.cpp @@ -25,6 +25,9 @@ #include #include +#include +#include +#include #include #include @@ -235,6 +238,71 @@ bool isLowSurrugate(uint32_t number) { return (number >= 0xDC00) && (number <= 0xDFFF); } +unsigned char consume(char const*& s) { + return *reinterpret_cast(s++); +} + +template +inline bool isEqual(InputType const& c1, InputType const& c2) { + return c1 == c2; +} + +template +LengthType levenshtein(InputType const* lhs, + InputType const* rhs, + LengthType lhsSize, + LengthType rhsSize) { + TRI_ASSERT(lhsSize >= rhsSize); + + std::vector costs; + costs.resize(rhsSize + 1); + + for (LengthType i = 0; i < rhsSize; ++i) { + costs[i] = i; + } + + LengthType next = 0; + + for (LengthType i = 0; i < lhsSize; ++i) { + LengthType current = i + 1; + + for (LengthType j = 0; j < rhsSize; ++j) { + LengthType cost = !(::isEqual(lhs[i], rhs[j]) || + (i && j && ::isEqual(lhs[i - 1], rhs[j]) && ::isEqual(lhs[i], rhs[j - 1]))); + next = std::min(std::min(costs[j + 1] + 1, current + 1), costs[j] + cost); + costs[j] = current; + current = next; + } + costs[rhsSize] = next; + } + return next; +} + +size_t levenshteinDistance(std::vector& vect1, std::vector& vect2) { + if (vect1.empty() || vect2.empty()) { + return vect1.size() ? vect1.size() : vect2.size(); + } + + if (vect1.size() < vect2.size()) { + vect1.swap(vect2); + } + + size_t lhsSize = vect1.size(); + size_t rhsSize = vect2.size(); + + uint32_t const* l = vect1.data(); + uint32_t const* r = vect2.data(); + + if (lhsSize < std::numeric_limits::max()) { + return static_cast(::levenshtein(l, r, lhsSize, rhsSize)); + } else if (lhsSize < std::numeric_limits::max()) { + return static_cast(::levenshtein(l, r, lhsSize, rhsSize)); + } else if (lhsSize < std::numeric_limits::max()) { + return static_cast(::levenshtein(l, r, lhsSize, rhsSize)); + } + return static_cast(::levenshtein(l, r, lhsSize, rhsSize)); +} + } // namespace namespace arangodb { @@ -1406,17 +1474,70 @@ std::string soundex(char const* src, size_t const len) { return result; } +unsigned int levenshteinDistance(std::string const& str1, std::string const& str2) { + // convert input strings to vectors of (multi-byte) character numbers + std::vector vect1 = characterCodes(str1); + std::vector vect2 = characterCodes(str2); + + // calculate levenshtein distance on vectors of character numbers + return static_cast(::levenshteinDistance(vect1, vect2)); +} + +std::vector characterCodes(std::string const& str) { + char const* s = str.data(); + char const* e = s + str.size(); + + std::vector charNums; + // be conservative, and reserve space for one number of input + // string byte. this may be too much, but it avoids later + // reallocation of the vector + charNums.reserve(str.size()); + + while (s < e) { + // note: consume advances the *s* pointer by one byte + unsigned char c = ::consume(s); + uint32_t n = uint32_t(c); + + if ((c & 0x80U) == 0U) { + // single-byte character + charNums.push_back(n); + } else if ((c & 0xE0U) == 0xC0U) { + // two-byte character + if (s >= e) { + THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "invalid UTF-8 sequence"); + } + charNums.push_back((n << 8U) + uint32_t(::consume(s))); + } else if ((c & 0xF0U) == 0xE0U) { + // three-byte character + if (s + 1 >= e) { + THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "invalid UTF-8 sequence"); + } + charNums.push_back((n << 16U) + (uint32_t(::consume(s)) << 8U) + (uint32_t(::consume(s)))); + } else if ((c & 0xF8U) == 0XF0U){ + // four-byte character + if (s + 2 >= e) { + THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "invalid UTF-8 sequence"); + } + charNums.push_back((n << 24U) + (uint32_t(::consume(s)) << 16U) + (uint32_t(::consume(s)) << 8U) + (uint32_t(::consume(s)))); + } else { + THROW_ARANGO_EXCEPTION_MESSAGE(TRI_ERROR_INTERNAL, "invalid UTF-8 sequence"); + } + } + + return charNums; +} + // ............................................................................. // CONVERT TO STRING // ............................................................................. std::string itoa(int16_t attr) { - char buffer[7]; - char* p = buffer; - if (attr == INT16_MIN) { return "-32768"; } + + char buffer[7]; + char* p = buffer; if (attr < 0) { *p++ = '-'; @@ -1466,12 +1587,12 @@ std::string itoa(uint16_t attr) { } std::string itoa(int32_t attr) { - char buffer[12]; - char* p = buffer; - if (attr == INT32_MIN) { return "-2147483648"; } + + char buffer[12]; + char* p = buffer; if (attr < 0) { *p++ = '-'; @@ -1551,12 +1672,12 @@ std::string itoa(uint32_t attr) { } std::string itoa(int64_t attr) { - char buffer[21]; - char* p = buffer; - if (attr == INT64_MIN) { return "-9223372036854775808"; } + + char buffer[21]; + char* p = buffer; if (attr < 0) { *p++ = '-'; @@ -1776,9 +1897,8 @@ bool boolean(std::string const& str) { if (lower == "true" || lower == "yes" || lower == "on" || lower == "y" || lower == "1" || lower == "✓") { return true; - } else { - return false; } + return false; } int64_t int64(std::string const& str) { @@ -2064,9 +2184,7 @@ float floatDecimal(char const* value, size_t size) { bool unicodeToUTF8(char const* inputStr, size_t const& len, std::string& outputStr) { uint32_t outputInt = 0; - bool ok; - - ok = parseHexanumber(inputStr, len, &outputInt); + bool ok = parseHexanumber(inputStr, len, &outputInt); if (ok == false) { outputStr = std::string(inputStr, len); return false; diff --git a/lib/Basics/StringUtils.h b/lib/Basics/StringUtils.h index 08e7271986..28021dc075 100644 --- a/lib/Basics/StringUtils.h +++ b/lib/Basics/StringUtils.h @@ -27,6 +27,8 @@ #include "Basics/Common.h" +#include + namespace arangodb { namespace basics { @@ -220,7 +222,13 @@ std::string soundex(std::string const& str); /// @brief converts input string to soundex code std::string soundex(char const* src, size_t const len); - + +/// @brief converts input string to vector of character codes +std::vector characterCodes(std::string const& str); + +/// @brief calculates the levenshtein distance between the input strings +unsigned int levenshteinDistance(std::string const& str1, std::string const& str2); + /// @brief unicode hexadecimal characters to utf8 bool unicodeToUTF8(char const* inputStr, size_t const& len, std::string& outputStr);