diff --git a/UnitTests/Philadelphia/string-utf8-normalize-test.cpp b/UnitTests/Philadelphia/string-utf8-normalize-test.cpp index bc28dc7df7..63bd2aabb1 100644 --- a/UnitTests/Philadelphia/string-utf8-normalize-test.cpp +++ b/UnitTests/Philadelphia/string-utf8-normalize-test.cpp @@ -161,6 +161,67 @@ BOOST_AUTO_TEST_CASE (tst_3) { BOOST_CHECK_EQUAL(expectString, resultString); } +BOOST_AUTO_TEST_CASE (tst_4) { + std::string testString = "Der Müller geht in die Post."; + + TRI_vector_string_t* words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 3, true); + BOOST_CHECK(words != NULL); + + BOOST_CHECK_EQUAL(5, words->_length); + BOOST_CHECK_EQUAL("der", words->_buffer[0]); + BOOST_CHECK_EQUAL("müller", words->_buffer[1]); + BOOST_CHECK_EQUAL("geht", words->_buffer[2]); + BOOST_CHECK_EQUAL("die", words->_buffer[3]); + BOOST_CHECK_EQUAL("post", words->_buffer[4]); + + TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); + + + words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 4, true); + BOOST_CHECK(words != NULL); + + BOOST_CHECK_EQUAL(3, words->_length); + BOOST_CHECK_EQUAL("müller", words->_buffer[0]); + BOOST_CHECK_EQUAL("geht", words->_buffer[1]); + BOOST_CHECK_EQUAL("post", words->_buffer[2]); + + TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); + + words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(NULL, 0, 4, true); + BOOST_CHECK(words == NULL); +} + +BOOST_AUTO_TEST_CASE (tst_5) { + std::string testString = "Der Müller geht in die Post."; + + TRI_vector_string_t* words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 3, false); + BOOST_CHECK(words != NULL); + + BOOST_CHECK_EQUAL(5, words->_length); + BOOST_CHECK_EQUAL("Der", words->_buffer[0]); + BOOST_CHECK_EQUAL("Müller", words->_buffer[1]); + BOOST_CHECK_EQUAL("geht", words->_buffer[2]); + BOOST_CHECK_EQUAL("die", words->_buffer[3]); + BOOST_CHECK_EQUAL("Post", words->_buffer[4]); + + TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); + + + words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 4, false); + BOOST_CHECK(words != NULL); + + BOOST_CHECK_EQUAL(3, words->_length); + BOOST_CHECK_EQUAL("Müller", words->_buffer[0]); + BOOST_CHECK_EQUAL("geht", words->_buffer[1]); + BOOST_CHECK_EQUAL("Post", words->_buffer[2]); + + TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); + + words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(NULL, 0, 4, false); + BOOST_CHECK(words == NULL); +} + + #endif BOOST_AUTO_TEST_SUITE_END () diff --git a/arangod/VocBase/index.c b/arangod/VocBase/index.c index 540fa3d966..4d191ddfc8 100644 --- a/arangod/VocBase/index.c +++ b/arangod/VocBase/index.c @@ -34,6 +34,7 @@ #include "BasicsC/logging.h" #include "BasicsC/string-buffer.h" #include "BasicsC/strings.h" +#include "BasicsC/utf8-helper.h" #include "ShapedJson/shape-accessor.h" #include "ShapedJson/shaped-json.h" #include "VocBase/document-collection.h" @@ -4059,143 +4060,6 @@ void TRI_FreeSkiplistIndex (TRI_index_t* idx) { /// @{ //////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// -/// @brief add an identified word to the word vector -//////////////////////////////////////////////////////////////////////////////// - -static bool AddWord (TRI_vector_string_t* const words, - const char* const wordStart, - const size_t wordLength, - const bool containsUtf8) { - char* copy; - - if (containsUtf8) { - // UTF-8 string - copy = TRI_NormaliseWordFulltextIndex(wordStart, wordLength); - } - else { - // ASCII string - char* src; - char* end; - char* dst; - - copy = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, (wordLength + 1) * sizeof(char), false); - if (copy == NULL) { - return false; - } - - src = (char*) wordStart; - end = src + wordLength; - dst = copy; - - for (; src < end; ++src, ++dst) { - char c = *src; - - // lower case the text so it is normalised in the index - if (c >= 'A' && c <= 'Z') { - *dst = (char) (((unsigned char) c) + 32); - } - else { - *dst = c; - } - } - - *dst = '\0'; - } - - TRI_PushBackVectorString(words, copy); - LOG_DEBUG("found word '%s'", copy); - - return true; -} - -//////////////////////////////////////////////////////////////////////////////// -/// @brief parse a document string value into the individual words that should -/// be indexed -/// words returned are all lower cased -/// -/// This function is very naive and currently does not handle lower-casing of -/// unicode characters, normalisation of unicode characters, and exclusion of -/// unicode punctuation characters -//////////////////////////////////////////////////////////////////////////////// - -static TRI_vector_string_t* ParseWordsFulltextIndex (const char* const text, - const size_t textLength) { - TRI_vector_string_t* words; - char* ptr; - char* end; - char* wordStart; - bool containsUtf8; - - words = (TRI_vector_string_t*) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_vector_string_t), false); - if (words == NULL) { - return NULL; - } - - TRI_InitVectorString(words, TRI_UNKNOWN_MEM_ZONE); - - ptr = (char*) text; - end = ptr + textLength; - wordStart = NULL; - containsUtf8 = false; - - while (ptr < end) { - char c = *ptr; - - if ((c >= 'A' && c <= 'Z') || - (c >= 'a' && c <= 'z')) { - if (wordStart == NULL) { - wordStart = ptr; - } - } - else if ((unsigned char) c >= 128) { - // UTF-8 - if (wordStart == NULL) { - wordStart = ptr; - } - containsUtf8 = true; - } - else { - if (wordStart != NULL) { - size_t wordLength = ptr - wordStart; - - // check the length of the word - if (wordLength >= 2) { - if (! AddWord(words, wordStart, wordLength, containsUtf8)) { - TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); - return NULL; - } - } - wordStart = NULL; - containsUtf8 = false; - } - } - - ++ptr; - } - - // check if we have something left to index - if (wordStart != NULL) { - size_t wordLength = ptr - wordStart; - - // check the length of the word - if (wordLength >= 2) { - if (! AddWord(words, wordStart, wordLength, containsUtf8)) { - TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); - return NULL; - } - } - } - - if (words->_length == 0) { - // no words found - TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); - return NULL; - } - - return words; -} - //////////////////////////////////////////////////////////////////////////////// /// @brief free function for word list, fulltext index //////////////////////////////////////////////////////////////////////////////// @@ -4251,7 +4115,7 @@ static FTS_texts_t* GetTextsFulltextIndex (FTS_document_id_t document, } // parse the document text - words = ParseWordsFulltextIndex(text, textLength); + words = TRI_get_words(text, textLength, 2, true); if (words == NULL) { return NULL; } diff --git a/lib/Basics/Utf8Helper.cpp b/lib/Basics/Utf8Helper.cpp index d9a6852a6e..6a7ee0466c 100644 --- a/lib/Basics/Utf8Helper.cpp +++ b/lib/Basics/Utf8Helper.cpp @@ -32,12 +32,14 @@ #include "unicode/normalizer2.h" #include "unicode/ucasemap.h" #include "unicode/brkiter.h" +#include "unicode/ustdio.h" #else #include "string.h" #endif #include "Logger/Logger.h" #include "BasicsC/strings.h" +#include "BasicsC/utf8-helper.h" using namespace triagens::basics; using namespace std; @@ -373,6 +375,89 @@ char* Utf8Helper::toupper (TRI_memory_zone_t* zone, const char *src, int32_t src return utf8_dest; } +TRI_vector_string_t* Utf8Helper::getWords (const char* const text, + const size_t textLength, + uint8_t minimalLength, + bool lowerCase) { + TRI_vector_string_t* words; + UErrorCode status = U_ZERO_ERROR; + UnicodeString word; + size_t utf8WordLength = 0; + char* utf8Word; + + words = (TRI_vector_string_t*) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_vector_string_t), false); + if (words == NULL) { + return NULL; + } + + TRI_InitVectorString(words, TRI_UNKNOWN_MEM_ZONE); + +#ifdef TRI_HAVE_ICU + + size_t textUtf16Length = 0; + UChar* textUtf16 = NULL; + + if (lowerCase) { + // lower case string + int32_t lowerLength = 0; + char* lower = tolower(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, lowerLength); + + if (lowerLength == 0) { + TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); + return NULL; + } + textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, lower, lowerLength, &textUtf16Length); + TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower); + } + else { + textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, &textUtf16Length); + } + + ULocDataLocaleType type = ULOC_VALID_LOCALE; + const Locale& locale = _coll->getLocale(type, status); + if(U_FAILURE(status)) { + TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16); + LOGGER_ERROR << "error in Collator::getLocale(...): " << u_errorName(status); + return NULL; + } + + size_t tempUtf16Length = 0; + UChar* tempUtf16 = (UChar *) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, (textUtf16Length + 1) * sizeof(UChar), false); + + BreakIterator *wordIterator = BreakIterator::createWordInstance(locale, status); + UnicodeString utext(textUtf16); + + wordIterator->setText(utext); + int32_t start = wordIterator->first(); + for(int32_t end = wordIterator->next(); end != BreakIterator::DONE; + start = end, end = wordIterator->next()) { + + tempUtf16Length = end - start; + // end - start = word length + if (tempUtf16Length >= minimalLength) { + utext.extractBetween(start, end, tempUtf16, 0); + utf8Word = TRI_UCharToUtf8(TRI_UNKNOWN_MEM_ZONE, tempUtf16, tempUtf16Length, &utf8WordLength); + TRI_PushBackVectorString(words, utf8Word); + } + } + + delete wordIterator; + TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16); + TRI_Free(TRI_UNKNOWN_MEM_ZONE, tempUtf16); + +#else + // TODO +#endif + + if (words->_length == 0) { + // no words found + TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words); + return NULL; + } + + return words; +} + #ifdef __cplusplus extern "C" { #endif @@ -409,6 +494,17 @@ char* TRI_toupper_utf8 (TRI_memory_zone_t* zone, const char *src, int32_t srcLen return Utf8Helper::DefaultUtf8Helper.toupper(zone, src, srcLength, *dstLength); } +//////////////////////////////////////////////////////////////////////////////// +/// @brief Get words of an UTF-8 string (implemented in Basic/Utf8Helper.cpp) +//////////////////////////////////////////////////////////////////////////////// + +TRI_vector_string_t* TRI_get_words (const char* const text, + const size_t textLength, + uint8_t minimalWordLength, + bool lowerCase) { + return Utf8Helper::DefaultUtf8Helper.getWords(text, textLength, minimalWordLength, lowerCase); +} + #ifdef __cplusplus } #endif diff --git a/lib/Basics/Utf8Helper.h b/lib/Basics/Utf8Helper.h index 3cd32a5cef..1cad62711a 100644 --- a/lib/Basics/Utf8Helper.h +++ b/lib/Basics/Utf8Helper.h @@ -30,6 +30,7 @@ #define TRIAGENS_BASICS_UTF8_HELPER_H 1 #include "Basics/Common.h" +#include "BasicsC/vector.h" #ifdef TRI_HAVE_ICU #include "unicode/coll.h" @@ -148,6 +149,15 @@ namespace triagens { char* toupper (TRI_memory_zone_t* zone, const char *src, int32_t srcLength, int32_t& dstLength); +//////////////////////////////////////////////////////////////////////////////// +/// @brief returns the words of a UTF-8 string. +//////////////////////////////////////////////////////////////////////////////// + + TRI_vector_string_t* getWords (const char* const text, + const size_t textLength, + uint8_t minimalWordLength, + bool lowerCase); + private: #ifdef TRI_HAVE_ICU Collator* _coll; diff --git a/lib/BasicsC/utf8-helper.c b/lib/BasicsC/utf8-helper.c index 781d25b9e2..870d60669b 100644 --- a/lib/BasicsC/utf8-helper.c +++ b/lib/BasicsC/utf8-helper.c @@ -25,12 +25,10 @@ /// @author Copyright 2011-2012, triAGENS GmbH, Cologne, Germany //////////////////////////////////////////////////////////////////////////////// - #include "utf8-helper.h" #ifdef TRI_HAVE_ICU -#include "unicode/ustring.h" #include "unicode/unorm2.h" // ----------------------------------------------------------------------------- @@ -46,7 +44,7 @@ /// @brief convert a utf-8 string to a uchar (utf-16) //////////////////////////////////////////////////////////////////////////////// -static UChar* Utf8ToUChar (TRI_memory_zone_t* zone, +UChar* TRI_Utf8ToUChar (TRI_memory_zone_t* zone, const char* utf8, const size_t inLength, size_t* outLength) { @@ -85,7 +83,7 @@ static UChar* Utf8ToUChar (TRI_memory_zone_t* zone, /// @brief convert a uchar (utf-16) to a utf-8 string //////////////////////////////////////////////////////////////////////////////// -static char* UCharToUtf8 (TRI_memory_zone_t* zone, +char* TRI_UCharToUtf8 (TRI_memory_zone_t* zone, const UChar* uchar, const size_t inLength, size_t* outLength) { @@ -154,7 +152,7 @@ char* TRI_normalize_utf8_to_NFC (TRI_memory_zone_t* zone, return utf8Dest; } - utf16 = Utf8ToUChar(zone, utf8, inLength, &utf16Length); + utf16 = TRI_Utf8ToUChar(zone, utf8, inLength, &utf16Length); if (utf16 == NULL) { return NULL; } @@ -211,7 +209,7 @@ char* TRI_normalize_utf16_to_NFC (TRI_memory_zone_t* zone, } // Convert data back from UChar (UTF-16) to UTF-8 - utf8Dest = UCharToUtf8(zone, utf16Dest, utf16DestLength, outLength); + utf8Dest = TRI_UCharToUtf8(zone, utf16Dest, utf16DestLength, outLength); TRI_Free(zone, utf16Dest); return utf8Dest; diff --git a/lib/BasicsC/utf8-helper.h b/lib/BasicsC/utf8-helper.h index 054d1cc7e2..c009657351 100644 --- a/lib/BasicsC/utf8-helper.h +++ b/lib/BasicsC/utf8-helper.h @@ -29,6 +29,11 @@ #define TRIAGENS_BASICS_C_UTF8_HELPER_H 1 #include "BasicsC/common.h" +#include "BasicsC/vector.h" + +#ifdef TRI_HAVE_ICU +#include "unicode/ustring.h" +#endif #ifdef __cplusplus extern "C" { @@ -45,6 +50,24 @@ extern "C" { #ifdef TRI_HAVE_ICU +//////////////////////////////////////////////////////////////////////////////// +/// @brief convert a utf-8 string to a uchar (utf-16) +//////////////////////////////////////////////////////////////////////////////// + +UChar* TRI_Utf8ToUChar (TRI_memory_zone_t* zone, + const char* utf8, + const size_t inLength, + size_t* outLength); + +//////////////////////////////////////////////////////////////////////////////// +/// @brief convert a uchar (utf-16) to a utf-8 string +//////////////////////////////////////////////////////////////////////////////// + +char* TRI_UCharToUtf8 (TRI_memory_zone_t* zone, + const UChar* uchar, + const size_t inLength, + size_t* outLength); + //////////////////////////////////////////////////////////////////////////////// /// @brief normalize an utf8 string (NFC) //////////////////////////////////////////////////////////////////////////////// @@ -98,6 +121,15 @@ char* TRI_toupper_utf8 (TRI_memory_zone_t* zone, int32_t srcLength, int32_t* dstLength); +//////////////////////////////////////////////////////////////////////////////// +/// @brief Get words of an UTF-8 string (implemented in Basic/Utf8Helper.cpp) +//////////////////////////////////////////////////////////////////////////////// + +TRI_vector_string_t* TRI_get_words (const char* const text, + const size_t textLength, + uint8_t minimalWordLength, + bool lowerCase); + //////////////////////////////////////////////////////////////////////////////// /// @} ////////////////////////////////////////////////////////////////////////////////