mirror of https://gitee.com/bigwinds/arangodb
Merge branch 'devel' of https://github.com/triAGENS/ArangoDB into devel
This commit is contained in:
commit
7261465139
|
@ -161,6 +161,67 @@ BOOST_AUTO_TEST_CASE (tst_3) {
|
|||
BOOST_CHECK_EQUAL(expectString, resultString);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE (tst_4) {
|
||||
std::string testString = "Der Müller geht in die Post.";
|
||||
|
||||
TRI_vector_string_t* words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 3, true);
|
||||
BOOST_CHECK(words != NULL);
|
||||
|
||||
BOOST_CHECK_EQUAL(5, words->_length);
|
||||
BOOST_CHECK_EQUAL("der", words->_buffer[0]);
|
||||
BOOST_CHECK_EQUAL("müller", words->_buffer[1]);
|
||||
BOOST_CHECK_EQUAL("geht", words->_buffer[2]);
|
||||
BOOST_CHECK_EQUAL("die", words->_buffer[3]);
|
||||
BOOST_CHECK_EQUAL("post", words->_buffer[4]);
|
||||
|
||||
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
|
||||
|
||||
|
||||
words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 4, true);
|
||||
BOOST_CHECK(words != NULL);
|
||||
|
||||
BOOST_CHECK_EQUAL(3, words->_length);
|
||||
BOOST_CHECK_EQUAL("müller", words->_buffer[0]);
|
||||
BOOST_CHECK_EQUAL("geht", words->_buffer[1]);
|
||||
BOOST_CHECK_EQUAL("post", words->_buffer[2]);
|
||||
|
||||
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
|
||||
|
||||
words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(NULL, 0, 4, true);
|
||||
BOOST_CHECK(words == NULL);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE (tst_5) {
|
||||
std::string testString = "Der Müller geht in die Post.";
|
||||
|
||||
TRI_vector_string_t* words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 3, false);
|
||||
BOOST_CHECK(words != NULL);
|
||||
|
||||
BOOST_CHECK_EQUAL(5, words->_length);
|
||||
BOOST_CHECK_EQUAL("Der", words->_buffer[0]);
|
||||
BOOST_CHECK_EQUAL("Müller", words->_buffer[1]);
|
||||
BOOST_CHECK_EQUAL("geht", words->_buffer[2]);
|
||||
BOOST_CHECK_EQUAL("die", words->_buffer[3]);
|
||||
BOOST_CHECK_EQUAL("Post", words->_buffer[4]);
|
||||
|
||||
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
|
||||
|
||||
|
||||
words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 4, false);
|
||||
BOOST_CHECK(words != NULL);
|
||||
|
||||
BOOST_CHECK_EQUAL(3, words->_length);
|
||||
BOOST_CHECK_EQUAL("Müller", words->_buffer[0]);
|
||||
BOOST_CHECK_EQUAL("geht", words->_buffer[1]);
|
||||
BOOST_CHECK_EQUAL("Post", words->_buffer[2]);
|
||||
|
||||
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
|
||||
|
||||
words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(NULL, 0, 4, false);
|
||||
BOOST_CHECK(words == NULL);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END ()
|
||||
|
|
|
@ -34,6 +34,7 @@
|
|||
#include "BasicsC/logging.h"
|
||||
#include "BasicsC/string-buffer.h"
|
||||
#include "BasicsC/strings.h"
|
||||
#include "BasicsC/utf8-helper.h"
|
||||
#include "ShapedJson/shape-accessor.h"
|
||||
#include "ShapedJson/shaped-json.h"
|
||||
#include "VocBase/document-collection.h"
|
||||
|
@ -4059,143 +4060,6 @@ void TRI_FreeSkiplistIndex (TRI_index_t* idx) {
|
|||
/// @{
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief add an identified word to the word vector
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static bool AddWord (TRI_vector_string_t* const words,
|
||||
const char* const wordStart,
|
||||
const size_t wordLength,
|
||||
const bool containsUtf8) {
|
||||
char* copy;
|
||||
|
||||
if (containsUtf8) {
|
||||
// UTF-8 string
|
||||
copy = TRI_NormaliseWordFulltextIndex(wordStart, wordLength);
|
||||
}
|
||||
else {
|
||||
// ASCII string
|
||||
char* src;
|
||||
char* end;
|
||||
char* dst;
|
||||
|
||||
copy = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, (wordLength + 1) * sizeof(char), false);
|
||||
if (copy == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
src = (char*) wordStart;
|
||||
end = src + wordLength;
|
||||
dst = copy;
|
||||
|
||||
for (; src < end; ++src, ++dst) {
|
||||
char c = *src;
|
||||
|
||||
// lower case the text so it is normalised in the index
|
||||
if (c >= 'A' && c <= 'Z') {
|
||||
*dst = (char) (((unsigned char) c) + 32);
|
||||
}
|
||||
else {
|
||||
*dst = c;
|
||||
}
|
||||
}
|
||||
|
||||
*dst = '\0';
|
||||
}
|
||||
|
||||
TRI_PushBackVectorString(words, copy);
|
||||
LOG_DEBUG("found word '%s'", copy);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief parse a document string value into the individual words that should
|
||||
/// be indexed
|
||||
/// words returned are all lower cased
|
||||
///
|
||||
/// This function is very naive and currently does not handle lower-casing of
|
||||
/// unicode characters, normalisation of unicode characters, and exclusion of
|
||||
/// unicode punctuation characters
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static TRI_vector_string_t* ParseWordsFulltextIndex (const char* const text,
|
||||
const size_t textLength) {
|
||||
TRI_vector_string_t* words;
|
||||
char* ptr;
|
||||
char* end;
|
||||
char* wordStart;
|
||||
bool containsUtf8;
|
||||
|
||||
words = (TRI_vector_string_t*) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_vector_string_t), false);
|
||||
if (words == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
TRI_InitVectorString(words, TRI_UNKNOWN_MEM_ZONE);
|
||||
|
||||
ptr = (char*) text;
|
||||
end = ptr + textLength;
|
||||
wordStart = NULL;
|
||||
containsUtf8 = false;
|
||||
|
||||
while (ptr < end) {
|
||||
char c = *ptr;
|
||||
|
||||
if ((c >= 'A' && c <= 'Z') ||
|
||||
(c >= 'a' && c <= 'z')) {
|
||||
if (wordStart == NULL) {
|
||||
wordStart = ptr;
|
||||
}
|
||||
}
|
||||
else if ((unsigned char) c >= 128) {
|
||||
// UTF-8
|
||||
if (wordStart == NULL) {
|
||||
wordStart = ptr;
|
||||
}
|
||||
containsUtf8 = true;
|
||||
}
|
||||
else {
|
||||
if (wordStart != NULL) {
|
||||
size_t wordLength = ptr - wordStart;
|
||||
|
||||
// check the length of the word
|
||||
if (wordLength >= 2) {
|
||||
if (! AddWord(words, wordStart, wordLength, containsUtf8)) {
|
||||
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
wordStart = NULL;
|
||||
containsUtf8 = false;
|
||||
}
|
||||
}
|
||||
|
||||
++ptr;
|
||||
}
|
||||
|
||||
// check if we have something left to index
|
||||
if (wordStart != NULL) {
|
||||
size_t wordLength = ptr - wordStart;
|
||||
|
||||
// check the length of the word
|
||||
if (wordLength >= 2) {
|
||||
if (! AddWord(words, wordStart, wordLength, containsUtf8)) {
|
||||
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (words->_length == 0) {
|
||||
// no words found
|
||||
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return words;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief free function for word list, fulltext index
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -4251,7 +4115,7 @@ static FTS_texts_t* GetTextsFulltextIndex (FTS_document_id_t document,
|
|||
}
|
||||
|
||||
// parse the document text
|
||||
words = ParseWordsFulltextIndex(text, textLength);
|
||||
words = TRI_get_words(text, textLength, 2, true);
|
||||
if (words == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
@ -32,12 +32,14 @@
|
|||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/ustdio.h"
|
||||
#else
|
||||
#include "string.h"
|
||||
#endif
|
||||
|
||||
#include "Logger/Logger.h"
|
||||
#include "BasicsC/strings.h"
|
||||
#include "BasicsC/utf8-helper.h"
|
||||
|
||||
using namespace triagens::basics;
|
||||
using namespace std;
|
||||
|
@ -373,6 +375,89 @@ char* Utf8Helper::toupper (TRI_memory_zone_t* zone, const char *src, int32_t src
|
|||
return utf8_dest;
|
||||
}
|
||||
|
||||
TRI_vector_string_t* Utf8Helper::getWords (const char* const text,
|
||||
const size_t textLength,
|
||||
uint8_t minimalLength,
|
||||
bool lowerCase) {
|
||||
TRI_vector_string_t* words;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString word;
|
||||
size_t utf8WordLength = 0;
|
||||
char* utf8Word;
|
||||
|
||||
words = (TRI_vector_string_t*) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_vector_string_t), false);
|
||||
if (words == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
TRI_InitVectorString(words, TRI_UNKNOWN_MEM_ZONE);
|
||||
|
||||
#ifdef TRI_HAVE_ICU
|
||||
|
||||
size_t textUtf16Length = 0;
|
||||
UChar* textUtf16 = NULL;
|
||||
|
||||
if (lowerCase) {
|
||||
// lower case string
|
||||
int32_t lowerLength = 0;
|
||||
char* lower = tolower(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, lowerLength);
|
||||
|
||||
if (lowerLength == 0) {
|
||||
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
|
||||
return NULL;
|
||||
}
|
||||
textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, lower, lowerLength, &textUtf16Length);
|
||||
TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower);
|
||||
}
|
||||
else {
|
||||
textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, &textUtf16Length);
|
||||
}
|
||||
|
||||
ULocDataLocaleType type = ULOC_VALID_LOCALE;
|
||||
const Locale& locale = _coll->getLocale(type, status);
|
||||
if(U_FAILURE(status)) {
|
||||
TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
|
||||
LOGGER_ERROR << "error in Collator::getLocale(...): " << u_errorName(status);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t tempUtf16Length = 0;
|
||||
UChar* tempUtf16 = (UChar *) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, (textUtf16Length + 1) * sizeof(UChar), false);
|
||||
|
||||
BreakIterator *wordIterator = BreakIterator::createWordInstance(locale, status);
|
||||
UnicodeString utext(textUtf16);
|
||||
|
||||
wordIterator->setText(utext);
|
||||
int32_t start = wordIterator->first();
|
||||
for(int32_t end = wordIterator->next(); end != BreakIterator::DONE;
|
||||
start = end, end = wordIterator->next()) {
|
||||
|
||||
tempUtf16Length = end - start;
|
||||
// end - start = word length
|
||||
if (tempUtf16Length >= minimalLength) {
|
||||
utext.extractBetween(start, end, tempUtf16, 0);
|
||||
utf8Word = TRI_UCharToUtf8(TRI_UNKNOWN_MEM_ZONE, tempUtf16, tempUtf16Length, &utf8WordLength);
|
||||
TRI_PushBackVectorString(words, utf8Word);
|
||||
}
|
||||
}
|
||||
|
||||
delete wordIterator;
|
||||
TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
|
||||
TRI_Free(TRI_UNKNOWN_MEM_ZONE, tempUtf16);
|
||||
|
||||
#else
|
||||
// TODO
|
||||
#endif
|
||||
|
||||
if (words->_length == 0) {
|
||||
// no words found
|
||||
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return words;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
@ -409,6 +494,17 @@ char* TRI_toupper_utf8 (TRI_memory_zone_t* zone, const char *src, int32_t srcLen
|
|||
return Utf8Helper::DefaultUtf8Helper.toupper(zone, src, srcLength, *dstLength);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Get words of an UTF-8 string (implemented in Basic/Utf8Helper.cpp)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TRI_vector_string_t* TRI_get_words (const char* const text,
|
||||
const size_t textLength,
|
||||
uint8_t minimalWordLength,
|
||||
bool lowerCase) {
|
||||
return Utf8Helper::DefaultUtf8Helper.getWords(text, textLength, minimalWordLength, lowerCase);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
#define TRIAGENS_BASICS_UTF8_HELPER_H 1
|
||||
|
||||
#include "Basics/Common.h"
|
||||
#include "BasicsC/vector.h"
|
||||
|
||||
#ifdef TRI_HAVE_ICU
|
||||
#include "unicode/coll.h"
|
||||
|
@ -148,6 +149,15 @@ namespace triagens {
|
|||
|
||||
char* toupper (TRI_memory_zone_t* zone, const char *src, int32_t srcLength, int32_t& dstLength);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief returns the words of a UTF-8 string.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TRI_vector_string_t* getWords (const char* const text,
|
||||
const size_t textLength,
|
||||
uint8_t minimalWordLength,
|
||||
bool lowerCase);
|
||||
|
||||
private:
|
||||
#ifdef TRI_HAVE_ICU
|
||||
Collator* _coll;
|
||||
|
|
|
@ -25,12 +25,10 @@
|
|||
/// @author Copyright 2011-2012, triAGENS GmbH, Cologne, Germany
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#include "utf8-helper.h"
|
||||
|
||||
#ifdef TRI_HAVE_ICU
|
||||
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/unorm2.h"
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
@ -46,7 +44,7 @@
|
|||
/// @brief convert a utf-8 string to a uchar (utf-16)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static UChar* Utf8ToUChar (TRI_memory_zone_t* zone,
|
||||
UChar* TRI_Utf8ToUChar (TRI_memory_zone_t* zone,
|
||||
const char* utf8,
|
||||
const size_t inLength,
|
||||
size_t* outLength) {
|
||||
|
@ -85,7 +83,7 @@ static UChar* Utf8ToUChar (TRI_memory_zone_t* zone,
|
|||
/// @brief convert a uchar (utf-16) to a utf-8 string
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static char* UCharToUtf8 (TRI_memory_zone_t* zone,
|
||||
char* TRI_UCharToUtf8 (TRI_memory_zone_t* zone,
|
||||
const UChar* uchar,
|
||||
const size_t inLength,
|
||||
size_t* outLength) {
|
||||
|
@ -154,7 +152,7 @@ char* TRI_normalize_utf8_to_NFC (TRI_memory_zone_t* zone,
|
|||
return utf8Dest;
|
||||
}
|
||||
|
||||
utf16 = Utf8ToUChar(zone, utf8, inLength, &utf16Length);
|
||||
utf16 = TRI_Utf8ToUChar(zone, utf8, inLength, &utf16Length);
|
||||
if (utf16 == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
@ -211,7 +209,7 @@ char* TRI_normalize_utf16_to_NFC (TRI_memory_zone_t* zone,
|
|||
}
|
||||
|
||||
// Convert data back from UChar (UTF-16) to UTF-8
|
||||
utf8Dest = UCharToUtf8(zone, utf16Dest, utf16DestLength, outLength);
|
||||
utf8Dest = TRI_UCharToUtf8(zone, utf16Dest, utf16DestLength, outLength);
|
||||
TRI_Free(zone, utf16Dest);
|
||||
|
||||
return utf8Dest;
|
||||
|
|
|
@ -29,6 +29,11 @@
|
|||
#define TRIAGENS_BASICS_C_UTF8_HELPER_H 1
|
||||
|
||||
#include "BasicsC/common.h"
|
||||
#include "BasicsC/vector.h"
|
||||
|
||||
#ifdef TRI_HAVE_ICU
|
||||
#include "unicode/ustring.h"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -45,6 +50,24 @@ extern "C" {
|
|||
|
||||
#ifdef TRI_HAVE_ICU
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief convert a utf-8 string to a uchar (utf-16)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
UChar* TRI_Utf8ToUChar (TRI_memory_zone_t* zone,
|
||||
const char* utf8,
|
||||
const size_t inLength,
|
||||
size_t* outLength);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief convert a uchar (utf-16) to a utf-8 string
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
char* TRI_UCharToUtf8 (TRI_memory_zone_t* zone,
|
||||
const UChar* uchar,
|
||||
const size_t inLength,
|
||||
size_t* outLength);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief normalize an utf8 string (NFC)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -98,6 +121,15 @@ char* TRI_toupper_utf8 (TRI_memory_zone_t* zone,
|
|||
int32_t srcLength,
|
||||
int32_t* dstLength);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @brief Get words of an UTF-8 string (implemented in Basic/Utf8Helper.cpp)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TRI_vector_string_t* TRI_get_words (const char* const text,
|
||||
const size_t textLength,
|
||||
uint8_t minimalWordLength,
|
||||
bool lowerCase);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/// @}
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
Loading…
Reference in New Issue