1
0
Fork 0

Merge branch 'devel' of https://github.com/triAGENS/ArangoDB into devel

This commit is contained in:
Jan Steemann 2012-12-07 13:51:31 +01:00
commit 7261465139
6 changed files with 205 additions and 144 deletions

View File

@ -161,6 +161,67 @@ BOOST_AUTO_TEST_CASE (tst_3) {
BOOST_CHECK_EQUAL(expectString, resultString);
}
BOOST_AUTO_TEST_CASE (tst_4) {
std::string testString = "Der Müller geht in die Post.";
TRI_vector_string_t* words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 3, true);
BOOST_CHECK(words != NULL);
BOOST_CHECK_EQUAL(5, words->_length);
BOOST_CHECK_EQUAL("der", words->_buffer[0]);
BOOST_CHECK_EQUAL("müller", words->_buffer[1]);
BOOST_CHECK_EQUAL("geht", words->_buffer[2]);
BOOST_CHECK_EQUAL("die", words->_buffer[3]);
BOOST_CHECK_EQUAL("post", words->_buffer[4]);
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 4, true);
BOOST_CHECK(words != NULL);
BOOST_CHECK_EQUAL(3, words->_length);
BOOST_CHECK_EQUAL("müller", words->_buffer[0]);
BOOST_CHECK_EQUAL("geht", words->_buffer[1]);
BOOST_CHECK_EQUAL("post", words->_buffer[2]);
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(NULL, 0, 4, true);
BOOST_CHECK(words == NULL);
}
BOOST_AUTO_TEST_CASE (tst_5) {
std::string testString = "Der Müller geht in die Post.";
TRI_vector_string_t* words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 3, false);
BOOST_CHECK(words != NULL);
BOOST_CHECK_EQUAL(5, words->_length);
BOOST_CHECK_EQUAL("Der", words->_buffer[0]);
BOOST_CHECK_EQUAL("Müller", words->_buffer[1]);
BOOST_CHECK_EQUAL("geht", words->_buffer[2]);
BOOST_CHECK_EQUAL("die", words->_buffer[3]);
BOOST_CHECK_EQUAL("Post", words->_buffer[4]);
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 4, false);
BOOST_CHECK(words != NULL);
BOOST_CHECK_EQUAL(3, words->_length);
BOOST_CHECK_EQUAL("Müller", words->_buffer[0]);
BOOST_CHECK_EQUAL("geht", words->_buffer[1]);
BOOST_CHECK_EQUAL("Post", words->_buffer[2]);
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(NULL, 0, 4, false);
BOOST_CHECK(words == NULL);
}
#endif
BOOST_AUTO_TEST_SUITE_END ()

View File

@ -34,6 +34,7 @@
#include "BasicsC/logging.h"
#include "BasicsC/string-buffer.h"
#include "BasicsC/strings.h"
#include "BasicsC/utf8-helper.h"
#include "ShapedJson/shape-accessor.h"
#include "ShapedJson/shaped-json.h"
#include "VocBase/document-collection.h"
@ -4059,143 +4060,6 @@ void TRI_FreeSkiplistIndex (TRI_index_t* idx) {
/// @{
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/// @brief add an identified word to the word vector
////////////////////////////////////////////////////////////////////////////////
static bool AddWord (TRI_vector_string_t* const words,
const char* const wordStart,
const size_t wordLength,
const bool containsUtf8) {
char* copy;
if (containsUtf8) {
// UTF-8 string
copy = TRI_NormaliseWordFulltextIndex(wordStart, wordLength);
}
else {
// ASCII string
char* src;
char* end;
char* dst;
copy = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, (wordLength + 1) * sizeof(char), false);
if (copy == NULL) {
return false;
}
src = (char*) wordStart;
end = src + wordLength;
dst = copy;
for (; src < end; ++src, ++dst) {
char c = *src;
// lower case the text so it is normalised in the index
if (c >= 'A' && c <= 'Z') {
*dst = (char) (((unsigned char) c) + 32);
}
else {
*dst = c;
}
}
*dst = '\0';
}
TRI_PushBackVectorString(words, copy);
LOG_DEBUG("found word '%s'", copy);
return true;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief parse a document string value into the individual words that should
/// be indexed
/// words returned are all lower cased
///
/// This function is very naive and currently does not handle lower-casing of
/// unicode characters, normalisation of unicode characters, and exclusion of
/// unicode punctuation characters
////////////////////////////////////////////////////////////////////////////////
static TRI_vector_string_t* ParseWordsFulltextIndex (const char* const text,
const size_t textLength) {
TRI_vector_string_t* words;
char* ptr;
char* end;
char* wordStart;
bool containsUtf8;
words = (TRI_vector_string_t*) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_vector_string_t), false);
if (words == NULL) {
return NULL;
}
TRI_InitVectorString(words, TRI_UNKNOWN_MEM_ZONE);
ptr = (char*) text;
end = ptr + textLength;
wordStart = NULL;
containsUtf8 = false;
while (ptr < end) {
char c = *ptr;
if ((c >= 'A' && c <= 'Z') ||
(c >= 'a' && c <= 'z')) {
if (wordStart == NULL) {
wordStart = ptr;
}
}
else if ((unsigned char) c >= 128) {
// UTF-8
if (wordStart == NULL) {
wordStart = ptr;
}
containsUtf8 = true;
}
else {
if (wordStart != NULL) {
size_t wordLength = ptr - wordStart;
// check the length of the word
if (wordLength >= 2) {
if (! AddWord(words, wordStart, wordLength, containsUtf8)) {
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
return NULL;
}
}
wordStart = NULL;
containsUtf8 = false;
}
}
++ptr;
}
// check if we have something left to index
if (wordStart != NULL) {
size_t wordLength = ptr - wordStart;
// check the length of the word
if (wordLength >= 2) {
if (! AddWord(words, wordStart, wordLength, containsUtf8)) {
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
return NULL;
}
}
}
if (words->_length == 0) {
// no words found
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
return NULL;
}
return words;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief free function for word list, fulltext index
////////////////////////////////////////////////////////////////////////////////
@ -4251,7 +4115,7 @@ static FTS_texts_t* GetTextsFulltextIndex (FTS_document_id_t document,
}
// parse the document text
words = ParseWordsFulltextIndex(text, textLength);
words = TRI_get_words(text, textLength, 2, true);
if (words == NULL) {
return NULL;
}

View File

@ -32,12 +32,14 @@
#include "unicode/normalizer2.h"
#include "unicode/ucasemap.h"
#include "unicode/brkiter.h"
#include "unicode/ustdio.h"
#else
#include "string.h"
#endif
#include "Logger/Logger.h"
#include "BasicsC/strings.h"
#include "BasicsC/utf8-helper.h"
using namespace triagens::basics;
using namespace std;
@ -373,6 +375,89 @@ char* Utf8Helper::toupper (TRI_memory_zone_t* zone, const char *src, int32_t src
return utf8_dest;
}
TRI_vector_string_t* Utf8Helper::getWords (const char* const text,
const size_t textLength,
uint8_t minimalLength,
bool lowerCase) {
TRI_vector_string_t* words;
UErrorCode status = U_ZERO_ERROR;
UnicodeString word;
size_t utf8WordLength = 0;
char* utf8Word;
words = (TRI_vector_string_t*) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_vector_string_t), false);
if (words == NULL) {
return NULL;
}
TRI_InitVectorString(words, TRI_UNKNOWN_MEM_ZONE);
#ifdef TRI_HAVE_ICU
size_t textUtf16Length = 0;
UChar* textUtf16 = NULL;
if (lowerCase) {
// lower case string
int32_t lowerLength = 0;
char* lower = tolower(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, lowerLength);
if (lowerLength == 0) {
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
return NULL;
}
textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, lower, lowerLength, &textUtf16Length);
TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower);
}
else {
textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, &textUtf16Length);
}
ULocDataLocaleType type = ULOC_VALID_LOCALE;
const Locale& locale = _coll->getLocale(type, status);
if(U_FAILURE(status)) {
TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
LOGGER_ERROR << "error in Collator::getLocale(...): " << u_errorName(status);
return NULL;
}
size_t tempUtf16Length = 0;
UChar* tempUtf16 = (UChar *) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, (textUtf16Length + 1) * sizeof(UChar), false);
BreakIterator *wordIterator = BreakIterator::createWordInstance(locale, status);
UnicodeString utext(textUtf16);
wordIterator->setText(utext);
int32_t start = wordIterator->first();
for(int32_t end = wordIterator->next(); end != BreakIterator::DONE;
start = end, end = wordIterator->next()) {
tempUtf16Length = end - start;
// end - start = word length
if (tempUtf16Length >= minimalLength) {
utext.extractBetween(start, end, tempUtf16, 0);
utf8Word = TRI_UCharToUtf8(TRI_UNKNOWN_MEM_ZONE, tempUtf16, tempUtf16Length, &utf8WordLength);
TRI_PushBackVectorString(words, utf8Word);
}
}
delete wordIterator;
TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
TRI_Free(TRI_UNKNOWN_MEM_ZONE, tempUtf16);
#else
// TODO
#endif
if (words->_length == 0) {
// no words found
TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
return NULL;
}
return words;
}
#ifdef __cplusplus
extern "C" {
#endif
@ -409,6 +494,17 @@ char* TRI_toupper_utf8 (TRI_memory_zone_t* zone, const char *src, int32_t srcLen
return Utf8Helper::DefaultUtf8Helper.toupper(zone, src, srcLength, *dstLength);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief Get words of an UTF-8 string (implemented in Basic/Utf8Helper.cpp)
////////////////////////////////////////////////////////////////////////////////
TRI_vector_string_t* TRI_get_words (const char* const text,
const size_t textLength,
uint8_t minimalWordLength,
bool lowerCase) {
return Utf8Helper::DefaultUtf8Helper.getWords(text, textLength, minimalWordLength, lowerCase);
}
#ifdef __cplusplus
}
#endif

View File

@ -30,6 +30,7 @@
#define TRIAGENS_BASICS_UTF8_HELPER_H 1
#include "Basics/Common.h"
#include "BasicsC/vector.h"
#ifdef TRI_HAVE_ICU
#include "unicode/coll.h"
@ -148,6 +149,15 @@ namespace triagens {
char* toupper (TRI_memory_zone_t* zone, const char *src, int32_t srcLength, int32_t& dstLength);
////////////////////////////////////////////////////////////////////////////////
/// @brief returns the words of a UTF-8 string.
////////////////////////////////////////////////////////////////////////////////
TRI_vector_string_t* getWords (const char* const text,
const size_t textLength,
uint8_t minimalWordLength,
bool lowerCase);
private:
#ifdef TRI_HAVE_ICU
Collator* _coll;

View File

@ -25,12 +25,10 @@
/// @author Copyright 2011-2012, triAGENS GmbH, Cologne, Germany
////////////////////////////////////////////////////////////////////////////////
#include "utf8-helper.h"
#ifdef TRI_HAVE_ICU
#include "unicode/ustring.h"
#include "unicode/unorm2.h"
// -----------------------------------------------------------------------------
@ -46,7 +44,7 @@
/// @brief convert a utf-8 string to a uchar (utf-16)
////////////////////////////////////////////////////////////////////////////////
static UChar* Utf8ToUChar (TRI_memory_zone_t* zone,
UChar* TRI_Utf8ToUChar (TRI_memory_zone_t* zone,
const char* utf8,
const size_t inLength,
size_t* outLength) {
@ -85,7 +83,7 @@ static UChar* Utf8ToUChar (TRI_memory_zone_t* zone,
/// @brief convert a uchar (utf-16) to a utf-8 string
////////////////////////////////////////////////////////////////////////////////
static char* UCharToUtf8 (TRI_memory_zone_t* zone,
char* TRI_UCharToUtf8 (TRI_memory_zone_t* zone,
const UChar* uchar,
const size_t inLength,
size_t* outLength) {
@ -154,7 +152,7 @@ char* TRI_normalize_utf8_to_NFC (TRI_memory_zone_t* zone,
return utf8Dest;
}
utf16 = Utf8ToUChar(zone, utf8, inLength, &utf16Length);
utf16 = TRI_Utf8ToUChar(zone, utf8, inLength, &utf16Length);
if (utf16 == NULL) {
return NULL;
}
@ -211,7 +209,7 @@ char* TRI_normalize_utf16_to_NFC (TRI_memory_zone_t* zone,
}
// Convert data back from UChar (UTF-16) to UTF-8
utf8Dest = UCharToUtf8(zone, utf16Dest, utf16DestLength, outLength);
utf8Dest = TRI_UCharToUtf8(zone, utf16Dest, utf16DestLength, outLength);
TRI_Free(zone, utf16Dest);
return utf8Dest;

View File

@ -29,6 +29,11 @@
#define TRIAGENS_BASICS_C_UTF8_HELPER_H 1
#include "BasicsC/common.h"
#include "BasicsC/vector.h"
#ifdef TRI_HAVE_ICU
#include "unicode/ustring.h"
#endif
#ifdef __cplusplus
extern "C" {
@ -45,6 +50,24 @@ extern "C" {
#ifdef TRI_HAVE_ICU
////////////////////////////////////////////////////////////////////////////////
/// @brief convert a utf-8 string to a uchar (utf-16)
////////////////////////////////////////////////////////////////////////////////
UChar* TRI_Utf8ToUChar (TRI_memory_zone_t* zone,
const char* utf8,
const size_t inLength,
size_t* outLength);
////////////////////////////////////////////////////////////////////////////////
/// @brief convert a uchar (utf-16) to a utf-8 string
////////////////////////////////////////////////////////////////////////////////
char* TRI_UCharToUtf8 (TRI_memory_zone_t* zone,
const UChar* uchar,
const size_t inLength,
size_t* outLength);
////////////////////////////////////////////////////////////////////////////////
/// @brief normalize an utf8 string (NFC)
////////////////////////////////////////////////////////////////////////////////
@ -98,6 +121,15 @@ char* TRI_toupper_utf8 (TRI_memory_zone_t* zone,
int32_t srcLength,
int32_t* dstLength);
////////////////////////////////////////////////////////////////////////////////
/// @brief Get words of an UTF-8 string (implemented in Basic/Utf8Helper.cpp)
////////////////////////////////////////////////////////////////////////////////
TRI_vector_string_t* TRI_get_words (const char* const text,
const size_t textLength,
uint8_t minimalWordLength,
bool lowerCase);
////////////////////////////////////////////////////////////////////////////////
/// @}
////////////////////////////////////////////////////////////////////////////////