Merge branch 'devel' of https://github.com/triAGENS/ArangoDB into devel

2012-12-07 13:51:31 +01:00 · 2012-12-07 13:51:31 +01:00 · 7261465139
parent c49ce0a52f c4cc85ab9f
commit 7261465139
6 changed files with 205 additions and 144 deletions
--- a/UnitTests/Philadelphia/string-utf8-normalize-test.cpp
+++ b/UnitTests/Philadelphia/string-utf8-normalize-test.cpp
@ -161,6 +161,67 @@ BOOST_AUTO_TEST_CASE (tst_3) {
  BOOST_CHECK_EQUAL(expectString, resultString);
 }

+BOOST_AUTO_TEST_CASE (tst_4) {
+  std::string testString   = "Der Müller geht in die Post.";
+  
+  TRI_vector_string_t* words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 3, true);
+  BOOST_CHECK(words != NULL);
+  
+  BOOST_CHECK_EQUAL(5, words->_length);
+  BOOST_CHECK_EQUAL("der", words->_buffer[0]);
+  BOOST_CHECK_EQUAL("müller", words->_buffer[1]);
+  BOOST_CHECK_EQUAL("geht", words->_buffer[2]);
+  BOOST_CHECK_EQUAL("die", words->_buffer[3]);
+  BOOST_CHECK_EQUAL("post", words->_buffer[4]);
+    
+  TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
+  
+
+  words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 4, true);
+  BOOST_CHECK(words != NULL);
+  
+  BOOST_CHECK_EQUAL(3, words->_length);
+  BOOST_CHECK_EQUAL("müller", words->_buffer[0]);
+  BOOST_CHECK_EQUAL("geht", words->_buffer[1]);
+  BOOST_CHECK_EQUAL("post", words->_buffer[2]);
+    
+  TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
+
+  words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(NULL, 0, 4, true);
+  BOOST_CHECK(words == NULL);
+}
+
+BOOST_AUTO_TEST_CASE (tst_5) {
+  std::string testString   = "Der Müller geht in die Post.";
+  
+  TRI_vector_string_t* words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 3, false);
+  BOOST_CHECK(words != NULL);
+  
+  BOOST_CHECK_EQUAL(5, words->_length);
+  BOOST_CHECK_EQUAL("Der", words->_buffer[0]);
+  BOOST_CHECK_EQUAL("Müller", words->_buffer[1]);
+  BOOST_CHECK_EQUAL("geht", words->_buffer[2]);
+  BOOST_CHECK_EQUAL("die", words->_buffer[3]);
+  BOOST_CHECK_EQUAL("Post", words->_buffer[4]);
+    
+  TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
+  
+
+  words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(testString.c_str(), testString.length(), 4, false);
+  BOOST_CHECK(words != NULL);
+  
+  BOOST_CHECK_EQUAL(3, words->_length);
+  BOOST_CHECK_EQUAL("Müller", words->_buffer[0]);
+  BOOST_CHECK_EQUAL("geht", words->_buffer[1]);
+  BOOST_CHECK_EQUAL("Post", words->_buffer[2]);
+    
+  TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
+
+  words = triagens::basics::Utf8Helper::DefaultUtf8Helper.getWords(NULL, 0, 4, false);
+  BOOST_CHECK(words == NULL);
+}
+
+
 #endif

 BOOST_AUTO_TEST_SUITE_END ()
--- a/arangod/VocBase/index.c
+++ b/arangod/VocBase/index.c
@ -34,6 +34,7 @@
 #include "BasicsC/logging.h"
 #include "BasicsC/string-buffer.h"
 #include "BasicsC/strings.h"
+#include "BasicsC/utf8-helper.h"
 #include "ShapedJson/shape-accessor.h"
 #include "ShapedJson/shaped-json.h"
 #include "VocBase/document-collection.h"
@ -4059,143 +4060,6 @@ void TRI_FreeSkiplistIndex (TRI_index_t* idx) {
 /// @{
 ////////////////////////////////////////////////////////////////////////////////

-////////////////////////////////////////////////////////////////////////////////
-/// @brief add an identified word to the word vector 
-////////////////////////////////////////////////////////////////////////////////
-
-static bool AddWord (TRI_vector_string_t* const words,
-                     const char* const wordStart,
-                     const size_t wordLength,
-                     const bool containsUtf8) {
-  char* copy;
-
-  if (containsUtf8) {
-    // UTF-8 string
-    copy = TRI_NormaliseWordFulltextIndex(wordStart, wordLength);
-  }
-  else {
-    // ASCII string 
-    char* src;
-    char* end;
-    char* dst;
-
-    copy = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, (wordLength + 1) * sizeof(char), false);
-    if (copy == NULL) {
-      return false;
-    }
-
-    src = (char*) wordStart;
-    end = src + wordLength;
-    dst = copy;
-
-    for (; src < end; ++src, ++dst) {
-      char c = *src;
-
-      // lower case the text so it is normalised in the index
-      if (c >= 'A' && c <= 'Z') {
-        *dst = (char) (((unsigned char) c) + 32);
-      }
-      else {
-        *dst = c;
-      }
-    }
-
-    *dst = '\0';
-  }
-
-  TRI_PushBackVectorString(words, copy);
-  LOG_DEBUG("found word '%s'", copy);
-
-  return true;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// @brief parse a document string value into the individual words that should
-/// be indexed
-/// words returned are all lower cased
-///
-/// This function is very naive and currently does not handle lower-casing of 
-/// unicode characters, normalisation of unicode characters, and exclusion of
-/// unicode punctuation characters
-////////////////////////////////////////////////////////////////////////////////
-
-static TRI_vector_string_t* ParseWordsFulltextIndex (const char* const text, 
-                                                     const size_t textLength) {
-  TRI_vector_string_t* words;
-  char* ptr;
-  char* end;
-  char* wordStart;
-  bool containsUtf8;
-
-  words = (TRI_vector_string_t*) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_vector_string_t), false);
-  if (words == NULL) {
-    return NULL;
-  }
-
-  TRI_InitVectorString(words, TRI_UNKNOWN_MEM_ZONE);
-
-  ptr = (char*) text;
-  end = ptr + textLength;
-  wordStart = NULL;
-  containsUtf8 = false;
-
-  while (ptr < end) {
-    char c = *ptr;
-
-    if ((c >= 'A' && c <= 'Z') ||
-        (c >= 'a' && c <= 'z')) {
-      if (wordStart == NULL) {
-        wordStart = ptr;
-      }
-    }
-    else if ((unsigned char) c >= 128) {
-      // UTF-8
-      if (wordStart == NULL) {
-        wordStart = ptr;
-      }
-      containsUtf8 = true;
-    }
-    else {
-      if (wordStart != NULL) {
-        size_t wordLength = ptr - wordStart;
-
-        // check the length of the word
-        if (wordLength >= 2) {
-          if (! AddWord(words, wordStart, wordLength, containsUtf8)) {
-            TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
-            return NULL;
-          }
-        }
-        wordStart = NULL;
-        containsUtf8 = false;
-      }
-    }
-
-    ++ptr;
-  }
-
-  // check if we have something left to index
-  if (wordStart != NULL) {
-    size_t wordLength = ptr - wordStart;
-
-    // check the length of the word
-    if (wordLength >= 2) {
-      if (! AddWord(words, wordStart, wordLength, containsUtf8)) {
-        TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
-        return NULL;
-      }
-    }
-  }
-
-  if (words->_length == 0) {
-    // no words found
-    TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
-    return NULL;
-  }
-
-  return words;
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 /// @brief free function for word list, fulltext index
 ////////////////////////////////////////////////////////////////////////////////
@ -4251,7 +4115,7 @@ static FTS_texts_t* GetTextsFulltextIndex (FTS_document_id_t document,
  } 

  // parse the document text
-  words = ParseWordsFulltextIndex(text, textLength);
+  words = TRI_get_words(text, textLength, 2, true);
  if (words == NULL) {
    return NULL;
  }
--- a/lib/Basics/Utf8Helper.cpp
+++ b/lib/Basics/Utf8Helper.cpp
@ -32,12 +32,14 @@
 #include "unicode/normalizer2.h"
 #include "unicode/ucasemap.h"
 #include "unicode/brkiter.h"
+#include "unicode/ustdio.h"
 #else
 #include "string.h"
 #endif

 #include "Logger/Logger.h"
 #include "BasicsC/strings.h"
+#include "BasicsC/utf8-helper.h"

 using namespace triagens::basics;
 using namespace std;
@ -373,6 +375,89 @@ char* Utf8Helper::toupper (TRI_memory_zone_t* zone, const char *src, int32_t src
  return utf8_dest;
 }

+TRI_vector_string_t* Utf8Helper::getWords (const char* const text, 
+                                                const size_t textLength,
+                                                uint8_t minimalLength,
+                                                bool lowerCase) {
+  TRI_vector_string_t* words;
+  UErrorCode status = U_ZERO_ERROR;
+  UnicodeString word;
+  size_t utf8WordLength = 0;
+  char* utf8Word;
+
+  words = (TRI_vector_string_t*) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_vector_string_t), false);
+  if (words == NULL) {
+    return NULL;
+  }
+
+  TRI_InitVectorString(words, TRI_UNKNOWN_MEM_ZONE);    
+
+#ifdef TRI_HAVE_ICU  
+  
+  size_t textUtf16Length = 0;
+  UChar* textUtf16 = NULL;
+
+  if (lowerCase) {
+    // lower case string
+    int32_t lowerLength = 0;
+    char* lower = tolower(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, lowerLength);
+  
+    if (lowerLength == 0) {
+      TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
+      return NULL;
+    }
+    textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, lower, lowerLength, &textUtf16Length);
+    TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower);    
+  }
+  else {
+    textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, &textUtf16Length);    
+  }
+  
+  ULocDataLocaleType type = ULOC_VALID_LOCALE;  
+  const Locale& locale = _coll->getLocale(type, status);
+  if(U_FAILURE(status)) {
+    TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
+    LOGGER_ERROR << "error in Collator::getLocale(...): " << u_errorName(status);
+    return NULL;
+  }
+
+  size_t tempUtf16Length = 0;
+  UChar* tempUtf16 = (UChar *) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, (textUtf16Length + 1) * sizeof(UChar), false);  
+  
+  BreakIterator *wordIterator = BreakIterator::createWordInstance(locale, status);
+  UnicodeString utext(textUtf16);
+  
+  wordIterator->setText(utext);
+  int32_t start = wordIterator->first();
+  for(int32_t end = wordIterator->next(); end != BreakIterator::DONE; 
+    start = end, end = wordIterator->next()) {
+    
+    tempUtf16Length = end - start;
+    // end - start = word length
+    if (tempUtf16Length >= minimalLength) {
+      utext.extractBetween(start, end, tempUtf16, 0);      
+      utf8Word = TRI_UCharToUtf8(TRI_UNKNOWN_MEM_ZONE, tempUtf16, tempUtf16Length, &utf8WordLength);
+      TRI_PushBackVectorString(words, utf8Word);
+    }
+  }
+  
+  delete wordIterator;
+  TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
+  TRI_Free(TRI_UNKNOWN_MEM_ZONE, tempUtf16);
+
+#else
+  // TODO
+#endif
+  
+  if (words->_length == 0) {
+    // no words found
+    TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
+    return NULL;
+  }
+
+  return words;
+}
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -409,6 +494,17 @@ char* TRI_toupper_utf8 (TRI_memory_zone_t* zone, const char *src, int32_t srcLen
  return Utf8Helper::DefaultUtf8Helper.toupper(zone, src, srcLength, *dstLength);  
 }

+////////////////////////////////////////////////////////////////////////////////
+/// @brief Get words of an UTF-8 string (implemented in Basic/Utf8Helper.cpp)
+////////////////////////////////////////////////////////////////////////////////
+
+TRI_vector_string_t* TRI_get_words (const char* const text, 
+                                    const size_t textLength,
+                                    uint8_t minimalWordLength,
+                                    bool lowerCase) {
+  return Utf8Helper::DefaultUtf8Helper.getWords(text, textLength, minimalWordLength, lowerCase);    
+}
+
 #ifdef __cplusplus
 }
 #endif
--- a/lib/Basics/Utf8Helper.h
+++ b/lib/Basics/Utf8Helper.h
@ -30,6 +30,7 @@
 #define TRIAGENS_BASICS_UTF8_HELPER_H 1

 #include "Basics/Common.h"
+#include "BasicsC/vector.h"

 #ifdef TRI_HAVE_ICU
 #include "unicode/coll.h"
@ -148,6 +149,15 @@ namespace triagens {

        char* toupper (TRI_memory_zone_t* zone, const char *src, int32_t srcLength, int32_t& dstLength);
        
+////////////////////////////////////////////////////////////////////////////////
+/// @brief returns the words of a UTF-8 string.
+////////////////////////////////////////////////////////////////////////////////
+
+        TRI_vector_string_t* getWords (const char* const text, 
+                                                   const size_t textLength,
+                                                   uint8_t minimalWordLength,
+                                                   bool lowerCase);
+        
      private:
 #ifdef TRI_HAVE_ICU      
        Collator* _coll;
--- a/lib/BasicsC/utf8-helper.c
+++ b/lib/BasicsC/utf8-helper.c
@ -25,12 +25,10 @@
 /// @author Copyright 2011-2012, triAGENS GmbH, Cologne, Germany
 ////////////////////////////////////////////////////////////////////////////////

-
 #include "utf8-helper.h"

 #ifdef TRI_HAVE_ICU

-#include "unicode/ustring.h"
 #include "unicode/unorm2.h"

 // -----------------------------------------------------------------------------
@ -46,7 +44,7 @@
 /// @brief convert a utf-8 string to a uchar (utf-16)
 ////////////////////////////////////////////////////////////////////////////////

-static UChar* Utf8ToUChar (TRI_memory_zone_t* zone, 
+UChar* TRI_Utf8ToUChar (TRI_memory_zone_t* zone, 
                           const char* utf8, 
                           const size_t inLength, 
                           size_t* outLength) {
@ -85,7 +83,7 @@ static UChar* Utf8ToUChar (TRI_memory_zone_t* zone,
 /// @brief convert a uchar (utf-16) to a utf-8 string
 ////////////////////////////////////////////////////////////////////////////////

-static char* UCharToUtf8 (TRI_memory_zone_t* zone,
+char* TRI_UCharToUtf8 (TRI_memory_zone_t* zone,
                          const UChar* uchar,
                          const size_t inLength,
                          size_t* outLength) {
@ -154,7 +152,7 @@ char* TRI_normalize_utf8_to_NFC (TRI_memory_zone_t* zone,
    return utf8Dest;
  }

-  utf16 = Utf8ToUChar(zone, utf8, inLength, &utf16Length); 
+  utf16 = TRI_Utf8ToUChar(zone, utf8, inLength, &utf16Length); 
  if (utf16 == NULL) {
    return NULL;
  }
@ -211,7 +209,7 @@ char* TRI_normalize_utf16_to_NFC (TRI_memory_zone_t* zone,
  }
  
  // Convert data back from UChar (UTF-16) to UTF-8 
-  utf8Dest = UCharToUtf8(zone, utf16Dest, utf16DestLength, outLength);
+  utf8Dest = TRI_UCharToUtf8(zone, utf16Dest, utf16DestLength, outLength);
  TRI_Free(zone, utf16Dest);
  
  return utf8Dest;  
--- a/lib/BasicsC/utf8-helper.h
+++ b/lib/BasicsC/utf8-helper.h
@ -29,6 +29,11 @@
 #define TRIAGENS_BASICS_C_UTF8_HELPER_H 1

 #include "BasicsC/common.h"
+#include "BasicsC/vector.h"
+
+#ifdef TRI_HAVE_ICU
+#include "unicode/ustring.h"
+#endif

 #ifdef __cplusplus
 extern "C" {
@ -45,6 +50,24 @@ extern "C" {

 #ifdef TRI_HAVE_ICU
  
+////////////////////////////////////////////////////////////////////////////////
+/// @brief convert a utf-8 string to a uchar (utf-16)
+////////////////////////////////////////////////////////////////////////////////
+
+UChar* TRI_Utf8ToUChar (TRI_memory_zone_t* zone, 
+                           const char* utf8, 
+                           const size_t inLength, 
+                           size_t* outLength);
+  
+////////////////////////////////////////////////////////////////////////////////
+/// @brief convert a uchar (utf-16) to a utf-8 string
+////////////////////////////////////////////////////////////////////////////////
+
+char* TRI_UCharToUtf8 (TRI_memory_zone_t* zone,
+                          const UChar* uchar,
+                          const size_t inLength,
+                          size_t* outLength);
+
 ////////////////////////////////////////////////////////////////////////////////
 /// @brief normalize an utf8 string (NFC)
 ////////////////////////////////////////////////////////////////////////////////
@ -98,6 +121,15 @@ char* TRI_toupper_utf8 (TRI_memory_zone_t* zone,
                        int32_t srcLength, 
                        int32_t* dstLength);

+////////////////////////////////////////////////////////////////////////////////
+/// @brief Get words of an UTF-8 string (implemented in Basic/Utf8Helper.cpp)
+////////////////////////////////////////////////////////////////////////////////
+
+TRI_vector_string_t* TRI_get_words (const char* const text, 
+                                    const size_t textLength,
+                                    uint8_t minimalWordLength,
+                                    bool lowerCase);
+
 ////////////////////////////////////////////////////////////////////////////////
 /// @}
 ////////////////////////////////////////////////////////////////////////////////