Replaced the word tokenizer TRI_string_vector_t version by a std::vector<std::string> version

2016-03-01 13:39:17 +01:00 · 2016-03-01 13:39:17 +01:00 · d3a53cd9bf
parent fa8b4ae0e3
commit d3a53cd9bf
2 changed files with 22 additions and 215 deletions
--- a/lib/Basics/Utf8Helper.cpp
+++ b/lib/Basics/Utf8Helper.cpp
@ -387,22 +387,20 @@ char* Utf8Helper::toupper(TRI_memory_zone_t* zone, char const* src,
 /// @brief Extract the words from a UTF-8 string.
 ////////////////////////////////////////////////////////////////////////////////

-TRI_vector_string_t* Utf8Helper::getWords(char const* text, size_t textLength,
-                                          size_t minimalLength,
-                                          size_t maximalLength,
-                                          bool lowerCase) {
-  TRI_vector_string_t* words;
+bool Utf8Helper::getWords(std::vector<std::string>& words,
+                          std::string const& text, size_t minimalLength,
+                          size_t maximalLength, bool lowerCase) {
  UErrorCode status = U_ZERO_ERROR;
  UnicodeString word;

-  if (textLength == 0) {
-    // input text is empty
-    return nullptr;
+  if (text.empty()) {
+    return true;
  }
+  size_t textLength = text.size();

  if (textLength < minimalLength) {
    // input text is shorter than required minimum length
-    return nullptr;
+    return true;
  }

  size_t textUtf16Length = 0;
@ -412,28 +410,28 @@ TRI_vector_string_t* Utf8Helper::getWords(char const* text, size_t textLength,
    // lower case string
    int32_t lowerLength = 0;
    char* lower =
-        tolower(TRI_UNKNOWN_MEM_ZONE, text, (int32_t)textLength, lowerLength);
+        tolower(TRI_UNKNOWN_MEM_ZONE, text.c_str(), (int32_t)textLength, lowerLength);

    if (lower == nullptr) {
      // out of memory
-      return nullptr;
+      return false;
    }

    if (lowerLength == 0) {
      TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower);
-      return nullptr;
+      return false;
    }

    textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, lower, lowerLength,
                                &textUtf16Length);
    TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower);
  } else {
-    textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, text, (int32_t)textLength,
+    textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, text.c_str(), (int32_t)textLength,
                                &textUtf16Length);
  }

  if (textUtf16 == nullptr) {
-    return nullptr;
+    return false;
  }

  ULocDataLocaleType type = ULOC_VALID_LOCALE;
@ -442,7 +440,7 @@ TRI_vector_string_t* Utf8Helper::getWords(char const* text, size_t textLength,
  if (U_FAILURE(status)) {
    TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
    LOG(ERR) << "error in Collator::getLocale(...): " << u_errorName(status);
-    return nullptr;
+    return false;
  }

  UChar* tempUtf16 = (UChar*)TRI_Allocate(
@ -450,16 +448,7 @@ TRI_vector_string_t* Utf8Helper::getWords(char const* text, size_t textLength,

  if (tempUtf16 == nullptr) {
    TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
-    return nullptr;
-  }
-
-  words = (TRI_vector_string_t*)TRI_Allocate(
-      TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_vector_string_t), false);
-
-  if (words == nullptr) {
-    TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
-    TRI_Free(TRI_UNKNOWN_MEM_ZONE, tempUtf16);
-    return nullptr;
+    return false;
  }

  // estimate an initial vector size. this is not accurate, but setting the
@ -474,8 +463,8 @@ TRI_vector_string_t* Utf8Helper::getWords(char const* text, size_t textLength,
    // alloc at most 8192 pointers (= 64kb)
    initialWordCount = 8192;
  }
-
-  TRI_InitVectorString2(words, TRI_UNKNOWN_MEM_ZONE, initialWordCount);
+  // Reserve initialWordCount additional words in the vector
+  words.reserve(words.size() + initialWordCount);

  BreakIterator* wordIterator =
      BreakIterator::createWordInstance(locale, status);
@ -498,145 +487,9 @@ TRI_vector_string_t* Utf8Helper::getWords(char const* text, size_t textLength,
      char* utf8Word = TRI_UCharToUtf8(TRI_UNKNOWN_MEM_ZONE, tempUtf16,
                                       chunkLength, &utf8WordLength);
      if (utf8Word != nullptr) {
-        TRI_PushBackVectorString(words, utf8Word);
-      }
-    }
-  }
-
-  delete wordIterator;
-
-  TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
-  TRI_Free(TRI_UNKNOWN_MEM_ZONE, tempUtf16);
-
-  if (words->_length == 0) {
-    // no words found
-    TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
-    return nullptr;
-  }
-
-  return words;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// @brief Extract the words from a UTF-8 string.
-////////////////////////////////////////////////////////////////////////////////
-
-bool Utf8Helper::getWords(TRI_vector_string_t*& words, char const* text,
-                          size_t textLength, size_t minimalLength,
-                          size_t maximalLength, bool lowerCase) {
-  UErrorCode status = U_ZERO_ERROR;
-  UnicodeString word;
-
-  if (textLength == 0) {
-    // input text is empty
-    return true;
-  }
-
-  if (textLength < minimalLength) {
-    // input text is shorter than required minimum length
-    return true;
-  }
-
-  size_t textUtf16Length = 0;
-  UChar* textUtf16 = nullptr;
-
-  if (lowerCase) {
-    // lower case string
-    int32_t lowerLength = 0;
-    char* lower =
-        tolower(TRI_UNKNOWN_MEM_ZONE, text, (int32_t)textLength, lowerLength);
-
-    if (lower == nullptr) {
-      // out of memory
-      return false;
-    }
-
-    if (lowerLength == 0) {
-      TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower);
-      return false;
-    }
-
-    textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, lower, lowerLength,
-                                &textUtf16Length);
-    TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower);
-  } else {
-    textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, text, (int32_t)textLength,
-                                &textUtf16Length);
-  }
-
-  if (textUtf16 == nullptr) {
-    return false;
-  }
-
-  ULocDataLocaleType type = ULOC_VALID_LOCALE;
-  const Locale& locale = _coll->getLocale(type, status);
-
-  if (U_FAILURE(status)) {
-    TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
-    LOG(ERR) << "error in Collator::getLocale(...): " << u_errorName(status);
-    return false;
-  }
-
-  UChar* tempUtf16 = (UChar*)TRI_Allocate(
-      TRI_UNKNOWN_MEM_ZONE, (textUtf16Length + 1) * sizeof(UChar), false);
-
-  if (tempUtf16 == nullptr) {
-    TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
-    return false;
-  }
-
-  bool created = false;
-
-  if (words == nullptr) {
-    words = (TRI_vector_string_t*)TRI_Allocate(
-        TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_vector_string_t), false);
-    created = true;
-  }
-
-  if (words == nullptr) {
-    TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
-    TRI_Free(TRI_UNKNOWN_MEM_ZONE, tempUtf16);
-    return false;
-  }
-
-  if (created) {
-    // estimate an initial vector size. this is not accurate, but setting the
-    // initial size to some
-    // value in the correct order of magnitude will save a lot of vector
-    // reallocations later
-    size_t initialWordCount = textLength / (2 * (minimalLength + 1));
-    if (initialWordCount < 32) {
-      // alloc at least 32 pointers (= 256b)
-      initialWordCount = 32;
-    } else if (initialWordCount > 8192) {
-      // alloc at most 8192 pointers (= 64kb)
-      initialWordCount = 8192;
-    }
-    TRI_InitVectorString2(words, TRI_UNKNOWN_MEM_ZONE, initialWordCount);
-  }
-
-  BreakIterator* wordIterator =
-      BreakIterator::createWordInstance(locale, status);
-  UnicodeString utext(textUtf16);
-
-  wordIterator->setText(utext);
-  int32_t start = wordIterator->first();
-  for (int32_t end = wordIterator->next(); end != BreakIterator::DONE;
-       start = end, end = wordIterator->next()) {
-    size_t tempUtf16Length = (size_t)(end - start);
-    // end - start = word length
-    if (tempUtf16Length >= minimalLength) {
-      size_t chunkLength = tempUtf16Length;
-      if (chunkLength > maximalLength) {
-        chunkLength = maximalLength;
-      }
-      utext.extractBetween(start, (int32_t)(start + chunkLength), tempUtf16, 0);
-
-      size_t utf8WordLength;
-      char* utf8Word = TRI_UCharToUtf8(TRI_UNKNOWN_MEM_ZONE, tempUtf16,
-                                       chunkLength, &utf8WordLength);
-      if (utf8Word != nullptr) {
-        TRI_PushBackVectorString(words, utf8Word);
+        std::string word(utf8Word, utf8WordLength);
+        words.emplace_back(word);
+        TRI_Free(TRI_UNKNOWN_MEM_ZONE, utf8Word);
      }
    }
  }
@ -731,28 +584,6 @@ char* TRI_tolower_utf8(TRI_memory_zone_t* zone, char const* src,
                                               *dstLength);
 }

-////////////////////////////////////////////////////////////////////////////////
-/// @brief Get words of an UTF-8 string (implemented in Basic/Utf8Helper.cpp)
-////////////////////////////////////////////////////////////////////////////////
-
-TRI_vector_string_t* TRI_get_words(char const* text, size_t textLength,
-                                   size_t minimalWordLength,
-                                   size_t maximalWordLength, bool lowerCase) {
-  return Utf8Helper::DefaultUtf8Helper.getWords(
-      text, textLength, minimalWordLength, maximalWordLength, lowerCase);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// @brief Get words of an UTF-8 string (implemented in Basic/Utf8Helper.cpp)
-////////////////////////////////////////////////////////////////////////////////
-
-bool TRI_get_words(TRI_vector_string_t*& words, char const* text,
-                   size_t textLength, size_t minimalWordLength,
-                   size_t maximalWordLength, bool lowerCase) {
-  return Utf8Helper::DefaultUtf8Helper.getWords(
-      words, text, textLength, minimalWordLength, maximalWordLength, lowerCase);
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 /// @brief convert a utf-8 string to a uchar (utf-16)
 ////////////////////////////////////////////////////////////////////////////////
--- a/lib/Basics/Utf8Helper.h
+++ b/lib/Basics/Utf8Helper.h
@ -135,17 +135,9 @@ class Utf8Helper {
  /// @brief returns the words of a UTF-8 string.
  //////////////////////////////////////////////////////////////////////////////

-  TRI_vector_string_t* getWords(char const* text, size_t textLength,
-                                size_t minimalWordLength,
-                                size_t maximalWordLength, bool lowerCase);
-
-  //////////////////////////////////////////////////////////////////////////////
-  /// @brief returns the words of a UTF-8 string.
-  //////////////////////////////////////////////////////////////////////////////
-
-  bool getWords(TRI_vector_string_t*& words, char const* text,
-                size_t textLength, size_t minimalWordLength,
-                size_t maximalWordLength, bool lowerCase);
+  bool getWords(std::vector<std::string>& words, std::string const& text,
+                size_t minimalWordLength, size_t maximalWordLength,
+                bool lowerCase);

  //////////////////////////////////////////////////////////////////////////////
  /// @brief builds a regex matcher for the specified pattern
@ -220,20 +212,4 @@ int TRI_compare_utf8(char const* left, size_t leftLength, char const* right,
 char* TRI_tolower_utf8(TRI_memory_zone_t* zone, char const* src,
                       int32_t srcLength, int32_t* dstLength);

-////////////////////////////////////////////////////////////////////////////////
-/// @brief Get words of an UTF-8 string (implemented in Basic/Utf8Helper.cpp)
-////////////////////////////////////////////////////////////////////////////////
-
-TRI_vector_string_t* TRI_get_words(char const* text, size_t textLength,
-                                   size_t minimalWordLength,
-                                   size_t maximalWordLength, bool lowerCase);
-
-////////////////////////////////////////////////////////////////////////////////
-/// @brief Get words of an UTF-8 string (implemented in Basic/Utf8Helper.cpp)
-////////////////////////////////////////////////////////////////////////////////
-
-bool TRI_get_words(TRI_vector_string_t*& words, char const* text,
-                   size_t textLength, size_t minimalWordLength,
-                   size_t maximalWordLength, bool lowerCase);
-
 #endif