mirror of https://gitee.com/bigwinds/arangodb
210 lines
8.4 KiB
C++
210 lines
8.4 KiB
C++
////////////////////////////////////////////////////////////////////////////////
|
|
/// DISCLAIMER
|
|
///
|
|
/// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany
|
|
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
|
|
///
|
|
/// Licensed under the Apache License, Version 2.0 (the "License");
|
|
/// you may not use this file except in compliance with the License.
|
|
/// You may obtain a copy of the License at
|
|
///
|
|
/// http://www.apache.org/licenses/LICENSE-2.0
|
|
///
|
|
/// Unless required by applicable law or agreed to in writing, software
|
|
/// distributed under the License is distributed on an "AS IS" BASIS,
|
|
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
/// See the License for the specific language governing permissions and
|
|
/// limitations under the License.
|
|
///
|
|
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
|
|
///
|
|
/// @author Frank Celler
|
|
/// @author Achim Brandt
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#ifndef ARANGODB_BASICS_UTF8HELPER_H
|
|
#define ARANGODB_BASICS_UTF8HELPER_H 1
|
|
|
|
#include "Basics/Common.h"
|
|
|
|
#include <unicode/coll.h>
|
|
#include <unicode/regex.h>
|
|
#include <unicode/ustring.h>
|
|
|
|
namespace arangodb {
|
|
namespace basics {
|
|
|
|
class Utf8Helper {
|
|
Utf8Helper(Utf8Helper const&) = delete;
|
|
Utf8Helper& operator=(Utf8Helper const&) = delete;
|
|
|
|
public:
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// @brief a default helper
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
static Utf8Helper DefaultUtf8Helper;
|
|
|
|
public:
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// @brief constructor
|
|
/// @param lang Lowercase two-letter or three-letter ISO-639 code.
|
|
/// This parameter can instead be an ICU style C locale (e.g. "en_US")
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
Utf8Helper(std::string const& lang, void *icuDataPtr);
|
|
|
|
explicit Utf8Helper(void *icuDataPtr);
|
|
|
|
~Utf8Helper();
|
|
|
|
public:
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// public functions
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// @brief compare utf8 strings
|
|
/// -1 : left < right
|
|
/// 0 : left == right
|
|
/// 1 : left > right
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
int compareUtf8(char const* left, size_t leftLength, char const* right,
|
|
size_t rightLength) const;
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// @brief compare utf16 strings
|
|
/// -1 : left < right
|
|
/// 0 : left == right
|
|
/// 1 : left > right
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
int compareUtf16(uint16_t const* left, size_t leftLength,
|
|
uint16_t const* right, size_t rightLength) const;
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// @brief set collator by language
|
|
/// @param lang Lowercase two-letter or three-letter ISO-639 code.
|
|
/// This parameter can instead be an ICU style C locale (e.g. "en_US")
|
|
/// @param icuDataPointer data file to be loaded by the application
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
bool setCollatorLanguage(std::string const& lang, void* icuDataPointer);
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// @brief get collator language
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
std::string getCollatorLanguage();
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// @brief get collator country
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
std::string getCollatorCountry();
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// @brief Lowercase the characters in a UTF-8 string.
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
std::string toLowerCase(std::string const& src);
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// @brief Lowercase the characters in a UTF-8 string.
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
char* tolower(char const* src, int32_t srcLength,
|
|
int32_t& dstLength);
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// @brief Uppercase the characters in a UTF-8 string.
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
std::string toUpperCase(std::string const& src);
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// @brief Uppercase the characters in a UTF-8 string.
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
char* toupper(char const* src, int32_t srcLength,
|
|
int32_t& dstLength);
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// @brief returns the words of a UTF-8 string.
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
bool tokenize(std::set<std::string>& words, std::string const& text,
|
|
size_t minimalWordLength, size_t maximalWordLength,
|
|
bool lowerCase);
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// @brief builds a regex matcher for the specified pattern
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
RegexMatcher* buildMatcher(std::string const&);
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
/// @brief whether or not value matches a regex
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
bool matches(RegexMatcher*, char const* pattern, size_t patternLength,
|
|
bool partial, bool& error);
|
|
|
|
std::string replace(RegexMatcher*, char const* pattern, size_t patternLength,
|
|
char const* replacement, size_t replacementLength,
|
|
bool partial, bool& error);
|
|
|
|
private:
|
|
Collator* _coll;
|
|
};
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief convert a utf-8 string to a uchar (utf-16)
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
UChar* TRI_Utf8ToUChar(char const* utf8,
|
|
size_t inLength, size_t* outLength);
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief convert a uchar (utf-16) to a utf-8 string
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
char* TRI_UCharToUtf8(UChar const* uchar,
|
|
size_t inLength, size_t* outLength);
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief normalize an utf8 string (NFC)
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
char* TRI_normalize_utf8_to_NFC(char const* utf8,
|
|
size_t inLength, size_t* outLength);
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief normalize an utf16 string (NFC) and export it to utf8
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
char* TRI_normalize_utf16_to_NFC(uint16_t const* utf16,
|
|
size_t inLength, size_t* outLength);
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief compare two utf8 strings
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
static inline int TRI_compare_utf8(char const* left, size_t leftLength,
|
|
char const* right, size_t rightLength) {
|
|
return arangodb::basics::Utf8Helper::DefaultUtf8Helper.compareUtf8(left, leftLength,
|
|
right, rightLength);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief Lowercase the characters in a UTF-8 string
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
char* TRI_tolower_utf8(char const* src,
|
|
int32_t srcLength, int32_t* dstLength);
|
|
|
|
#endif
|