mirror of https://gitee.com/bigwinds/arangodb
173 lines
5.2 KiB
C
173 lines
5.2 KiB
C
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief utf8 helper functions
|
|
///
|
|
/// @file
|
|
///
|
|
/// DISCLAIMER
|
|
///
|
|
/// Copyright 2004-2012 triagens GmbH, Cologne, Germany
|
|
///
|
|
/// Licensed under the Apache License, Version 2.0 (the "License");
|
|
/// you may not use this file except in compliance with the License.
|
|
/// You may obtain a copy of the License at
|
|
///
|
|
/// http://www.apache.org/licenses/LICENSE-2.0
|
|
///
|
|
/// Unless required by applicable law or agreed to in writing, software
|
|
/// distributed under the License is distributed on an "AS IS" BASIS,
|
|
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
/// See the License for the specific language governing permissions and
|
|
/// limitations under the License.
|
|
///
|
|
/// Copyright holder is triAGENS GmbH, Cologne, Germany
|
|
///
|
|
/// @author Dr. Frank Celler
|
|
/// @author Copyright 2011-2012, triAGENS GmbH, Cologne, Germany
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
#include "utf8-helper.h"
|
|
|
|
#ifdef TRI_HAVE_ICU
|
|
|
|
#include "unicode/ustring.h"
|
|
#include "unicode/unorm2.h"
|
|
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// --SECTION-- public functions
|
|
// -----------------------------------------------------------------------------
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @addtogroup Helper functions
|
|
/// @{
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief normalize an utf8 string (NFC)
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
char * TR_normalize_utf8_to_NFC (TRI_memory_zone_t* zone, const char* utf8, size_t inLength, size_t* outLength) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
UChar* utf16 = NULL;
|
|
int32_t utf16_length = 0;
|
|
char * utf8_dest = NULL;
|
|
*outLength = 0;
|
|
|
|
// 1. convert utf8 string to utf16
|
|
|
|
// calculate utf16 string length
|
|
u_strFromUTF8(NULL, 0, &utf16_length, utf8, inLength, &status);
|
|
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
|
printf("error in u_strFromUTF8 1: %s\n", u_errorName(status));
|
|
return 0;
|
|
}
|
|
|
|
status = U_ZERO_ERROR;
|
|
utf16 = (UChar *) malloc((utf16_length+1) * sizeof(UChar));
|
|
if (utf16 == NULL) {
|
|
printf("malloc error\r");
|
|
return 0;
|
|
}
|
|
|
|
// now convert
|
|
u_strFromUTF8(utf16, utf16_length+1, NULL, utf8, inLength, &status);
|
|
if (status != U_ZERO_ERROR) {
|
|
printf("error in u_strFromUTF8 2: %s\n", u_errorName(status));
|
|
|
|
free(utf16);
|
|
return 0;
|
|
}
|
|
|
|
// continue in TR_normalize_utf16_to_NFC
|
|
utf8_dest = TR_normalize_utf16_to_NFC(zone, utf16, utf16_length, outLength);
|
|
free(utf16);
|
|
|
|
return utf8_dest;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief normalize an utf8 string (NFC)
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
char * TR_normalize_utf16_to_NFC (TRI_memory_zone_t* zone, const uint16_t* utf16, size_t inLength, size_t* outLength) {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
UChar * utf16_dest = NULL;
|
|
int32_t utf16_dest_length = 0;
|
|
char * utf8_dest = NULL;
|
|
int32_t out_length = 0;
|
|
const UNormalizer2 * norm2 = unorm2_getInstance(NULL, "nfc", UNORM2_COMPOSE ,&status);
|
|
*outLength = 0;
|
|
|
|
if (status != U_ZERO_ERROR) {
|
|
printf("error in unorm2_getInstance: %s\n", u_errorName(status));
|
|
return 0;
|
|
}
|
|
|
|
// 2. normalize UChar (UTF-16)
|
|
|
|
utf16_dest = (UChar *) malloc((inLength+1) * sizeof(UChar));
|
|
if (utf16_dest == NULL) {
|
|
printf("malloc error\n");
|
|
|
|
return 0;
|
|
}
|
|
|
|
utf16_dest_length = unorm2_normalize(norm2, (UChar*) utf16, inLength, utf16_dest, inLength+1, &status);
|
|
if (status != U_ZERO_ERROR) {
|
|
printf("error in unorm2_normalize: %s\n", u_errorName(status));
|
|
|
|
free(utf16_dest);
|
|
return 0;
|
|
}
|
|
|
|
// 3. Convert data back from UChar (UTF-16) to UTF-8
|
|
|
|
// calculate utf8 string length
|
|
u_strToUTF8(NULL, 0, &out_length, utf16_dest, utf16_dest_length+1, &status);
|
|
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
|
printf("error in u_strToUTF8 1 %s\n", u_errorName(status));
|
|
|
|
free(utf16_dest);
|
|
return 0;
|
|
}
|
|
|
|
status = U_ZERO_ERROR;
|
|
// utf8_dest = (char *) malloc((out_length+1) * sizeof(char));
|
|
utf8_dest = TRI_Allocate(zone, (out_length+1) * sizeof(char), false);
|
|
if (utf8_dest == NULL) {
|
|
printf("malloc error\n");
|
|
|
|
free(utf16_dest);
|
|
return 0;
|
|
}
|
|
|
|
// convert to utf8
|
|
u_strToUTF8(utf8_dest, out_length+1, NULL, utf16_dest, utf16_dest_length+1, &status);
|
|
if (status != U_ZERO_ERROR) {
|
|
printf("error in u_strToUTF8 2 %s\n", u_errorName(status));
|
|
|
|
free(utf16_dest);
|
|
TRI_Free(zone, utf8_dest);
|
|
return 0;
|
|
}
|
|
|
|
*outLength = out_length - 1; // ?
|
|
|
|
free(utf16_dest);
|
|
|
|
return utf8_dest;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @}
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#endif
|
|
|
|
// Local Variables:
|
|
// mode: outline-minor
|
|
// outline-regexp: "^\\(/// @brief\\|/// {@inheritDoc}\\|/// @addtogroup\\|// --SECTION--\\|/// @\\}\\)"
|
|
// End:
|
|
|