1
0
Fork 0
arangodb/lib/BasicsC/utf8-helper.c

173 lines
5.2 KiB
C

////////////////////////////////////////////////////////////////////////////////
/// @brief utf8 helper functions
///
/// @file
///
/// DISCLAIMER
///
/// Copyright 2004-2012 triagens GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is triAGENS GmbH, Cologne, Germany
///
/// @author Dr. Frank Celler
/// @author Copyright 2011-2012, triAGENS GmbH, Cologne, Germany
////////////////////////////////////////////////////////////////////////////////
#include "utf8-helper.h"
#ifdef TRI_HAVE_ICU
#include "unicode/ustring.h"
#include "unicode/unorm2.h"
// -----------------------------------------------------------------------------
// --SECTION-- public functions
// -----------------------------------------------------------------------------
////////////////////////////////////////////////////////////////////////////////
/// @addtogroup Helper functions
/// @{
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/// @brief normalize an utf8 string (NFC)
////////////////////////////////////////////////////////////////////////////////
char * TR_normalize_utf8_to_NFC (TRI_memory_zone_t* zone, const char* utf8, size_t inLength, size_t* outLength) {
UErrorCode status = U_ZERO_ERROR;
UChar* utf16 = NULL;
int32_t utf16_length = 0;
char * utf8_dest = NULL;
*outLength = 0;
// 1. convert utf8 string to utf16
// calculate utf16 string length
u_strFromUTF8(NULL, 0, &utf16_length, utf8, inLength, &status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
printf("error in u_strFromUTF8 1: %s\n", u_errorName(status));
return 0;
}
status = U_ZERO_ERROR;
utf16 = (UChar *) malloc((utf16_length+1) * sizeof(UChar));
if (utf16 == NULL) {
printf("malloc error\r");
return 0;
}
// now convert
u_strFromUTF8(utf16, utf16_length+1, NULL, utf8, inLength, &status);
if (status != U_ZERO_ERROR) {
printf("error in u_strFromUTF8 2: %s\n", u_errorName(status));
free(utf16);
return 0;
}
// continue in TR_normalize_utf16_to_NFC
utf8_dest = TR_normalize_utf16_to_NFC(zone, utf16, utf16_length, outLength);
free(utf16);
return utf8_dest;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief normalize an utf8 string (NFC)
////////////////////////////////////////////////////////////////////////////////
char * TR_normalize_utf16_to_NFC (TRI_memory_zone_t* zone, const uint16_t* utf16, size_t inLength, size_t* outLength) {
UErrorCode status = U_ZERO_ERROR;
UChar * utf16_dest = NULL;
int32_t utf16_dest_length = 0;
char * utf8_dest = NULL;
int32_t out_length = 0;
const UNormalizer2 * norm2 = unorm2_getInstance(NULL, "nfc", UNORM2_COMPOSE ,&status);
*outLength = 0;
if (status != U_ZERO_ERROR) {
printf("error in unorm2_getInstance: %s\n", u_errorName(status));
return 0;
}
// 2. normalize UChar (UTF-16)
utf16_dest = (UChar *) malloc((inLength+1) * sizeof(UChar));
if (utf16_dest == NULL) {
printf("malloc error\n");
return 0;
}
utf16_dest_length = unorm2_normalize(norm2, (UChar*) utf16, inLength, utf16_dest, inLength+1, &status);
if (status != U_ZERO_ERROR) {
printf("error in unorm2_normalize: %s\n", u_errorName(status));
free(utf16_dest);
return 0;
}
// 3. Convert data back from UChar (UTF-16) to UTF-8
// calculate utf8 string length
u_strToUTF8(NULL, 0, &out_length, utf16_dest, utf16_dest_length+1, &status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
printf("error in u_strToUTF8 1 %s\n", u_errorName(status));
free(utf16_dest);
return 0;
}
status = U_ZERO_ERROR;
// utf8_dest = (char *) malloc((out_length+1) * sizeof(char));
utf8_dest = TRI_Allocate(zone, (out_length+1) * sizeof(char), false);
if (utf8_dest == NULL) {
printf("malloc error\n");
free(utf16_dest);
return 0;
}
// convert to utf8
u_strToUTF8(utf8_dest, out_length+1, NULL, utf16_dest, utf16_dest_length+1, &status);
if (status != U_ZERO_ERROR) {
printf("error in u_strToUTF8 2 %s\n", u_errorName(status));
free(utf16_dest);
TRI_Free(zone, utf8_dest);
return 0;
}
*outLength = out_length - 1; // ?
free(utf16_dest);
return utf8_dest;
}
////////////////////////////////////////////////////////////////////////////////
/// @}
////////////////////////////////////////////////////////////////////////////////
#endif
// Local Variables:
// mode: outline-minor
// outline-regexp: "^\\(/// @brief\\|/// {@inheritDoc}\\|/// @addtogroup\\|// --SECTION--\\|/// @\\}\\)"
// End: