1
0
Fork 0

added utf8 normalization function (uses icu)

This commit is contained in:
a-brandt 2012-09-13 16:41:00 +02:00
parent 41bdeeecdd
commit 4dc64a546b
9 changed files with 463 additions and 1 deletions

View File

@ -160,6 +160,7 @@ UnitTests_basics_suite_SOURCES = \
UnitTests/Philadelphia/hashes-test.cpp \
UnitTests/Philadelphia/associative-pointer-test.cpp \
UnitTests/Philadelphia/string-buffer-test.cpp \
UnitTests/Philadelphia/string-utf8-normalize-test.cpp \
UnitTests/Philadelphia/string-utf8-test.cpp \
UnitTests/Philadelphia/string-test.cpp \
UnitTests/Philadelphia/vector-pointer-test.cpp \

View File

@ -0,0 +1,124 @@
////////////////////////////////////////////////////////////////////////////////
/// @brief test suite for string utility functions
///
/// @file
///
/// DISCLAIMER
///
/// Copyright 2012 triagens GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is triAGENS GmbH, Cologne, Germany
///
/// @author Jan Steemann
/// @author Copyright 2012, triAGENS GmbH, Cologne, Germany
////////////////////////////////////////////////////////////////////////////////
#include <boost/test/unit_test.hpp>
#include "BasicsC/utf8-helper.h"
#include "BasicsC/strings.h"
// -----------------------------------------------------------------------------
// --SECTION-- private macros
// -----------------------------------------------------------------------------
// -----------------------------------------------------------------------------
// --SECTION-- private constants
// -----------------------------------------------------------------------------
// -----------------------------------------------------------------------------
// --SECTION-- setup / tear-down
// -----------------------------------------------------------------------------
struct CNormalizeStringTestSetup {
CNormalizeStringTestSetup () {
BOOST_TEST_MESSAGE("setup utf8 string normalize test");
}
~CNormalizeStringTestSetup () {
BOOST_TEST_MESSAGE("tear-down utf8 string normalize test");
}
};
// -----------------------------------------------------------------------------
// --SECTION-- test suite
// -----------------------------------------------------------------------------
////////////////////////////////////////////////////////////////////////////////
/// @brief setup
////////////////////////////////////////////////////////////////////////////////
BOOST_FIXTURE_TEST_SUITE(CNormalizeStringTest, CNormalizeStringTestSetup)
#ifdef TRI_HAVE_ICU
////////////////////////////////////////////////////////////////////////////////
/// @brief test NFD to NFC
////////////////////////////////////////////////////////////////////////////////
BOOST_AUTO_TEST_CASE (tst_1) {
/* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a) 日本語,中文,한글" */
static const unsigned char composed[] =
{ 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9,
0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')',
'/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
0xED, 0x95, 0x9C,
0xEA, 0xB8, 0x80, 'z', 0
};
static const unsigned char decomposed[] =
{ 'G', 'r', 0x75, 0xCC, 0x88, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB8, 0xCC, 0x86,
0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')',
'/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xAB,
0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF, 'z', 0
};
size_t len = 0;
char* result = TR_normalize_utf8_to_NFC(TRI_CORE_MEM_ZONE, (const char*) decomposed, strlen((const char*) decomposed),&len);
/*
size_t outLength;
char* uni = TRI_EscapeUtf8StringZ (TRI_CORE_MEM_ZONE, (const char*) decomposed, strlen((const char*) decomposed), true, &outLength);
printf("\nOriginal: %s\nEscaped: %s\n", decomposed, uni);
char* uni2 = TRI_EscapeUtf8StringZ (TRI_CORE_MEM_ZONE, (const char*) composed, strlen((const char*) composed), true, &outLength);
printf("\nOriginal: %s\nEscaped: %s\n", composed, uni2);
*/
BOOST_CHECK_EQUAL((const char*) composed, (const char*) result);
TRI_FreeString(TRI_CORE_MEM_ZONE, result);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief generate tests
////////////////////////////////////////////////////////////////////////////////
#endif
BOOST_AUTO_TEST_SUITE_END ()
// Local Variables:
// mode: outline-minor
// outline-regexp: "^\\(/// @brief\\|/// {@inheritDoc}\\|/// @addtogroup\\|// --SECTION--\\|/// @\\}\\)"
// End:

View File

@ -1608,6 +1608,22 @@ static void* UnwrapGeneralCursor (v8::Handle<v8::Object> cursorObject) {
/// @{
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/// @brief test UTF 16 normalize
////////////////////////////////////////////////////////////////////////////////
static v8::Handle<v8::Value> JS_test_normalizer (v8::Arguments const& argv) {
v8::HandleScope scope;
TRI_Utf8ValueNFC x(TRI_UNKNOWN_MEM_ZONE, argv[0]);
if (x.length() == 0) {
return scope.Close(v8::Null());
}
return scope.Close(v8::String::New(*x, x.length()));
}
////////////////////////////////////////////////////////////////////////////////
/// @brief generates a general cursor from a list
////////////////////////////////////////////////////////////////////////////////
@ -5608,6 +5624,9 @@ TRI_v8_global_t* TRI_InitV8VocBridge (v8::Handle<v8::Context> context, TRI_vocba
v8::FunctionTemplate::New(JS_CreateCursor)->GetFunction(),
v8::ReadOnly);
context->Global()->Set(v8::String::New("TEST_NORMALIZER"),
v8::FunctionTemplate::New(JS_test_normalizer)->GetFunction(),
v8::ReadOnly);
// .............................................................................
// create the global variables
// .............................................................................

View File

@ -27,6 +27,8 @@
#include "strings.h"
#include "utf8-helper.h"
// -----------------------------------------------------------------------------
// --SECTION-- private functions
// -----------------------------------------------------------------------------
@ -1144,7 +1146,12 @@ char* TRI_UnescapeUtf8StringZ (TRI_memory_zone_t* zone, char const* in, size_t i
char * qtr;
char const * ptr;
char const * end;
#ifdef TRI_HAVE_ICU
char * utf8_nfc;
size_t tmpLength = 0;
#endif
buffer = TRI_Allocate(zone, inLength + 1, false);
if (buffer == NULL) {
@ -1240,6 +1247,14 @@ char* TRI_UnescapeUtf8StringZ (TRI_memory_zone_t* zone, char const* in, size_t i
*qtr = '\0';
*outLength = qtr - buffer;
#ifdef TRI_HAVE_ICU
utf8_nfc = TR_normalize_utf8_to_NFC(zone, buffer, *outLength, &tmpLength);
if (utf8_nfc) {
*outLength = tmpLength;
TRI_Free(zone, buffer);
return utf8_nfc;
}
#endif
// we might have wasted some space if the unescaped string is shorter than the
// escaped one. this is the case if the string contained escaped characters

172
lib/BasicsC/utf8-helper.c Normal file
View File

@ -0,0 +1,172 @@
////////////////////////////////////////////////////////////////////////////////
/// @brief utf8 helper functions
///
/// @file
///
/// DISCLAIMER
///
/// Copyright 2004-2012 triagens GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is triAGENS GmbH, Cologne, Germany
///
/// @author Dr. Frank Celler
/// @author Copyright 2011-2012, triAGENS GmbH, Cologne, Germany
////////////////////////////////////////////////////////////////////////////////
#include "utf8-helper.h"
#ifdef TRI_HAVE_ICU
#include "unicode/ustring.h"
#include "unicode/unorm2.h"
// -----------------------------------------------------------------------------
// --SECTION-- public functions
// -----------------------------------------------------------------------------
////////////////////////////////////////////////////////////////////////////////
/// @addtogroup Helper functions
/// @{
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/// @brief normalize an utf8 string (NFC)
////////////////////////////////////////////////////////////////////////////////
char * TR_normalize_utf8_to_NFC (TRI_memory_zone_t* zone, const char* utf8, size_t inLength, size_t* outLength) {
UErrorCode status = U_ZERO_ERROR;
UChar* utf16 = NULL;
int32_t utf16_length = 0;
char * utf8_dest = NULL;
*outLength = 0;
// 1. convert utf8 string to utf16
// calculate utf16 string length
u_strFromUTF8(NULL, 0, &utf16_length, utf8, inLength, &status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
printf("error in u_strFromUTF8 1: %s\n", u_errorName(status));
return 0;
}
status = U_ZERO_ERROR;
utf16 = (UChar *) malloc((utf16_length+1) * sizeof(UChar));
if (utf16 == NULL) {
printf("malloc error\r");
return 0;
}
// now convert
u_strFromUTF8(utf16, utf16_length+1, NULL, utf8, inLength, &status);
if (status != U_ZERO_ERROR) {
printf("error in u_strFromUTF8 2: %s\n", u_errorName(status));
free(utf16);
return 0;
}
// continue in TR_normalize_utf16_to_NFC
utf8_dest = TR_normalize_utf16_to_NFC(zone, utf16, utf16_length, outLength);
free(utf16);
return utf8_dest;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief normalize an utf8 string (NFC)
////////////////////////////////////////////////////////////////////////////////
char * TR_normalize_utf16_to_NFC (TRI_memory_zone_t* zone, const uint16_t* utf16, size_t inLength, size_t* outLength) {
UErrorCode status = U_ZERO_ERROR;
UChar * utf16_dest = NULL;
int32_t utf16_dest_length = 0;
char * utf8_dest = NULL;
int32_t out_length = 0;
const UNormalizer2 * norm2 = unorm2_getInstance(NULL, "nfc", UNORM2_COMPOSE ,&status);
*outLength = 0;
if (status != U_ZERO_ERROR) {
printf("error in unorm2_getInstance: %s\n", u_errorName(status));
return 0;
}
// 2. normalize UChar (UTF-16)
utf16_dest = (UChar *) malloc((inLength+1) * sizeof(UChar));
if (utf16_dest == NULL) {
printf("malloc error\n");
return 0;
}
utf16_dest_length = unorm2_normalize(norm2, (UChar*) utf16, inLength, utf16_dest, inLength+1, &status);
if (status != U_ZERO_ERROR) {
printf("error in unorm2_normalize: %s\n", u_errorName(status));
free(utf16_dest);
return 0;
}
// 3. Convert data back from UChar (UTF-16) to UTF-8
// calculate utf8 string length
u_strToUTF8(NULL, 0, &out_length, utf16_dest, utf16_dest_length+1, &status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
printf("error in u_strToUTF8 1 %s\n", u_errorName(status));
free(utf16_dest);
return 0;
}
status = U_ZERO_ERROR;
// utf8_dest = (char *) malloc((out_length+1) * sizeof(char));
utf8_dest = TRI_Allocate(zone, (out_length+1) * sizeof(char), false);
if (utf8_dest == NULL) {
printf("malloc error\n");
free(utf16_dest);
return 0;
}
// convert to utf8
u_strToUTF8(utf8_dest, out_length+1, NULL, utf16_dest, utf16_dest_length+1, &status);
if (status != U_ZERO_ERROR) {
printf("error in u_strToUTF8 2 %s\n", u_errorName(status));
free(utf16_dest);
TRI_Free(zone, utf8_dest);
return 0;
}
*outLength = out_length - 1; // ?
free(utf16_dest);
return utf8_dest;
}
////////////////////////////////////////////////////////////////////////////////
/// @}
////////////////////////////////////////////////////////////////////////////////
#endif
// Local Variables:
// mode: outline-minor
// outline-regexp: "^\\(/// @brief\\|/// {@inheritDoc}\\|/// @addtogroup\\|// --SECTION--\\|/// @\\}\\)"
// End:

75
lib/BasicsC/utf8-helper.h Normal file
View File

@ -0,0 +1,75 @@
////////////////////////////////////////////////////////////////////////////////
/// @brief utf8 helper functions
///
/// @file
///
/// DISCLAIMER
///
/// Copyright 2004-2012 triagens GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is triAGENS GmbH, Cologne, Germany
///
/// @author Dr. Frank Celler
/// @author Copyright 2011-2012, triAGENS GmbH, Cologne, Germany
////////////////////////////////////////////////////////////////////////////////
#ifndef TRIAGENS_BASICS_C_UTF8_HELPER_H
#define TRIAGENS_BASICS_C_UTF8_HELPER_H 1
#include "BasicsC/common.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifdef TRI_HAVE_ICU
// -----------------------------------------------------------------------------
// --SECTION-- public functions
// -----------------------------------------------------------------------------
////////////////////////////////////////////////////////////////////////////////
/// @addtogroup Helper functions
/// @{
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/// @brief normalize an utf8 string (NFC)
////////////////////////////////////////////////////////////////////////////////
char * TR_normalize_utf8_to_NFC (TRI_memory_zone_t* zone, const char* utf8, size_t inLength, size_t* outLength);
////////////////////////////////////////////////////////////////////////////////
/// @brief normalize an utf16 string (NFC) and export it to utf8
////////////////////////////////////////////////////////////////////////////////
char * TR_normalize_utf16_to_NFC (TRI_memory_zone_t* zone, const uint16_t* utf16, size_t inLength, size_t* outLength);
////////////////////////////////////////////////////////////////////////////////
/// @}
////////////////////////////////////////////////////////////////////////////////
#endif
#ifdef __cplusplus
}
#endif
#endif
// Local Variables:
// mode: outline-minor
// outline-regexp: "^\\(/// @brief\\|/// {@inheritDoc}\\|/// @addtogroup\\|// --SECTION--\\|/// @\\}\\)"
// End:

View File

@ -53,6 +53,7 @@ lib_libarango_a_SOURCES = \
lib/BasicsC/terminal-utils-posix.c \
lib/BasicsC/terminal-utils.c \
lib/BasicsC/threads-posix.c \
lib/BasicsC/utf8-helper.c \
lib/BasicsC/vector.c \
lib/BasicsC/voc-errors.c \
lib/JsonParser/json-parser.c \

View File

@ -39,6 +39,7 @@
#include "BasicsC/process-utils.h"
#include "BasicsC/string-buffer.h"
#include "BasicsC/strings.h"
#include "BasicsC/utf8-helper.h"
#include "Rest/SslInterface.h"
#include "V8/v8-conv.h"
@ -1559,6 +1560,34 @@ void TRI_InitV8Utils (v8::Handle<v8::Context> context, string const& path) {
context->Global()->Set(v8::String::New("MODULES_PATH"), modulesPaths);
}
#ifdef TRI_HAVE_ICU
TRI_Utf8ValueNFC::TRI_Utf8ValueNFC(TRI_memory_zone_t* memoryZone, v8::Handle<v8::Value> obj) :
_str(0), _length(0), _memoryZone(memoryZone) {
v8::String::Value str(obj);
size_t str_len = str.length();
if (str_len > 0) {
_str = TR_normalize_utf16_to_NFC(_memoryZone, *str, str_len, &_length);
}
}
TRI_Utf8ValueNFC::~TRI_Utf8ValueNFC() {
if (_str) {
TRI_Free(_memoryZone, _str);
}
}
#else
TRI_Utf8ValueNFC::TRI_Utf8ValueNFC(TRI_memory_zone_t* memoryZone, v8::Handle<v8::Value> obj) :
_str(0), _length(0), _memoryZone(memoryZone), _utf8Value(obj) {
_str = *_utf8Value;
_length = _utf8Value.length();
}
TRI_Utf8ValueNFC::~TRI_Utf8ValueNFC() {
}
#endif
////////////////////////////////////////////////////////////////////////////////
/// @}
////////////////////////////////////////////////////////////////////////////////

View File

@ -150,6 +150,32 @@ v8::Handle<v8::Object> TRI_CreateErrorObject (int errorNumber, std::string const
void TRI_InitV8Utils (v8::Handle<v8::Context>, std::string const&);
////////////////////////////////////////////////////////////////////////////////
/// @brief Converts an object to a UTF-8-encoded and normalized character array.
////////////////////////////////////////////////////////////////////////////////
class TRI_Utf8ValueNFC {
public:
TRI_Utf8ValueNFC(TRI_memory_zone_t* memoryZone, v8::Handle<v8::Value> obj);
~TRI_Utf8ValueNFC();
char* operator*() { return _str; }
const char* operator*() const { return _str; }
size_t length() const { return _length; }
private:
char* _str;
size_t _length;
TRI_memory_zone_t* _memoryZone;
#ifndef TRI_HAVE_ICU
v8::String::Utf8Value _utf8Value;
#endif
// Disallow copying and assigning.
TRI_Utf8ValueNFC(const TRI_Utf8ValueNFC&);
void operator=(const TRI_Utf8ValueNFC&);
};
////////////////////////////////////////////////////////////////////////////////
/// @}
////////////////////////////////////////////////////////////////////////////////