1
0
Fork 0
arangodb/lib/Basics/tri-strings.cpp

1106 lines
28 KiB
C++

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2014-2016 ArangoDB GmbH, Cologne, Germany
/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is ArangoDB GmbH, Cologne, Germany
///
/// @author Dr. Frank Celler
////////////////////////////////////////////////////////////////////////////////
#include "tri-strings.h"
#include "Basics/conversions.h"
#include "Basics/Utf8Helper.h"
#include <openssl/sha.h>
////////////////////////////////////////////////////////////////////////////////
/// @brief hex values for all characters
////////////////////////////////////////////////////////////////////////////////
static char const HexValues[513] = {
"000102030405060708090a0b0c0d0e0f"
"101112131415161718191a1b1c1d1e1f"
"202122232425262728292a2b2c2d2e2f"
"303132333435363738393a3b3c3d3e3f"
"404142434445464748494a4b4c4d4e4f"
"505152535455565758595a5b5c5d5e5f"
"606162636465666768696a6b6c6d6e6f"
"707172737475767778797a7b7c7d7e7f"
"808182838485868788898a8b8c8d8e8f"
"909192939495969798999a9b9c9d9e9f"
"a0a1a2a3a4a5a6a7a8a9aaabacadaeaf"
"b0b1b2b3b4b5b6b7b8b9babbbcbdbebf"
"c0c1c2c3c4c5c6c7c8c9cacbcccdcecf"
"d0d1d2d3d4d5d6d7d8d9dadbdcdddedf"
"e0e1e2e3e4e5e6e7e8e9eaebecedeeef"
"f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff"};
////////////////////////////////////////////////////////////////////////////////
/// @brief integer values for all hex characters
////////////////////////////////////////////////////////////////////////////////
static uint8_t const HexDecodeLookup[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, // 0123456789
0, 0, 0, 0, 0, 0, 0, // :;<=>?@
10, 11, 12, 13, 14, 15, // ABCDEF
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // GHIJKLMNOPQRS
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // TUVWXYZ[/]^_`
10, 11, 12, 13, 14, 15, // abcdef
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
////////////////////////////////////////////////////////////////////////////////
/// @brief escapes UTF-8 range U+0000 to U+007F
////////////////////////////////////////////////////////////////////////////////
static void EscapeUtf8Range0000T007F(char** dst, char const** src) {
uint8_t c;
uint16_t i1;
uint16_t i2;
c = (uint8_t) * (*src);
i1 = (((uint16_t)c) & 0xF0) >> 4;
i2 = (((uint16_t)c) & 0x0F);
*(*dst)++ = '\\';
*(*dst)++ = 'u';
*(*dst)++ = '0';
*(*dst)++ = '0';
*(*dst)++ = (i1 < 10) ? ('0' + i1) : ('A' + i1 - 10);
*(*dst) = (i2 < 10) ? ('0' + i2) : ('A' + i2 - 10);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief escapes UTF-8 range U+0080 to U+07FF
////////////////////////////////////////////////////////////////////////////////
static void EscapeUtf8Range0080T07FF(char** dst, char const** src) {
uint8_t c;
uint8_t d;
c = (uint8_t) * ((*src) + 0);
d = (uint8_t) * ((*src) + 1);
// correct UTF-8
if ((d & 0xC0) == 0x80) {
uint16_t n;
uint16_t i1;
uint16_t i2;
uint16_t i3;
uint16_t i4;
n = ((c & 0x1F) << 6) | (d & 0x3F);
TRI_ASSERT(n >= 128);
i1 = (n & 0xF000) >> 12;
i2 = (n & 0x0F00) >> 8;
i3 = (n & 0x00F0) >> 4;
i4 = (n & 0x000F);
*(*dst)++ = '\\';
*(*dst)++ = 'u';
*(*dst)++ = (i1 < 10) ? ('0' + i1) : ('A' + i1 - 10);
*(*dst)++ = (i2 < 10) ? ('0' + i2) : ('A' + i2 - 10);
*(*dst)++ = (i3 < 10) ? ('0' + i3) : ('A' + i3 - 10);
*(*dst) = (i4 < 10) ? ('0' + i4) : ('A' + i4 - 10);
(*src) += 1;
}
// corrupted UTF-8
else {
*(*dst) = *(*src);
}
}
////////////////////////////////////////////////////////////////////////////////
/// @brief escapes UTF-8 range U+0800 to U+D7FF and U+E000 to U+FFFF
////////////////////////////////////////////////////////////////////////////////
static void EscapeUtf8Range0800TFFFF(char** dst, char const** src) {
uint8_t c;
uint8_t d;
uint8_t e;
c = (uint8_t) * ((*src) + 0);
d = (uint8_t) * ((*src) + 1);
e = (uint8_t) * ((*src) + 2);
// correct UTF-8 (3-byte sequence UTF-8 1110xxxx 10xxxxxx)
if ((d & 0xC0) == 0x80 && (e & 0xC0) == 0x80) {
uint16_t n;
uint16_t i1;
uint16_t i2;
uint16_t i3;
uint16_t i4;
n = ((c & 0x0F) << 12) | ((d & 0x3F) << 6) | (e & 0x3F);
TRI_ASSERT(n >= 2048 && (n < 55296 || n > 57343));
i1 = (n & 0xF000) >> 12;
i2 = (n & 0x0F00) >> 8;
i3 = (n & 0x00F0) >> 4;
i4 = (n & 0x000F);
*(*dst)++ = '\\';
*(*dst)++ = 'u';
*(*dst)++ = (i1 < 10) ? ('0' + i1) : ('A' + i1 - 10);
*(*dst)++ = (i2 < 10) ? ('0' + i2) : ('A' + i2 - 10);
*(*dst)++ = (i3 < 10) ? ('0' + i3) : ('A' + i3 - 10);
*(*dst) = (i4 < 10) ? ('0' + i4) : ('A' + i4 - 10);
(*src) += 2;
}
// corrupted UTF-8
else {
*(*dst) = *(*src);
}
}
////////////////////////////////////////////////////////////////////////////////
/// @brief escapes UTF-8 range U+10000 to U+10FFFF
////////////////////////////////////////////////////////////////////////////////
static void EscapeUtf8Range10000T10FFFF(char** dst, char const** src) {
uint8_t c;
uint8_t d;
uint8_t e;
uint8_t f;
c = (uint8_t) * ((*src) + 0);
d = (uint8_t) * ((*src) + 1);
e = (uint8_t) * ((*src) + 2);
f = (uint8_t) * ((*src) + 3);
// correct UTF-8 (4-byte sequence UTF-8 1110xxxx 10xxxxxx 10xxxxxx)
if ((d & 0xC0) == 0x80 && (e & 0xC0) == 0x80 && (f & 0xC0) == 0x80) {
uint32_t n;
uint32_t s1;
uint32_t s2;
uint16_t i1;
uint16_t i2;
uint16_t i3;
uint16_t i4;
n = ((c & 0x0F) << 18) | ((d & 0x3F) << 12) | ((e & 0x3F) << 6) |
(f & 0x3F);
TRI_ASSERT(n >= 65536 && n <= 1114111);
// construct the surrogate pairs
n -= 0x10000;
s1 = ((n & 0xFFC00) >> 10) + 0xD800;
s2 = (n & 0x3FF) + 0xDC00;
// encode high surrogate
i1 = (s1 & 0xF000) >> 12;
i2 = (s1 & 0x0F00) >> 8;
i3 = (s1 & 0x00F0) >> 4;
i4 = (s1 & 0x000F);
*(*dst)++ = '\\';
*(*dst)++ = 'u';
*(*dst)++ = (i1 < 10) ? ('0' + i1) : ('A' + i1 - 10);
*(*dst)++ = (i2 < 10) ? ('0' + i2) : ('A' + i2 - 10);
*(*dst)++ = (i3 < 10) ? ('0' + i3) : ('A' + i3 - 10);
*(*dst)++ = (i4 < 10) ? ('0' + i4) : ('A' + i4 - 10);
// encode low surrogate
i1 = (s2 & 0xF000) >> 12;
i2 = (s2 & 0x0F00) >> 8;
i3 = (s2 & 0x00F0) >> 4;
i4 = (s2 & 0x000F);
*(*dst)++ = '\\';
*(*dst)++ = 'u';
*(*dst)++ = (i1 < 10) ? ('0' + i1) : ('A' + i1 - 10);
*(*dst)++ = (i2 < 10) ? ('0' + i2) : ('A' + i2 - 10);
*(*dst)++ = (i3 < 10) ? ('0' + i3) : ('A' + i3 - 10);
*(*dst) = (i4 < 10) ? ('0' + i4) : ('A' + i4 - 10);
// advance src
(*src) += 3;
}
// corrupted UTF-8
else {
*(*dst) = *(*src);
}
}
////////////////////////////////////////////////////////////////////////////////
/// @brief decodes a unicode escape sequence
////////////////////////////////////////////////////////////////////////////////
static void DecodeUnicodeEscape(char** dst, char const* src) {
int i1;
int i2;
int i3;
int i4;
uint16_t n;
i1 = TRI_IntHex(src[0], 0);
i2 = TRI_IntHex(src[1], 0);
i3 = TRI_IntHex(src[2], 0);
i4 = TRI_IntHex(src[3], 0);
n = ((i1 & 0xF) << 12) | ((i2 & 0xF) << 8) | ((i3 & 0xF) << 4) | (i4 & 0xF);
if (n <= 0x7F) {
*(*dst) = n & 0x7F;
} else if (n <= 0x7FF) {
*(*dst)++ = 0xC0 + (n >> 6);
*(*dst) = 0x80 + (n & 0x3F);
} else {
*(*dst)++ = 0xE0 + (n >> 12);
*(*dst)++ = 0x80 + ((n >> 6) & 0x3F);
*(*dst) = 0x80 + (n & 0x3F);
}
}
////////////////////////////////////////////////////////////////////////////////
/// @brief decodes a unicode surrogate pair
////////////////////////////////////////////////////////////////////////////////
static void DecodeSurrogatePair(char** dst, char const* src1,
char const* src2) {
int i1;
int i2;
int i3;
int i4;
uint32_t n1;
uint32_t n2;
uint32_t n;
i1 = TRI_IntHex(src1[0], 0);
i2 = TRI_IntHex(src1[1], 0);
i3 = TRI_IntHex(src1[2], 0);
i4 = TRI_IntHex(src1[3], 0);
n1 = ((i1 & 0xF) << 12) | ((i2 & 0xF) << 8) | ((i3 & 0xF) << 4) | (i4 & 0xF);
n1 -= 0xD800;
i1 = TRI_IntHex(src2[0], 0);
i2 = TRI_IntHex(src2[1], 0);
i3 = TRI_IntHex(src2[2], 0);
i4 = TRI_IntHex(src2[3], 0);
n2 = ((i1 & 0xF) << 12) | ((i2 & 0xF) << 8) | ((i3 & 0xF) << 4) | (i4 & 0xF);
n2 -= 0xDC00;
n = 0x10000 + ((n1 << 10) | n2);
if (n <= 0x7F) {
*(*dst) = n & 0x7F;
} else if (n <= 0x7FF) {
*(*dst)++ = 0xC0 + (n >> 6);
*(*dst) = 0x80 + (n & 0x3F);
} else if (n <= 0xFFFF) {
*(*dst)++ = 0xE0 + (n >> 12);
*(*dst)++ = 0x80 + ((n >> 6) & 0x3F);
*(*dst) = 0x80 + (n & 0x3F);
} else {
*(*dst)++ = 0xF0 + (n >> 18);
*(*dst)++ = 0x80 + ((n >> 12) & 0x3F);
*(*dst)++ = 0x80 + ((n >> 6) & 0x3F);
*(*dst) = 0x80 + (n & 0x3F);
}
}
////////////////////////////////////////////////////////////////////////////////
/// @brief convert a string to lower case
////////////////////////////////////////////////////////////////////////////////
char* TRI_LowerAsciiString(char const* value) {
size_t length;
char* buffer;
char* p;
char* out;
char c;
if (value == nullptr) {
return nullptr;
}
length = strlen(value);
buffer = static_cast<char*>(
TRI_Allocate((sizeof(char) * length) + 1));
if (buffer == nullptr) {
return nullptr;
}
p = (char*)value;
out = buffer;
while ((c = *p++)) {
if (c >= 'A' && c <= 'Z') {
*out++ = c + 32;
} else {
*out++ = c;
}
}
*out = '\0';
return buffer;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief convert a string to upper case
////////////////////////////////////////////////////////////////////////////////
char* TRI_UpperAsciiString(char const* value) {
size_t length;
char* buffer;
char* p;
char* out;
char c;
if (value == nullptr) {
return nullptr;
}
length = strlen(value);
buffer = static_cast<char*>(
TRI_Allocate((sizeof(char) * length) + 1));
if (buffer == nullptr) {
return nullptr;
}
p = (char*)value;
out = buffer;
while ((c = *p++)) {
if (c >= 'a' && c <= 'z') {
*out++ = c - 32;
} else {
*out++ = c;
}
}
*out = '\0';
return buffer;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief tests if strings are equal
////////////////////////////////////////////////////////////////////////////////
bool TRI_EqualString(char const* left, char const* right) {
return strcmp(left, right) == 0;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief tests if strings are equal
////////////////////////////////////////////////////////////////////////////////
bool TRI_EqualString(char const* left, char const* right, size_t n) {
return strncmp(left, right, n) == 0;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief tests if ASCII strings are equal ignoring case
////////////////////////////////////////////////////////////////////////////////
bool TRI_CaseEqualString(char const* left, char const* right) {
return strcasecmp(left, right) == 0;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief tests if ASCII strings are equal ignoring case
////////////////////////////////////////////////////////////////////////////////
bool TRI_CaseEqualString(char const* left, char const* right, size_t n) {
return strncasecmp(left, right, n) == 0;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief tests if second string is prefix of the first
////////////////////////////////////////////////////////////////////////////////
bool TRI_IsPrefixString(char const* full, char const* prefix) {
return strncmp(full, prefix, strlen(prefix)) == 0;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief tests if second string is contained in the first
////////////////////////////////////////////////////////////////////////////////
bool TRI_IsContainedString(char const* full, char const* part) {
return strstr(full, part) != nullptr;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief tests if second string is contained in the first, byte-safe
////////////////////////////////////////////////////////////////////////////////
char* TRI_IsContainedMemory(char const* full, size_t fullLength,
char const* part, size_t partLength) {
if (fullLength == 0 || partLength == 0 || fullLength < partLength) {
return nullptr;
}
if (partLength == 1) {
return static_cast<char*>(const_cast<void*>(
memchr(static_cast<void const*>(full), (int)*part, fullLength)));
}
char const* end = full + fullLength - partLength;
for (char const* p = full; p <= end; ++p) {
if (*p == *part &&
memcmp(static_cast<void const*>(p), static_cast<void const*>(part),
partLength) == 0) {
return const_cast<char*>(p);
}
}
return nullptr;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief duplicates a string
////////////////////////////////////////////////////////////////////////////////
char* TRI_DuplicateString(char const* value) {
size_t n = strlen(value) + 1;
char* result = static_cast<char*>(TRI_Allocate(n));
if (result != nullptr) {
memcpy(result, value, n);
}
return result;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief duplicates a string of given length
////////////////////////////////////////////////////////////////////////////////
char* TRI_DuplicateString(char const* value,
size_t length) {
char* result = static_cast<char*>(TRI_Allocate(length + 1));
if (result != nullptr) {
memcpy(result, value, length);
result[length] = '\0';
}
return result;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief copies a string
////////////////////////////////////////////////////////////////////////////////
void TRI_CopyString(char* dst, char const* src, size_t length) {
*dst = '\0';
strncat(dst, src, length);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief concatenate three strings using a memory zone
////////////////////////////////////////////////////////////////////////////////
char* TRI_Concatenate3String(char const* a,
char const* b, char const* c) {
size_t na = strlen(a);
size_t nb = strlen(b);
size_t nc = strlen(c);
char* result =
static_cast<char*>(TRI_Allocate(na + nb + nc + 1));
if (result != nullptr) {
memcpy(result, a, na);
memcpy(result + na, b, nb);
memcpy(result + na + nb, c, nc);
result[na + nb + nc] = '\0';
}
return result;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief frees a string
////////////////////////////////////////////////////////////////////////////////
void TRI_FreeString(char* value) {
TRI_Free(value);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief converts into hex representation
////////////////////////////////////////////////////////////////////////////////
char* TRI_EncodeHexString(char const* source, size_t sourceLen,
size_t* dstLen) {
char* result;
uint16_t* hex;
uint16_t* dst;
uint8_t* src;
size_t j;
*dstLen = (sourceLen * 2);
dst = static_cast<uint16_t*>(
TRI_Allocate((*dstLen) + 1));
if (dst == nullptr) {
return nullptr;
}
result = (char*)dst;
hex = (uint16_t*)HexValues;
src = (uint8_t*)source;
for (j = 0; j < sourceLen; j++) {
*dst = hex[*src];
dst++;
src++;
}
*((char*)dst) = 0; // terminate the string
return result;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief converts from hex representation
////////////////////////////////////////////////////////////////////////////////
char* TRI_DecodeHexString(char const* source, size_t sourceLen,
size_t* dstLen) {
char* result;
uint8_t* dst;
uint8_t* src;
size_t j;
*dstLen = (sourceLen / 2);
dst = static_cast<uint8_t*>(
TRI_Allocate((*dstLen) + 1));
if (dst == nullptr) {
return nullptr;
}
result = (char*)dst;
src = (uint8_t*)source;
for (j = 0; j < sourceLen; j += 2) {
uint8_t d;
d = HexDecodeLookup[*src++] << 4;
d |= HexDecodeLookup[*src++];
*dst++ = d;
}
*dst = 0; // terminate the string
return result;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief sha256 of a string
////////////////////////////////////////////////////////////////////////////////
char* TRI_SHA256String(char const* source, size_t sourceLen, size_t* dstLen) {
unsigned char* dst = static_cast<unsigned char*>(
TRI_Allocate(SHA256_DIGEST_LENGTH));
if (dst == nullptr) {
return nullptr;
}
*dstLen = SHA256_DIGEST_LENGTH;
SHA256((unsigned char const*)source, sourceLen, dst);
return (char*)dst;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief escapes special characters using C escapes
////////////////////////////////////////////////////////////////////////////////
char* TRI_EscapeControlsCString(char const* in,
size_t inLength, size_t* outLength,
bool appendNewline) {
char* buffer = static_cast<char*>(
TRI_Allocate(TRI_MaxLengthEscapeControlsCString(inLength)));
return TRI_EscapeControlsCString(in, inLength, buffer, outLength, appendNewline);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief escapes special characters using C escapes
/// the target buffer must have been allocated already and big enough to hold
/// the result of at most (4 * inLength) + 2 bytes!
////////////////////////////////////////////////////////////////////////////////
char* TRI_EscapeControlsCString(char const* in, size_t inLength,
char* out, size_t* outLength,
bool appendNewline) {
if (out == nullptr) {
return nullptr;
}
char* qtr = out;
char const* ptr;
char const* end;
for (ptr = in, end = ptr + inLength; ptr < end; ptr++, qtr++) {
uint8_t n;
switch (*ptr) {
case '\n':
*qtr++ = '\\';
*qtr = 'n';
break;
case '\r':
*qtr++ = '\\';
*qtr = 'r';
break;
case '\t':
*qtr++ = '\\';
*qtr = 't';
break;
default:
n = (uint8_t)(*ptr);
if (n < 32) {
uint8_t n1 = n >> 4;
uint8_t n2 = n & 0x0F;
*qtr++ = '\\';
*qtr++ = 'x';
*qtr++ = (n1 < 10) ? ('0' + n1) : ('A' + n1 - 10);
*qtr = (n2 < 10) ? ('0' + n2) : ('A' + n2 - 10);
} else {
*qtr = *ptr;
}
break;
}
}
if (appendNewline) {
*qtr++ = '\n';
}
*qtr = '\0';
*outLength = static_cast<size_t>(qtr - out);
return out;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief escapes special characters using unicode escapes
////////////////////////////////////////////////////////////////////////////////
char* TRI_EscapeUtf8String(char const* in,
size_t inLength, bool escapeSlash, size_t* outLength,
bool compactResult) {
char* buffer = (char*)TRI_Allocate(6 * inLength + 1);
if (buffer == nullptr) {
return nullptr;
}
char* qtr = buffer;
char const* ptr;
char const* end;
for (ptr = in, end = ptr + inLength; ptr < end; ++ptr, ++qtr) {
switch (*ptr) {
case '/':
if (escapeSlash) {
*qtr++ = '\\';
}
*qtr = *ptr;
break;
case '\\':
case '"':
*qtr++ = '\\';
*qtr = *ptr;
break;
case '\b':
*qtr++ = '\\';
*qtr = 'b';
break;
case '\f':
*qtr++ = '\\';
*qtr = 'f';
break;
case '\n':
*qtr++ = '\\';
*qtr = 'n';
break;
case '\r':
*qtr++ = '\\';
*qtr = 'r';
break;
case '\t':
*qtr++ = '\\';
*qtr = 't';
break;
case '\0':
*qtr++ = '\\';
*qtr++ = 'u';
*qtr++ = '0';
*qtr++ = '0';
*qtr++ = '0';
*qtr = '0';
break;
default: {
uint8_t c;
// next character as unsigned char
c = (uint8_t)*ptr;
// character is in the normal latin1 range
if ((c & 0x80) == 0) {
// special character, escape
if (c < 32) {
EscapeUtf8Range0000T007F(&qtr, &ptr);
}
// normal latin1
else {
*qtr = *ptr;
}
}
// unicode range 0080 - 07ff (2-byte sequence UTF-8)
else if ((c & 0xE0) == 0xC0) {
// hopefully correct UTF-8
if (ptr + 1 < end) {
EscapeUtf8Range0080T07FF(&qtr, &ptr);
}
// corrupted UTF-8
else {
*qtr = *ptr;
}
}
// unicode range 0800 - ffff (3-byte sequence UTF-8)
else if ((c & 0xF0) == 0xE0) {
// hopefully correct UTF-8
if (ptr + 2 < end) {
EscapeUtf8Range0800TFFFF(&qtr, &ptr);
}
// corrupted UTF-8
else {
*qtr = *ptr;
}
}
// unicode range 10000 - 10ffff (4-byte sequence UTF-8)
else if ((c & 0xF8) == 0xF0) {
// hopefully correct UTF-8
if (ptr + 3 < end) {
EscapeUtf8Range10000T10FFFF(&qtr, &ptr);
}
// corrupted UTF-8
else {
*qtr = *ptr;
}
}
// unicode range above 10ffff -- NOT IMPLEMENTED
else {
*qtr = *ptr;
}
}
break;
}
}
*qtr = '\0';
*outLength = (size_t)(qtr - buffer);
if (!compactResult) {
return buffer;
}
qtr = static_cast<char*>(TRI_Allocate(*outLength + 1));
if (qtr != nullptr) {
memcpy(qtr, buffer, *outLength + 1);
}
TRI_Free(buffer);
return qtr;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief unescapes unicode escape sequences
////////////////////////////////////////////////////////////////////////////////
char* TRI_UnescapeUtf8String(char const* in,
size_t inLength, size_t* outLength, bool normalize) {
char const* ptr;
char const* end;
size_t tmpLength = 0;
char* buffer = static_cast<char*>(TRI_Allocate(inLength + 1));
if (buffer == nullptr) {
return nullptr;
}
char* qtr = buffer;
for (ptr = in, end = ptr + inLength; ptr < end; ++ptr, ++qtr) {
if (*ptr == '\\' && ptr + 1 < end) {
++ptr;
switch (*ptr) {
case 'b':
*qtr = '\b';
break;
case 'f':
*qtr = '\f';
break;
case 'n':
*qtr = '\n';
break;
case 'r':
*qtr = '\r';
break;
case 't':
*qtr = '\t';
break;
case 'u':
// expecting at least 6 characters: \uXXXX
if (ptr + 4 < end) {
// check, if we have a surrogate pair
if (ptr + 10 < end) {
bool sp;
char c1 = ptr[1];
sp = (c1 == 'd' || c1 == 'D');
if (sp) {
char c2 = ptr[2];
sp &= (c2 == '8' || c2 == '9' || c2 == 'A' || c2 == 'a' ||
c2 == 'B' || c2 == 'b');
}
if (sp) {
char c3 = ptr[7];
sp &= (ptr[5] == '\\' && ptr[6] == 'u');
sp &= (c3 == 'd' || c3 == 'D');
}
if (sp) {
char c4 = ptr[8];
sp &= (c4 == 'C' || c4 == 'c' || c4 == 'D' || c4 == 'd' ||
c4 == 'E' || c4 == 'e' || c4 == 'F' || c4 == 'f');
}
if (sp) {
DecodeSurrogatePair(&qtr, ptr + 1, ptr + 7);
ptr += 10;
} else {
DecodeUnicodeEscape(&qtr, ptr + 1);
ptr += 4;
}
} else {
DecodeUnicodeEscape(&qtr, ptr + 1);
ptr += 4;
}
}
// ignore wrong format
else {
*qtr = *ptr;
}
break;
default:
// this includes cases \/, \\, and \"
*qtr = *ptr;
break;
}
continue;
}
*qtr = *ptr;
}
*qtr = '\0';
*outLength = (size_t)(qtr - buffer);
if (normalize && *outLength > 0) {
char* utf8_nfc =
TRI_normalize_utf8_to_NFC(buffer, *outLength, &tmpLength);
if (utf8_nfc != nullptr) {
*outLength = tmpLength;
TRI_Free(buffer);
return utf8_nfc;
}
// intentional fall-through
}
return buffer;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief determine the number of characters in a UTF-8 string
/// the UTF-8 string must be well-formed and end with a NUL terminator
////////////////////////////////////////////////////////////////////////////////
size_t TRI_CharLengthUtf8String(char const* in) {
unsigned char const* p = reinterpret_cast<unsigned char const*>(in);
size_t chars = 0;
while (*p) {
unsigned char c = *p;
if (c < 128) {
// single byte
p++;
} else if (c < 224) {
p += 2;
} else if (c < 240) {
p += 3;
} else if (c < 248) {
p += 4;
} else {
// invalid UTF-8 sequence
break;
}
++chars;
}
return chars;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief determine the number of characters in a UTF-8 string
////////////////////////////////////////////////////////////////////////////////
size_t TRI_CharLengthUtf8String(char const* in, size_t length) {
unsigned char const* p = reinterpret_cast<unsigned char const*>(in);
unsigned char const* e = p + length;
size_t chars = 0;
while (p < e) {
unsigned char c = *p;
if (c < 128) {
// single byte
p++;
} else if (c < 224) {
p += 2;
} else if (c < 240) {
p += 3;
} else if (c < 248) {
p += 4;
} else {
// invalid UTF-8 sequence
break;
}
++chars;
}
return chars;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief get the string end position for a leftmost prefix of a UTF-8 string
/// eg. when specifying (müller, 2), the return value will be a pointer to the
/// first "l".
/// the UTF-8 string must be well-formed and end with a NUL terminator
////////////////////////////////////////////////////////////////////////////////
char* TRI_PrefixUtf8String(char const* in, const uint32_t maximumLength) {
uint32_t length;
unsigned char* p;
p = (unsigned char*)in;
length = 0;
while (*p && length < maximumLength) {
unsigned char c = *p;
if (c < 128) {
// single byte
p++;
} else if (c < 224) {
p += 2;
} else if (c < 240) {
p += 3;
} else if (c < 248) {
p += 4;
} else {
// invalid UTF-8 sequence
break;
}
++length;
}
return (char*)p;
}