//////////////////////////////////////////////////////////////////////////////// /// @brief basic string functions /// /// @file /// /// DISCLAIMER /// /// Copyright 2014 ArangoDB GmbH, Cologne, Germany /// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany /// /// Licensed under the Apache License, Version 2.0 (the "License"); /// you may not use this file except in compliance with the License. /// You may obtain a copy of the License at /// /// http://www.apache.org/licenses/LICENSE-2.0 /// /// Unless required by applicable law or agreed to in writing, software /// distributed under the License is distributed on an "AS IS" BASIS, /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. /// See the License for the specific language governing permissions and /// limitations under the License. /// /// Copyright holder is ArangoDB GmbH, Cologne, Germany /// /// @author Dr. Frank Celler /// @author Copyright 2014, ArangoDB GmbH, Cologne, Germany /// @author Copyright 2011-2013, triAGENS GmbH, Cologne, Germany //////////////////////////////////////////////////////////////////////////////// #include "tri-strings.h" #include "Basics/conversions.h" #include "Basics/Utf8Helper.h" #include // ----------------------------------------------------------------------------- // --SECTION-- private variables // ----------------------------------------------------------------------------- //////////////////////////////////////////////////////////////////////////////// /// @brief hex values for all characters //////////////////////////////////////////////////////////////////////////////// static char const HexValues[513] = { "000102030405060708090a0b0c0d0e0f" "101112131415161718191a1b1c1d1e1f" "202122232425262728292a2b2c2d2e2f" "303132333435363738393a3b3c3d3e3f" "404142434445464748494a4b4c4d4e4f" "505152535455565758595a5b5c5d5e5f" "606162636465666768696a6b6c6d6e6f" "707172737475767778797a7b7c7d7e7f" "808182838485868788898a8b8c8d8e8f" "909192939495969798999a9b9c9d9e9f" "a0a1a2a3a4a5a6a7a8a9aaabacadaeaf" "b0b1b2b3b4b5b6b7b8b9babbbcbdbebf" "c0c1c2c3c4c5c6c7c8c9cacbcccdcecf" "d0d1d2d3d4d5d6d7d8d9dadbdcdddedf" "e0e1e2e3e4e5e6e7e8e9eaebecedeeef" "f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff" }; //////////////////////////////////////////////////////////////////////////////// /// @brief integer values for all hex characters //////////////////////////////////////////////////////////////////////////////// static uint8_t const HexDecodeLookup[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,1,2,3,4,5,6,7,8,9, // 0123456789 0,0,0,0,0,0,0, // :;<=>?@ 10,11,12,13,14,15, // ABCDEF 0,0,0,0,0,0,0,0,0,0,0,0,0, // GHIJKLMNOPQRS 0,0,0,0,0,0,0,0,0,0,0,0,0, // TUVWXYZ[/]^_` 10,11,12,13,14,15, // abcdef 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0 }; // ----------------------------------------------------------------------------- // --SECTION-- private functions // ----------------------------------------------------------------------------- //////////////////////////////////////////////////////////////////////////////// /// @brief escapes UTF-8 range U+0000 to U+007F //////////////////////////////////////////////////////////////////////////////// static void EscapeUtf8Range0000T007F (char** dst, char const** src) { uint8_t c; uint16_t i1; uint16_t i2; c = (uint8_t) *(*src); i1 = (((uint16_t) c) & 0xF0) >> 4; i2 = (((uint16_t) c) & 0x0F); *(*dst)++ = '\\'; *(*dst)++ = 'u'; *(*dst)++ = '0'; *(*dst)++ = '0'; *(*dst)++ = (i1 < 10) ? ('0' + i1) : ('A' + i1 - 10); *(*dst) = (i2 < 10) ? ('0' + i2) : ('A' + i2 - 10); } //////////////////////////////////////////////////////////////////////////////// /// @brief escapes UTF-8 range U+0080 to U+07FF //////////////////////////////////////////////////////////////////////////////// static void EscapeUtf8Range0080T07FF (char** dst, char const** src) { uint8_t c; uint8_t d; c = (uint8_t) *((*src) + 0); d = (uint8_t) *((*src) + 1); // correct UTF-8 if ((d & 0xC0) == 0x80) { uint16_t n; uint16_t i1; uint16_t i2; uint16_t i3; uint16_t i4; n = ((c & 0x1F) << 6) | (d & 0x3F); TRI_ASSERT(n >= 128); i1 = (n & 0xF000) >> 12; i2 = (n & 0x0F00) >> 8; i3 = (n & 0x00F0) >> 4; i4 = (n & 0x000F); *(*dst)++ = '\\'; *(*dst)++ = 'u'; *(*dst)++ = (i1 < 10) ? ('0' + i1) : ('A' + i1 - 10); *(*dst)++ = (i2 < 10) ? ('0' + i2) : ('A' + i2 - 10); *(*dst)++ = (i3 < 10) ? ('0' + i3) : ('A' + i3 - 10); *(*dst) = (i4 < 10) ? ('0' + i4) : ('A' + i4 - 10); (*src) += 1; } // corrupted UTF-8 else { *(*dst) = *(*src); } } //////////////////////////////////////////////////////////////////////////////// /// @brief escapes UTF-8 range U+0800 to U+D7FF and U+E000 to U+FFFF //////////////////////////////////////////////////////////////////////////////// static void EscapeUtf8Range0800TFFFF (char** dst, char const** src) { uint8_t c; uint8_t d; uint8_t e; c = (uint8_t) *((*src) + 0); d = (uint8_t) *((*src) + 1); e = (uint8_t) *((*src) + 2); // correct UTF-8 (3-byte sequence UTF-8 1110xxxx 10xxxxxx) if ((d & 0xC0) == 0x80 && (e & 0xC0) == 0x80) { uint16_t n; uint16_t i1; uint16_t i2; uint16_t i3; uint16_t i4; n = ((c & 0x0F) << 12) | ((d & 0x3F) << 6) | (e & 0x3F); TRI_ASSERT(n >= 2048 && (n < 55296 || n > 57343)); i1 = (n & 0xF000) >> 12; i2 = (n & 0x0F00) >> 8; i3 = (n & 0x00F0) >> 4; i4 = (n & 0x000F); *(*dst)++ = '\\'; *(*dst)++ = 'u'; *(*dst)++ = (i1 < 10) ? ('0' + i1) : ('A' + i1 - 10); *(*dst)++ = (i2 < 10) ? ('0' + i2) : ('A' + i2 - 10); *(*dst)++ = (i3 < 10) ? ('0' + i3) : ('A' + i3 - 10); *(*dst) = (i4 < 10) ? ('0' + i4) : ('A' + i4 - 10); (*src) += 2; } // corrupted UTF-8 else { *(*dst) = *(*src); } } //////////////////////////////////////////////////////////////////////////////// /// @brief escapes UTF-8 range U+10000 to U+10FFFF //////////////////////////////////////////////////////////////////////////////// static void EscapeUtf8Range10000T10FFFF (char** dst, char const** src) { uint8_t c; uint8_t d; uint8_t e; uint8_t f; c = (uint8_t) *((*src) + 0); d = (uint8_t) *((*src) + 1); e = (uint8_t) *((*src) + 2); f = (uint8_t) *((*src) + 3); // correct UTF-8 (4-byte sequence UTF-8 1110xxxx 10xxxxxx 10xxxxxx) if ((d & 0xC0) == 0x80 && (e & 0xC0) == 0x80 && (f & 0xC0) == 0x80) { uint32_t n; uint32_t s1; uint32_t s2; uint16_t i1; uint16_t i2; uint16_t i3; uint16_t i4; n = ((c & 0x0F) << 18) | ((d & 0x3F) << 12) | ((e & 0x3F) << 6) | (f & 0x3F); TRI_ASSERT(n >= 65536 && n <= 1114111); // construct the surrogate pairs n -= 0x10000; s1 = ((n & 0xFFC00) >> 10) + 0xD800; s2 = (n & 0x3FF) + 0xDC00; // encode high surrogate i1 = (s1 & 0xF000) >> 12; i2 = (s1 & 0x0F00) >> 8; i3 = (s1 & 0x00F0) >> 4; i4 = (s1 & 0x000F); *(*dst)++ = '\\'; *(*dst)++ = 'u'; *(*dst)++ = (i1 < 10) ? ('0' + i1) : ('A' + i1 - 10); *(*dst)++ = (i2 < 10) ? ('0' + i2) : ('A' + i2 - 10); *(*dst)++ = (i3 < 10) ? ('0' + i3) : ('A' + i3 - 10); *(*dst)++ = (i4 < 10) ? ('0' + i4) : ('A' + i4 - 10); // encode low surrogate i1 = (s2 & 0xF000) >> 12; i2 = (s2 & 0x0F00) >> 8; i3 = (s2 & 0x00F0) >> 4; i4 = (s2 & 0x000F); *(*dst)++ = '\\'; *(*dst)++ = 'u'; *(*dst)++ = (i1 < 10) ? ('0' + i1) : ('A' + i1 - 10); *(*dst)++ = (i2 < 10) ? ('0' + i2) : ('A' + i2 - 10); *(*dst)++ = (i3 < 10) ? ('0' + i3) : ('A' + i3 - 10); *(*dst) = (i4 < 10) ? ('0' + i4) : ('A' + i4 - 10); // advance src (*src) += 3; } // corrupted UTF-8 else { *(*dst) = *(*src); } } //////////////////////////////////////////////////////////////////////////////// /// @brief decodes a unicode escape sequence //////////////////////////////////////////////////////////////////////////////// static void DecodeUnicodeEscape (char** dst, char const* src) { int i1; int i2; int i3; int i4; uint16_t n; i1 = TRI_IntHex(src[0], 0); i2 = TRI_IntHex(src[1], 0); i3 = TRI_IntHex(src[2], 0); i4 = TRI_IntHex(src[3], 0); n = ((i1 & 0xF) << 12) | ((i2 & 0xF) << 8) | ((i3 & 0xF) << 4) | (i4 & 0xF); if (n <= 0x7F) { *(*dst) = n & 0x7F; } else if (n <= 0x7FF) { *(*dst)++ = 0xC0 + (n >> 6); *(*dst) = 0x80 + (n & 0x3F); } else { *(*dst)++ = 0xE0 + (n >> 12); *(*dst)++ = 0x80 + ((n >> 6) & 0x3F); *(*dst) = 0x80 + (n & 0x3F); } } //////////////////////////////////////////////////////////////////////////////// /// @brief decodes a unicode surrogate pair //////////////////////////////////////////////////////////////////////////////// static void DecodeSurrogatePair (char** dst, char const* src1, char const* src2) { int i1; int i2; int i3; int i4; uint32_t n1; uint32_t n2; uint32_t n; i1 = TRI_IntHex(src1[0], 0); i2 = TRI_IntHex(src1[1], 0); i3 = TRI_IntHex(src1[2], 0); i4 = TRI_IntHex(src1[3], 0); n1 = ((i1 & 0xF) << 12) | ((i2 & 0xF) << 8) | ((i3 & 0xF) << 4) | (i4 & 0xF); n1 -= 0xD800; i1 = TRI_IntHex(src2[0], 0); i2 = TRI_IntHex(src2[1], 0); i3 = TRI_IntHex(src2[2], 0); i4 = TRI_IntHex(src2[3], 0); n2 = ((i1 & 0xF) << 12) | ((i2 & 0xF) << 8) | ((i3 & 0xF) << 4) | (i4 & 0xF); n2 -= 0xDC00; n = 0x10000 + ((n1 << 10) | n2); if (n <= 0x7F) { *(*dst) = n & 0x7F; } else if (n <= 0x7FF) { *(*dst)++ = 0xC0 + (n >> 6); *(*dst) = 0x80 + (n & 0x3F); } else if (n <= 0xFFFF) { *(*dst)++ = 0xE0 + (n >> 12); *(*dst)++ = 0x80 + ((n >> 6) & 0x3F); *(*dst) = 0x80 + (n & 0x3F); } else { *(*dst)++ = 0xF0 + (n >> 18); *(*dst)++ = 0x80 + ((n >> 12) & 0x3F); *(*dst)++ = 0x80 + ((n >> 6) & 0x3F); *(*dst) = 0x80 + (n & 0x3F); } } // ----------------------------------------------------------------------------- // --SECTION-- public functions // ----------------------------------------------------------------------------- //////////////////////////////////////////////////////////////////////////////// /// @brief convert a string to lower case //////////////////////////////////////////////////////////////////////////////// char* TRI_LowerAsciiString (TRI_memory_zone_t* zone, char const* value) { size_t length; char* buffer; char* p; char* out; char c; if (value == nullptr) { return nullptr; } length = strlen(value); buffer = static_cast(TRI_Allocate(zone, (sizeof(char) * length) + 1, false)); if (buffer == nullptr) { return nullptr; } p = (char*) value; out = buffer; while ((c = *p++)) { if (c >= 'A' && c <= 'Z') { *out++ = c + 32; } else { *out++ = c; } } *out = '\0'; return buffer; } //////////////////////////////////////////////////////////////////////////////// /// @brief convert a string to upper case //////////////////////////////////////////////////////////////////////////////// char* TRI_UpperAsciiString (TRI_memory_zone_t* zone, char const* value) { size_t length; char* buffer; char* p; char* out; char c; if (value == nullptr) { return nullptr; } length = strlen(value); buffer = static_cast(TRI_Allocate(zone, (sizeof(char) * length) + 1, false)); if (buffer == nullptr) { return nullptr; } p = (char*) value; out = buffer; while ((c = *p++)) { if (c >= 'a' && c <= 'z') { *out++ = c - 32; } else { *out++ = c; } } *out = '\0'; return buffer; } //////////////////////////////////////////////////////////////////////////////// /// @brief tests if strings are equal //////////////////////////////////////////////////////////////////////////////// bool TRI_EqualString (char const* left, char const* right) { return strcmp(left, right) == 0; } //////////////////////////////////////////////////////////////////////////////// /// @brief tests if strings are equal //////////////////////////////////////////////////////////////////////////////// bool TRI_EqualString2 (char const* left, char const* right, size_t n) { return strncmp(left, right, n) == 0; } //////////////////////////////////////////////////////////////////////////////// /// @brief tests if ASCII strings are equal ignoring case //////////////////////////////////////////////////////////////////////////////// bool TRI_CaseEqualString (char const* left, char const* right) { return strcasecmp(left, right) == 0; } //////////////////////////////////////////////////////////////////////////////// /// @brief tests if ASCII strings are equal ignoring case //////////////////////////////////////////////////////////////////////////////// bool TRI_CaseEqualString2 (char const* left, char const* right, size_t n) { return strncasecmp(left, right, n) == 0; } //////////////////////////////////////////////////////////////////////////////// /// @brief tests if second string is prefix of the first //////////////////////////////////////////////////////////////////////////////// bool TRI_IsPrefixString (char const* full, char const* prefix) { return strncmp(full, prefix, strlen(prefix)) == 0; } //////////////////////////////////////////////////////////////////////////////// /// @brief tests if second string is contained in the first //////////////////////////////////////////////////////////////////////////////// bool TRI_IsContainedString (char const* full, char const* part) { return strstr(full, part) != nullptr; } //////////////////////////////////////////////////////////////////////////////// /// @brief tests if second string is contained in the first, byte-safe //////////////////////////////////////////////////////////////////////////////// char* TRI_IsContainedMemory (char const* full, size_t fullLength, char const* part, size_t partLength) { if (fullLength == 0 || partLength == 0 || fullLength < partLength) { return nullptr; } if (partLength == 1) { return static_cast(const_cast(memchr(static_cast(full), (int) *part, fullLength))); } char const* end = full + fullLength - partLength; for (char const* p = full; p <= end; ++p) { if (*p == *part && memcmp(static_cast(p), static_cast(part), partLength) == 0) { return const_cast(p); } } return nullptr; } //////////////////////////////////////////////////////////////////////////////// /// @brief duplicates a string //////////////////////////////////////////////////////////////////////////////// char* TRI_DuplicateString (char const* value) { size_t n = strlen(value) + 1; char* result = static_cast(TRI_Allocate(TRI_CORE_MEM_ZONE, n, false)); memcpy(result, value, n); return result; } //////////////////////////////////////////////////////////////////////////////// /// @brief duplicates a string //////////////////////////////////////////////////////////////////////////////// char* TRI_DuplicateStringZ (TRI_memory_zone_t* zone, char const* value) { size_t n = strlen(value) + 1; char* result = static_cast(TRI_Allocate(zone, n, false)); if (result != nullptr) { memcpy(result, value, n); } return result; } //////////////////////////////////////////////////////////////////////////////// /// @brief duplicates a string of given length //////////////////////////////////////////////////////////////////////////////// char* TRI_DuplicateString2 (char const* value, size_t length) { char* result = static_cast(TRI_Allocate(TRI_CORE_MEM_ZONE, length + 1, false)); memcpy(result, value, length); result[length] = '\0'; return result; } //////////////////////////////////////////////////////////////////////////////// /// @brief duplicates a string of given length //////////////////////////////////////////////////////////////////////////////// char* TRI_DuplicateString2Z (TRI_memory_zone_t* zone, char const* value, size_t length) { char* result = static_cast(TRI_Allocate(zone, length + 1, false)); if (result != nullptr) { memcpy(result, value, length); result[length] = '\0'; } return result; } //////////////////////////////////////////////////////////////////////////////// /// @brief appends text to a string //////////////////////////////////////////////////////////////////////////////// void TRI_AppendString (char** dst, char const* src) { char* ptr; ptr = TRI_Concatenate2String(*dst, src); TRI_FreeString(TRI_CORE_MEM_ZONE, *dst); *dst = ptr; } //////////////////////////////////////////////////////////////////////////////// /// @brief copies a string //////////////////////////////////////////////////////////////////////////////// void TRI_CopyString (char* dst, char const* src, size_t length) { *dst = '\0'; strncat(dst, src, length); } //////////////////////////////////////////////////////////////////////////////// /// @brief concatenate two strings //////////////////////////////////////////////////////////////////////////////// char* TRI_Concatenate2String (char const* a, char const* b) { return TRI_Concatenate2StringZ(TRI_CORE_MEM_ZONE, a, b); } //////////////////////////////////////////////////////////////////////////////// /// @brief concatenate two strings using a memory zone //////////////////////////////////////////////////////////////////////////////// char* TRI_Concatenate2StringZ (TRI_memory_zone_t* zone, char const* a, char const* b) { char* result; size_t na; size_t nb; na = strlen(a); nb = strlen(b); result = static_cast(TRI_Allocate(zone, na + nb + 1, false)); if (result != nullptr) { memcpy(result, a, na); memcpy(result + na, b, nb); result[na + nb] = '\0'; } return result; } //////////////////////////////////////////////////////////////////////////////// /// @brief concatenate three strings //////////////////////////////////////////////////////////////////////////////// char* TRI_Concatenate3String (char const* a, char const* b, char const* c) { return TRI_Concatenate3StringZ(TRI_CORE_MEM_ZONE, a, b, c); } //////////////////////////////////////////////////////////////////////////////// /// @brief concatenate three strings using a memory zone //////////////////////////////////////////////////////////////////////////////// char* TRI_Concatenate3StringZ (TRI_memory_zone_t* zone, char const* a, char const* b, char const* c) { char* result; size_t na; size_t nb; size_t nc; na = strlen(a); nb = strlen(b); nc = strlen(c); result = static_cast(TRI_Allocate(zone, na + nb + nc + 1, false)); if (result != nullptr) { memcpy(result, a, na); memcpy(result + na, b, nb); memcpy(result + na + nb, c, nc); result[na + nb + nc] = '\0'; } return result; } //////////////////////////////////////////////////////////////////////////////// /// @brief concatenate four strings //////////////////////////////////////////////////////////////////////////////// char* TRI_Concatenate4String (char const* a, char const* b, char const* c, char const* d) { char* result; size_t na; size_t nb; size_t nc; size_t nd; na = strlen(a); nb = strlen(b); nc = strlen(c); nd = strlen(d); result = static_cast(TRI_Allocate(TRI_CORE_MEM_ZONE, na + nb + nc + nd + 1, false)); memcpy(result, a, na); memcpy(result + na, b, nb); memcpy(result + na + nb, c, nc); memcpy(result + na + nb + nc, d, nd); result[na + nb + nc + nd] = '\0'; return result; } //////////////////////////////////////////////////////////////////////////////// /// @brief splits a string //////////////////////////////////////////////////////////////////////////////// TRI_vector_string_t TRI_SplitString (char const* source, char delim) { TRI_vector_string_t result; char* buffer; char* p; char const* q; char const* e; size_t size; TRI_InitVectorString(&result, TRI_CORE_MEM_ZONE); if (source == nullptr || *source == '\0') { return result; } size = strlen(source); buffer = static_cast(TRI_Allocate(TRI_CORE_MEM_ZONE, size + 1, false)); p = buffer; q = source; e = source + size; for (; q < e; ++q) { if (*q == delim) { *p = '\0'; TRI_PushBackVectorString(&result, TRI_DuplicateString2(buffer, (size_t) (p - buffer))); p = buffer; } else { *p++ = *q; } } *p = '\0'; TRI_PushBackVectorString(&result, TRI_DuplicateString2(buffer, (size_t) (p - buffer))); TRI_FreeString(TRI_CORE_MEM_ZONE, buffer); return result; } //////////////////////////////////////////////////////////////////////////////// /// @brief splits a string, using more than one delimiter //////////////////////////////////////////////////////////////////////////////// TRI_vector_string_t TRI_Split2String (char const* source, char const* delim) { TRI_vector_string_t result; char* buffer; char* p; char const* q; char const* e; size_t size; size_t delimiterSize; TRI_InitVectorString(&result, TRI_CORE_MEM_ZONE); if (delim == nullptr || *delim == '\0') { return result; } if (source == nullptr || *source == '\0') { return result; } delimiterSize = strlen(delim); size = strlen(source); buffer = static_cast(TRI_Allocate(TRI_CORE_MEM_ZONE, size + 1, false)); p = buffer; q = source; e = source + size; for (; q < e; ++q) { size_t i; bool found = false; for (i = 0; i < delimiterSize; ++i) { if (*q == delim[i]) { *p = '\0'; TRI_PushBackVectorString(&result, TRI_DuplicateString2(buffer, (size_t) (p - buffer))); p = buffer; found = true; break; } } if (! found) { *p++ = *q; } } *p = '\0'; TRI_PushBackVectorString(&result, TRI_DuplicateString2(buffer, p - buffer)); TRI_FreeString(TRI_CORE_MEM_ZONE, buffer); return result; } //////////////////////////////////////////////////////////////////////////////// /// @brief frees a string //////////////////////////////////////////////////////////////////////////////// #ifdef TRI_ENABLE_MAINTAINER_MODE void TRI_FreeStringZ (TRI_memory_zone_t* zone, char* value, char const* file, int line) { TRI_FreeZ(zone, value, file, line); } #else void TRI_FreeString (TRI_memory_zone_t* zone, char* value) { TRI_Free(zone, value); } #endif // ----------------------------------------------------------------------------- // --SECTION-- public escape functions // ----------------------------------------------------------------------------- //////////////////////////////////////////////////////////////////////////////// /// @brief converts into printable representation //////////////////////////////////////////////////////////////////////////////// char* TRI_PrintableString (char const* source, size_t sourceLen) { unsigned char* result; unsigned char* p; unsigned char* end; p = result = (unsigned char*) TRI_Allocate(TRI_CORE_MEM_ZONE, sourceLen + 1, false); end = p + sourceLen; while (p < end) { if (*source >= ' ' && *source <= 'z') { *p = *source; } else { *p = '.'; } source++; p++; } *p = '\0'; return (char*) result; } //////////////////////////////////////////////////////////////////////////////// /// @brief converts into hex representation //////////////////////////////////////////////////////////////////////////////// char* TRI_EncodeHexString (char const* source, size_t sourceLen, size_t* dstLen) { char* result; uint16_t* hex; uint16_t* dst; uint8_t* src; size_t j; *dstLen = (sourceLen * 2); dst = static_cast(TRI_Allocate(TRI_CORE_MEM_ZONE, (*dstLen) + 1, false)); result = (char*) dst; hex = (uint16_t*) HexValues; src = (uint8_t*) source; for (j = 0; j < sourceLen; j++) { *dst = hex[*src]; dst++; src++; } *((char*) dst) = 0; // terminate the string return result; } //////////////////////////////////////////////////////////////////////////////// /// @brief converts from hex representation //////////////////////////////////////////////////////////////////////////////// char* TRI_DecodeHexString (char const* source, size_t sourceLen, size_t* dstLen) { char* result; uint8_t* dst; uint8_t* src; size_t j; *dstLen = (sourceLen / 2); dst = static_cast(TRI_Allocate(TRI_CORE_MEM_ZONE, (*dstLen) + 1, false)); result = (char*) dst; src = (uint8_t*) source; for (j = 0; j < sourceLen; j += 2) { uint8_t d; d = HexDecodeLookup[*src++] << 4; d |= HexDecodeLookup[*src++]; *dst++ = d; } *dst = 0; // terminate the string return result; } //////////////////////////////////////////////////////////////////////////////// /// @brief sha256 of a string //////////////////////////////////////////////////////////////////////////////// char* TRI_SHA256String (char const* source, size_t sourceLen, size_t* dstLen) { unsigned char* dst; dst = static_cast(TRI_Allocate(TRI_CORE_MEM_ZONE, SHA256_DIGEST_LENGTH, false)); *dstLen = SHA256_DIGEST_LENGTH; SHA256((unsigned char const*) source, sourceLen, dst); return (char*) dst; } //////////////////////////////////////////////////////////////////////////////// /// @brief escapes special characters using C escapes //////////////////////////////////////////////////////////////////////////////// char* TRI_EscapeControlsCString (TRI_memory_zone_t* zone, char const* in, size_t inLength, size_t* outLength, bool appendNewline) { char * buffer; char * qtr; char const * ptr; char const * end; buffer = static_cast(TRI_Allocate(zone, (4 * inLength) + 1 + (appendNewline ? 1 : 0), false)); if (buffer == nullptr) { return nullptr; } qtr = buffer; for (ptr = in, end = ptr + inLength; ptr < end; ptr++, qtr++) { uint8_t n; switch (*ptr) { case '\n': *qtr++ = '\\'; *qtr = 'n'; break; case '\r': *qtr++ = '\\'; *qtr = 'r'; break; default: n = (uint8_t)(*ptr); if (n < 32 || n > 127) { uint8_t n1 = n >> 4; uint8_t n2 = n & 0x0F; *qtr++ = '\\'; *qtr++ = 'x'; *qtr++ = (n1 < 10) ? ('0' + n1) : ('A' + n1 - 10); *qtr = (n2 < 10) ? ('0' + n2) : ('A' + n2 - 10); } else { *qtr = *ptr; } break; } } if (appendNewline) { *qtr++ = '\n'; } *qtr = '\0'; *outLength = (size_t) (qtr - buffer); qtr = static_cast(TRI_Allocate(zone, (*outLength) + 1, false)); if (qtr != nullptr) { memcpy(qtr, buffer, (*outLength) + 1); } TRI_Free(zone, buffer); // may be nullptr return qtr; } //////////////////////////////////////////////////////////////////////////////// /// @brief escapes special characters using unicode escapes //////////////////////////////////////////////////////////////////////////////// char* TRI_EscapeUtf8String (TRI_memory_zone_t* zone, char const* in, size_t inLength, bool escapeSlash, size_t* outLength, bool compactResult) { char * buffer; char * qtr; char const * ptr; char const * end; buffer = (char*) TRI_Allocate(zone, 6 * inLength + 1, false); if (buffer == nullptr) { return nullptr; } qtr = buffer; for (ptr = in, end = ptr + inLength; ptr < end; ++ptr, ++qtr) { switch (*ptr) { case '/': if (escapeSlash) { *qtr++ = '\\'; } *qtr = *ptr; break; case '\\': case '"': *qtr++ = '\\'; *qtr = *ptr; break; case '\b': *qtr++ = '\\'; *qtr = 'b'; break; case '\f': *qtr++ = '\\'; *qtr = 'f'; break; case '\n': *qtr++ = '\\'; *qtr = 'n'; break; case '\r': *qtr++ = '\\'; *qtr = 'r'; break; case '\t': *qtr++ = '\\'; *qtr = 't'; break; case '\0': *qtr++ = '\\'; *qtr++ = 'u'; *qtr++ = '0'; *qtr++ = '0'; *qtr++ = '0'; *qtr = '0'; break; default: { uint8_t c; // next character as unsigned char c = (uint8_t) *ptr; // character is in the normal latin1 range if ((c & 0x80) == 0) { // special character, escape if (c < 32) { EscapeUtf8Range0000T007F(&qtr, &ptr); } // normal latin1 else { *qtr = *ptr; } } // unicode range 0080 - 07ff (2-byte sequence UTF-8) else if ((c & 0xE0) == 0xC0) { // hopefully correct UTF-8 if (ptr + 1 < end) { EscapeUtf8Range0080T07FF(&qtr, &ptr); } // corrupted UTF-8 else { *qtr = *ptr; } } // unicode range 0800 - ffff (3-byte sequence UTF-8) else if ((c & 0xF0) == 0xE0) { // hopefully correct UTF-8 if (ptr + 2 < end) { EscapeUtf8Range0800TFFFF(&qtr, &ptr); } // corrupted UTF-8 else { *qtr = *ptr; } } // unicode range 10000 - 10ffff (4-byte sequence UTF-8) else if ((c & 0xF8) == 0xF0) { // hopefully correct UTF-8 if (ptr + 3 < end) { EscapeUtf8Range10000T10FFFF(&qtr, &ptr); } // corrupted UTF-8 else { *qtr = *ptr; } } // unicode range above 10ffff -- NOT IMPLEMENTED else { *qtr = *ptr; } } break; } } *qtr = '\0'; *outLength = (size_t) (qtr - buffer); if (! compactResult) { return buffer; } qtr = static_cast(TRI_Allocate(zone, *outLength + 1, false)); if (qtr != nullptr) { memcpy(qtr, buffer, *outLength + 1); } TRI_Free(zone, buffer); return qtr; } //////////////////////////////////////////////////////////////////////////////// /// @brief unescapes unicode escape sequences //////////////////////////////////////////////////////////////////////////////// char* TRI_UnescapeUtf8String (TRI_memory_zone_t* zone, char const* in, size_t inLength, size_t* outLength) { char * buffer; char * qtr; char const * ptr; char const * end; size_t tmpLength = 0; buffer = static_cast(TRI_Allocate(zone, inLength + 1, false)); if (buffer == nullptr) { return nullptr; } qtr = buffer; for (ptr = in, end = ptr + inLength; ptr < end; ++ptr, ++qtr) { if (*ptr == '\\' && ptr + 1 < end) { ++ptr; switch (*ptr) { case 'b': *qtr = '\b'; break; case 'f': *qtr = '\f'; break; case 'n': *qtr = '\n'; break; case 'r': *qtr = '\r'; break; case 't': *qtr = '\t'; break; case 'u': // expecting at least 6 characters: \uXXXX if (ptr + 4 < end) { // check, if we have a surrogate pair if (ptr + 10 < end) { bool sp; char c1 = ptr[1]; sp = (c1 == 'd' || c1 == 'D'); if (sp) { char c2 = ptr[2]; sp &= (c2 == '8' || c2 == '9' || c2 == 'A' || c2 == 'a' || c2 == 'B' || c2 == 'b'); } if (sp) { char c3 = ptr[7]; sp &= (ptr[5] == '\\' && ptr[6] == 'u'); sp &= (c3 == 'd' || c3 == 'D'); } if (sp) { char c4 = ptr[8]; sp &= (c4 == 'C' || c4 == 'c' || c4 == 'D' || c4 == 'd' || c4 == 'E' || c4 == 'e' || c4 == 'F' || c4 == 'f'); } if (sp) { DecodeSurrogatePair(&qtr, ptr + 1, ptr + 7); ptr += 10; } else { DecodeUnicodeEscape(&qtr, ptr + 1); ptr += 4; } } else { DecodeUnicodeEscape(&qtr, ptr + 1); ptr += 4; } } // ignore wrong format else { *qtr = *ptr; } break; default: // this includes cases \/, \\, and \" *qtr = *ptr; break; } continue; } *qtr = *ptr; } *qtr = '\0'; *outLength = (size_t) (qtr - buffer); if (*outLength > 0) { char * utf8_nfc = TRI_normalize_utf8_to_NFC(zone, buffer, *outLength, &tmpLength); if (utf8_nfc != nullptr) { *outLength = tmpLength; TRI_Free(zone, buffer); return utf8_nfc; } // intentional fall-through } return buffer; } //////////////////////////////////////////////////////////////////////////////// /// @brief determine the number of characters in a UTF-8 string /// the UTF-8 string must be well-formed and end with a NUL terminator //////////////////////////////////////////////////////////////////////////////// size_t TRI_CharLengthUtf8String (const char* in) { size_t length; unsigned char* p; p = (unsigned char*) in; length = 0; while (*p) { unsigned char c = *p; if (c < 128) { // single byte p++; } else if (c < 224) { p += 2; } else if (c < 240) { p += 3; } else if (c < 248) { p += 4; } else { printf("invalid utf\n"); // invalid UTF-8 sequence break; } ++length; } return length; } //////////////////////////////////////////////////////////////////////////////// /// @brief get the string end position for a leftmost prefix of a UTF-8 string /// eg. when specifying (müller, 2), the return value will be a pointer to the /// first "l". /// the UTF-8 string must be well-formed and end with a NUL terminator //////////////////////////////////////////////////////////////////////////////// char* TRI_PrefixUtf8String (const char* in, const uint32_t maximumLength) { uint32_t length; unsigned char* p; p = (unsigned char*) in; length = 0; while (*p && length < maximumLength) { unsigned char c = *p; if (c < 128) { // single byte p++; } else if (c < 224) { p += 2; } else if (c < 240) { p += 3; } else if (c < 248) { p += 4; } else { // invalid UTF-8 sequence break; } ++length; } return (char*) p; } // ----------------------------------------------------------------------------- // --SECTION-- END-OF-FILE // ----------------------------------------------------------------------------- // Local Variables: // mode: outline-minor // outline-regexp: "/// @brief\\|/// {@inheritDoc}\\|/// @page\\|// --SECTION--\\|/// @\\}" // End: