1
0
Fork 0
arangodb/3rdParty/iresearch/core/utils/locale_utils.cpp

3777 lines
112 KiB
C++

////////////////////////////////////////////////////////////////////////////////
/// DISCLAIMER
///
/// Copyright 2016 by EMC Corporation, All Rights Reserved
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is EMC Corporation
///
/// @author Andrey Abramov
/// @author Vasiliy Nabatchikov
////////////////////////////////////////////////////////////////////////////////
#ifdef _WIN32
#include <Windows.h> // for GetACP()
#else
#include <langinfo.h> // for nl_langinfo(...)
#endif
#include <algorithm>
#include <cstring>
#include <map>
#include <unordered_map>
#if defined (__GNUC__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
#include <boost/locale/generator.hpp>
#if defined (__GNUC__)
#pragma GCC diagnostic pop
#endif
#include <boost/locale/info.hpp>
#if defined (__GNUC__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
#include <boost/locale/util.hpp>
#if defined (__GNUC__)
#pragma GCC diagnostic pop
#endif
#include <unicode/coll.h> // for icu::Collator
#include <unicode/decimfmt.h> // for icu::DecimalFormat
#include <unicode/numfmt.h> // for icu::NumberFormat
#include <unicode/ucnv.h> // for UConverter
#include <unicode/ustring.h> // for u_strToUTF32, u_strToUTF8
#include "hash_utils.hpp"
#include "map_utils.hpp"
#include "object_pool.hpp"
#include "numeric_utils.hpp"
#include "error/error.hpp"
#include "locale_utils.hpp"
NS_BEGIN(std)
// GCC < v5 does not explicitly define
// std::codecvt<char16_t, char, mbstate_t>::id or std::codecvt<char32_t, char, mbstate_t>::id
// this causes linking issues in optimized code
// Note: clang tries to pretend to be GCC, so it must be explicitly excluded
#if !defined(__APPLE__) && !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 5)
/*static*/ template<> locale::id codecvt<char16_t, char, mbstate_t>::id;
/*static*/ template<> locale::id codecvt<char32_t, char, mbstate_t>::id;
#endif
NS_END // std
NS_LOCAL
////////////////////////////////////////////////////////////////////////////////
/// @brief size of internal buffers, arbitrary size
////////////////////////////////////////////////////////////////////////////////
const size_t BUFFER_SIZE = 1024;
////////////////////////////////////////////////////////////////////////////////
/// @brief size of ICU object pools, arbitrary size
////////////////////////////////////////////////////////////////////////////////
const size_t POOL_SIZE = 8;
// -----------------------------------------------------------------------------
// --SECTION-- facets required by std::locale
// -----------------------------------------------------------------------------
std::string system_encoding() {
#ifdef _WIN32
static std::string prefix("cp");
return prefix + std::to_string(GetACP());
#else
return nl_langinfo(CODESET);
#endif
}
////////////////////////////////////////////////////////////////////////////////
/// @brief a thread-safe pool of ICU converters for a given encoding
/// may hold nullptr on ICU converter instantiation failure
////////////////////////////////////////////////////////////////////////////////
class converter_pool: private irs::util::noncopyable {
public:
DECLARE_SHARED_PTR(UConverter);
converter_pool(std::string&& encoding)
: encoding_(std::move(encoding)), pool_(POOL_SIZE) {}
ptr get() { return pool_.emplace(encoding_).release(); }
const std::string& encoding() const NOEXCEPT { return encoding_; }
private:
struct builder {
DECLARE_SHARED_PTR(UConverter);
static ptr make(const std::string& encoding) {
UErrorCode status = U_ZERO_ERROR;
ptr value(
ucnv_open(encoding.c_str(), &status),
[](UConverter* ptr)->void{ ucnv_close(ptr); }
);
return U_SUCCESS(status) ? std::move(value) : nullptr;
}
};
std::string encoding_;
irs::unbounded_object_pool_volatile<builder> pool_;
};
////////////////////////////////////////////////////////////////////////////////
/// @param encoding the converter encoding (null == system encoding)
/// @@return a converter for the specified encoding
////////////////////////////////////////////////////////////////////////////////
converter_pool& get_converter(const irs::string_ref& encoding) {
static auto generator = [](
const irs::hashed_string_ref& key,
const converter_pool& pool
) NOEXCEPT->irs::hashed_string_ref {
// reuse hash but point ref at value in pool
return irs::hashed_string_ref(key.hash(), pool.encoding());
};
static std::mutex mutex;
static std::unordered_map<irs::hashed_string_ref, converter_pool> encodings;
auto key = encoding;
std::string tmp;
// use system encoding if encoding.null()
if (key.null()) {
tmp = system_encoding();
key = tmp;
}
SCOPED_LOCK(mutex);
return irs::map_utils::try_emplace_update_key(
encodings,
generator,
irs::make_hashed_ref(key, std::hash<irs::string_ref>()),
key
).first->second;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief base implementation for converters between 'internal' representation
/// and an 'external' user-specified encoding (unicode internal)
////////////////////////////////////////////////////////////////////////////////
template<typename InternType>
class codecvtu_base: public std::codecvt<InternType, char, mbstate_t> {
public:
typedef std::codecvt<InternType, char, mbstate_t> parent_t;
typedef typename parent_t::extern_type extern_type;
typedef typename parent_t::intern_type intern_type;
typedef typename parent_t::state_type state_type;
codecvtu_base(converter_pool& converters)
: contexts_(POOL_SIZE), converters_(converters) {}
protected:
struct context_t {
DECLARE_UNIQUE_PTR(context_t);
std::basic_string<typename parent_t::intern_type> buf_;
converter_pool::ptr converter_;
static ptr make(converter_pool& pool) {
auto ctx = irs::memory::make_unique<context_t>();
ctx->converter_ = pool.get();
return ctx->converter_ ? std::move(ctx) : nullptr;
}
};
typedef irs::unbounded_object_pool<context_t> context_pool;
typename context_pool::ptr context() const {
return contexts_.emplace(converters_);
}
const std::string& context_encoding() const NOEXCEPT {
return converters_.encoding();
}
virtual bool do_always_noconv() const NOEXCEPT final override {
return false; // not an identity conversion
}
virtual int do_encoding() const NOEXCEPT override = 0;
virtual std::codecvt_base::result do_in(
state_type& state,
const extern_type* from,
const extern_type* from_end,
const extern_type*& from_next,
intern_type* to,
intern_type* to_end,
intern_type*& to_next
) const override = 0;
virtual int do_length(
state_type& state,
const extern_type* from,
const extern_type* from_end,
std::size_t max
) const final override;
virtual int do_max_length() const NOEXCEPT override = 0;
virtual std::codecvt_base::result do_out(
state_type& state,
const intern_type* from,
const intern_type* from_end,
const intern_type*& from_next,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const override = 0;
virtual std::codecvt_base::result do_unshift(
state_type& state,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const final override;
private:
mutable context_pool contexts_;
converter_pool& converters_;
};
template<typename InternType>
int codecvtu_base<InternType>::do_length(
state_type& state,
const extern_type* from,
const extern_type* from_end,
std::size_t max
) const {
auto ctx = context();
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while computing number of required input characters from encoding '%s' to produce at most '" IR_SIZE_T_SPECIFIER "' output characters",
context_encoding().c_str(), max
);
return std::codecvt_base::error;
}
ctx->buf_.resize(max);
auto* from_next = from;
auto* to = &(ctx->buf_[0]);
auto* to_end = to + max;
auto* to_next = to;
auto res = do_in(state, from, from_end, from_next, to, to_end, to_next);
return res == std::codecvt_base::ok ? std::distance(from, from_next) : 0;
}
template<typename InternType>
std::codecvt_base::result codecvtu_base<InternType>::do_unshift(
state_type& state,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const {
to_next = to;
return std::codecvt_base::ok;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief converter between an 'internal' utf16 representation and
/// an 'external' user-specified encoding
////////////////////////////////////////////////////////////////////////////////
class codecvt16_facet final: public codecvtu_base<char16_t> {
public:
MSVC2015_ONLY(static std::locale::id id;) // MSVC2015 requires a static instance of an 'id' member
MSVC2017_ONLY(static std::locale::id id;) // MSVC2017 requires a static instance of an 'id' member
codecvt16_facet(converter_pool& converters): codecvtu_base(converters) {}
bool append(
std::basic_string<intern_type>& buf, const icu::UnicodeString& value
) const;
protected:
virtual int do_encoding() const NOEXCEPT override;
virtual std::codecvt_base::result do_in(
state_type& state,
const extern_type* from,
const extern_type* from_end,
const extern_type*& from_next,
intern_type* to,
intern_type* to_end,
intern_type*& to_next
) const override;
virtual int do_max_length() const NOEXCEPT override;
virtual std::codecvt_base::result do_out(
state_type& state,
const intern_type* from,
const intern_type* from_end,
const intern_type*& from_next,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const override;
};
MSVC2015_ONLY(/*static*/ std::locale::id codecvt16_facet::id;) // MSVC2015 requires a static instance of an 'id' member
MSVC2017_ONLY(/*static*/ std::locale::id codecvt16_facet::id;) // MSVC2017 requires a static instance of an 'id' member
#if defined (__GNUC__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-function"
#endif
bool codecvt16_facet::append(
std::basic_string<intern_type>& buf, const icu::UnicodeString& value
) const {
auto size = value.length();
if (size < 0) {
IR_FRMT_WARN(
"ICU returned invalid string size while converting unicode string to UTF16"
);
return false;
}
static_assert(sizeof(UChar) == sizeof(intern_type), "sizeof(UChar) != sizeof(intern_type)");
buf.append(reinterpret_cast<const intern_type*>(value.getBuffer()), size);
return true;
}
#if defined (__GNUC__)
#pragma GCC diagnostic pop
#endif
int codecvt16_facet::do_encoding() const NOEXCEPT {
auto ctx = context();
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while computing number of required input characters from encoding '%s' to produce a single output character",
context_encoding().c_str()
);
return -1;
}
UErrorCode status = U_ZERO_ERROR;
// the exact number of externT characters that correspond to one internT character, if constant
return ucnv_isFixedWidth(ctx->converter_.get(), &status)
? ucnv_getMinCharSize(ctx->converter_.get()) : 0;
}
std::codecvt_base::result codecvt16_facet::do_in(
state_type& state,
const extern_type* from,
const extern_type* from_end,
const extern_type*& from_next,
intern_type* to,
intern_type* to_end,
intern_type*& to_next
) const {
auto ctx = context();
from_next = from;
to_next = to;
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while converting encoding '%s' to unicode system encoding",
context_encoding().c_str()
);
return std::codecvt_base::error;
}
UErrorCode status = U_ZERO_ERROR;
ucnv_reset(ctx->converter_.get());
static_assert(sizeof(UChar) == sizeof(intern_type), "sizeof(UChar) != sizeof(intern_type)");
ucnv_toUnicode(
ctx->converter_.get(),
reinterpret_cast<UChar**>(&to_next),
reinterpret_cast<const UChar*>(to_end),
&from_next,
from_end,
nullptr,
true,
&status
);
if (U_BUFFER_OVERFLOW_ERROR == status) {
return std::codecvt_base::partial; // destination buffer is not large enough
}
if (!U_SUCCESS(status)) {
from_next = from;
to_next = to;
IR_FRMT_WARN(
"failure to convert from locale encoding to UTF16 while converting encoding '%s' unicode system encoding",
context_encoding().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
return std::codecvt_base::ok;
}
int codecvt16_facet::do_max_length() const NOEXCEPT {
auto ctx = context();
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while computing maximum number of required input characters from encoding '%s' to produce a single output character",
context_encoding().c_str()
);
return -1;
}
return ucnv_getMaxCharSize(ctx->converter_.get());
}
std::codecvt_base::result codecvt16_facet::do_out(
state_type& state,
const intern_type* from,
const intern_type* from_end,
const intern_type*& from_next,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const {
auto ctx = context();
from_next = from;
to_next = to;
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while converting unicode system encoding to encoding '%s'",
context_encoding().c_str()
);
return std::codecvt_base::error;
}
UErrorCode status = U_ZERO_ERROR;
ucnv_reset(ctx->converter_.get());
static_assert(sizeof(UChar) == sizeof(intern_type), "sizeof(UChar) != sizeof(intern_type)");
ucnv_fromUnicode(
ctx->converter_.get(),
&to_next,
to_end,
reinterpret_cast<const UChar**>(&from_next),
reinterpret_cast<const UChar *>(from_end),
nullptr,
true,
&status
);
if (U_BUFFER_OVERFLOW_ERROR == status) {
return std::codecvt_base::partial; // destination buffer is not large enough
}
if (!U_SUCCESS(status)) {
from_next = from;
to_next = to;
IR_FRMT_WARN(
"failure to convert from TF16 to locale encoding while converting unicode system encoding to encoding '%s'",
context_encoding().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
return std::codecvt_base::ok;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief converter between an 'internal' utf32 representation and
/// an 'external' user-specified encoding and an
////////////////////////////////////////////////////////////////////////////////
class codecvt32_facet final: public codecvtu_base<char32_t> {
public:
MSVC2015_ONLY(static std::locale::id id;) // MSVC2015 requires a static instance of an 'id' member
MSVC2017_ONLY(static std::locale::id id;) // MSVC2017 requires a static instance of an 'id' member
codecvt32_facet(converter_pool& converters): codecvtu_base(converters) {}
bool append(
std::basic_string<intern_type>& buf, const icu::UnicodeString& value
) const;
protected:
virtual int do_encoding() const NOEXCEPT final override;
virtual std::codecvt_base::result do_in(
state_type& state,
const extern_type* from,
const extern_type* from_end,
const extern_type*& from_next,
intern_type* to,
intern_type* to_end,
intern_type*& to_next
) const final override;
virtual int do_max_length() const NOEXCEPT override;
virtual std::codecvt_base::result do_out(
state_type& state,
const intern_type* from,
const intern_type* from_end,
const intern_type*& from_next,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const override;
};
MSVC2015_ONLY(/*static*/ std::locale::id codecvt32_facet::id;) // MSVC2015 requires a static instance of an 'id' member
MSVC2017_ONLY(/*static*/ std::locale::id codecvt32_facet::id;) // MSVC2017 requires a static instance of an 'id' member
bool codecvt32_facet::append(
std::basic_string<intern_type>& buf, const icu::UnicodeString& value
) const {
auto size = value.countChar32();
if (size < 0) {
IR_FRMT_WARN(
"ICU returned invalid string size while converting unicode string to UTF32"
);
return false;
}
auto start = buf.size();
UErrorCode status = U_ZERO_ERROR;
buf.resize(buf.size() + size);
static_assert(sizeof(UChar32) == sizeof(intern_type), "sizeof(UChar32) != sizeof(intern_type)");
auto written =
value.toUTF32(reinterpret_cast<UChar32*>(&buf[start]), size, status);
if (U_SUCCESS(status) && written == size) {
return true;
}
buf.resize(start);
return false;
}
int codecvt32_facet::do_encoding() const NOEXCEPT {
auto ctx = context();
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while computing number of required input characters from encoding '%s' to produce a single output character",
context_encoding().c_str()
);
return -1;
}
UErrorCode status = U_ZERO_ERROR;
// the exact number of extern_type characters that correspond to one intern_type character, if constant
return ucnv_isFixedWidth(ctx->converter_.get(), &status)
? int(ucnv_getMinCharSize(ctx->converter_.get())) : 0;
}
std::codecvt_base::result codecvt32_facet::do_in(
state_type& state,
const extern_type* from,
const extern_type* from_end,
const extern_type*& from_next,
intern_type* to,
intern_type* to_end,
intern_type*& to_next
) const {
auto ctx = context();
from_next = from;
to_next = to;
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while converting encoding '%s' to unicode system encoding",
context_encoding().c_str()
);
return std::codecvt_base::error;
}
UChar buf[BUFFER_SIZE];
auto* buf_end = buf + IRESEARCH_COUNTOF(buf);
int32_t src_offsets[IRESEARCH_COUNTOF(buf) + 1]; // +1 for end
int32_t dst_offsets[IRESEARCH_COUNTOF(buf) + 1]; // +1 for end (use same size since always: count of UTF32 chars <= count of UTF16 chars)
ucnv_reset(ctx->converter_.get());
// convert 'BUFFER_SIZE' at a time
while (from_next < from_end) {
auto* buf_next = buf;
auto* from_next_prev = from_next;
auto* to_next_prev = to_next;
UErrorCode src_status = U_ZERO_ERROR;
UErrorCode dst_status = U_ZERO_ERROR;
// convert from desired encoding to the intermediary representation
ucnv_toUnicode(
ctx->converter_.get(),
&buf_next,
buf_end,
&from_next,
from_end,
src_offsets,
true,
&src_status
);
if (!U_SUCCESS(src_status) && U_BUFFER_OVERFLOW_ERROR != src_status) {
from_next = from_next_prev;
to_next = to_next_prev;
IR_FRMT_WARN(
"failure to convert from locale encoding to UTF16 while converting encoding '%s' unicode system encoding",
context_encoding().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
assert(buf_next >= buf && IRESEARCH_COUNTOF(src_offsets) > size_t(buf_next - buf));
src_offsets[buf_next - buf] = from_next - from_next_prev; // remember past-end position
auto* buf_dst_next = buf;
auto* buf_dst_end = buf_next;
// convert one char at a time to track source position to destination position
do {
int32_t to_used = 0;
static_assert(sizeof(UChar32) == sizeof(intern_type), "sizeof(UChar32) != sizeof(intern_type)");
u_strToUTF32(
reinterpret_cast<UChar32*>(to_next),
to_end - to_next,
&to_used, // set to the number of output units corresponding to the transformation of all the input units, even in case of a buffer overflow
buf_dst_next,
U_IS_SURROGATE(*buf_dst_next) ? 2 : 1, // 1 char at a time to track source/destination position mapping
&dst_status
);
if (U_BUFFER_OVERFLOW_ERROR == dst_status) {
break; // conversion buffer not large enough to hold result
}
if (U_SUCCESS(dst_status) && to_used < 0) {
dst_status = U_INTERNAL_PROGRAM_ERROR; // ICU internal error
}
if (!U_SUCCESS(dst_status) && U_BUFFER_OVERFLOW_ERROR != dst_status) {
from_next = from_next_prev;
to_next = to_next_prev;
IR_FRMT_WARN(
"failure to convert from UTF16 to UTF32 while converting encoding '%s' to unicode system encoding",
context_encoding().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
// all of 'to_used' since if not enough space in 'to' buffer then would have had U_BUFFER_OVERFLOW_ERROR
while (to_used) {
assert(to_next >= to_next_prev && IRESEARCH_COUNTOF(dst_offsets) > size_t(to_next - to_next_prev));
dst_offsets[to_next - to_next_prev] = buf_dst_next - buf; // remember converted position (start)
++to_next;
--to_used;
}
buf_dst_next += U_IS_SURROGATE(*buf_dst_next) ? 2 : 1; // +1 for 1 char at a time (+2 for surrogate)
} while (buf_dst_next + 1 < buf_dst_end); // +1 for possible surrogate
assert(to_next >= to_next_prev && IRESEARCH_COUNTOF(dst_offsets) > size_t(to_next - to_next_prev));
dst_offsets[to_next - to_next_prev] = buf_dst_next - buf; // remember past-end position
auto buf_pos = dst_offsets[to_next - to_next_prev];
assert(buf_pos >= 0 && IRESEARCH_COUNTOF(src_offsets) > size_t(buf_pos));
from_next = from_next_prev + src_offsets[buf_pos]; // update successfully converted
if (U_BUFFER_OVERFLOW_ERROR == dst_status
|| (U_BUFFER_OVERFLOW_ERROR == src_status && from_next >= from_end)) {
return std::codecvt_base::partial; // destination buffer is not large enough
}
}
return std::codecvt_base::ok;
}
int codecvt32_facet::do_max_length() const NOEXCEPT {
auto ctx = context();
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while computing maximum number of required input characters from encoding '%s' to produce a single output character",
context_encoding().c_str()
);
return -1;
}
auto bytes_per_utf16 = ucnv_getMaxCharSize(ctx->converter_.get());
// *2 for UTF16->UTF32 conversion if can't fit each encoded character into a utf16 character
return bytes_per_utf16 <= 2 ? bytes_per_utf16 : (bytes_per_utf16 * 2);
}
std::codecvt_base::result codecvt32_facet::do_out(
state_type& state,
const intern_type* from,
const intern_type* from_end,
const intern_type*& from_next,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const {
auto ctx = context();
from_next = from;
to_next = to;
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while converting unicode system encoding to encoding '%s'",
context_encoding().c_str()
);
return std::codecvt_base::error;
}
UChar buf[BUFFER_SIZE];
auto* buf_end = buf + IRESEARCH_COUNTOF(buf);
size_t offsets[IRESEARCH_COUNTOF(buf) + 1]; // +1 for end
ucnv_reset(ctx->converter_.get());
// convert 'BUFFER_SIZE' at a time
while (from_next < from_end) {
const UChar* buf_from = buf;
auto* buf_next = buf;
auto* from_next_prev = from_next;
auto* to_next_prev = to_next;
UErrorCode src_status = U_ZERO_ERROR;
UErrorCode dst_status = U_ZERO_ERROR;
// convert one char at a time to track source position to destination position
do {
int32_t buf_used = 0;
static_assert(sizeof(UChar32) == sizeof(intern_type), "sizeof(UChar32) != sizeof(intern_type)");
u_strFromUTF32(
buf_next,
buf_end - buf_next,
&buf_used, // set to the number of output units corresponding to the transformation of all the input units, even in case of a buffer overflow
reinterpret_cast<const UChar32*>(from_next),
1, // 1 char at a time to track source/destination position mapping
&src_status
);
if (U_BUFFER_OVERFLOW_ERROR == src_status) {
break; // conversion buffer not large enough to hold result
}
if (U_SUCCESS(src_status) && buf_used < 0) {
src_status = U_INTERNAL_PROGRAM_ERROR; // ICU internal error
}
if (!U_SUCCESS(src_status)) {
IR_FRMT_WARN(
"failure to convert from UTF32 to UTF16 while converting unicode system encoding to encoding '%s'",
context_encoding().c_str()
);
break; // finish copying all successfully converted
}
assert(buf_next >= buf && IRESEARCH_COUNTOF(offsets) > size_t(buf_next - buf));
offsets[buf_next - buf] = from_next - from; // remember converted position
buf_next += buf_used;
++from_next; // +1 for 1 char at a time
} while (from_next < from_end);
assert(buf_next >= buf && IRESEARCH_COUNTOF(offsets) > size_t(buf_next - buf));
offsets[buf_next - buf] = from_next - from; // remember past-end position
// convert intermediary representation to the desired encoding
ucnv_fromUnicode(
ctx->converter_.get(),
&to_next,
to_end,
&buf_from,
buf_next,
nullptr,
true,
&dst_status
);
if (!U_SUCCESS(dst_status) && U_BUFFER_OVERFLOW_ERROR != dst_status) {
from_next = from_next_prev;
to_next = to_next_prev;
IR_FRMT_WARN(
"failure to convert from UTF16 to locale encoding while converting unicode system encoding to encoding '%s'",
context_encoding().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
assert(buf_from >= buf && IRESEARCH_COUNTOF(offsets) > size_t(buf_from - buf));
from_next = from + offsets[buf_from - buf]; // update successfully converted
if (!U_SUCCESS(src_status) && U_BUFFER_OVERFLOW_ERROR != src_status) {
return std::codecvt_base::error; // error occured during intermediary conversion
}
if (U_BUFFER_OVERFLOW_ERROR == dst_status
|| (U_BUFFER_OVERFLOW_ERROR == src_status && from_next >= from_end)) {
return std::codecvt_base::partial; // destination buffer is not large enough
}
}
return std::codecvt_base::ok;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief converter between an 'internal' utf8 representation and
/// an 'external' user-specified encoding and an
////////////////////////////////////////////////////////////////////////////////
class codecvt8u_facet: public codecvtu_base<char> {
public:
codecvt8u_facet(converter_pool& converters): codecvtu_base(converters) {}
bool append(
std::basic_string<intern_type>& buf, const icu::UnicodeString& value
) const;
protected:
virtual int do_encoding() const NOEXCEPT override { return 0; } // only non-zero for ASCII
virtual std::codecvt_base::result do_in(
state_type& state,
const extern_type* from,
const extern_type* from_end,
const extern_type*& from_next,
intern_type* to,
intern_type* to_end,
intern_type*& to_next
) const override;
virtual int do_max_length() const NOEXCEPT override;
virtual std::codecvt_base::result do_out(
state_type& state,
const intern_type* from,
const intern_type* from_end,
const intern_type*& from_next,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const override;
};
bool codecvt8u_facet::append(
std::basic_string<intern_type>& buf, const icu::UnicodeString& value
) const {
struct sink_t: public icu::ByteSink {
std::basic_string<intern_type>& buf_;
bool error_;
sink_t(std::basic_string<intern_type>& buf): buf_(buf), error_(false) {}
virtual void Append(const char* bytes, int32_t n) override {
if (n < 0 || error_) {
error_ = true;
return;
}
buf_.append(bytes, n);
}
} sink(buf);
auto start = buf.size();
value.toUTF8(sink);
if (!sink.error_) {
return true;
}
IR_FRMT_WARN(
"ICU returned invalid string size while converting unicode string to UTF8"
);
buf.resize(start);
return false;
}
std::codecvt_base::result codecvt8u_facet::do_in(
state_type& state,
const extern_type* from,
const extern_type* from_end,
const extern_type*& from_next,
intern_type* to,
intern_type* to_end,
intern_type*& to_next
) const {
auto ctx = context();
from_next = from;
to_next = to;
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while converting encoding '%s' to unicode system encoding",
context_encoding().c_str()
);
return std::codecvt_base::error;
}
UChar buf[BUFFER_SIZE];
auto* buf_end = buf + IRESEARCH_COUNTOF(buf);
int32_t src_offsets[IRESEARCH_COUNTOF(buf) + 1]; // +1 for end
int32_t dst_offsets[IRESEARCH_COUNTOF(buf) * U8_MAX_LENGTH + 1]; // +1 for end
ucnv_reset(ctx->converter_.get());
// convert 'BUFFER_SIZE' at a time
while (from_next < from_end) {
auto* buf_next = buf;
auto* from_next_prev = from_next;
auto* to_next_prev = to_next;
UErrorCode src_status = U_ZERO_ERROR;
UErrorCode dst_status = U_ZERO_ERROR;
// convert from desired encoding to the intermediary representation
ucnv_toUnicode(
ctx->converter_.get(),
&buf_next,
buf_end,
&from_next,
from_end,
src_offsets,
true,
&src_status
);
if (!U_SUCCESS(src_status) && U_BUFFER_OVERFLOW_ERROR != src_status) {
from_next = from_next_prev;
to_next = to_next_prev;
IR_FRMT_WARN(
"failure to convert from locale encoding to UTF16 while converting encoding '%s' unicode system encoding",
context_encoding().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
auto buf_pos = buf_next - buf;
assert(buf_pos >= 0 && IRESEARCH_COUNTOF(src_offsets) > size_t(buf_pos >= 0));
src_offsets[buf_pos] = from_next - from_next_prev; // remember past-end position
auto* buf_dst_next = buf;
auto* buf_dst_end = buf_next;
// convert one char at a time to track source position to destination position
do {
int32_t to_used = 0;
u_strToUTF8(
to_next,
to_end - to_next,
&to_used, // set to the number of output units corresponding to the transformation of all the input units, even in case of a buffer overflow
buf_dst_next,
U_IS_SURROGATE(*buf_dst_next) ? 2 : 1, // 1 char at a time to track source/destination position mapping
&dst_status
);
if (U_BUFFER_OVERFLOW_ERROR == dst_status) {
break; // conversion buffer not large enough to hold result
}
if (U_SUCCESS(dst_status) && to_used < 0) {
dst_status = U_INTERNAL_PROGRAM_ERROR; // ICU internal error
}
if (!U_SUCCESS(dst_status) && U_BUFFER_OVERFLOW_ERROR != dst_status) {
from_next = from_next_prev;
to_next = to_next_prev;
IR_FRMT_WARN(
"failure to convert from UTF16 to UTF8 while converting encoding '%s' to unicode system encoding",
context_encoding().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
// all of 'to_used' since if not enough space in 'to' buffer then would have had U_BUFFER_OVERFLOW_ERROR
while (to_used) {
assert(to_next >= to_next_prev && IRESEARCH_COUNTOF(dst_offsets) > size_t(to_next - to_next_prev));
dst_offsets[to_next - to_next_prev] = buf_dst_next - buf; // remember converted position (start)
++to_next;
--to_used;
}
buf_dst_next += U_IS_SURROGATE(*buf_dst_next) ? 2 : 1; // +1 for 1 char at a time (+2 for surrogate)
} while (buf_dst_next + 1 < buf_dst_end); // +1 for possible surrogate
assert(to_next >= to_next_prev && IRESEARCH_COUNTOF(dst_offsets) > size_t(to_next - to_next_prev));
dst_offsets[to_next - to_next_prev] = buf_dst_next - buf; // remember past-end position
auto buf_dst_pos = dst_offsets[to_next - to_next_prev];
assert(buf_dst_pos >= 0 && IRESEARCH_COUNTOF(src_offsets) > size_t(buf_dst_pos));
from_next = from_next_prev + src_offsets[buf_dst_pos]; // update successfully converted
if (U_BUFFER_OVERFLOW_ERROR == dst_status
|| (U_BUFFER_OVERFLOW_ERROR == src_status && from_next >= from_end)) {
return std::codecvt_base::partial; // destination buffer is not large enough
}
}
return std::codecvt_base::ok;
}
int codecvt8u_facet::do_max_length() const NOEXCEPT {
auto ctx = context();
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while computing maximum number of required input characters from encoding '%s' to produce a single output character",
context_encoding().c_str()
);
return -1;
}
return ucnv_getMaxCharSize(ctx->converter_.get()); // fo non-ASCII this will produce 2+ UTF8 encoded chars
}
std::codecvt_base::result codecvt8u_facet::do_out(
state_type& state,
const intern_type* from,
const intern_type* from_end,
const intern_type*& from_next,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const {
auto ctx = context();
from_next = from;
to_next = to;
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while converting unicode system encoding to encoding '%s'",
context_encoding().c_str()
);
return std::codecvt_base::error;
}
UChar buf[BUFFER_SIZE];
auto* buf_end = buf + IRESEARCH_COUNTOF(buf);
size_t offsets[IRESEARCH_COUNTOF(buf) + 1]; // +1 for end
ucnv_reset(ctx->converter_.get());
// convert 'BUFFER_SIZE' at a time
while (from_next < from_end) {
auto* buf_next = buf;
auto* from_next_prev = from_next;
auto* to_next_prev = to_next;
UErrorCode src_status = U_ZERO_ERROR;
UErrorCode dst_status = U_ZERO_ERROR;
// convert one char at a time to track source position to destination position
do {
size_t from_size = 1;
int32_t buf_used = 0;
if (!U8_IS_SINGLE(*from_next)) {
// find all the tail UTF8 chars if possible
for (auto* from_tail = from_next + 1;
from_tail < from_end && U8_IS_TRAIL(*from_tail);
++from_tail) {
++from_size;
}
}
u_strFromUTF8(
buf_next,
buf_end - buf_next,
&buf_used, // set to the number of output units corresponding to the transformation of all the input units, even in case of a buffer overflow
from_next,
from_size, // 1 char at a time to track source/destination position mapping
&src_status
);
if (U_BUFFER_OVERFLOW_ERROR == src_status) {
break; // conversion buffer not large enough to hold result
}
if (U_SUCCESS(src_status) && buf_used < 0) {
src_status = U_INTERNAL_PROGRAM_ERROR; // ICU internal error
}
if (!U_SUCCESS(src_status)) {
IR_FRMT_WARN(
"failure to convert from UTF8 to UTF16 while converting unicode system encoding to encoding '%s'",
context_encoding().c_str()
);
break; // finish copying all successfully converted
}
// all of 'buf_used' since if not enough space in 'buf' buffer then would have had U_BUFFER_OVERFLOW_ERROR
while (buf_used) {
assert(buf_next >= buf && IRESEARCH_COUNTOF(offsets) > size_t(buf_next - buf));
offsets[buf_next - buf] = from_next - from; // remember converted position
++buf_next;
--buf_used;
}
from_next += from_size; // +1 for 1 char at a time
} while (from_next + 3 < from_end); // +3 for possible surrogates
auto buf_pos = buf_next - buf;
assert(buf_pos >= 0 && IRESEARCH_COUNTOF(offsets) > size_t(buf_pos));
offsets[buf_pos] = from_next - from_next_prev; // remember past-end position
const UChar* buf_dst_next = buf;
auto* buf_dst_end = buf_next;
// convert intermediary representation to the desired encoding
ucnv_fromUnicode(
ctx->converter_.get(),
&to_next,
to_end,
&buf_dst_next,
buf_dst_end,
nullptr,
true,
&dst_status
);
if (!U_SUCCESS(dst_status) && U_BUFFER_OVERFLOW_ERROR != dst_status) {
from_next = from_next_prev;
to_next = to_next_prev;
IR_FRMT_WARN(
"failure to convert from UTF16 to locale encoding while converting unicode system encoding to encoding '%s'",
context_encoding().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
auto buf_dst_pos = buf_dst_next - buf;
assert(buf_dst_pos >= 0 && IRESEARCH_COUNTOF(offsets) > size_t(buf_dst_pos));
from_next = from_next_prev + offsets[buf_dst_pos]; // update successfully converted
if (!U_SUCCESS(src_status) && U_BUFFER_OVERFLOW_ERROR != src_status) {
return std::codecvt_base::error; // error occured during intermediary conversion
}
if (U_BUFFER_OVERFLOW_ERROR == dst_status
|| (U_BUFFER_OVERFLOW_ERROR == src_status && from_next >= from_end)) {
return std::codecvt_base::partial; // destination buffer is not large enough
}
}
return std::codecvt_base::ok;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief converter between an 'internal' utf8/utf16/uf32 representation,
/// based on sizeof(wchar_t), and
/// an 'external' user-specified encoding
////////////////////////////////////////////////////////////////////////////////
class codecvtwu_facet: public std::codecvt<wchar_t, char, mbstate_t> {
public:
codecvtwu_facet(converter_pool& pool): impl_(pool) {}
bool append(
std::basic_string<intern_type>& buf, const icu::UnicodeString& value
) const {
static_assert(sizeof(impl_t::intern_type) == sizeof(intern_type), "sizeof(impl_t::intern_type) != sizeof(intern_type)");
return impl_.append(
reinterpret_cast<std::basic_string<impl_t::intern_type>&>(buf),
value
);
}
protected:
virtual bool do_always_noconv() const NOEXCEPT override {
return impl_.always_noconv();
}
virtual int do_encoding() const NOEXCEPT override {
return impl_.encoding();
}
virtual std::codecvt_base::result do_in(
state_type& state,
const extern_type* from,
const extern_type* from_end,
const extern_type*& from_next,
intern_type* to,
intern_type* to_end,
intern_type*& to_next
) const override {
static_assert(sizeof(impl_t::intern_type) == sizeof(intern_type), "sizeof(impl_t::intern_type) != sizeof(intern_type)");
return impl_.in(
state,
from,
from_end,
from_next,
reinterpret_cast<impl_t::intern_type*>(to),
reinterpret_cast<impl_t::intern_type*>(to_end),
reinterpret_cast<impl_t::intern_type*&>(to_next)
);
}
virtual int do_length(
state_type& state,
const extern_type* from,
const extern_type* from_end,
std::size_t max
) const override {
return impl_.length(state, from, from_end, max);
}
virtual int do_max_length() const NOEXCEPT override {
return impl_.max_length();
}
virtual std::codecvt_base::result do_out(
state_type& state,
const intern_type* from,
const intern_type* from_end,
const intern_type*& from_next,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const override {
static_assert(sizeof(impl_t::intern_type) == sizeof(intern_type), "sizeof(impl_t::intern_type) != sizeof(intern_type)");
return impl_.out(
state,
reinterpret_cast<const impl_t::intern_type*>(from),
reinterpret_cast<const impl_t::intern_type*>(from_end),
reinterpret_cast<const impl_t::intern_type*&>(from_next),
to,
to_end,
to_next
);
}
virtual std::codecvt_base::result do_unshift(
state_type& state,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const override {
return impl_.unshift(state, to, to_end, to_next);
}
private:
typedef std::conditional<
sizeof(char32_t) == sizeof(wchar_t),
codecvt32_facet,
std::conditional<
sizeof(char16_t) == sizeof(wchar_t),
codecvt16_facet,
std::conditional<
sizeof(char) == sizeof(wchar_t),
codecvt8u_facet,
void
>::type
>::type
>::type impl_t; // unicode implementation depends on sizeof(whcar_t)
impl_t impl_;
};
////////////////////////////////////////////////////////////////////////////////
/// @brief base implementation for converters between 'internal' representation
/// and an 'external' user-specified encoding (custom internal)
////////////////////////////////////////////////////////////////////////////////
template<typename InternType>
class codecvt_base: public std::codecvt<InternType, char, mbstate_t> {
public:
typedef std::codecvt<InternType, char, mbstate_t> parent_t;
typedef typename parent_t::extern_type extern_type;
typedef typename parent_t::intern_type intern_type;
typedef typename parent_t::state_type state_type;
codecvt_base(converter_pool& converters_int, converter_pool& converters_ext)
: contexts_(POOL_SIZE),
converters_ext_(converters_ext),
converters_int_(converters_int) {
}
protected:
struct context_t {
DECLARE_UNIQUE_PTR(context_t);
std::basic_string<typename parent_t::intern_type> buf_;
converter_pool::ptr converter_ext_;
converter_pool::ptr converter_int_;
static ptr make(converter_pool& pool_int, converter_pool& pool_ext) {
auto ctx = irs::memory::make_unique<context_t>();
if (!ctx) {
return nullptr;
}
ctx->converter_ext_ = pool_ext.get();
ctx->converter_int_ = pool_int.get();
return ctx->converter_ext_ && ctx->converter_int_
? std::move(ctx) : nullptr;
}
};
typedef irs::unbounded_object_pool<context_t> context_pool;
typename context_pool::ptr context() const {
return contexts_.emplace(converters_int_, converters_ext_);
}
const std::string& context_encoding_ext() const NOEXCEPT {
return converters_ext_.encoding();
}
const std::string& context_encoding_int() const NOEXCEPT {
return converters_int_.encoding();
}
virtual bool do_always_noconv() const NOEXCEPT final override {
return false; // not an identity conversion
}
virtual int do_encoding() const NOEXCEPT override = 0;
virtual std::codecvt_base::result do_in(
state_type& state,
const extern_type* from,
const extern_type* from_end,
const extern_type*& from_next,
intern_type* to,
intern_type* to_end,
intern_type*& to_next
) const override = 0;
virtual int do_length(
state_type& state,
const extern_type* from,
const extern_type* from_end,
std::size_t max
) const final override;
virtual int do_max_length() const NOEXCEPT override = 0;
virtual std::codecvt_base::result do_out(
state_type& state,
const intern_type* from,
const intern_type* from_end,
const intern_type*& from_next,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const override = 0;
virtual std::codecvt_base::result do_unshift(
state_type& state,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const final override;
private:
mutable context_pool contexts_;
converter_pool& converters_ext_;
converter_pool& converters_int_;
};
template<typename InternType>
int codecvt_base<InternType>::do_length(
state_type& state,
const extern_type* from,
const extern_type* from_end,
std::size_t max
) const {
auto ctx = context();
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while computing number of required input characters from encoding '%s' to produce at most '" IR_SIZE_T_SPECIFIER "' system encoding '%s' output characters",
context_encoding_ext().c_str(), max, context_encoding_int().c_str()
);
return std::codecvt_base::error;
}
ctx->buf_.resize(max);
auto* from_next = from;
auto* to = &(ctx->buf_[0]);
auto* to_end = to + max;
auto* to_next = to;
auto res = do_in(state, from, from_end, from_next, to, to_end, to_next);
return res == std::codecvt_base::ok ? std::distance(from, from_next) : 0;
}
template<typename InternType>
std::codecvt_base::result codecvt_base<InternType>::do_unshift(
state_type& state,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const {
to_next = to;
return std::codecvt_base::ok;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief converter between an 'internal' 'system' encoding representation and
/// an 'external' user-specified encoding and an
////////////////////////////////////////////////////////////////////////////////
class codecvt8_facet final: public codecvt_base<char> {
public:
codecvt8_facet(converter_pool& pool_int, converter_pool& pool_ext)
: codecvt_base(pool_int, pool_ext) {
}
bool append(
std::basic_string<intern_type>& buf, const icu::UnicodeString& value
) const;
protected:
virtual int do_encoding() const NOEXCEPT override;
virtual std::codecvt_base::result do_in(
state_type& state,
const extern_type* from,
const extern_type* from_end,
const extern_type*& from_next,
intern_type* to,
intern_type* to_end,
intern_type*& to_next
) const override;
virtual int do_max_length() const NOEXCEPT override;
virtual std::codecvt_base::result do_out(
state_type& state,
const intern_type* from,
const intern_type* from_end,
const intern_type*& from_next,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const override;
};
bool codecvt8_facet::append(
std::basic_string<intern_type>& buf, const icu::UnicodeString& value
) const {
auto ctx = context();
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while converting unicode string to encoding '%s'",
context_encoding_ext().c_str()
);
return false;
}
auto size = value.length();
if (size < 0) {
IR_FRMT_WARN(
"ICU returned invalid string size while converting unicode string to encoding '%s'",
context_encoding_ext().c_str()
);
return false;
}
auto* from_next = value.getBuffer();
auto* from_end = from_next + size;
auto start = buf.size();
UErrorCode status = U_ZERO_ERROR;
ucnv_reset(ctx->converter_ext_.get());
// convert 'BUFFER_SIZE' at a time
do {
auto offset = buf.size();
buf.resize(buf.size() + BUFFER_SIZE);
auto* to_next = &buf[offset];
auto* to_end = to_next + BUFFER_SIZE;
status = U_ZERO_ERROR;
ucnv_fromUnicode(
ctx->converter_int_.get(),
&to_next,
to_end,
&from_next,
from_end,
nullptr,
true,
&status
);
if (U_SUCCESS(status)) {
buf.resize(to_next - &buf[0]); // truncate to actual data size
return true;
}
} while (status == U_BUFFER_OVERFLOW_ERROR);
IR_FRMT_WARN(
"failure while converting unicode string to encoding '%s'",
context_encoding_ext().c_str()
);
buf.resize(start); // truncate all appended data
return false;
}
int codecvt8_facet::do_encoding() const NOEXCEPT {
auto ctx = context();
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while computing number of required input characters from encoding '%s' to produce a single system encoding '%s' output character",
context_encoding_ext().c_str(), context_encoding_int().c_str()
);
return -1;
}
UErrorCode status = U_ZERO_ERROR;
// the exact number of externT characters that correspond to one internT character, if constant
return ucnv_isFixedWidth(ctx->converter_ext_.get(), &status)
&& ucnv_isFixedWidth(ctx->converter_int_.get(), &status)
? (std::max)(
ucnv_getMinCharSize(ctx->converter_ext_.get()),
ucnv_getMinCharSize(ctx->converter_int_.get())
)
: 0
;
}
std::codecvt_base::result codecvt8_facet::do_in(
state_type& state,
const extern_type* from,
const extern_type* from_end,
const extern_type*& from_next,
intern_type* to,
intern_type* to_end,
intern_type*& to_next
) const {
auto ctx = context();
from_next = from;
to_next = to;
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while converting encoding '%s' to system encoding '%s'",
context_encoding_ext().c_str(), context_encoding_int().c_str()
);
return std::codecvt_base::error;
}
UChar buf[BUFFER_SIZE];
auto* buf_end = buf + IRESEARCH_COUNTOF(buf);
int32_t offsets[IRESEARCH_COUNTOF(buf) + 1]; // +1 for end
ucnv_reset(ctx->converter_ext_.get());
ucnv_reset(ctx->converter_int_.get());
// convert 'BUFFER_SIZE' at a time
while (from_next < from_end) {
auto* buf_next = buf;
auto* from_next_prev = from_next;
auto* to_next_prev = to_next;
UErrorCode src_status = U_ZERO_ERROR;
UErrorCode dst_status = U_ZERO_ERROR;
// convert from desired encoding to the intermediary representation
ucnv_toUnicode(
ctx->converter_ext_.get(),
&buf_next,
buf_end,
&from_next,
from_end,
offsets,
true,
&src_status
);
if (!U_SUCCESS(src_status) && U_BUFFER_OVERFLOW_ERROR != src_status) {
from_next = from_next_prev;
to_next = to_next_prev;
IR_FRMT_WARN(
"failure to convert from locale encoding to UTF16 while converting encoding '%s' to system encoding '%s'",
context_encoding_ext().c_str(), context_encoding_int().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
auto buf_pos = buf_next - buf;
assert(buf_pos >= 0 && IRESEARCH_COUNTOF(offsets) > size_t(buf_pos));
offsets[buf_pos] = from_next - from_next_prev; // remember past-end position
const UChar* buf_dst_next = buf;
auto* buf_dst_end = buf_next;
// convert from intermediary representation to the internal encoding
ucnv_fromUnicode(
ctx->converter_int_.get(),
&to_next,
to_end,
&buf_dst_next,
buf_dst_end,
nullptr,
true,
&dst_status
);
if (!U_SUCCESS(dst_status) && U_BUFFER_OVERFLOW_ERROR != dst_status) {
from_next = from_next_prev;
to_next = to_next_prev;
IR_FRMT_WARN(
"failure to convert from UTF16 to internal encoding while converting encoding '%s' to system encoding '%s'",
context_encoding_ext().c_str(), context_encoding_int().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
auto buf_dst_pos = buf_dst_next - buf;
assert(buf_dst_pos >= 0 && IRESEARCH_COUNTOF(offsets) > size_t(buf_dst_pos));
from_next = from_next_prev + offsets[buf_dst_pos]; // update successfully converted
if (U_BUFFER_OVERFLOW_ERROR == dst_status
|| (U_BUFFER_OVERFLOW_ERROR == src_status && from_next >= from_end)) {
return std::codecvt_base::partial; // destination buffer is not large enough
}
}
return std::codecvt_base::ok;
}
int codecvt8_facet::do_max_length() const NOEXCEPT {
auto ctx = context();
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while computing maximum number of required input characters from encoding '%s' to produce a single system encoding '%s' output character",
context_encoding_ext().c_str(), context_encoding_int().c_str()
);
return -1;
}
return (std::max)(
ucnv_getMaxCharSize(ctx->converter_ext_.get()),
ucnv_getMaxCharSize(ctx->converter_int_.get())
);
}
std::codecvt_base::result codecvt8_facet::do_out(
state_type& state,
const intern_type* from,
const intern_type* from_end,
const intern_type*& from_next,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const {
auto ctx = context();
from_next = from;
to_next = to;
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while converting system encoding '%s' to encoding '%s'",
context_encoding_int().c_str(), context_encoding_ext().c_str()
);
return std::codecvt_base::error;
}
UChar buf[BUFFER_SIZE];
auto* buf_end = buf + IRESEARCH_COUNTOF(buf);
int32_t offsets[IRESEARCH_COUNTOF(buf) + 1]; // +1 for end
ucnv_reset(ctx->converter_ext_.get());
ucnv_reset(ctx->converter_int_.get());
// convert 'BUFFER_SIZE' at a time
while (from_next < from_end) {
auto* buf_next = buf;
auto* from_next_prev = from_next;
auto* to_next_prev = to_next;
UErrorCode src_status = U_ZERO_ERROR;
UErrorCode dst_status = U_ZERO_ERROR;
// convert from desired encoding to the intermediary representation
ucnv_toUnicode(
ctx->converter_int_.get(),
&buf_next,
buf_end,
&from_next,
from_end,
offsets,
true,
&src_status
);
if (!U_SUCCESS(src_status) && U_BUFFER_OVERFLOW_ERROR != src_status) {
from_next = from_next_prev;
to_next = to_next_prev;
IR_FRMT_WARN(
"failure to convert from locale encoding to UTF16 while converting system encoding '%s' to encoding '%s'",
context_encoding_int().c_str(), context_encoding_ext().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
auto buf_pos = buf_next - buf;
assert(buf_pos >= 0 && IRESEARCH_COUNTOF(offsets) > size_t(buf_pos));
offsets[buf_pos] = from_next - from_next_prev; // remember past-end position
const UChar* buf_dst_next = buf;
auto* buf_dst_end = buf_next;
// convert intermediary representation to the desired encoding
ucnv_fromUnicode(
ctx->converter_ext_.get(),
&to_next,
to_end,
&buf_dst_next,
buf_dst_end,
nullptr,
true,
&dst_status
);
if (!U_SUCCESS(dst_status) && U_BUFFER_OVERFLOW_ERROR != dst_status) {
from_next = from_next_prev;
to_next = to_next_prev;
IR_FRMT_WARN(
"failure to convert from UTF16 to locale encoding while converting system encoding '%s' to encoding '%s'",
context_encoding_int().c_str(), context_encoding_ext().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
auto buf_dst_pos = buf_dst_next - buf;
assert(buf_dst_pos >= 0 && IRESEARCH_COUNTOF(offsets) > size_t(buf_dst_pos));
from_next = from_next_prev + offsets[buf_dst_pos]; // update successfully converted
if (!U_SUCCESS(src_status) && U_BUFFER_OVERFLOW_ERROR != src_status) {
return std::codecvt_base::error; // error occured during intermediary conversion
}
if (U_BUFFER_OVERFLOW_ERROR == dst_status
|| (U_BUFFER_OVERFLOW_ERROR == src_status && from_next >= from_end)) {
return std::codecvt_base::partial; // destination buffer is not large enough
}
}
return std::codecvt_base::ok;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief converter between an 'internal' 'system' encoding representation and
/// an 'external' user-specified encoding and an
////////////////////////////////////////////////////////////////////////////////
class codecvtw_facet final: public codecvt_base<wchar_t> {
public:
codecvtw_facet(converter_pool& pool_int, converter_pool& pool_ext)
: codecvt_base(pool_int, pool_ext) {
}
bool append(
std::basic_string<intern_type>& buf, const icu::UnicodeString& value
) const;
protected:
virtual int do_encoding() const NOEXCEPT override;
virtual std::codecvt_base::result do_in(
state_type& state,
const extern_type* from,
const extern_type* from_end,
const extern_type*& from_next,
intern_type* to,
intern_type* to_end,
intern_type*& to_next
) const override;
virtual int do_max_length() const NOEXCEPT override;
virtual std::codecvt_base::result do_out(
state_type& state,
const intern_type* from,
const intern_type* from_end,
const intern_type*& from_next,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const override;
};
bool codecvtw_facet::append(
std::basic_string<intern_type>& buf, const icu::UnicodeString& value
) const {
auto ctx = context();
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while converting unicode string to encoding '%s'",
context_encoding_ext().c_str()
);
return false;
}
UErrorCode status = U_ZERO_ERROR;
const auto char_size =
size_t((std::max)(int8_t(1), ucnv_getMinCharSize(ctx->converter_int_.get())));
// cannot support conversion to variable-width system encoding since no way to
// determine internal char size
// cannot support conversion of fixed-width system encoding where
// sizeof(intern_type) < sizeof(<internal char>)
if (!ucnv_isFixedWidth(ctx->converter_int_.get(), &status)
|| sizeof(intern_type) < char_size) {
IR_FRMT_WARN(
"unsupported encoding while converting unicode string encoding '%s'",
context_encoding_ext().c_str()
);
return false;
}
auto size = value.length();
if (size < 0) {
IR_FRMT_WARN(
"ICU returned invalid string size while converting unicode string to encoding '%s'",
context_encoding_ext().c_str()
);
return false;
}
auto* from_next = value.getBuffer();
auto* from_end = from_next + size;
auto start = buf.size();
ucnv_reset(ctx->converter_ext_.get());
// convert 'BUFFER_SIZE' at a time
do {
auto offset = buf.size();
UErrorCode status = U_ZERO_ERROR;
buf.resize(buf.size() + BUFFER_SIZE);
auto* to_next = &buf[offset];
auto* to_end = to_next + BUFFER_SIZE;// * sizeof(intern_type);
static_assert(sizeof(char) == 1, "sizeof(char) != 1"); // otherwise have to divide lower
// convert from intermediary representation to the internal encoding
if (sizeof(intern_type) == char_size) {
auto* buf_to_next = reinterpret_cast<char*>(to_next);
auto* buf_to_end = reinterpret_cast<char*>(to_next + BUFFER_SIZE);
ucnv_fromUnicode(
ctx->converter_ext_.get(),
&buf_to_next,
buf_to_end,
&from_next,
from_end,
nullptr,
true,
&status
);
} else {
intern_type ch = 0;
auto* buf_to = reinterpret_cast<char*>(&ch) + (sizeof(intern_type) - char_size);
auto* buf_to_end = reinterpret_cast<char*>(&ch + 1); // +1 for char after buf
// convert one char at a time and left pad with 0's
while (to_next < to_end) {
auto* buf_to_next = buf_to;
status = U_ZERO_ERROR;
ucnv_fromUnicode(
ctx->converter_int_.get(),
&buf_to_next,
buf_to_end,
&from_next,
from_end,
nullptr,
true,
&status
);
if (!U_SUCCESS(status) && U_BUFFER_OVERFLOW_ERROR != status) {
break;
}
*to_next = ch; // copy over char
++to_next;
ch = 0;
if (U_SUCCESS(status)) {
break; // nothing was converted from source to destination
}
}
}
if (U_SUCCESS(status)) {
buf.resize(to_next - &buf[0]); // truncate to actual data size
return true;
}
} while (status == U_BUFFER_OVERFLOW_ERROR);
IR_FRMT_WARN(
"failure while converting unicode string to encoding '%s'",
context_encoding_ext().c_str()
);
buf.resize(start); // truncate all appended data
return false;
}
int codecvtw_facet::do_encoding() const NOEXCEPT {
auto ctx = context();
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while computing number of required input characters from encoding '%s' to produce a single system encoding '%s' output character",
context_encoding_ext().c_str(), context_encoding_int().c_str()
);
return -1;
}
UErrorCode status = U_ZERO_ERROR;
// the exact number of externT characters that correspond to one internT character, if constant
return ucnv_isFixedWidth(ctx->converter_ext_.get(), &status)
? ucnv_getMinCharSize(ctx->converter_ext_.get()) : 0;
}
std::codecvt_base::result codecvtw_facet::do_in(
state_type& state,
const extern_type* from,
const extern_type* from_end,
const extern_type*& from_next,
intern_type* to,
intern_type* to_end,
intern_type*& to_next
) const {
auto ctx = context();
from_next = from;
to_next = to;
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while converting encoding '%s' to system encoding '%s'",
context_encoding_ext().c_str(), context_encoding_int().c_str()
);
return std::codecvt_base::error;
}
UErrorCode status = U_ZERO_ERROR;
const auto char_size =
size_t((std::max)(int8_t(1), ucnv_getMinCharSize(ctx->converter_int_.get())));
// cannot support conversion to variable-width system encoding since no way to
// determine internal char size
// cannot support conversion of fixed-width system encoding where
// sizeof(intern_type) < sizeof(<internal char>)
if (!ucnv_isFixedWidth(ctx->converter_int_.get(), &status)
|| sizeof(intern_type) < char_size) {
IR_FRMT_WARN(
"unsupported encoding while converting encoding '%s' to system encoding '%s'",
context_encoding_ext().c_str(), context_encoding_int().c_str()
);
return std::codecvt_base::error;
}
UChar buf[BUFFER_SIZE];
auto* buf_end = buf + IRESEARCH_COUNTOF(buf);
int32_t offsets[IRESEARCH_COUNTOF(buf) + 1]; // +1 for end
ucnv_reset(ctx->converter_ext_.get());
ucnv_reset(ctx->converter_int_.get());
// convert 'BUFFER_SIZE' at a time
while (from_next < from_end) {
auto* buf_next = buf;
auto* from_next_prev = from_next;
auto* to_next_prev = to_next;
UErrorCode src_status = U_ZERO_ERROR;
UErrorCode dst_status = U_ZERO_ERROR;
// convert from desired encoding to the intermediary representation
ucnv_toUnicode(
ctx->converter_ext_.get(),
&buf_next,
buf_end,
&from_next,
from_end,
offsets,
true,
&src_status
);
if (!U_SUCCESS(src_status) && U_BUFFER_OVERFLOW_ERROR != src_status) {
from_next = from_next_prev;
to_next = to_next_prev;
IR_FRMT_WARN(
"failure to convert from locale encoding to UTF16 while converting encoding '%s' to system encoding '%s'",
context_encoding_ext().c_str(), context_encoding_int().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
auto buf_pos = buf_next - buf;
assert(buf_pos >= 0 && IRESEARCH_COUNTOF(offsets) > size_t(buf_pos));
offsets[buf_pos] = from_next - from_next_prev; // remember past-end position
const UChar* buf_dst_next = buf;
auto* buf_dst_end = buf_next;
static_assert(sizeof(char) == 1, "sizeof(char) != 1"); // otherwise have to divide lower
// convert from intermediary representation to the internal encoding
if (sizeof(intern_type) == char_size) {
auto* buf_to_next = reinterpret_cast<char*>(to_next);
auto* buf_to_end =
buf_to_next + std::distance(to_next, to_end) * sizeof(intern_type);
ucnv_fromUnicode(
ctx->converter_int_.get(),
&buf_to_next,
buf_to_end,
&buf_dst_next,
buf_dst_end,
nullptr,
true,
&dst_status
);
} else {
intern_type ch = 0;
auto* buf_to = reinterpret_cast<char*>(&ch) + (sizeof(intern_type) - char_size);
auto* buf_to_end = reinterpret_cast<char*>(&ch + 1); // +1 for char after buf
// convert one char at a time and left pad with 0's
while (to_next < to_end) {
auto* buf_to_next = buf_to;
dst_status = U_ZERO_ERROR;
ucnv_fromUnicode(
ctx->converter_int_.get(),
&buf_to_next,
buf_to_end,
&buf_dst_next,
buf_dst_end,
nullptr,
true,
&dst_status
);
if (!U_SUCCESS(dst_status) && U_BUFFER_OVERFLOW_ERROR != dst_status) {
break;
}
*to_next = ch; // copy over char
++to_next;
ch = 0;
if (U_SUCCESS(dst_status)) {
break; // nothing was converted from source to destination
}
}
}
if (!U_SUCCESS(dst_status) && U_BUFFER_OVERFLOW_ERROR != dst_status) {
from_next = from_next_prev;
to_next = to_next_prev;
IR_FRMT_WARN(
"failure to convert from UTF16 to internal encoding while converting encoding '%s' to system encoding '%s'",
context_encoding_ext().c_str(), context_encoding_int().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
auto buf_dst_pos = buf_dst_next - buf;
assert(buf_dst_pos >= 0 && IRESEARCH_COUNTOF(offsets) > size_t(buf_dst_pos));
from_next = from_next_prev + offsets[buf_dst_pos]; // update successfully converted
if (U_BUFFER_OVERFLOW_ERROR == dst_status
|| (U_BUFFER_OVERFLOW_ERROR == src_status && from_next >= from_end)) {
return std::codecvt_base::partial; // destination buffer is not large enough
}
}
return std::codecvt_base::ok;
}
int codecvtw_facet::do_max_length() const NOEXCEPT {
auto ctx = context();
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while computing maximum number of required input characters from encoding '%s' to produce a single system encoding '%s' output character",
context_encoding_ext().c_str(), context_encoding_int().c_str()
);
return -1;
}
UErrorCode status = U_ZERO_ERROR;
const auto char_size =
size_t((std::max)(int8_t(1), ucnv_getMinCharSize(ctx->converter_int_.get())));
// cannot support conversion to variable-width system encoding since no way to
// determine internal char size
// cannot support conversion of fixed-width system encoding where
// sizeof(intern_type) < sizeof(<internal char>)
if (!ucnv_isFixedWidth(ctx->converter_int_.get(), &status)
|| sizeof(intern_type) < char_size) {
IR_FRMT_WARN(
"unsupported encoding while computing maximum number of required input characters from encoding '%s' to produce a single system encoding '%s' output character",
context_encoding_ext().c_str(), context_encoding_int().c_str()
);
return -1;
}
return ucnv_getMaxCharSize(ctx->converter_ext_.get());
}
std::codecvt_base::result codecvtw_facet::do_out(
state_type& state,
const intern_type* from,
const intern_type* from_end,
const intern_type*& from_next,
extern_type* to,
extern_type* to_end,
extern_type*& to_next
) const {
auto ctx = context();
from_next = from;
to_next = to;
if (!ctx) {
IR_FRMT_WARN(
"failure to get conversion context while converting system encoding '%s' to encoding '%s'",
context_encoding_int().c_str(), context_encoding_ext().c_str()
);
return std::codecvt_base::error;
}
UErrorCode status = U_ZERO_ERROR;
const auto char_size =
size_t((std::max)(int8_t(1), ucnv_getMinCharSize(ctx->converter_int_.get())));
// cannot support conversion from variable-width system encoding since no way
// to determine internal char size
// cannot support conversion of fixed-width system encoding where
// sizeof(intern_type) < sizeof(<internal char>)
if (!ucnv_isFixedWidth(ctx->converter_int_.get(), &status)
|| sizeof(intern_type) < char_size) {
IR_FRMT_WARN(
"unsupported encoding while converting system encoding '%s' to encoding '%s'",
context_encoding_int().c_str(), context_encoding_ext().c_str()
);
return std::codecvt_base::error;
}
UChar buf[BUFFER_SIZE];
auto* buf_end = buf + IRESEARCH_COUNTOF(buf);
int32_t offsets[IRESEARCH_COUNTOF(buf) + 1]; // +1 for end
ucnv_reset(ctx->converter_ext_.get());
ucnv_reset(ctx->converter_int_.get());
// convert 'BUFFER_SIZE' at a time
while (from_next < from_end) {
auto* buf_next = buf;
auto* from_next_prev = from_next;
auto* to_next_prev = to_next;
UErrorCode src_status = U_ZERO_ERROR;
UErrorCode dst_status = U_ZERO_ERROR;
static_assert(sizeof(char) == 1, "sizeof(char) != 1"); // otherwise have to divide lower
// convert from the internal encoding to the intermediary representation
if (sizeof(intern_type) == char_size) {
auto* buf_from_next = reinterpret_cast<const char*>(from_next);
auto* buf_from_end =
buf_from_next + std::distance(from_next, from_end) * sizeof(intern_type);
src_status = U_ZERO_ERROR;
ucnv_toUnicode(
ctx->converter_int_.get(),
&buf_next,
buf_end,
&buf_from_next,
buf_from_end,
offsets,
true,
&src_status
);
} else {
// convert one char at a time
do {
auto* buf_from = reinterpret_cast<const char*>(from_next) + (sizeof(intern_type) - char_size);
auto* buf_from_end = reinterpret_cast<const char*>(from_next + 1); // +1 for char after buf
auto* buf_next_start = buf_next;
auto* buf_from_next = buf_from;
src_status = U_ZERO_ERROR;
ucnv_toUnicode(
ctx->converter_int_.get(),
&buf_next,
buf_end,
&buf_from_next,
buf_from_end,
nullptr,
true,
&src_status
);
if (U_BUFFER_OVERFLOW_ERROR == src_status) {
break; // conversion buffer not large enough to hold result
}
if (!U_SUCCESS(src_status)) {
IR_FRMT_WARN(
"failure to convert from system encoding to UTF16 while converting system encoding '%s' to encoding '%s'",
context_encoding_int().c_str(), context_encoding_ext().c_str()
);
break; // finish copying all successfully converted
}
assert(buf_next >= buf && IRESEARCH_COUNTOF(offsets) > size_t(buf_next - buf));
while(buf_next_start < buf_next) {
offsets[buf_next_start - buf] = from_next - from; // remember converted position
++buf_next_start;
}
++from_next; // +1 for 1 char at a time
} while(from_next < from_end);
}
if (!U_SUCCESS(src_status) && U_BUFFER_OVERFLOW_ERROR != src_status) {
from_next = from_next_prev;
to_next = to_next_prev;
IR_FRMT_WARN(
"failure to convert from system encoding to UTF16 while converting system encoding '%s' to encoding '%s'",
context_encoding_int().c_str(), context_encoding_ext().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
auto buf_pos = buf_next - buf;
assert(buf_pos >= 0 && IRESEARCH_COUNTOF(offsets) > size_t(buf_pos));
offsets[buf_pos] = from_next - from_next_prev; // remember past-end position
const UChar* buf_dst_next = buf;
auto* buf_dst_end = buf_next;
// convert intermediary representation to the desired encoding
ucnv_fromUnicode(
ctx->converter_ext_.get(),
&to_next,
to_end,
&buf_dst_next,
buf_dst_end,
nullptr,
true,
&dst_status
);
if (!U_SUCCESS(dst_status) && U_BUFFER_OVERFLOW_ERROR != dst_status) {
from_next = from_next_prev;
to_next = to_next_prev;
IR_FRMT_WARN(
"failure to convert from UTF16 to locale encoding while converting system encoding '%s' to encoding '%s'",
context_encoding_int().c_str(), context_encoding_ext().c_str()
);
return std::codecvt_base::error; // error occured during final conversion
}
auto buf_dst_pos = buf_dst_next - buf;
assert(buf_dst_pos >= 0 && IRESEARCH_COUNTOF(offsets) > size_t(buf_dst_pos));
from_next = from_next_prev + offsets[buf_dst_pos]; // update successfully converted
if (!U_SUCCESS(src_status) && U_BUFFER_OVERFLOW_ERROR != src_status) {
return std::codecvt_base::error; // error occured during intermediary conversion
}
if (U_BUFFER_OVERFLOW_ERROR == dst_status
|| (U_BUFFER_OVERFLOW_ERROR == src_status && from_next >= from_end)) {
return std::codecvt_base::partial; // destination buffer is not large enough
}
}
return std::codecvt_base::ok;
}
class collate_facet: public std::collate<char> {
// FIXME TODO implement
};
class collatew_facet: public std::collate<wchar_t> {
// FIXME TODO implement
};
class ctype_facet: public std::ctype<char> {
// FIXME TODO implement
};
class ctypew_facet: public std::ctype<wchar_t> {
// FIXME TODO implement
};
class money_get_facet: public std::money_get<char> {
// FIXME TODO implement
};
class money_getw_facet: public std::money_get<wchar_t> {
// FIXME TODO implement
};
class money_put_facet: public std::money_put<char> {
// FIXME TODO implement
};
class money_putw_facet: public std::money_put<wchar_t> {
// FIXME TODO implement
};
class moneypunct_facet: public std::moneypunct<char> {
// FIXME TODO implement
};
class moneypunctintl_facet: public std::moneypunct<char, true> {
// FIXME TODO implement
};
class moneypunctw_facet: public std::moneypunct<wchar_t> {
// FIXME TODO implement
};
class moneypunctwintl_facet: public std::moneypunct<wchar_t, true> {
// FIXME TODO implement
};
class num_get_facet: public std::num_get<char> {
// FIXME TODO implement
};
class num_getw_facet: public std::num_get<wchar_t> {
// FIXME TODO implement
};
template<typename CharType, typename CvtType>
class num_put_facet: public std::num_put<CharType> {
public:
typedef typename std::num_put<CharType>::char_type char_type;
typedef typename std::num_put<CharType>::iter_type iter_type;
num_put_facet(const icu::Locale& locale, const CvtType& converter)
: contexts_(POOL_SIZE), converter_(converter), locale_(locale) {
}
protected:
virtual iter_type do_put(
iter_type out, std::ios_base& str, char_type fill, bool value
) const override;
virtual iter_type do_put(
iter_type out, std::ios_base& str, char_type fill, long value
) const override;
virtual iter_type do_put(
iter_type out, std::ios_base& str, char_type fill, long long value
) const override;
virtual iter_type do_put(
iter_type out, std::ios_base& str, char_type fill, unsigned long value
) const override;
virtual iter_type do_put(
iter_type out, std::ios_base& str, char_type fill, unsigned long long value
) const override;
virtual iter_type do_put(
iter_type out, std::ios_base& str, char_type fill, double value
) const override;
virtual iter_type do_put(
iter_type out, std::ios_base& str, char_type fill, long double value
) const override;
virtual iter_type do_put(
iter_type out, std::ios_base& str, char_type fill, const void* value
) const override;
private:
struct context_t {
DECLARE_UNIQUE_PTR(context_t);
std::basic_string<char_type> buf_;
UnicodeString icu_buf0_;
UnicodeString icu_buf1_;
std::unique_ptr<icu::NumberFormat> regular_;
std::unique_ptr<icu::NumberFormat> scientific_; // uppercase (instead of mixed case by default)
static ptr make(const icu::Locale& locale) {
auto ctx = irs::memory::make_unique<context_t>();
if (!ctx) {
return nullptr;
}
UErrorCode status = U_ZERO_ERROR;
ctx->regular_.reset(icu::NumberFormat::createInstance(locale, status));
if (!U_SUCCESS(status) && !ctx->regular_) {
return nullptr;
}
// at least on ICU v55/v57/v59 createScientificInstance(...) will create different,
// (even on the same version but different hosts) and mostly incorrct formats,
// e.g. incorrect decimal or exponent precision
// hence use createInstance(...) and cast to DecimalFormat as per ICU documentation
ctx->scientific_.reset(icu::NumberFormat::createInstance(locale, status));
if (!U_SUCCESS(status) && !ctx->scientific_) {
return nullptr;
}
auto* decimal = dynamic_cast<icu::DecimalFormat*>(ctx->scientific_.get());
if (!decimal) {
return nullptr; // can't set to scientific
}
decimal->setScientificNotation(true);
// uppercase (instead of mixed case with UDisplayContext::UDISPCTX_CAPITALIZATION_NONE)
ctx->scientific_->setContext(UDisplayContext::UDISPCTX_CAPITALIZATION_FOR_STANDALONE, status);
if (!U_SUCCESS(status)) {
return nullptr;
}
return std::move(ctx);
}
void reset(const std::ios_base& str) {
auto grouping =
!std::use_facet<std::numpunct<char_type>>(str.getloc()).grouping().empty();
buf_.clear();
icu_buf0_.truncate(0);
icu_buf1_.truncate(0);
regular_->setGroupingUsed(grouping);
regular_->setMinimumFractionDigits(0);
regular_->setMaximumFractionDigits(0);
scientific_->setGroupingUsed(grouping);
scientific_->setMinimumFractionDigits(0);
scientific_->setMaximumFractionDigits(0);
}
};
typedef irs::unbounded_object_pool<context_t> context_pool;
mutable context_pool contexts_;
const CvtType& converter_;
icu::Locale locale_;
typename context_pool::ptr context() const {
return contexts_.emplace(locale_);
}
template<typename T>
static iter_type do_put_float_hex(
iter_type out, std::ios_base& str, char_type fill, T value
);
template<typename T>
static iter_type do_put_int_hex(
iter_type out, std::ios_base& str, char_type fill, T value, bool full_width
);
template<typename T>
static iter_type do_put_int_oct(
iter_type out, std::ios_base& str, char_type fill, T value
);
static iter_type do_put_int_zero(
iter_type out, std::ios_base& str, char_type fill
);
};
template<typename CharType, typename CvtType>
typename num_put_facet<CharType, CvtType>::iter_type num_put_facet<CharType, CvtType>::do_put(
iter_type out, std::ios_base& str, char_type fill, bool value
) const {
if (!(str.flags() & std::ios_base::boolalpha)) {
return do_put(out, str, fill, long(value));
}
auto val = value
? std::use_facet<std::numpunct<char_type>>(str.getloc()).truename()
: std::use_facet<std::numpunct<char_type>>(str.getloc()).falsename()
;
auto rpad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::left
? str.width() : size_t(0)
;
auto lpad = !rpad ? str.width() : size_t(0);
size_t size = 0;
str.width(0); // reset padding
for (size_t i = lpad < val.size() ? 0 : lpad - val.size(); i; --i) {
*out++ = fill;
++size;
}
for (size_t i = 0, count = val.size(); i < count; ++i) {
*out++ = val[i];
++size;
}
for (size_t i = rpad < size ? 0 : rpad - size; i; --i) {
*out++ = fill;
}
return out;
}
template<typename CharType, typename CvtType>
typename num_put_facet<CharType, CvtType>::iter_type num_put_facet<CharType, CvtType>::do_put(
iter_type out, std::ios_base& str, char_type fill, long value
) const {
if (str.flags() & std::ios_base::oct) {
return do_put_int_oct(out, str, fill, (unsigned long)value);
}
if (str.flags() & std::ios_base::hex) {
return do_put_int_hex(out, str, fill, (unsigned long)value, false);
}
// the ICU operations are identical
return do_put(out, str, fill, (long long)value);
}
template<typename CharType, typename CvtType>
typename num_put_facet<CharType, CvtType>::iter_type num_put_facet<CharType, CvtType>::do_put(
iter_type out, std::ios_base& str, char_type fill, long long value
) const {
if (str.flags() & std::ios_base::oct) {
return do_put_int_oct(out, str, fill, (unsigned long long)value);
}
if (str.flags() & std::ios_base::hex) {
static_assert(sizeof(uint64_t) == sizeof(unsigned long long), "sizeof(uint64_t) != sizeof(unsigned long long)");
return do_put_int_hex(out, str, fill, (uint64_t)value, false);
}
if (value >= 0) {
return do_put(out, str, fill, (unsigned long long)value);
}
auto ipad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::internal
? str.width() : size_t(0)
;
auto rpad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::left
? str.width() : size_t(0)
;
auto lpad = !ipad && !rpad ? str.width() : size_t(0);
size_t size = 0;
str.width(0); // reset padding
auto ctx = context();
if (!ctx) {
throw irs::detailed_io_error(
"failed to retrieve ICU formatter in num_put_facet::do_put(...)"
);
}
static_assert(sizeof(int64_t) == sizeof(long long), "sizeof(int64_t) != sizeof(long long)");
ctx->reset(str);
ctx->regular_->format(int64_t(0 - value), ctx->icu_buf0_);
if (!converter_.append(ctx->buf_, ctx->icu_buf0_)) {
throw irs::detailed_io_error(
"failed to convert data from UTF8 in num_put_facet::do_put(...)"
);
}
size_t len = ctx->buf_.size() + 1; // +1 for '-'
for (size_t i = lpad < len ? 0 : lpad - len; i; --i) {
*out++ = fill;
++size;
}
*out++ = '-';
++size;
for (size_t i = ipad < len ? 0 : ipad - len; i; --i) {
*out++ = fill;
++size;
}
for (size_t i = 0, count = ctx->buf_.size(); i < count; ++i) {
*out++ = ctx->buf_[i];
++size;
}
for (size_t i = rpad < size ? 0 : rpad - size; i; --i) {
*out++ = fill;
}
return out;
}
template<typename CharType, typename CvtType>
typename num_put_facet<CharType, CvtType>::iter_type num_put_facet<CharType, CvtType>::do_put(
iter_type out, std::ios_base& str, char_type fill, unsigned long value
) const {
if (str.flags() & std::ios_base::oct) {
return do_put_int_oct(out, str, fill, (unsigned long)value);
}
if (str.flags() & std::ios_base::hex) {
return do_put_int_hex(out, str, fill, (unsigned long)value, false);
}
// the ICU operations are identical
return do_put(out, str, fill, (unsigned long long)value);
}
template<typename CharType, typename CvtType>
typename num_put_facet<CharType, CvtType>::iter_type num_put_facet<CharType, CvtType>::do_put(
iter_type out, std::ios_base& str, char_type fill, unsigned long long value
) const {
if (str.flags() & std::ios_base::oct) {
return do_put_int_oct(out, str, fill, (unsigned long long)value);
}
if (str.flags() & std::ios_base::hex) {
static_assert(sizeof(uint64_t) == sizeof(unsigned long long), "sizeof(uint64_t) != sizeof(unsigned long long)");
return do_put_int_hex(out, str, fill, (uint64_t)value, false);
}
if (!value) {
return do_put_int_zero(out, str,fill); // optimization for '0'
}
if ((unsigned long long)irs::integer_traits<int64_t>::const_max < value) {
throw irs::detailed_io_error(
"value too large while converting data from UTF8 in num_put_facet::do_put(...)"
);
}
auto ipad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::internal
? str.width() : size_t(0)
;
auto rpad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::left
? str.width() : size_t(0)
;
auto lpad = !ipad && !rpad ? str.width() : size_t(0);
size_t size = 0;
str.width(0); // reset padding
auto ctx = context();
if (!ctx) {
throw irs::detailed_io_error(
"failed to retrieve ICU formatter in num_put_facet::do_put(...)"
);
}
static_assert(sizeof(int64_t) == sizeof(long long), "sizeof(int64_t) != sizeof(long long)");
ctx->reset(str);
ctx->regular_->format(int64_t(value), ctx->icu_buf0_);
if (!converter_.append(ctx->buf_, ctx->icu_buf0_)) {
throw irs::detailed_io_error(
"failed to convert data from UTF8 in num_put_facet::do_put(...)"
);
}
size_t len = ctx->buf_.size() + (str.flags() & std::ios_base::showpos ? 1 : 0);
for (size_t i = lpad < len ? 0 : lpad - len; i; --i) {
*out++ = fill;
++size;
}
if (str.flags() & std::ios_base::showpos) {
*out++ = '+';
++size;
}
for (size_t i = ipad < len ? 0 : ipad - len; i; --i) {
*out++ = fill;
++size;
}
for (size_t i = 0, count = ctx->buf_.size(); i < count; ++i) {
*out++ = ctx->buf_[i];
++size;
}
for (size_t i = rpad < size ? 0 : rpad - size; i; --i) {
*out++ = fill;
}
return out;
}
template<typename CharType, typename CvtType>
typename num_put_facet<CharType, CvtType>::iter_type num_put_facet<CharType, CvtType>::do_put(
iter_type out, std::ios_base& str, char_type fill, double value
) const {
if ((str.flags() & std::ios_base::floatfield) == (std::ios_base::fixed | std::ios_base::scientific)) {
return do_put_float_hex(out, str, fill, value);
}
auto ipad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::internal
? str.width() : size_t(0)
;
auto rpad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::left
? str.width() : size_t(0)
;
auto lpad = !ipad && !rpad ? str.width() : size_t(0);
size_t size = 0;
str.width(0); // reset padding
auto ctx = context();
if (!ctx) {
throw irs::detailed_io_error(
"failed to retrieve ICU formatter in num_put_facet::do_put(...)"
);
}
ctx->reset(str);
ctx->regular_->setMinimumFractionDigits(6); // default 6 as per specification
ctx->regular_->setMaximumFractionDigits(6); // default 6 as per specification
ctx->scientific_->setMinimumFractionDigits(6); // default 6 as per specification
ctx->scientific_->setMaximumFractionDigits(6); // default 6 as per specification
static const UnicodeString point(".");
icu::UnicodeString* icu_buf;
bool negative = false;
if (value < 0) {
value = 0 - value;
negative = true;
}
if ((str.flags() & std::ios_base::floatfield) == std::ios_base::fixed) {
icu::FieldPosition decimal(UNumberFormatFields::UNUM_DECIMAL_SEPARATOR_FIELD);
// Decimal floating point, lowercase
ctx->regular_->format(value, ctx->icu_buf0_, decimal);
icu_buf = &ctx->icu_buf0_;
if ((str.flags() & std::ios_base::showpoint)
&& !decimal.getBeginIndex() && !decimal.getEndIndex()) { // 0,0 indicates no decimal
icu_buf->append(point); // append at end
}
} else if ((str.flags() & std::ios_base::floatfield) == std::ios_base::scientific) {
icu::FieldPosition decimal(UNumberFormatFields::UNUM_DECIMAL_SEPARATOR_FIELD);
// Scientific notation (mantissa/exponent), uppercase/lowercase
ctx->scientific_->format(value, ctx->icu_buf0_, decimal);
icu_buf = &ctx->icu_buf0_;
if ((str.flags() & std::ios_base::showpoint)
&& !decimal.getBeginIndex() && !decimal.getEndIndex()) { // 0,0 indicates no decimal
icu_buf->insert(icu_buf->length() - 2, point); // -2 to insert before 'e0'
}
} else {
icu::FieldPosition decimal_r(UNumberFormatFields::UNUM_DECIMAL_SEPARATOR_FIELD);
icu::FieldPosition decimal_s(UNumberFormatFields::UNUM_DECIMAL_SEPARATOR_FIELD);
// set the maximum number of significant digits to be printed (as per spec)
ctx->regular_->setMinimumFractionDigits(0);
ctx->regular_->setMaximumFractionDigits(str.precision());
ctx->scientific_->setMinimumFractionDigits(0);
ctx->scientific_->setMaximumFractionDigits(str.precision());
// Use the shortest representation:
// Decimal floating point
// Scientific notation (mantissa/exponent), uppercase/lowercase
ctx->regular_->format(value, ctx->icu_buf0_, decimal_r);
ctx->scientific_->format(value, ctx->icu_buf1_, decimal_s);
if ((str.flags() & std::ios_base::showpoint)) {
if (!decimal_r.getBeginIndex() && !decimal_r.getEndIndex()) { // 0,0 indicates no decimal
ctx->icu_buf0_.append(point); // append at end
}
if (!decimal_s.getBeginIndex() && !decimal_s.getEndIndex()) { // 0,0 indicates no decimal
ctx->icu_buf1_.insert(ctx->icu_buf1_.length() - 2, point); // -2 to insert before 'e0'
}
}
icu_buf = ctx->icu_buf1_.length() < ctx->icu_buf1_.length()
? &ctx->icu_buf1_ : &ctx->icu_buf0_;
}
// ensure all letters are uppercased/lowercased
if (!(str.flags() & std::ios_base::uppercase)) {
icu_buf->toLower();
}
if (!converter_.append(ctx->buf_, *icu_buf)) {
throw irs::detailed_io_error(
"failed to convert data from UTF8 in num_put_facet::do_put(...)"
);
}
size_t len = ctx->buf_.size()
+ (negative || (str.flags() & std::ios_base::showpos) ? 1 : 0);
for (size_t i = lpad < len ? 0 : lpad - len; i; --i) {
*out++ = fill;
++size;
}
if (negative) {
*out++ = '-';
++size;
} else if (str.flags() & std::ios_base::showpos) {
*out++ = '+';
++size;
}
for (size_t i = ipad < len ? 0 : ipad - len; i; --i) {
*out++ = fill;
++size;
}
for (size_t i = 0, count = ctx->buf_.size(); i < count; ++i) {
*out++ = ctx->buf_[i];
++size;
}
for (size_t i = rpad < size ? 0 : rpad - size; i; --i) {
*out++ = fill;
}
return out;
}
template<typename CharType, typename CvtType>
typename num_put_facet<CharType, CvtType>::iter_type num_put_facet<CharType, CvtType>::do_put(
iter_type out, std::ios_base& str, char_type fill, long double value
) const {
if ((str.flags() & std::ios_base::floatfield) == (std::ios_base::fixed | std::ios_base::scientific)) {
return do_put_float_hex(out, str, fill, value);
}
// the ICU operations are identical (with lower precision)
return do_put(out, str, fill, (double)value);
}
template<typename CharType, typename CvtType>
typename num_put_facet<CharType, CvtType>::iter_type num_put_facet<CharType, CvtType>::do_put(
iter_type out, std::ios_base& str, char_type fill, const void* value
) const {
return do_put_int_hex(out, str, fill, size_t(value), true);
}
template<typename CharType, typename CvtType>
template<typename T>
/*static*/ typename num_put_facet<CharType, CvtType>::iter_type num_put_facet<CharType, CvtType>::do_put_float_hex(
iter_type out, std::ios_base& str, char_type fill, T value
) {
typedef typename std::enable_if<std::is_floating_point<T>::value, T>::type type;
auto ipad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::internal
? str.width() : size_t(0)
;
auto rpad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::left
? str.width() : size_t(0)
;
auto lpad = !ipad && !rpad ? str.width() : size_t(0);
size_t size = 0;
str.width(0); // reset padding
static auto mantissa_bits = std::numeric_limits<type>::digits;
static const char lower[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
static const char upper[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
auto* table = str.flags() & std::ios_base::uppercase ? upper : lower;
bool negative = false;
if (value < 0) {
value = 0 - value;
negative = true;
}
// optimization for '0'
if (!value) {
size_t len = 6 // 0x0p+0
+ ((str.flags() & std::ios_base::showpos) ? 1 : 0)
+ ((str.flags() & std::ios_base::showpoint) ? 1 : 0)
;
for (size_t i = lpad < len ? 0 : lpad - len; i; --i) {
*out++ = fill;
++size;
}
// if a sign character occurs in the representation, will pad after the sign
if (str.flags() & std::ios_base::showpos) {
*out++ = '+';
++size;
for (size_t i = ipad < len ? 0 : ipad - len; i; --i) {
*out++ = fill;
++len; // subtract from 'ipad'
++size;
}
}
*out++ = '0'; // hexadecimal prefix
*out++ = str.flags() & std::ios_base::uppercase ? 'X' : 'x';
size += 2;
for (size_t i = ipad < len ? 0 : ipad - len; i; --i) {
*out++ = fill;
++size;
}
*out++ = '0';
++size;
if (str.flags() & std::ios_base::showpoint) {
*out++ = '.';
++size;
}
*out++ = str.flags() & std::ios_base::uppercase ? 'P' : 'p';
*out++ = '+';
*out++ = '0';
size += 3;
for (size_t i = rpad < size ? 0 : rpad - size; i; --i) {
*out++ = fill;
}
return out;
}
int exponent;
auto mantissa_f = std::frexp(value, &exponent);
auto mantissa_i = size_t(std::ldexp(mantissa_f, mantissa_bits));
int half_byte = sizeof(size_t) * 2;
// strip leading/trailing zero half-bytes
{
static_assert(std::numeric_limits<size_t>::digits < irs::integer_traits<int>::const_max, "std::numeric_limits<size_t>::digits >= std::numeric_limits<int>::max()");
auto clz = int(irs::math::math_traits<size_t>::clz(mantissa_i));
auto ctz = int(irs::math::math_traits<size_t>::ctz(mantissa_i));
exponent -= 4 - (clz % 4); // number of bits used in the first half-byte
half_byte -= clz / 4; // 4 for half-byte
half_byte -= ctz / 4; // 4 for half-byte
mantissa_i >>= ctz & ~size_t(0x3); // (ctz / 4) * 4
}
auto exp_str = std::to_string(exponent);
size_t len = half_byte
+ 4 // for 0x...p+
+ (negative || (str.flags() & std::ios_base::showpos) ? 1 : 0)
+ (!half_byte || (str.flags() & std::ios_base::showpoint) ? 1 : 0)
+ exp_str.size()
;
for (size_t i = lpad < len ? 0 : lpad - len; i; --i) {
*out++ = fill;
++size;
}
// if a sign character occurs in the representation, will pad after the sign
if (negative || (str.flags() & std::ios_base::showpos)) {
*out++ = negative ? '-' : '+';
++size;
for (size_t i = ipad < len ? 0 : ipad - len; i; --i) {
*out++ = fill;
++len; // subtract from 'ipad'
++size;
}
}
*out++ = '0'; // hexadecimal prefix
*out++ = str.flags() & std::ios_base::uppercase ? 'X' : 'x';
size += 2;
for (size_t i = ipad < len ? 0 : ipad - len; i; --i) {
*out++ = fill;
++size;
}
bool started = false;
while(half_byte) {
auto val = (mantissa_i >> (--half_byte * 4)) & 0xF;
*out++ = table[val];
++size;
if (!started) {
started = true;
if (half_byte || (str.flags() & std::ios_base::showpoint)) {
*out++ = '.';
++size;
}
}
}
*out++ = str.flags() & std::ios_base::uppercase ? 'P' : 'p'; // exponent suffix
*out++ = '+';
size += 2;
for (size_t i = 0, count = exp_str.size(); i < count; ++i) {
*out++ = exp_str[i];
++size;
}
for (size_t i = rpad < size ? 0 : rpad - size; i; --i) {
*out++ = fill;
}
return out;
}
template<typename CharType, typename CvtType>
template<typename T>
/*static*/ typename num_put_facet<CharType, CvtType>::iter_type num_put_facet<CharType, CvtType>::do_put_int_hex(
iter_type out, std::ios_base& str, char_type fill, T value, bool full_width
) {
typedef typename std::enable_if<std::is_unsigned<T>::value, T>::type type;
if (!value && !full_width) {
return do_put_int_zero(out, str, fill); // optimization for '0'
}
auto ipad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::internal
? str.width() : size_t(0)
;
auto rpad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::left
? str.width() : size_t(0)
;
auto lpad = !ipad && !rpad ? str.width() : size_t(0);
size_t size = 0;
str.width(0); // reset padding
static const char lower[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
static const char upper[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
auto* table = str.flags() & std::ios_base::uppercase ? upper : lower;
auto val = irs::numeric_utils::numeric_traits<type>::hton(value);
auto* v = reinterpret_cast<uint8_t*>(&val);
bool started = false;
size_t len = sizeof(val) * 2 // *2 for hi+lo
+ (str.flags() & std::ios_base::showpos ? 1 : 0)
+ (str.flags() & std::ios_base::showbase ? 2 : 0)
;
for (auto i = sizeof(type); i; --i, ++v) {
if (started) {
*out++ = table[*v >> 4];
*out++ = table[*v & 0xF];
size += 2;
continue;
}
if (!*v && !full_width) {
len -= 2; // 2 for hi+lo
continue;
}
auto hi = *v >> 4;
auto lo = *v & 0xF;
len -= hi || full_width ? 0 : 1;
for (size_t i = lpad < len ? 0 : lpad - len; i; --i) {
*out++ = fill;
++size;
}
// if a sign character occurs in the representation, will pad after the sign
if (str.flags() & std::ios_base::showpos) {
*out++ = '+';
++size;
for (size_t i = ipad < len ? 0 : ipad - len; i; --i) {
*out++ = fill;
++len; // subtract from 'ipad'
++size;
}
}
// else if representation began with 0x or 0X, will pad after the x or X
if (str.flags() & std::ios_base::showbase) {
*out++ = '0'; // hexadecimal prefix
*out++ = str.flags() & std::ios_base::uppercase ? 'X' : 'x';
size += 2;
}
for (size_t i = ipad < len ? 0 : ipad - len; i; --i) {
*out++ = fill;
++size;
}
if (hi || full_width) {
*out++ = table[hi];
++size;
}
*out++ = table[lo];
++size;
started = true;
}
for (size_t i = rpad < size ? 0 : rpad - size; i; --i) {
*out++ = fill;
}
return out;
}
template<typename CharType, typename CvtType>
template<typename T>
/*static*/ typename num_put_facet<CharType, CvtType>::iter_type num_put_facet<CharType, CvtType>::do_put_int_oct(
iter_type out, std::ios_base& str, char_type fill, T value
) {
typedef typename std::enable_if<std::is_unsigned<T>::value, T>::type type;
if (!value) {
return do_put_int_zero(out, str, fill); // optimization for '0'
}
auto ipad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::internal
? str.width() : size_t(0)
;
auto rpad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::left
? str.width() : size_t(0)
;
auto lpad = !ipad && !rpad ? str.width() : size_t(0);
size_t size = 0;
str.width(0); // reset padding
static const char table[] = { '0', '1', '2', '3', '4', '5', '6', '7' };
size_t shift = (sizeof(type) * 8 / 3) + 1; // shift in blocks of 3 bits, +1 for initial decrement
bool started = false;
size_t len = shift
+ (str.flags() & std::ios_base::showpos ? 1 : 0)
+ (str.flags() & std::ios_base::showbase ? 1 : 0)
;
do {
auto v = (value >> (--shift * 3)) & 0x7; // shift in blocks of 3 bits
if (started) {
*out++ = table[v];
++size;
continue;
}
if (!v) {
--len;
continue;
}
for (size_t i = lpad < len ? 0 : lpad - len; i; --i) {
*out++ = fill;
++size;
}
if (str.flags() & std::ios_base::showpos) {
*out++ = '+';
++size;
}
for (size_t i = ipad < len ? 0 : ipad - len; i; --i) {
*out++ = fill;
++len; // subtract from 'ipad'
++size;
}
if (str.flags() & std::ios_base::showbase) {
*out++ = '0'; // octal prefix
++size;
}
*out++ = table[v];
++size;
started = true;
} while (shift);
for (size_t i = rpad < size ? 0 : rpad - size; i; --i) {
*out++ = fill;
}
return out;
}
template<typename CharType, typename CvtType>
/*static*/ typename num_put_facet<CharType, CvtType>::iter_type num_put_facet<CharType, CvtType>::do_put_int_zero(
iter_type out, std::ios_base& str, char_type fill
) {
auto ipad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::internal
? str.width() : size_t(0)
;
auto rpad = (str.flags() & std::ios_base::adjustfield) == std::ios_base::left
? str.width() : size_t(0)
;
auto lpad = !ipad && !rpad ? str.width() : size_t(0);
size_t size = 0;
str.width(0); // reset padding
size_t len = strlen("0") + (str.flags() & std::ios_base::showpos ? 1 : 0);
for (size_t i = lpad < len ? 0 : lpad - len; i; --i) {
*out++ = fill;
++size;
}
if (str.flags() & std::ios_base::showpos) {
*out++ = '+';
++size;
}
for (size_t i = ipad < len ? 0 : ipad - len; i; --i) {
*out++ = fill;
++size;
}
*out++ = '0';
++size;
for (size_t i = rpad < size ? 0 : rpad - size; i; --i) {
*out++ = fill;
}
return out;
}
class num_putw_facet: public std::num_put<wchar_t> {
// FIXME TODO implement
};
class numpunct_facet: public std::numpunct<char> {
// FIXME TODO implement
};
class numpunctw_facet: public std::numpunct<wchar_t> {
// FIXME TODO implement
};
class time_get_facet: public std::time_get<char> {
// FIXME TODO implement
};
class time_getw_facet: public std::time_get<wchar_t> {
// FIXME TODO implement
};
class time_put_facet: public std::time_put<char> {
// FIXME TODO implement
};
class time_putw_facet: public std::time_put<wchar_t> {
// FIXME TODO implement
};
class messages_facet: public std::messages<char> {
// FIXME TODO implement
};
class messagesw_facet: public std::messages<wchar_t> {
// FIXME TODO implement
};
// -----------------------------------------------------------------------------
// --SECTION-- custom std::locale facets
// -----------------------------------------------------------------------------
class locale_info_facet: public std::locale::facet {
public:
static std::locale::id id; // required for each class derived from std::locale::facet as per spec
locale_info_facet(const irs::string_ref& name);
locale_info_facet(locale_info_facet const& other) = delete; // because of string_ref
locale_info_facet(locale_info_facet&& other) NOEXCEPT { *this = std::move(other); }
locale_info_facet& operator=(const locale_info_facet& other) = delete; // because of string_ref
locale_info_facet& operator=(locale_info_facet&& other) NOEXCEPT;
bool operator<(const locale_info_facet& other) const NOEXCEPT { return name_ < other.name_; }
const irs::string_ref& country() const NOEXCEPT { return country_; }
const irs::string_ref& encoding() const NOEXCEPT { return encoding_; }
const irs::string_ref& language() const NOEXCEPT { return language_; }
const std::string& name() const NOEXCEPT { return name_; }
bool unicode() const NOEXCEPT { return unicode_t::NONE != unicode_; }
bool utf8() const NOEXCEPT { return unicode_t::UTF8 == unicode_; }
const irs::string_ref& variant() const NOEXCEPT { return variant_; }
private:
enum class unicode_t { NONE, UTF7, UTF8, UTF16, UTF32 };
std::string name_; // the normalized locale name: language[_COUNTRY][.encoding][@variant]
irs::string_ref country_;
irs::string_ref encoding_;
irs::string_ref language_;
irs::string_ref variant_;
unicode_t unicode_;
};
/*static*/ std::locale::id locale_info_facet::id;
//////////////////////////////////////////////////////////////////////////////
/// The name has the following format: language[_COUNTRY][.encoding][@variant]
/// Where 'language' is ISO-639 language code like "en" or "ru",
/// 'COUNTRY' is ISO-3166 country identifier like "US" or "RU",
/// 'encoding' is a charracter set name like "UTF-8" or "ISO-8859-1",
/// 'variant' is backend specific variant like "euro" or "calendar=hebrew"
//////////////////////////////////////////////////////////////////////////////
locale_info_facet::locale_info_facet(const irs::string_ref& name)
: name_(name),
country_(""),
encoding_("us-ascii"),
language_("C"),
variant_(""),
unicode_(unicode_t::NONE) { // us-ascii is not unicode
if (name_ == "C") {
return;
}
if (name_.empty() || name_ == "c") {
name_ = "C"; // uppercase 'classic' locale name
return;
}
auto data = &name_[0];
std::transform(data, data + name_.size(), data, ::tolower); // lowercase full string
auto length = ::strcspn(data, "-_.@");
language_ = irs::string_ref(data, length);
data += length;
// found country
if ('-' == data[0] || '_' == data[0]) {
++data;
length = ::strcspn(data, ".@");
country_ = irs::string_ref(data, length);
std::transform(data, data + length, data, ::toupper); // uppercase country
data += length;
}
// found encoding
if ('.' == data[0]) {
++data;
length = ::strcspn(data, "@");
encoding_ = irs::string_ref(data, length);
data += length;
// normalize encoding and compare to 'utf8' (data already in lower case)
std::string buf = encoding_;
auto* str = &buf[0];
auto end = std::remove_if(
str, str + buf.size(),
[](char x){ return !(('0' <= x && '9' >= x) || ('a' <= x && 'z' >= x)); }
);
irs::string_ref enc(str, std::distance(str, end));
if (enc == "utf7") {
unicode_ = unicode_t::UTF7;
} else if (enc == "utf8") {
unicode_ = unicode_t::UTF8;
} else if (enc == "utf16") {
unicode_ = unicode_t::UTF16;
} else if (enc == "utf32") {
unicode_ = unicode_t::UTF32;
}
}
// found variant
if ('@' == data[0]) {
++data;
variant_ = data;
}
}
locale_info_facet& locale_info_facet::operator=(
locale_info_facet&& other
) NOEXCEPT {
if (this != &other) {
const char* start = &(other.name_[0]);
const char* end = start + other.name_.size();
name_ = std::move(other.name_); // move first since string_ref point into it
country_ = other.country_.c_str() < start || other.country_.c_str() >= end
? other.country_ // does not point into 'name_'
: irs::string_ref(
&name_[0] + std::distance(start, other.country_.c_str()),
other.country_.size()
)
;
encoding_ = other.encoding_.c_str() < start || other.encoding_.c_str() >= end
? other.encoding_ // does not point into 'name_'
: irs::string_ref(
&name_[0] + std::distance(start, other.encoding_.c_str()),
other.encoding_.size()
)
;
language_ = other.language_.c_str() < start || other.language_.c_str() >= end
? other.language_ // does not point into 'name_'
: irs::string_ref(
&name_[0] + std::distance(start, other.language_.c_str()),
other.language_.size()
)
;
variant_ = other.variant_.c_str() < start || other.variant_.c_str() >= end
? other.variant_ // does not point into 'name_'
: irs::string_ref(
&name_[0] + std::distance(start, other.variant_.c_str()),
other.variant_.size()
)
;
unicode_ = other.unicode_;
other.country_ = irs::string_ref::NIL;
other.encoding_ = irs::string_ref::NIL;
other.language_ = irs::string_ref::NIL;
other.variant_ = irs::string_ref::NIL;
other.unicode_ = unicode_t::NONE;
}
return *this;
}
const std::locale& get_locale(
const irs::string_ref& name, bool forceUnicodeSystem = true
) {
struct less_t {
bool operator()(
const locale_info_facet* lhs, const locale_info_facet* rhs
) const NOEXCEPT {
return (!lhs && rhs) || (lhs && rhs && *lhs < *rhs);
}
};
auto unicodeSystem =
forceUnicodeSystem || locale_info_facet(system_encoding()).unicode();
locale_info_facet info(name);
static std::map<locale_info_facet*, std::locale, less_t> locales_s;
static std::map<locale_info_facet*, std::locale, less_t> locales_u;
auto& locales = unicodeSystem ? locales_u : locales_s;
static std::mutex mutex;
SCOPED_LOCK(mutex);
auto itr = locales.find(&info);
if (itr != locales.end()) {
return itr->second;
}
// Boost locales always assume system is unicode
boost::locale::generator locale_genrator; // stateful object, cannot be static
icu::Locale icu_locale(
std::string(info.language()).c_str(),
std::string(info.country()).c_str(),
std::string(info.variant()).c_str()
);
if (icu_locale.isBogus()) {
IR_FRMT_WARN("locale '%s' is not supported by ICU", info.name().c_str());
}
std::locale boost_locale;
// FIXME TODO this is a workaround for boost throwning exceptions for
// unsupported encodings which are overriden below anyway
try {
boost_locale = locale_genrator.generate(info.name());
} catch(...) {
if (info.encoding().c_str() < info.name().c_str()
|| info.encoding().c_str() >= info.name().c_str() + info.name().size()) {
throw;
}
auto boost_locale_name = info.name();
boost_locale_name.erase(
info.encoding().c_str() - info.name().c_str() - 1, // -1 for '_'
info.encoding().size() + 1 // +1 for '_'
); // skip encoding
boost_locale = locale_genrator.generate(boost_locale_name);
}
auto locale_info =
irs::memory::make_unique<locale_info_facet>(std::move(info));
auto* locale_info_ptr = locale_info.get();
auto& converter = get_converter(locale_info->encoding());
auto locale = std::locale(boost_locale, locale_info.release());
locale = std::locale(
locale, irs::memory::make_unique<codecvt16_facet>(converter).release()
);
locale = std::locale(
locale, irs::memory::make_unique<codecvt32_facet>(converter).release()
);
if (unicodeSystem) {
auto cvt8 = irs::memory::make_unique<codecvt8u_facet>(converter);
auto cvtw = irs::memory::make_unique<codecvtwu_facet>(converter);
locale = std::locale(
locale,
irs::memory::make_unique<num_put_facet<char,codecvt8u_facet>>(icu_locale, *cvt8).release()
);
locale = std::locale(
locale,
irs::memory::make_unique<num_put_facet<wchar_t, codecvtwu_facet>>(icu_locale, *cvtw).release()
);
locale = std::locale(locale, cvt8.release());
locale = std::locale(locale, cvtw.release());
} else {
auto& converter_int = get_converter(system_encoding());
auto cvt8 = irs::memory::make_unique<codecvt8_facet>(converter_int, converter);
auto cvtw = irs::memory::make_unique<codecvtw_facet>(converter_int, converter);
locale = std::locale(
locale,
irs::memory::make_unique<num_put_facet<char, codecvt8_facet>>(icu_locale, *cvt8).release()
);
locale = std::locale(
locale,
irs::memory::make_unique<num_put_facet<wchar_t, codecvtw_facet>>(icu_locale, *cvtw).release()
);
locale = std::locale(locale, cvt8.release());
locale = std::locale(locale, cvtw.release());
}
return locales.emplace(locale_info_ptr, locale).first->second;
}
NS_END
NS_ROOT
NS_BEGIN( locale_utils )
#if defined(_MSC_VER) && _MSC_VER <= 1800 && defined(IRESEARCH_DLL) // MSVC2013
// MSVC2013 does not properly export
// std::codecvt<char32_t, char, mbstate_t>::id for shared libraries
template<>
const std::codecvt<char32_t, char, mbstate_t>& codecvt(
std::locale const& locale
) {
return std::use_facet<std::codecvt<char32_t, char, mbstate_t>>(locale);
}
#elif defined(_MSC_VER) && _MSC_VER <= 1916 // MSVC2015/MSVC2017
// MSVC2015/MSVC2017 implementations do not support char16_t/char32_t 'codecvt'
// due to a missing export, as per their comment:
// This is an active bug in our database (VSO#143857), which we'll investigate
// for a future release, but we're currently working on higher priority things
template<>
const std::codecvt<char16_t, char, mbstate_t>& codecvt(
std::locale const& locale
) {
return std::use_facet<codecvt16_facet>(locale);
}
template<>
const std::codecvt<char32_t, char, mbstate_t>& codecvt(
std::locale const& locale
) {
return std::use_facet<codecvt32_facet>(locale);
}
#endif
const irs::string_ref& country(std::locale const& locale) {
auto* loc = &locale;
if (!std::has_facet<locale_info_facet>(*loc)) {
loc = &get_locale(loc->name());
}
return std::use_facet<locale_info_facet>(*loc).country();
}
const irs::string_ref& encoding(std::locale const& locale) {
auto* loc = &locale;
if (!std::has_facet<locale_info_facet>(*loc)) {
loc = &get_locale(loc->name());
}
return std::use_facet<locale_info_facet>(*loc).encoding();
}
const irs::string_ref& language(std::locale const& locale) {
auto* loc = &locale;
if (!std::has_facet<locale_info_facet>(*loc)) {
loc = &get_locale(loc->name());
}
return std::use_facet<locale_info_facet>(*loc).language();
}
std::locale locale(
irs::string_ref const& name,
irs::string_ref const& encodingOverride /*= irs::string_ref::NIL*/,
bool forceUnicodeSystem /*= true*/
) {
if (encodingOverride.null()) {
return get_locale(name, forceUnicodeSystem);
}
locale_info_facet info(name);
std::string locale_name = info.language();
if (!info.country().empty()) {
locale_name.append(1, '_').append(info.country());
}
if (!encodingOverride.empty()) {
locale_name.append(1, '.').append(encodingOverride);
}
if (!info.variant().empty()) {
locale_name.append(1, '@').append(info.variant());
}
return get_locale(locale_name, forceUnicodeSystem);
}
const std::string& name(std::locale const& locale) {
auto* loc = &locale;
if (!std::has_facet<locale_info_facet>(*loc)) {
loc = &get_locale(loc->name());
}
return std::use_facet<locale_info_facet>(*loc).name();
}
bool utf8(std::locale const& locale) {
auto* loc = &locale;
if (!std::has_facet<locale_info_facet>(*loc)) {
loc = &get_locale(loc->name());
}
return std::use_facet<locale_info_facet>(*loc).utf8();
}
NS_END // locale_utils
NS_END
// -----------------------------------------------------------------------------
// --SECTION-- END-OF-FILE
// -----------------------------------------------------------------------------