mirror of https://gitee.com/bigwinds/arangodb
Bug fix 3.4/arangorestore add cleanup duplicate attributes (#7876)
This commit is contained in:
parent
1ad9ae3115
commit
a4a7867451
|
@ -1,6 +1,13 @@
|
|||
v3.4.2 (XXXX-XX-XX)
|
||||
-------------------
|
||||
|
||||
* added arangorestore option `--cleanup-duplicate-attributes` to clean up input documents
|
||||
with redundant attribute names
|
||||
|
||||
Importing such documents without the option set will make arangorestore fail with an
|
||||
error, and setting the option will make the restore process clean up the input by using
|
||||
just the first specified value for each redundant attribute.
|
||||
|
||||
* the arangorestore options `--default-number-of-shards` and `--default-replication-factor`
|
||||
are now deprecated in favor of the much more powerful options `--number-of-shards`
|
||||
and `--replication-factor`
|
||||
|
|
|
@ -129,26 +129,13 @@ Encryption
|
|||
|
||||
See [Arangodump](../Arangodump/Examples.md#encryption) for details.
|
||||
|
||||
Restoring Revision IDs and Collection IDs
|
||||
-----------------------------------------
|
||||
|
||||
_arangorestore_ will reload document and edges data with the exact same *_key*, *_from* and
|
||||
*_to* values found in the input directory. However, when loading document data, it will assign
|
||||
its own values for the *_rev* attribute of the reloaded documents. Though this difference is
|
||||
intentional (normally, every server should create its own *_rev* values) there might be
|
||||
situations when it is required to re-use the exact same *_rev* values for the reloaded data.
|
||||
This can be achieved by setting the *--recycle-ids* parameter to *true*:
|
||||
|
||||
arangorestore --collection myusers --collection myvalues --input-directory "dump"
|
||||
|
||||
Note that setting *--recycle-ids* to *true* will also cause collections to be (re-)created in
|
||||
the target database with the exact same collection id as in the input directory. Any potentially
|
||||
existing collection in the target database with the same collection id will then be dropped.
|
||||
|
||||
Reloading Data into a different Collection
|
||||
------------------------------------------
|
||||
|
||||
With some creativity you can use _arangodump_ and _arangorestore_ to transfer data from one
|
||||
_arangorestore_ will restore document and edges data with the exact same *_key*, *_rev*, *_from*
|
||||
and *_to* values as found in the input directory.
|
||||
|
||||
With some creativity you can also use _arangodump_ and _arangorestore_ to transfer data from one
|
||||
collection into another (either on the same server or not). For example, to copy data from
|
||||
a collection *myvalues* in database *mydb* into a collection *mycopyvalues* in database *mycopy*,
|
||||
you can start with the following command:
|
||||
|
@ -229,6 +216,6 @@ For restore this short overview is sufficient:
|
|||
|
||||
- When importing into an existing database, the given user needs `Administrate`
|
||||
access on this database.
|
||||
- When creating a new Database during restore, the given user needs `Administrate`
|
||||
- When creating a new database during restore, the given user needs `Administrate`
|
||||
access on `_system`. The user will be promoted with `Administrate` access on the
|
||||
newly created database.
|
||||
|
|
|
@ -273,11 +273,12 @@ void RestCollectionHandler::handleCommandPost() {
|
|||
// for some "security" a white-list of allowed parameters
|
||||
VPackBuilder filtered = VPackCollection::keep(
|
||||
body, std::unordered_set<std::string>{
|
||||
"doCompact", "isSystem", "id", "isVolatile", "journalSize",
|
||||
"indexBuckets", "keyOptions", "waitForSync", "cacheEnabled",
|
||||
"doCompact", StaticStrings::DataSourceSystem, StaticStrings::DataSourceId,
|
||||
"isVolatile", "journalSize", "indexBuckets", "keyOptions",
|
||||
StaticStrings::WaitForSyncString, "cacheEnabled",
|
||||
StaticStrings::ShardKeys, StaticStrings::NumberOfShards,
|
||||
StaticStrings::DistributeShardsLike, "avoidServers", "isSmart",
|
||||
"shardingStrategy", "smartGraphAttribute", "replicationFactor",
|
||||
StaticStrings::DistributeShardsLike, "avoidServers", StaticStrings::IsSmart,
|
||||
"shardingStrategy", StaticStrings::GraphSmartGraphAttribute, StaticStrings::ReplicationFactor,
|
||||
"servers"});
|
||||
VPackSlice const parameters = filtered.slice();
|
||||
|
||||
|
|
|
@ -1322,7 +1322,11 @@ Result RestReplicationHandler::processRestoreDataBatch(transaction::Methods& trx
|
|||
std::string const& collectionName) {
|
||||
std::unordered_map<std::string, VPackValueLength> latest;
|
||||
VPackBuilder allMarkers;
|
||||
parseBatch(collectionName, latest, allMarkers);
|
||||
|
||||
Result res = parseBatch(collectionName, latest, allMarkers);
|
||||
if (res.fail()) {
|
||||
return res;
|
||||
}
|
||||
|
||||
// First remove all keys of which the last marker we saw was a deletion
|
||||
// marker:
|
||||
|
@ -1750,6 +1754,7 @@ void RestReplicationHandler::handleCommandRestoreView() {
|
|||
auto nameSlice = slice.get(StaticStrings::DataSourceName);
|
||||
auto typeSlice = slice.get(StaticStrings::DataSourceType);
|
||||
|
||||
|
||||
if (!nameSlice.isString() || !typeSlice.isString()) {
|
||||
generateError(ResponseCode::BAD, TRI_ERROR_BAD_PARAMETER);
|
||||
return;
|
||||
|
@ -1759,7 +1764,7 @@ void RestReplicationHandler::handleCommandRestoreView() {
|
|||
|
||||
try {
|
||||
CollectionNameResolver resolver(_vocbase);
|
||||
auto view = resolver.getView(nameSlice.toString());
|
||||
auto view = resolver.getView(nameSlice.copyString());
|
||||
|
||||
if (view) {
|
||||
if (!overwrite) {
|
||||
|
|
|
@ -25,6 +25,8 @@
|
|||
|
||||
#include <velocypack/Builder.h>
|
||||
#include <velocypack/Collection.h>
|
||||
#include <velocypack/Iterator.h>
|
||||
#include <velocypack/StringRef.h>
|
||||
#include <velocypack/velocypack-aliases.h>
|
||||
#include <boost/algorithm/clamp.hpp>
|
||||
|
||||
|
@ -154,8 +156,8 @@ arangodb::Result checkHttpResponse(arangodb::httpclient::SimpleHttpClient& clien
|
|||
if (response == nullptr || !response->isComplete()) {
|
||||
return {TRI_ERROR_INTERNAL,
|
||||
"got invalid response from server: '" + client.getErrorMessage() +
|
||||
"' while executing '" + requestAction +
|
||||
"' with this payload: '" + originalRequest + "'"};
|
||||
"' while executing " + requestAction +
|
||||
" with this payload: '" + originalRequest + "'"};
|
||||
}
|
||||
if (response->wasHttpError()) {
|
||||
int errorNum = TRI_ERROR_INTERNAL;
|
||||
|
@ -210,6 +212,46 @@ bool sortCollections(VPackBuilder const& l, VPackBuilder const& r) {
|
|||
return strcasecmp(leftName.c_str(), rightName.c_str()) < 0;
|
||||
}
|
||||
|
||||
void makeAttributesUnique(arangodb::velocypack::Builder& builder, arangodb::velocypack::Slice slice) {
|
||||
if (slice.isObject()) {
|
||||
std::unordered_set<arangodb::velocypack::StringRef> keys;
|
||||
|
||||
builder.openObject();
|
||||
|
||||
auto it = arangodb::velocypack::ObjectIterator(slice, true);
|
||||
|
||||
while (it.valid()) {
|
||||
if (!keys.emplace(it.key().stringRef()).second) {
|
||||
// duplicate key
|
||||
it.next();
|
||||
continue;
|
||||
}
|
||||
|
||||
// process attributes recursively
|
||||
builder.add(it.key());
|
||||
makeAttributesUnique(builder, it.value());
|
||||
it.next();
|
||||
}
|
||||
|
||||
builder.close();
|
||||
} else if (slice.isArray()) {
|
||||
builder.openArray();
|
||||
|
||||
auto it = arangodb::velocypack::ArrayIterator(slice);
|
||||
|
||||
while (it.valid()) {
|
||||
// recurse into array
|
||||
makeAttributesUnique(builder, it.value());
|
||||
it.next();
|
||||
}
|
||||
|
||||
builder.close();
|
||||
} else {
|
||||
// non-compound value!
|
||||
builder.add(slice);
|
||||
}
|
||||
}
|
||||
|
||||
/// @brief Create the database to restore to, connecting manually
|
||||
arangodb::Result tryCreateDatabase(std::string const& name) {
|
||||
using arangodb::httpclient::SimpleHttpClient;
|
||||
|
@ -414,6 +456,75 @@ arangodb::Result sendRestoreData(arangodb::httpclient::SimpleHttpClient& httpCli
|
|||
size_t bufferSize) {
|
||||
using arangodb::basics::StringUtils::urlEncode;
|
||||
using arangodb::httpclient::SimpleHttpResult;
|
||||
|
||||
// the following two structs are needed for cleaning up duplicate attributes
|
||||
arangodb::velocypack::Builder result;
|
||||
arangodb::basics::StringBuffer cleaned;
|
||||
|
||||
if (options.cleanupDuplicateAttributes) {
|
||||
int res = cleaned.reserve(bufferSize);
|
||||
|
||||
if (res != TRI_ERROR_NO_ERROR) {
|
||||
// out of memory
|
||||
THROW_ARANGO_EXCEPTION(res);
|
||||
}
|
||||
|
||||
arangodb::velocypack::Options options = arangodb::velocypack::Options::Defaults;
|
||||
// do *not* check duplicate attributes here (because that would throw)
|
||||
options.checkAttributeUniqueness = false;
|
||||
arangodb::velocypack::Builder builder(&options);
|
||||
|
||||
// instead, we need to manually check for duplicate attributes...
|
||||
char const* p = buffer;
|
||||
char const* e = p + bufferSize;
|
||||
|
||||
while (p < e) {
|
||||
while (p < e && (*p == ' ' || *p == '\r' || *p == '\n' || *p == '\t')) {
|
||||
++p;
|
||||
}
|
||||
|
||||
// detect line ending
|
||||
size_t length;
|
||||
char const* nl = static_cast<char const*>(memchr(p, '\n', e - p));
|
||||
if (nl == nullptr) {
|
||||
length = e - p;
|
||||
} else {
|
||||
length = nl - p;
|
||||
}
|
||||
|
||||
builder.clear();
|
||||
try {
|
||||
VPackParser parser(builder, builder.options);
|
||||
parser.parse(p, length);
|
||||
} catch (arangodb::velocypack::Exception const& ex) {
|
||||
return {TRI_ERROR_HTTP_CORRUPTED_JSON, ex.what()};
|
||||
} catch (std::bad_alloc const& ex) {
|
||||
return {TRI_ERROR_OUT_OF_MEMORY};
|
||||
} catch (std::exception const& ex) {
|
||||
return {TRI_ERROR_INTERNAL, ex.what()};
|
||||
}
|
||||
|
||||
// recursively clean up duplicate attributes in the document
|
||||
result.clear();
|
||||
makeAttributesUnique(result, builder.slice());
|
||||
|
||||
std::string const json = result.toJson();
|
||||
cleaned.appendText(json.data(), json.size());
|
||||
|
||||
if (nl == nullptr) {
|
||||
// done
|
||||
break;
|
||||
}
|
||||
|
||||
cleaned.appendChar('\n');
|
||||
// advance behind newline
|
||||
p = nl + 1;
|
||||
}
|
||||
|
||||
// now point to the cleaned up data
|
||||
buffer = cleaned.c_str();
|
||||
bufferSize = cleaned.length();
|
||||
}
|
||||
|
||||
std::string const url = "/_api/replication/restore-data?collection=" + urlEncode(cname) +
|
||||
"&force=" + (options.force ? "true" : "false");
|
||||
|
@ -1026,6 +1137,10 @@ void RestoreFeature::collectOptions(std::shared_ptr<options::ProgramOptions> opt
|
|||
|
||||
options->addOption("--input-directory", "input directory",
|
||||
new StringParameter(&_options.inputPath));
|
||||
|
||||
options->addOption("--cleanup-duplicate-attributes", "clean up duplicate attributes in input documents instead of making the restore operation fail (since v3.3.22 and v3.4.2)",
|
||||
new BooleanParameter(&_options.cleanupDuplicateAttributes),
|
||||
arangodb::options::makeFlags(arangodb::options::Flags::Hidden));
|
||||
|
||||
options->addOption("--import-data", "import data into collection",
|
||||
new BooleanParameter(&_options.importData));
|
||||
|
|
|
@ -89,6 +89,7 @@ class RestoreFeature final : public application_features::ApplicationFeature {
|
|||
bool includeSystemCollections{false};
|
||||
bool indexesFirst{false};
|
||||
bool overwrite{true};
|
||||
bool cleanupDuplicateAttributes{false};
|
||||
bool progress{true};
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue