1
0
Fork 0

Bug fix 3.4/arangorestore add cleanup duplicate attributes (#7876)

This commit is contained in:
Jan 2019-01-04 15:26:11 +01:00 committed by GitHub
parent 1ad9ae3115
commit a4a7867451
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 142 additions and 26 deletions

View File

@ -1,6 +1,13 @@
v3.4.2 (XXXX-XX-XX)
-------------------
* added arangorestore option `--cleanup-duplicate-attributes` to clean up input documents
with redundant attribute names
Importing such documents without the option set will make arangorestore fail with an
error, and setting the option will make the restore process clean up the input by using
just the first specified value for each redundant attribute.
* the arangorestore options `--default-number-of-shards` and `--default-replication-factor`
are now deprecated in favor of the much more powerful options `--number-of-shards`
and `--replication-factor`

View File

@ -129,26 +129,13 @@ Encryption
See [Arangodump](../Arangodump/Examples.md#encryption) for details.
Restoring Revision IDs and Collection IDs
-----------------------------------------
_arangorestore_ will reload document and edges data with the exact same *_key*, *_from* and
*_to* values found in the input directory. However, when loading document data, it will assign
its own values for the *_rev* attribute of the reloaded documents. Though this difference is
intentional (normally, every server should create its own *_rev* values) there might be
situations when it is required to re-use the exact same *_rev* values for the reloaded data.
This can be achieved by setting the *--recycle-ids* parameter to *true*:
arangorestore --collection myusers --collection myvalues --input-directory "dump"
Note that setting *--recycle-ids* to *true* will also cause collections to be (re-)created in
the target database with the exact same collection id as in the input directory. Any potentially
existing collection in the target database with the same collection id will then be dropped.
Reloading Data into a different Collection
------------------------------------------
With some creativity you can use _arangodump_ and _arangorestore_ to transfer data from one
_arangorestore_ will restore document and edges data with the exact same *_key*, *_rev*, *_from*
and *_to* values as found in the input directory.
With some creativity you can also use _arangodump_ and _arangorestore_ to transfer data from one
collection into another (either on the same server or not). For example, to copy data from
a collection *myvalues* in database *mydb* into a collection *mycopyvalues* in database *mycopy*,
you can start with the following command:
@ -229,6 +216,6 @@ For restore this short overview is sufficient:
- When importing into an existing database, the given user needs `Administrate`
access on this database.
- When creating a new Database during restore, the given user needs `Administrate`
- When creating a new database during restore, the given user needs `Administrate`
access on `_system`. The user will be promoted with `Administrate` access on the
newly created database.

View File

@ -273,11 +273,12 @@ void RestCollectionHandler::handleCommandPost() {
// for some "security" a white-list of allowed parameters
VPackBuilder filtered = VPackCollection::keep(
body, std::unordered_set<std::string>{
"doCompact", "isSystem", "id", "isVolatile", "journalSize",
"indexBuckets", "keyOptions", "waitForSync", "cacheEnabled",
"doCompact", StaticStrings::DataSourceSystem, StaticStrings::DataSourceId,
"isVolatile", "journalSize", "indexBuckets", "keyOptions",
StaticStrings::WaitForSyncString, "cacheEnabled",
StaticStrings::ShardKeys, StaticStrings::NumberOfShards,
StaticStrings::DistributeShardsLike, "avoidServers", "isSmart",
"shardingStrategy", "smartGraphAttribute", "replicationFactor",
StaticStrings::DistributeShardsLike, "avoidServers", StaticStrings::IsSmart,
"shardingStrategy", StaticStrings::GraphSmartGraphAttribute, StaticStrings::ReplicationFactor,
"servers"});
VPackSlice const parameters = filtered.slice();

View File

@ -1322,7 +1322,11 @@ Result RestReplicationHandler::processRestoreDataBatch(transaction::Methods& trx
std::string const& collectionName) {
std::unordered_map<std::string, VPackValueLength> latest;
VPackBuilder allMarkers;
parseBatch(collectionName, latest, allMarkers);
Result res = parseBatch(collectionName, latest, allMarkers);
if (res.fail()) {
return res;
}
// First remove all keys of which the last marker we saw was a deletion
// marker:
@ -1750,6 +1754,7 @@ void RestReplicationHandler::handleCommandRestoreView() {
auto nameSlice = slice.get(StaticStrings::DataSourceName);
auto typeSlice = slice.get(StaticStrings::DataSourceType);
if (!nameSlice.isString() || !typeSlice.isString()) {
generateError(ResponseCode::BAD, TRI_ERROR_BAD_PARAMETER);
return;
@ -1759,7 +1764,7 @@ void RestReplicationHandler::handleCommandRestoreView() {
try {
CollectionNameResolver resolver(_vocbase);
auto view = resolver.getView(nameSlice.toString());
auto view = resolver.getView(nameSlice.copyString());
if (view) {
if (!overwrite) {

View File

@ -25,6 +25,8 @@
#include <velocypack/Builder.h>
#include <velocypack/Collection.h>
#include <velocypack/Iterator.h>
#include <velocypack/StringRef.h>
#include <velocypack/velocypack-aliases.h>
#include <boost/algorithm/clamp.hpp>
@ -154,8 +156,8 @@ arangodb::Result checkHttpResponse(arangodb::httpclient::SimpleHttpClient& clien
if (response == nullptr || !response->isComplete()) {
return {TRI_ERROR_INTERNAL,
"got invalid response from server: '" + client.getErrorMessage() +
"' while executing '" + requestAction +
"' with this payload: '" + originalRequest + "'"};
"' while executing " + requestAction +
" with this payload: '" + originalRequest + "'"};
}
if (response->wasHttpError()) {
int errorNum = TRI_ERROR_INTERNAL;
@ -210,6 +212,46 @@ bool sortCollections(VPackBuilder const& l, VPackBuilder const& r) {
return strcasecmp(leftName.c_str(), rightName.c_str()) < 0;
}
void makeAttributesUnique(arangodb::velocypack::Builder& builder, arangodb::velocypack::Slice slice) {
if (slice.isObject()) {
std::unordered_set<arangodb::velocypack::StringRef> keys;
builder.openObject();
auto it = arangodb::velocypack::ObjectIterator(slice, true);
while (it.valid()) {
if (!keys.emplace(it.key().stringRef()).second) {
// duplicate key
it.next();
continue;
}
// process attributes recursively
builder.add(it.key());
makeAttributesUnique(builder, it.value());
it.next();
}
builder.close();
} else if (slice.isArray()) {
builder.openArray();
auto it = arangodb::velocypack::ArrayIterator(slice);
while (it.valid()) {
// recurse into array
makeAttributesUnique(builder, it.value());
it.next();
}
builder.close();
} else {
// non-compound value!
builder.add(slice);
}
}
/// @brief Create the database to restore to, connecting manually
arangodb::Result tryCreateDatabase(std::string const& name) {
using arangodb::httpclient::SimpleHttpClient;
@ -414,6 +456,75 @@ arangodb::Result sendRestoreData(arangodb::httpclient::SimpleHttpClient& httpCli
size_t bufferSize) {
using arangodb::basics::StringUtils::urlEncode;
using arangodb::httpclient::SimpleHttpResult;
// the following two structs are needed for cleaning up duplicate attributes
arangodb::velocypack::Builder result;
arangodb::basics::StringBuffer cleaned;
if (options.cleanupDuplicateAttributes) {
int res = cleaned.reserve(bufferSize);
if (res != TRI_ERROR_NO_ERROR) {
// out of memory
THROW_ARANGO_EXCEPTION(res);
}
arangodb::velocypack::Options options = arangodb::velocypack::Options::Defaults;
// do *not* check duplicate attributes here (because that would throw)
options.checkAttributeUniqueness = false;
arangodb::velocypack::Builder builder(&options);
// instead, we need to manually check for duplicate attributes...
char const* p = buffer;
char const* e = p + bufferSize;
while (p < e) {
while (p < e && (*p == ' ' || *p == '\r' || *p == '\n' || *p == '\t')) {
++p;
}
// detect line ending
size_t length;
char const* nl = static_cast<char const*>(memchr(p, '\n', e - p));
if (nl == nullptr) {
length = e - p;
} else {
length = nl - p;
}
builder.clear();
try {
VPackParser parser(builder, builder.options);
parser.parse(p, length);
} catch (arangodb::velocypack::Exception const& ex) {
return {TRI_ERROR_HTTP_CORRUPTED_JSON, ex.what()};
} catch (std::bad_alloc const& ex) {
return {TRI_ERROR_OUT_OF_MEMORY};
} catch (std::exception const& ex) {
return {TRI_ERROR_INTERNAL, ex.what()};
}
// recursively clean up duplicate attributes in the document
result.clear();
makeAttributesUnique(result, builder.slice());
std::string const json = result.toJson();
cleaned.appendText(json.data(), json.size());
if (nl == nullptr) {
// done
break;
}
cleaned.appendChar('\n');
// advance behind newline
p = nl + 1;
}
// now point to the cleaned up data
buffer = cleaned.c_str();
bufferSize = cleaned.length();
}
std::string const url = "/_api/replication/restore-data?collection=" + urlEncode(cname) +
"&force=" + (options.force ? "true" : "false");
@ -1026,6 +1137,10 @@ void RestoreFeature::collectOptions(std::shared_ptr<options::ProgramOptions> opt
options->addOption("--input-directory", "input directory",
new StringParameter(&_options.inputPath));
options->addOption("--cleanup-duplicate-attributes", "clean up duplicate attributes in input documents instead of making the restore operation fail (since v3.3.22 and v3.4.2)",
new BooleanParameter(&_options.cleanupDuplicateAttributes),
arangodb::options::makeFlags(arangodb::options::Flags::Hidden));
options->addOption("--import-data", "import data into collection",
new BooleanParameter(&_options.importData));

View File

@ -89,6 +89,7 @@ class RestoreFeature final : public application_features::ApplicationFeature {
bool includeSystemCollections{false};
bool indexesFirst{false};
bool overwrite{true};
bool cleanupDuplicateAttributes{false};
bool progress{true};
};