1
0
Fork 0

compress-data option for arangodump. ported from 3.4 (#8861)

This commit is contained in:
Matthew Von-Maszewski 2019-04-26 12:50:58 -04:00 committed by Jan
parent c054cddfe5
commit eef492721a
8 changed files with 192 additions and 45 deletions

View File

@ -1,6 +1,11 @@
devel
-----
* add --compress-output flag to arangodump. Activates gzip compression for
collection data. Metadata files, such as .structure.json and .view.json,
do not get compressed. No option is needed for arangorestore to restore
.data.json.gz files.
* added options to make server more secure:
- `--server.harden`: denies access to certain REST APIs that return server internals

View File

@ -192,3 +192,29 @@ Using a different key will lead to the backup being non-recoverable.
Note that encrypted backups can be used together with the already existing
RocksDB encryption-at-rest feature, but they can also be used for the MMFiles
engine, which does not have encryption-at-rest.
Compression
-----------
<small>Introduced in: v3.4.6, v3.5.0</small>
`--compress-output`
Data can optionally be dumped in a compressed format to save space on disk.
The `--compress-output` option can not be used together with [Encryption](#encryption).
If compression is enabled, no `.data.json` files are written. Instead, the
collection data gets compressed using the Gzip algorithm and for each collection
a `.data.json.gz` file is written. Metadata files such as `.structure.json` and
`.view.json` do not get compressed.
```
arangodump --output-directory "dump" --compress-output
```
Compressed dumps can be restored with *arangorestore*, which automatically
detects whether the data is compressed or not based on the file extension.
```
arangorestore --input-directory "dump"
```

View File

@ -58,6 +58,23 @@
"section" : "",
"type" : "string..."
},
"compress-output" : {
"category" : "option",
"default" : true,
"deprecatedIn" : null,
"description" : "compress files containing collection contents using gzip format",
"dynamic" : false,
"enterpriseOnly" : false,
"hidden" : false,
"introducedIn" : [
"v3.4.6",
"v3.5.0"
],
"obsolete" : false,
"requiresValue" : false,
"section" : "",
"type" : "boolean"
},
"config" : {
"category" : "option",
"default" : "",

View File

@ -491,7 +491,7 @@ arangodb::Result processJob(arangodb::httpclient::SimpleHttpClient& client,
auto file = jobData.directory.writableFile(
jobData.name + (jobData.options.clusterMode ? "" : ("_" + hexString)) +
".structure.json",
true);
true, 0, false);
if (!::fileOk(file.get())) {
return ::fileError(file.get(), true);
}
@ -647,6 +647,12 @@ void DumpFeature::collectOptions(std::shared_ptr<options::ProgramOptions> option
new StringParameter(&_options.maskingsFile))
.setIntroducedIn(30322)
.setIntroducedIn(30402);
options->addOption("--compress-output",
"compress files containing collection contents using gzip format",
new BooleanParameter(&_options.useGzip))
.setIntroducedIn(30406)
.setIntroducedIn(30500);
}
void DumpFeature::validateOptions(std::shared_ptr<options::ProgramOptions> options) {
@ -987,7 +993,7 @@ Result DumpFeature::storeDumpJson(VPackSlice const& body, std::string const& dbN
meta.close();
// save last tick in file
auto file = _directory->writableFile("dump.json", true);
auto file = _directory->writableFile("dump.json", true, 0, false);
if (!::fileOk(file.get())) {
return ::fileError(file.get(), true);
}
@ -1018,7 +1024,7 @@ Result DumpFeature::storeViews(VPackSlice const& views) const {
std::string fname = nameSlice.copyString();
fname.append(".view.json");
// save last tick in file
auto file = _directory->writableFile(fname, true);
auto file = _directory->writableFile(fname, true, 0, false);
if (!::fileOk(file.get())) {
return ::fileError(file.get(), true);
}
@ -1073,7 +1079,8 @@ void DumpFeature::start() {
// set up the output directory, not much else
_directory = std::make_unique<ManagedDirectory>(_options.outputPath,
!_options.overwrite, true);
!_options.overwrite, true,
_options.useGzip);
if (_directory->status().fail()) {
switch (_directory->status().errorNumber()) {
case TRI_ERROR_FILE_EXISTS:

View File

@ -80,6 +80,7 @@ class DumpFeature : public application_features::ApplicationFeature {
bool includeSystemCollections{false};
bool overwrite{false};
bool progress{true};
bool useGzip{true};
};
/// @brief Stores stats about the overall dump progress

View File

@ -609,6 +609,7 @@ arangodb::Result restoreData(arangodb::httpclient::SimpleHttpClient& httpClient,
arangodb::Result result;
StringBuffer buffer(true);
bool isGzip(false);
VPackSlice const parameters = jobData.collection.get("parameters");
std::string const cname =
@ -618,12 +619,24 @@ arangodb::Result restoreData(arangodb::httpclient::SimpleHttpClient& httpClient,
std::string const collectionType(type == 2 ? "document" : "edge");
// import data. check if we have a datafile
// ... there are 4 possible names
auto datafile = jobData.directory.readableFile(
cname + "_" + arangodb::rest::SslInterface::sslMD5(cname) + ".data.json");
if (!datafile || datafile->status().fail()) {
datafile = jobData.directory.readableFile(cname + ".data.json");
datafile = jobData.directory.readableFile(
cname + "_" + arangodb::rest::SslInterface::sslMD5(cname) + ".data.json.gz");
isGzip = true;
} // if
if (!datafile || datafile->status().fail()) {
result = {TRI_ERROR_CANNOT_READ_FILE, "could not open data file for collection " + cname + "'"};
datafile = jobData.directory.readableFile(
cname + ".data.json.gz");
isGzip = true;
} // if
if (!datafile || datafile->status().fail()) {
datafile = jobData.directory.readableFile(cname + ".data.json");
isGzip = false;
if (!datafile || datafile->status().fail()) {
result = {TRI_ERROR_CANNOT_READ_FILE, "could not open file"};
return result;
}
}
@ -702,11 +715,21 @@ arangodb::Result restoreData(arangodb::httpclient::SimpleHttpClient& httpClient,
if (jobData.options.progress && fileSize > 0 &&
numReadSinceLastReport > 1024 * 1024 * 8) {
// report every 8MB of transferred data
// currently do not have unzipped size for .gz files
std::stringstream percentage, ofFilesize;
if (isGzip) {
ofFilesize << "";
percentage << "";
} else {
ofFilesize << " of " << fileSize;
percentage << " ("
<< int(100. * double(numReadForThisCollection) / double(fileSize)) << " %)";
} // else
LOG_TOPIC("69a73", INFO, Logger::RESTORE)
<< "# Still loading data into " << collectionType << " collection '"
<< cname << "', " << numReadForThisCollection << " of " << fileSize
<< " byte(s) restored ("
<< int(100. * double(numReadForThisCollection) / double(fileSize)) << " %)";
<< cname << "', " << numReadForThisCollection << ofFilesize.str()
<< " byte(s) restored" << percentage.str();
numReadSinceLastReport = 0;
}
}

View File

@ -203,7 +203,7 @@ void writeEncryptionFile(std::string const& directory, std::string& type) {
namespace arangodb {
ManagedDirectory::ManagedDirectory(std::string const& path, bool requireEmpty, bool create)
ManagedDirectory::ManagedDirectory(std::string const& path, bool requireEmpty, bool create, bool writeGzip)
:
#ifdef USE_ENTERPRISE
_encryptionFeature{
@ -211,6 +211,7 @@ ManagedDirectory::ManagedDirectory(std::string const& path, bool requireEmpty, b
#endif
_path{path},
_encryptionType{::EncryptionTypeNone},
_writeGzip(writeGzip),
_status{TRI_ERROR_NO_ERROR} {
if (_path.empty()) {
_status.reset(TRI_ERROR_BAD_PARAMETER, "must specify a path");
@ -264,6 +265,11 @@ ManagedDirectory::ManagedDirectory(std::string const& path, bool requireEmpty, b
}
}
// currently gzip and encryption are mutually exclusive, encryption wins
if (::EncryptionTypeNone != _encryptionType) {
_writeGzip = false;
} // if
#ifdef USE_ENTERPRISE
::writeEncryptionFile(_path, _encryptionType, _encryptionFeature);
#else
@ -306,8 +312,9 @@ std::unique_ptr<ManagedDirectory::File> ManagedDirectory::readableFile(std::stri
}
try {
bool gzFlag = (0 == filename.substr(filename.size() - 3).compare(".gz"));
file = std::make_unique<File>(*this, filename,
(ManagedDirectory::DefaultReadFlags ^ flags));
(ManagedDirectory::DefaultReadFlags ^ flags), gzFlag);
} catch (...) {
_status.reset(TRI_ERROR_CANNOT_READ_FILE, "error opening file " +
::filePath(*this, filename) +
@ -319,7 +326,7 @@ std::unique_ptr<ManagedDirectory::File> ManagedDirectory::readableFile(std::stri
}
std::unique_ptr<ManagedDirectory::File> ManagedDirectory::writableFile(
std::string const& filename, bool overwrite, int flags) {
std::string const& filename, bool overwrite, int flags, bool gzipOk) {
std::unique_ptr<File> file{nullptr};
if (_status.fail()) { // directory is in a bad state
@ -327,8 +334,13 @@ std::unique_ptr<ManagedDirectory::File> ManagedDirectory::writableFile(
}
try {
std::string filenameCopy = filename;
if (_writeGzip && gzipOk) {
filenameCopy.append(".gz");
} // if
// deal with existing file first if it exists
auto path = ::filePath(*this, filename);
auto path = ::filePath(*this, filenameCopy);
bool fileExists = TRI_ExistsFile(path.c_str());
if (fileExists) {
if (overwrite) {
@ -341,7 +353,7 @@ std::unique_ptr<ManagedDirectory::File> ManagedDirectory::writableFile(
}
file = std::make_unique<File>(*this, filename,
(ManagedDirectory::DefaultWriteFlags ^ flags));
(ManagedDirectory::DefaultWriteFlags ^ flags), _writeGzip && gzipOk);
} catch (...) {
return {nullptr};
}
@ -385,11 +397,14 @@ VPackBuilder ManagedDirectory::vpackFromJsonFile(std::string const& filename) {
}
ManagedDirectory::File::File(ManagedDirectory const& directory,
std::string const& filename, int flags)
std::string const& filename, int flags,
bool isGzip)
: _directory{directory},
_path{::filePath(_directory, filename)},
_flags{flags},
_fd{::openFile(_path, _flags)},
_gzfd(-1),
_gzFile(nullptr),
#ifdef USE_ENTERPRISE
_context{::getContext(_directory, _fd, _flags)},
_status {
@ -402,10 +417,31 @@ ManagedDirectory::File::File(ManagedDirectory const& directory,
#endif
{
TRI_ASSERT(::flagNotSet(_flags, O_RDWR)); // disallow read/write (encryption)
if (isGzip) {
const char * gzFlags(nullptr);
// gzip is going to perform a redundant close,
// simpler code to give it redundant handle
_gzfd = dup(_fd);
if (O_WRONLY & flags) {
gzFlags = "wb";
} else {
gzFlags = "rb";
} // else
_gzFile = gzdopen(_gzfd, gzFlags);
} // if
}
ManagedDirectory::File::~File() {
try {
if (_gzfd >=0) {
gzclose(_gzFile);
_gzfd = -1;
_gzFile = nullptr;
} // if
if (_fd >= 0) {
::closeFile(_fd, _status);
}
@ -428,11 +464,17 @@ void ManagedDirectory::File::write(char const* data, size_t length) {
if (!written) {
_status = _context->status();
}
} else if (isGzip()) {
gzwrite(_gzFile, data, length);
} else {
::rawWrite(_fd, data, length, _status, _path, _flags);
}
#else
if (isGzip()) {
gzwrite(_gzFile, data, length);
} else {
::rawWrite(_fd, data, length, _status, _path, _flags);
} // else
#endif
}
@ -448,11 +490,17 @@ ssize_t ManagedDirectory::File::read(char* buffer, size_t length) {
if (bytesRead < 0) {
_status = _context->status();
}
} else if (isGzip()) {
bytesRead = gzread(_gzFile, buffer, length);
} else {
bytesRead = ::rawRead(_fd, buffer, length, _status, _path, _flags);
}
#else
if (isGzip()) {
bytesRead = gzread(_gzFile, buffer, length);
} else {
bytesRead = ::rawRead(_fd, buffer, length, _status, _path, _flags);
} // else
#endif
return bytesRead;
}
@ -499,6 +547,12 @@ void ManagedDirectory::File::spit(std::string const& content) {
}
Result const& ManagedDirectory::File::close() {
if (_gzfd >=0) {
gzclose(_gzFile);
_gzfd = -1;
_gzFile = nullptr;
} // if
if (_fd >= 0) {
::closeFile(_fd, _status);
}

View File

@ -24,6 +24,8 @@
#ifndef ARANGOSH_UTILS_MANAGED_DIRECTORY_H
#define ARANGOSH_UTILS_MANAGED_DIRECTORY_H 1
#include "zlib.h"
#include <velocypack/Builder.h>
#include <velocypack/Parser.h>
#include <velocypack/velocypack-aliases.h>
@ -60,8 +62,9 @@ class ManagedDirectory {
* @param directory A reference to the containing directory
* @param filename The name of the file within the directory
* @param flags The flags to pass to the OS to open the file
* @param isGzip True if reads/writes should go through gzip functions
*/
File(ManagedDirectory const& directory, std::string const& filename, int flags);
File(ManagedDirectory const& directory, std::string const& filename, int flags, bool isGzip);
/**
* @brief Closes the file if it is still open
*/
@ -113,11 +116,19 @@ class ManagedDirectory {
*/
Result const& close();
/**
* @brief Closes file (now, as opposed to when the object is destroyed)
* @return Reference to file status
*/
bool isGzip() const {return -1 != _gzfd;}
private:
ManagedDirectory const& _directory;
std::string _path;
int _flags;
int _fd;
int _gzfd; // duplicate fd for gzip close
gzFile _gzFile;
#ifdef USE_ENTERPRISE
std::unique_ptr<EncryptionFeature::Context> _context;
#endif
@ -139,8 +150,9 @@ class ManagedDirectory {
* @param path The path to the directory
* @param requireEmpty If `true`, opening a non-empty directory will fail
* @param create If `true` and directory does not exist, create it
* @param writeGzip True if writes should use gzip (reads autodetect .gz)
*/
ManagedDirectory(std::string const& path, bool requireEmpty, bool create);
ManagedDirectory(std::string const& path, bool requireEmpty, bool create, bool writeGzip = true);
~ManagedDirectory();
public:
@ -204,10 +216,11 @@ class ManagedDirectory {
* @param name The filename, relative to the directory
* @param overwrite Whether to overwrite file if it exists (otherwise fail)
* @param flags Flags (will be XORed with `DefaultWriteFlags`
* @param gzipOk Flag whether this file is suitable for gzip (when enabled)
* @return Unique pointer to file, if opened
*/
std::unique_ptr<File> writableFile(std::string const& filename,
bool overwrite, int flags = 0);
bool overwrite, int flags = 0, bool gzipOk = true );
/**
* @brief Write a string to file
@ -236,6 +249,7 @@ class ManagedDirectory {
#endif
std::string const _path;
std::string _encryptionType;
bool _writeGzip;
Result _status;
};
} // namespace arangodb