1
0
Fork 0

Bug fix 3.4/create collections better preconditions (#9306)

* Backward port of #9296

* Updated changelog

* Updated changelog
This commit is contained in:
Michael Hackstein 2019-06-24 08:43:46 +02:00 committed by KVS85
parent 5d5eb23e7f
commit f06706e53d
3 changed files with 64 additions and 8 deletions

View File

@ -1,6 +1,16 @@
v3.4.7 (2019-XX-XX)
-------------------
* bug-fix for a very rare race condition on cluster collection creation process. It can only occur in the following
situation:
1) DBServer sucessfully creates all shards for collections, and reports it back
2) DBServer dies
3) Agency recognized death and reorganizes shards
4) Coordinator receives (1) but not (3) and decided everything was good
5) => Coordinator reverts (3) only on this collection
6) Coordinator receives (3) now
The bugfix disallows (5) if (3) is run in Agency.
* fixed internal issue #4040: gharial api is now checking existence of `_from` and `_to` vertices
during edge replacements and edge updates

View File

@ -40,9 +40,16 @@ arangodb::ClusterCollectionCreationInfo::ClusterCollectionCreationInfo(
name(arangodb::basics::VelocyPackHelper::getStringValue(json, arangodb::StaticStrings::DataSourceName,
StaticStrings::Empty)),
state(State::INIT) {
if (numberOfShards == 0 || arangodb::basics::VelocyPackHelper::getBooleanValue(
json, arangodb::StaticStrings::IsSmart, false)) {
if (numberOfShards == 0) {
// Nothing to do this cannot fail
// Deactivated this assertion, our testing mock for coordinator side
// tries to get away without other servers by initially adding only 0
// shard collections (non-smart). We do not want to loose these test.
// So we will loose this assertion for now.
/*
TRI_ASSERT(arangodb::basics::VelocyPackHelper::getBooleanValue(
json, arangodb::StaticStrings::IsSmart, false));
*/
state = State::DONE;
}
TRI_ASSERT(!name.empty());
@ -63,6 +70,13 @@ VPackSlice const arangodb::ClusterCollectionCreationInfo::isBuildingSlice() cons
}
bool arangodb::ClusterCollectionCreationInfo::needsBuildingFlag() const {
// Deactivated the smart graph check, our testing mock for coordinator side
// tries to get away without other servers by initially adding only 0
// shard collections (non-smart). We do not want to loose these test.
// So we will loose the more precise check for now.
/*
return numberOfShards > 0 ||
arangodb::basics::VelocyPackHelper::getBooleanValue(json, StaticStrings::IsSmart, false);
*/
return numberOfShards > 0;
}

View File

@ -114,6 +114,11 @@ static inline arangodb::AgencyOperation IncreaseVersion() {
arangodb::AgencySimpleOperationType::INCREMENT_OP};
}
static inline std::string collectionPath(std::string const& dbName,
std::string const& collection) {
return "Plan/Collections/" + dbName + "/" + collection;
}
static inline arangodb::AgencyOperation CreateCollectionOrder(std::string const& dbName,
std::string const& collection,
VPackSlice const& info,
@ -127,16 +132,24 @@ static inline arangodb::AgencyOperation CreateCollectionOrder(std::string const&
TRI_ASSERT(info.get(arangodb::StaticStrings::IsBuilding).getBool() == true);
}
#endif
arangodb::AgencyOperation op{"Plan/Collections/" + dbName + "/" + collection,
arangodb::AgencyOperation op{collectionPath(dbName, collection),
arangodb::AgencyValueOperationType::SET, info};
op._ttl = timeout;
return op;
}
static inline arangodb::AgencyPrecondition CreateCollectionOrderPrecondition(
std::string const& dbName, std::string const& collection, VPackSlice const& info) {
arangodb::AgencyPrecondition prec{collectionPath(dbName, collection),
arangodb::AgencyPrecondition::Type::VALUE, info};
return prec;
}
static inline arangodb::AgencyOperation CreateCollectionSuccess(
std::string const& dbName, std::string const& collection, VPackSlice const& info) {
TRI_ASSERT(!info.hasKey(arangodb::StaticStrings::IsBuilding));
return arangodb::AgencyOperation{"Plan/Collections/" + dbName + "/" + collection,
return arangodb::AgencyOperation{collectionPath(dbName,collection),
arangodb::AgencyValueOperationType::SET, info};
}
@ -1486,7 +1499,9 @@ int ClusterInfo::dropDatabaseCoordinator(std::string const& name,
AgencyPrecondition::Type::EMPTY, false);
AgencyWriteTransaction trans({delPlanDatabases, delPlanCollections, delPlanViews, incrementVersion},
databaseExists);
VPackBuilder hass;
trans.toVelocyPack(hass);
LOG_DEVEL << hass.toJson();
AgencyCommResult res = ac.sendTransactionWithFailover(trans);
if (!res.successful()) {
if (res._statusCode == (int)arangodb::rest::ResponseCode::PRECONDITION_FAILED) {
@ -1798,6 +1813,13 @@ Result ClusterInfo::createCollectionsCoordinator(std::string const& databaseName
basics::VelocyPackHelper::getStringValue(info.json, StaticStrings::DistributeShardsLike,
StaticStrings::Empty);
if (!otherCidString.empty() && conditions.find(otherCidString) == conditions.end()) {
// Distribute shards like case.
// Precondition: Master collection is not moving while we create this
// collection We only need to add these once for every Master, we cannot
// add multiple because we will end up with duplicate entries.
// NOTE: We do not need to add all collections created here, as they will not succeed
// In callbacks if they are moved during creation.
// If they are moved after creation was reported success they are under protection by Supervision.
conditions.emplace(otherCidString);
otherCidShardMap = getCollection(databaseName, otherCidString)->shardIds();
// Any of the shards locked?
@ -1924,22 +1946,32 @@ Result ClusterInfo::createCollectionsCoordinator(std::string const& databaseName
cbGuard.fire();
// Now we need to remove TTL + the IsBuilding flag in Agency
opers.clear();
precs.clear();
opers.push_back(IncreaseVersion());
for (auto const& info : infos) {
opers.push_back(CreateCollectionSuccess(databaseName, info.collectionID, info.json));
opers.emplace_back(
CreateCollectionSuccess(databaseName, info.collectionID, info.json));
// NOTE:
// We cannot do anything better than: "noone" has modified our collections while
// we tried to create them...
// Preconditions cover against supervision jobs injecting other leaders / followers during failovers.
// If they are it is not valid to confirm them here. (bad luck we were almost there)
precs.emplace_back(CreateCollectionOrderPrecondition(databaseName, info.collectionID,
info.isBuildingSlice()));
}
// TODO: Should we use preconditions?
AgencyWriteTransaction transaction(opers);
AgencyWriteTransaction transaction(opers, precs);
// This is a best effort, in the worst case the collection stays:
auto res = ac.sendTransactionWithFailover(transaction);
if (!res.successful()) {
// Everything worked, report success
// could not update TTL
for (auto const& info : infos) {
TRI_ASSERT(info.state == ClusterCollectionCreationInfo::State::DONE);
events::CreateCollection(info.name, res.errorCode());
}
return TRI_ERROR_CLUSTER_COULD_NOT_CREATE_COLLECTION;
} else {
// Everything worked, report success
for (auto const& info : infos) {