mirror of https://gitee.com/bigwinds/arangodb
4125 lines
162 KiB
C
4125 lines
162 KiB
C
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief skiplist implementation
|
|
///
|
|
/// @file
|
|
///
|
|
/// DISCLAIMER
|
|
///
|
|
/// Copyright 2004-2012 triagens GmbH, Cologne, Germany
|
|
///
|
|
/// Licensed under the Apache License, Version 2.0 (the "License");
|
|
/// you may not use this file except in compliance with the License.
|
|
/// You may obtain a copy of the License at
|
|
///
|
|
/// http://www.apache.org/licenses/LICENSE-2.0
|
|
///
|
|
/// Unless required by applicable law or agreed to in writing, software
|
|
/// distributed under the License is distributed on an "AS IS" BASIS,
|
|
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
/// See the License for the specific language governing permissions and
|
|
/// limitations under the License.
|
|
///
|
|
/// Copyright holder is triAGENS GmbH, Cologne, Germany
|
|
///
|
|
/// @author Anonymous
|
|
/// @author Copyright 2006-2012, triAGENS GmbH, Cologne, Germany
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include <BasicsC/locks.h>
|
|
#include <BasicsC/logging.h>
|
|
#include <BasicsC/random.h>
|
|
|
|
#include "skiplistEx.h"
|
|
#include "compareEx.h"
|
|
|
|
#ifdef _WIN32
|
|
#include <BasicsC/win-utils.h>
|
|
#endif
|
|
|
|
#ifdef TRI_SKIPLIST_EX
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// --SECTION-- SKIPLIST_EX
|
|
// -----------------------------------------------------------------------------
|
|
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// --SECTION-- Private Type Structures
|
|
// -----------------------------------------------------------------------------
|
|
typedef enum {
|
|
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG, // the nearest neighbour node is normal
|
|
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG, // the nearest neighbour node is bricked - next/prev pointers can not be modified
|
|
|
|
TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG, // normal tower node, no removal pending
|
|
TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG, // glass tower node, skipped in a lookup, removal pending
|
|
|
|
TRI_SKIPLIST_EX_FREE_TO_GROW_START_END_NODES_FLAG, // start/end nodes special this and flag below ensures that
|
|
TRI_SKIPLIST_EX_NOT_FREE_TO_GROW_START_END_NODES_FLAG // the tower height of these nodes is performed sequentially
|
|
}
|
|
TRI_skiplistEx_tower_node_flag_e;
|
|
|
|
static unsigned int CAS_FAILURE_SLEEP_TIME = 1000;
|
|
static unsigned int SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT = 100;
|
|
static unsigned int SKIPLIST_EX_CAS_FAILURES_MAX_LOOP = 10;
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// --SECTION-- STATIC FORWARD DECLARATIONS
|
|
// --SECTION-- common private functions
|
|
// -----------------------------------------------------------------------------
|
|
|
|
static void DestroyBaseSkipListEx (TRI_skiplistEx_base_t*);
|
|
static void DestroySkipListExNode (TRI_skiplistEx_base_t*, TRI_skiplistEx_node_t*);
|
|
static void FreeSkipListExNode (TRI_skiplistEx_base_t*, TRI_skiplistEx_node_t*);
|
|
static int GrowNewNodeHeight (TRI_skiplistEx_node_t*, uint32_t, uint32_t, int);
|
|
static int GrowStartEndNodes (TRI_skiplistEx_base_t*, uint32_t);
|
|
|
|
static void* NextNodeBaseSkipListEx (TRI_skiplistEx_base_t*, void*, uint64_t);
|
|
static void* PrevNodeBaseSkipListEx (TRI_skiplistEx_base_t*, void*, uint64_t);
|
|
static int32_t RandLevel (TRI_skiplistEx_base_t*);
|
|
|
|
static void JoinStartEndNodes (TRI_skiplistEx_node_t*, TRI_skiplistEx_node_t*, uint32_t, uint32_t);
|
|
static int JoinNewNodeCas (TRI_skiplistEx_node_t* newNode); // when node is inserted
|
|
static int UnJoinOldNodeCas (TRI_skiplistEx_node_t* oldNode); // when node is removed
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// --SECTION-- unique skiplist constructors and destructors
|
|
// -----------------------------------------------------------------------------
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @addtogroup Skiplist_unique
|
|
/// @{
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief initialises an skip list
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
// .............................................................................
|
|
// TODO: The static integer variables CAS_FAILURE_SLEEP_TIME(1000),
|
|
// SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT(100) and SKIPLIST_EX_CAS_FAILURES_MAX_LOOP(10)
|
|
// should be adjusted upon startup of the server -- command line perhaps?
|
|
// .............................................................................
|
|
|
|
int TRI_InitSkipListEx (TRI_skiplistEx_t* skiplist, size_t elementSize,
|
|
int (*compareElementElement) (TRI_skiplistEx_t*, void*, void*, int),
|
|
int (*compareKeyElement) (TRI_skiplistEx_t*, void*, void*, int),
|
|
TRI_skiplistEx_prob_e probability,
|
|
uint32_t maximumHeight,
|
|
uint64_t lastKnownTransID) {
|
|
|
|
int result;
|
|
|
|
if (skiplist == NULL) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
// ..........................................................................
|
|
// Assign the STATIC comparision call back functions
|
|
// ..........................................................................
|
|
|
|
skiplist->compareElementElement = IndexStaticCompareElementElement; // compareElementElement;
|
|
skiplist->compareKeyElement = IndexStaticCompareKeyElement; // compareKeyElement;
|
|
|
|
// ..........................................................................
|
|
// Assign the maximum height of the skip list. This maximum height must be
|
|
// no greater than the absolute max height defined as a compile time parameter
|
|
// ..........................................................................
|
|
if (maximumHeight == 0) {
|
|
maximumHeight = SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT;
|
|
}
|
|
skiplist->_base._maxHeight = maximumHeight;
|
|
if (maximumHeight > SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT) {
|
|
LOG_ERROR("Invalid maximum height for skiplist");
|
|
assert(false);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
// ..........................................................................
|
|
// Assign the probability and determine the number of random numbers which
|
|
// we will require -- do it once off here
|
|
// ..........................................................................
|
|
skiplist->_base._prob = probability;
|
|
skiplist->_base._numRandom = 0;
|
|
switch (skiplist->_base._prob) {
|
|
case TRI_SKIPLIST_EX_PROB_HALF: {
|
|
// determine the number of random numbers which we require.
|
|
skiplist->_base._numRandom = (skiplist->_base._maxHeight / 32);
|
|
if ((skiplist->_base._maxHeight % 32) != 0) {
|
|
++(skiplist->_base._numRandom);
|
|
}
|
|
break;
|
|
}
|
|
case TRI_SKIPLIST_EX_PROB_THIRD: {
|
|
// determine the number of random numbers which we require.
|
|
skiplist->_base._numRandom = (skiplist->_base._maxHeight / 16);
|
|
if ((skiplist->_base._maxHeight % 16) != 0) {
|
|
++(skiplist->_base._numRandom);
|
|
}
|
|
break;
|
|
}
|
|
case TRI_SKIPLIST_EX_PROB_QUARTER: {
|
|
// determine the number of random numbers which we require.
|
|
skiplist->_base._numRandom = (skiplist->_base._maxHeight / 16);
|
|
if ((skiplist->_base._maxHeight % 16) != 0) {
|
|
++(skiplist->_base._numRandom);
|
|
}
|
|
break;
|
|
}
|
|
default: {
|
|
LOG_ERROR("Invalid probability assigned to skiplist");
|
|
assert(false);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
} // end of switch statement
|
|
|
|
|
|
// ..........................................................................
|
|
// Create storage for where to store the random numbers which we generated
|
|
// do it here once off.
|
|
// ..........................................................................
|
|
|
|
skiplist->_base._random = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(uint32_t) * skiplist->_base._numRandom, false);
|
|
if (skiplist->_base._random == NULL) {
|
|
return TRI_ERROR_OUT_OF_MEMORY;
|
|
}
|
|
|
|
|
|
// ..........................................................................
|
|
// Assign the element size
|
|
// ..........................................................................
|
|
skiplist->_base._elementSize = elementSize;
|
|
|
|
|
|
// ..........................................................................
|
|
// Initialise the vertical storage of the lists and the place where we
|
|
// are going to store elements
|
|
// ..........................................................................
|
|
skiplist->_base._startNode._column = NULL;
|
|
skiplist->_base._startNode._colLength = 0;
|
|
skiplist->_base._startNode._extraData = NULL;
|
|
skiplist->_base._startNode._element = NULL;
|
|
skiplist->_base._startNode._delTransID = UINT64_MAX;
|
|
skiplist->_base._startNode._insTransID = lastKnownTransID;
|
|
|
|
|
|
skiplist->_base._endNode._column = NULL;
|
|
skiplist->_base._endNode._colLength = 0;
|
|
skiplist->_base._endNode._extraData = NULL;
|
|
skiplist->_base._endNode._element = NULL;
|
|
skiplist->_base._endNode._delTransID = UINT64_MAX;
|
|
skiplist->_base._endNode._insTransID = lastKnownTransID;
|
|
|
|
|
|
// ...........................................................................
|
|
// 32 bit integer CAS flag
|
|
// ...........................................................................
|
|
skiplist->_base._growStartEndNodesFlag = TRI_SKIPLIST_EX_FREE_TO_GROW_START_END_NODES_FLAG;
|
|
|
|
// ..........................................................................
|
|
// Whenever a probability of 1/2, 1/3, 1/4 is used, on average there will be
|
|
// each node will have a height of two. So initialise the start and end nodes
|
|
// with this 'average' height
|
|
// ..........................................................................
|
|
result = GrowNewNodeHeight(&(skiplist->_base._startNode), skiplist->_base._maxHeight, 2,TRI_ERROR_NO_ERROR); // may fail
|
|
result = GrowNewNodeHeight(&(skiplist->_base._endNode), skiplist->_base._maxHeight, 2, result); // may fail
|
|
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(skiplist->_base._random));
|
|
|
|
if (skiplist->_base._startNode._column != NULL) {
|
|
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(skiplist->_base._startNode._column));
|
|
}
|
|
|
|
if (skiplist->_base._endNode._column != NULL) {
|
|
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(skiplist->_base._endNode._column));
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
// ..........................................................................
|
|
// Join the empty lists together
|
|
// no locking requirements for joining nodes since the skip list index is not known
|
|
// to anyone yet!
|
|
// [N]<----------------------------------->[N]
|
|
// [N]<----------------------------------->[N]
|
|
// ..........................................................................
|
|
JoinStartEndNodes(&(skiplist->_base._startNode), &(skiplist->_base._endNode), 0, skiplist->_base._maxHeight - 1); // joins list 0 & 1
|
|
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief destroys a skip list, but does not free the pointer
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void TRI_DestroySkipListEx(TRI_skiplistEx_t* skiplist) {
|
|
if (skiplist != NULL) {
|
|
DestroyBaseSkipListEx( (TRI_skiplistEx_base_t*)(skiplist) );
|
|
}
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief destroys a skip list and frees the pointer
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void TRI_FreeSkipListEx(TRI_skiplistEx_t* skiplist) {
|
|
if (skiplist != NULL) {
|
|
TRI_DestroySkipListEx(skiplist);
|
|
TRI_Free(TRI_UNKNOWN_MEM_ZONE, skiplist);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @}
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// --SECTION-- unique skiplist public functions
|
|
// -----------------------------------------------------------------------------
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @addtogroup Skiplist_unique
|
|
/// @{
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief returns the end node associated with a skip list
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
// .............................................................................
|
|
// Observe that this is some sort of read transaction. The only possibilitiy
|
|
// we have is that the index must have been created AFTER this read transaction
|
|
// occurred (given that the skip list is valid of course). We do not check
|
|
// for this internal error.
|
|
// Also note that the ADDRESS of the START (HEAD) and END (TAIL) nodes never
|
|
// change once the skip list is created. (These addresses are static.)
|
|
// .............................................................................
|
|
|
|
void* TRI_EndNodeSkipListEx(TRI_skiplistEx_t* skiplist) {
|
|
if (skiplist != NULL) {
|
|
return &(skiplist->_base._endNode);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief adds an key/element to the skip list
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
int TRI_InsertElementSkipListEx(TRI_skiplistEx_t* skiplist, void* element, bool overwrite, uint64_t thisTransID) {
|
|
// Use TRI_InsertKeySkipList instead of calling this method
|
|
LOG_TRACE("Insertions into a skip list require a key. Elements/items are not currently supported.");
|
|
assert(false);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief inserts (adds) an element to the skip list using a key
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
int TRI_InsertKeySkipListEx (TRI_skiplistEx_t* skiplist, // the skiplist we are using
|
|
void* key, // the key used to locate the position of the item within the list
|
|
void* element, // the data stored within the skiplist node
|
|
bool overwrite, // if true, then if the key already exists, the element will be replaced by this one
|
|
uint64_t thisTransID) { // the transaction id of the writer which has requested the insertion
|
|
int32_t newHeight;
|
|
int32_t currentLevel;
|
|
TRI_skiplistEx_node_t* currentNode;
|
|
TRI_skiplistEx_node_t* nextNode;
|
|
TRI_skiplistEx_node_t* newNode;
|
|
int compareResult;
|
|
int result;
|
|
int casFailures = -1;
|
|
|
|
// ...........................................................................
|
|
// Just in case
|
|
// ...........................................................................
|
|
|
|
if (skiplist == NULL) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Determine the number of levels in which to add the item. That is, determine
|
|
// the height of the node so that it participates in that many lists.
|
|
// Convert the level to a height
|
|
// ...........................................................................
|
|
|
|
newHeight = (RandLevel(&(skiplist->_base))) + 1;
|
|
|
|
|
|
// ...........................................................................
|
|
// Something wrong since the newHeight must be at least 1
|
|
// ...........................................................................
|
|
|
|
if (newHeight < 1) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Grow lists if required by increasing the height of the start and end nodes
|
|
// ...........................................................................
|
|
|
|
result = GrowStartEndNodes(&(skiplist->_base), newHeight);
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
return result;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Create the new node to be inserted. If there is some sort of failure,
|
|
// then we delete the node memory.
|
|
// ...........................................................................
|
|
|
|
newNode = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_skiplistEx_node_t) + skiplist->_base._elementSize, false);
|
|
if (newNode == NULL) { // out of memory?
|
|
// no necessity to undo the start/end node growth
|
|
return TRI_ERROR_OUT_OF_MEMORY;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Copy the contents of element into the new node to be inserted.
|
|
// If a duplicate has been found, then we destroy the allocated memory.
|
|
// ...........................................................................
|
|
|
|
newNode->_column = NULL;
|
|
newNode->_colLength = 0;
|
|
newNode->_extraData = NULL;
|
|
|
|
result = IndexStaticCopyElementElement(&(skiplist->_base), &(newNode->_element), element);
|
|
result = GrowNewNodeHeight(newNode, newHeight, newHeight, result);
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
FreeSkipListExNode(&(skiplist->_base), newNode);
|
|
return result;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Assign the deletion transaction id and the insertion transaction id
|
|
// ...........................................................................
|
|
|
|
newNode->_delTransID = UINT64_MAX; // since we are inserting this new node it can not be deleted
|
|
newNode->_insTransID = thisTransID; // this is what was given to us
|
|
|
|
|
|
// ...........................................................................
|
|
// Big loop to restart the whole search routine
|
|
// ...........................................................................
|
|
|
|
CAS_RESTART: {
|
|
|
|
// ...........................................................................
|
|
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
|
|
// ...........................................................................
|
|
|
|
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
|
|
LOG_ERROR("CAS Failure");
|
|
FreeSkipListExNode(&(skiplist->_base), newNode);
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Provide a simple non-blocking, lock? The sleep time should be something
|
|
// needs to be adjusted.
|
|
// ...........................................................................
|
|
|
|
if (casFailures > -1) {
|
|
usleep(CAS_FAILURE_SLEEP_TIME);
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Increment the cas failures (which should always be hopefully 0).
|
|
// ...........................................................................
|
|
|
|
++casFailures;
|
|
|
|
|
|
// ...........................................................................
|
|
// Determine the path where the new item is to be inserted. If the item
|
|
// already exists either replace it or return false. Recall that this
|
|
// skip list is used for unique key/value pairs. Use the skiplist-multi
|
|
// non-unique key/value pairs.
|
|
// ...........................................................................
|
|
|
|
currentLevel = skiplist->_base._startNode._colLength - 1; // NOT current height BUT current level is required here
|
|
currentNode = &(skiplist->_base._startNode);
|
|
nextNode = currentNode;
|
|
|
|
|
|
START: {
|
|
|
|
|
|
// .........................................................................
|
|
// The current node (which we have called the nextNode below) should never
|
|
// be null. Protect yourself in case something has gone wrong.
|
|
// .........................................................................
|
|
|
|
if (nextNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// We require the successor of the current node so we can perform a
|
|
// comparison. It should never be null.
|
|
// .........................................................................
|
|
|
|
nextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[currentLevel]._next);
|
|
if (nextNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// An insert/lookup/removal SEARCH like this, can ONLY ever find 1 glass
|
|
// node when we are very unlucky. (The GC makes the node glass and then
|
|
// goes and unlinks the pointers.) If we skip the glass node, then we
|
|
// will have the wrong pointers to compare, so we have to CAS_RESTART
|
|
// .........................................................................
|
|
|
|
if (nextNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// WE HAVE FOUR CASES TO CONSIDER
|
|
// .........................................................................
|
|
|
|
// .........................................................................
|
|
// CASE ONE:
|
|
// At this level we have the smallest (start) and largest (end) nodes ONLY.
|
|
// CASE TWO:
|
|
// We have arrived at the end of the nodes and we are not at the
|
|
// start of the nodes either.
|
|
// .........................................................................
|
|
|
|
if (nextNode == &(skiplist->_base._endNode)) {
|
|
|
|
|
|
// .......................................................................
|
|
// Store the current node and level in the path
|
|
// .......................................................................
|
|
|
|
if (currentLevel < newHeight) {
|
|
newNode->_column[currentLevel]._prev = currentNode;
|
|
newNode->_column[currentLevel]._next = nextNode;
|
|
}
|
|
|
|
// .......................................................................
|
|
// if we are at the lowest level of the lists, insert the item to the
|
|
// right of the current node
|
|
// .......................................................................
|
|
if (currentLevel == 0) {
|
|
goto END;
|
|
}
|
|
|
|
// .......................................................................
|
|
// We have not yet reached the lowest level continue down.
|
|
// .......................................................................
|
|
nextNode = currentNode;
|
|
--currentLevel;
|
|
|
|
goto START;
|
|
}
|
|
|
|
|
|
|
|
// .........................................................................
|
|
// CASE THREE:
|
|
// We are the smallest left most node and the NEXT node is NOT the end node.
|
|
// Compare this element with the element in the right node to see what we do.
|
|
// CASE FOUR:
|
|
// We are somewhere in the middle of a list, away from the smallest and
|
|
// largest nodes.
|
|
// .........................................................................
|
|
|
|
else { // nextNode != &(skiplist->_endNode
|
|
|
|
// .......................................................................
|
|
// Use the callback to determine if the element is less or greater than
|
|
// the next node element.
|
|
// .......................................................................
|
|
compareResult = IndexStaticCompareKeyElement(skiplist,key,&(nextNode->_element), 0);
|
|
|
|
|
|
// .......................................................................
|
|
// The element to be inserted has a key which greater than the next node's
|
|
// element key. Keep going on this level.
|
|
// .......................................................................
|
|
|
|
if (compareResult > 0) {
|
|
currentNode = nextNode;
|
|
goto START;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// The element matches the next element.
|
|
// However since we support transactions some things are different and we
|
|
// we have to tread carefully. Note that any nodes with the same key are
|
|
// ALWAYS inserted to the LEFT of the existing node. This means we need
|
|
// only check the next node.
|
|
// .......................................................................
|
|
|
|
if (compareResult == 0) {
|
|
|
|
// .....................................................................
|
|
// It may happen that this node is NOT deleted and simply there -
|
|
// check the ins & del transaction numbers.
|
|
// .....................................................................
|
|
|
|
|
|
if (nextNode->_insTransID > thisTransID) {
|
|
|
|
// ...................................................................
|
|
// Something terrible has happened since writers have been serialized,
|
|
// how is that an existing node has a higher transaction number than
|
|
// this transaction
|
|
// ...................................................................
|
|
|
|
printf("%s:%s:%d:Can not be here!\n",__FILE__,__FUNCTION__,__LINE__);
|
|
assert(false); // there is no way we can be here
|
|
}
|
|
|
|
|
|
// .....................................................................
|
|
// node has been previously inserted
|
|
// .....................................................................
|
|
|
|
|
|
if (nextNode->_delTransID > thisTransID) {
|
|
|
|
// ...................................................................
|
|
// Node has NOT been deleted (e.g. imagine it will be deleted some
|
|
// time in the future). Treat this as a duplicate key, overwrite if
|
|
// possible and return. We do not allow elements with duplicate 'keys'.
|
|
// ...................................................................
|
|
|
|
FreeSkipListExNode(&(skiplist->_base), newNode);
|
|
|
|
if (overwrite) {
|
|
result = IndexStaticCopyElementElement(&(skiplist->_base), &(nextNode->_element), element);
|
|
return result;
|
|
}
|
|
return TRI_set_errno(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED);
|
|
}
|
|
|
|
// .....................................................................
|
|
// The only case left here is that the node has been deleted by either
|
|
// this transaction (which could happen in an UPDATE) or by some
|
|
// previous write transaction. Treat this case as if the element is
|
|
// less than the next node element - this ensure that that the
|
|
// most recent revision of the data is always to the LEFT.
|
|
// Keep going on this level.
|
|
// .....................................................................
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// The element is less than the next node. Can we drop down the list?
|
|
// Store the current node and level in the path.
|
|
// .......................................................................
|
|
if (currentLevel < newHeight) {
|
|
newNode->_column[currentLevel]._prev = currentNode;
|
|
newNode->_column[currentLevel]._next = nextNode;
|
|
}
|
|
|
|
// .......................................................................
|
|
// We have reached the lowest level of the lists. Time to insert item.
|
|
// .......................................................................
|
|
if (currentLevel == 0) {
|
|
goto END;
|
|
}
|
|
|
|
// .......................................................................
|
|
// Drop down the list
|
|
// .......................................................................
|
|
nextNode = currentNode;
|
|
--currentLevel;
|
|
|
|
goto START;
|
|
}
|
|
|
|
} // end of label START
|
|
|
|
} // end of label CAS_RESTART
|
|
|
|
|
|
END: {
|
|
|
|
// ..........................................................................
|
|
// Ok finished with the loop and we should have a path with AT MOST
|
|
// SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT number of elements.
|
|
// ..........................................................................
|
|
|
|
|
|
// ..........................................................................
|
|
// this is the tricky part since we have to attempt to do this as
|
|
// 'lock-free' as possible. This is acheived in three passes:
|
|
// Pass 1: Mark each prev and next node of the new node so that the GC
|
|
// can not modify it. If this fails goto CAS_RESTART
|
|
// Pass 2: Ensure that each prev and next tower is not glassed.
|
|
// Pass 3: Modify the newnode.prev.next to newnode and newnode.next.prev = newnode
|
|
// ..........................................................................
|
|
|
|
result = JoinNewNodeCas(newNode);
|
|
if (result == TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE) {
|
|
goto CAS_RESTART;
|
|
}
|
|
return result;
|
|
|
|
} // end of END label
|
|
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief returns greatest node less than a given key
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void* TRI_LeftLookupByKeySkipListEx(TRI_skiplistEx_t* skiplist, void* key, uint64_t thisTransID) {
|
|
int32_t currentLevel;
|
|
TRI_skiplistEx_node_t* currentNode;
|
|
TRI_skiplistEx_node_t* nextNode;
|
|
int casFailures = -1;
|
|
|
|
// ...........................................................................
|
|
// Just in case ...
|
|
// ...........................................................................
|
|
|
|
if (skiplist == NULL) {
|
|
LOG_ERROR("Internal Error");
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Big loop to restart the whole search routine
|
|
// ...........................................................................
|
|
|
|
CAS_RESTART: {
|
|
|
|
// ...........................................................................
|
|
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
|
|
// ...........................................................................
|
|
|
|
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
|
|
LOG_ERROR("CAS Failure");
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Provide a simple non-blocking, block?
|
|
// ...........................................................................
|
|
|
|
if (casFailures > -1) {
|
|
assert(0); // a test to see why it blocks - should not block!
|
|
usleep(CAS_FAILURE_SLEEP_TIME);
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Increment the cas failures (which should always be hopefully 0).
|
|
// ...........................................................................
|
|
|
|
++casFailures;
|
|
|
|
|
|
// ...........................................................................
|
|
// Determine the starting level and the starting node
|
|
// ...........................................................................
|
|
|
|
currentLevel = skiplist->_base._startNode._colLength - 1;
|
|
currentNode = &(skiplist->_base._startNode);
|
|
nextNode = currentNode;
|
|
|
|
|
|
START: {
|
|
|
|
|
|
// .........................................................................
|
|
// Find the next node in the current level of the lists. Protect yourself
|
|
// in case something has gone wrong.
|
|
// .........................................................................
|
|
|
|
if (nextNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// We require the successor of the current node so we can perform a
|
|
// comparison. It should never be null.
|
|
// .........................................................................
|
|
|
|
nextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[currentLevel]._next);
|
|
if (nextNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// Is our next node a glass node? If so we must skip it!
|
|
// Note: since Garbage Collection is performed in TWO passes, it is possible
|
|
// that we have more than one glass node.
|
|
// .........................................................................
|
|
|
|
if (nextNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
|
|
goto START;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// WE HAVE FOUR CASES TO CONSIDER
|
|
// .........................................................................
|
|
|
|
// .........................................................................
|
|
// CASE ONE:
|
|
// At this level we have the smallest (start) and largest (end) nodes ONLY.
|
|
// CASE TWO:
|
|
// We have arrived at the end of the nodes and we are not at the
|
|
// start of the nodes either.
|
|
// .........................................................................
|
|
|
|
if (nextNode == &(skiplist->_base._endNode)) {
|
|
|
|
|
|
// .......................................................................
|
|
// We are at the lowest level of the lists, and we haven't found the item
|
|
// yet. The currentNode does NOT compare and the next node is +\infinty.
|
|
// .......................................................................
|
|
|
|
if (currentLevel == 0) {
|
|
return currentNode;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// We have not yet reached the lowest level continue down. Possibly our
|
|
// item we seek is to be found a lower level.
|
|
// .......................................................................
|
|
|
|
nextNode = currentNode;
|
|
--currentLevel;
|
|
goto START;
|
|
}
|
|
|
|
|
|
|
|
// .........................................................................
|
|
// CASE THREE:
|
|
// We are the smallest left most node and the NEXT node is NOT the end node.
|
|
// Compare this element with the element in the right node to see what we do.
|
|
// CASE FOUR:
|
|
// We are somewhere in the middle of a list, away from the smallest and
|
|
// largest nodes.
|
|
// .........................................................................
|
|
|
|
else { // nextNode != &(skiplist->_endNode
|
|
int compareResult;
|
|
|
|
// .......................................................................
|
|
// Use the callback to determine if the element is less or greater than
|
|
// the next node element. We treat the comparison by assuming we are
|
|
// looking for a "key - epsilon". With this assumption we always find the
|
|
// last key to our right if it exists. The reason this is necessary is as
|
|
// follows: we allow a multiple documents with the same key to be stored
|
|
// here with the proviso that all but the last one is marked as deleted.
|
|
// This is how we cater for multiple revisions.
|
|
// .......................................................................
|
|
|
|
compareResult = IndexStaticCompareKeyElement(skiplist,key,&(nextNode->_element), -1);
|
|
|
|
// .......................................................................
|
|
// -1 is returned if the number of fields (attributes) in the key is LESS
|
|
// than the number of fields in the index definition. This has the effect
|
|
// of being slightly less efficient since we have to proceed to the level
|
|
// 0 list in the set of skip lists.
|
|
// .......................................................................
|
|
|
|
// .......................................................................
|
|
// We have found the item!
|
|
// .......................................................................
|
|
|
|
if (compareResult == 0) {
|
|
assert(false);
|
|
}
|
|
|
|
if (compareResult > 0) {
|
|
currentNode = nextNode;
|
|
goto START;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// The element is less than the next node. Can we drop down the list?
|
|
// If have reached the lowest level of the lists -- no such item.
|
|
// .......................................................................
|
|
|
|
if (currentLevel == 0) {
|
|
return currentNode;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// Drop down the list
|
|
// .......................................................................
|
|
|
|
--currentLevel;
|
|
nextNode = currentNode;
|
|
goto START;
|
|
}
|
|
|
|
} // end of label START
|
|
|
|
} // end of label CAS_RESTART
|
|
|
|
assert(false);
|
|
return NULL;
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief returns node which matches a key
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void* TRI_LookupByKeySkipListEx (TRI_skiplistEx_t* skiplist, void* key, uint64_t thisTransID) {
|
|
int32_t currentLevel;
|
|
TRI_skiplistEx_node_t* currentNode;
|
|
TRI_skiplistEx_node_t* nextNode;
|
|
int casFailures = -1;
|
|
|
|
// ...........................................................................
|
|
// Just in case ...
|
|
// ...........................................................................
|
|
|
|
if (skiplist == NULL) {
|
|
LOG_ERROR("Internal Error");
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Big loop to restart the whole search routine
|
|
// ...........................................................................
|
|
|
|
CAS_RESTART: {
|
|
|
|
// ...........................................................................
|
|
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
|
|
// ...........................................................................
|
|
|
|
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
|
|
LOG_ERROR("CAS Failure");
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Provide a simple non-blocking, block?
|
|
// ...........................................................................
|
|
|
|
if (casFailures > -1) {
|
|
usleep(CAS_FAILURE_SLEEP_TIME);
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Increment the cas failures (which should always be hopefully 0).
|
|
// ...........................................................................
|
|
|
|
++casFailures;
|
|
|
|
|
|
// ...........................................................................
|
|
// Determine the starting level and the starting node
|
|
// ...........................................................................
|
|
|
|
currentLevel = skiplist->_base._startNode._colLength - 1;
|
|
currentNode = &(skiplist->_base._startNode);
|
|
nextNode = currentNode;
|
|
|
|
|
|
START: {
|
|
|
|
|
|
// .........................................................................
|
|
// Find the next node in the current level of the lists. Protect yourself
|
|
// in case something has gone wrong.
|
|
// .........................................................................
|
|
|
|
if (nextNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// We require the successor of the current node so we can perform a
|
|
// comparison. It should never be null.
|
|
// .........................................................................
|
|
|
|
nextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[currentLevel]._next);
|
|
if (nextNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// Is our next node a glass node? If so we must skip it!
|
|
// Note: since Garbage Collection is performed in TWO passes, it is possible
|
|
// that we have more than one glass node.
|
|
// .........................................................................
|
|
|
|
if (nextNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
|
|
goto START;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// WE HAVE FOUR CASES TO CONSIDER
|
|
// .........................................................................
|
|
|
|
// .........................................................................
|
|
// CASE ONE:
|
|
// At this level we have the smallest (start) and largest (end) nodes ONLY.
|
|
// CASE TWO:
|
|
// We have arrived at the end of the nodes and we are not at the
|
|
// start of the nodes either.
|
|
// .........................................................................
|
|
|
|
if (nextNode == &(skiplist->_base._endNode)) {
|
|
|
|
|
|
// .......................................................................
|
|
// We are at the lowest level of the lists, and we haven't found the item
|
|
// yet. The currentNode does NOT compare and the next node is +\infinty.
|
|
// .......................................................................
|
|
|
|
if (currentLevel == 0) {
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// We have not yet reached the lowest level continue down. Possibly our
|
|
// item we seek is to be found a lower level.
|
|
// .......................................................................
|
|
|
|
nextNode = currentNode;
|
|
--currentLevel;
|
|
goto START;
|
|
}
|
|
|
|
|
|
|
|
// .........................................................................
|
|
// CASE THREE:
|
|
// We are the smallest left most node and the NEXT node is NOT the end node.
|
|
// Compare this element with the element in the right node to see what we do.
|
|
// CASE FOUR:
|
|
// We are somewhere in the middle of a list, away from the smallest and
|
|
// largest nodes.
|
|
// .........................................................................
|
|
|
|
else { // nextNode != &(skiplist->_endNode
|
|
int compareResult;
|
|
|
|
// .......................................................................
|
|
// Use the callback to determine if the element is less or greater than
|
|
// the next node element. We treat the comparison by assuming we are
|
|
// looking for a "key - epsilon". With this assumption we always find the
|
|
// last key to our right if it exists. The reason this is necessary is as
|
|
// follows: we allow a multiple documents with the same key to be stored
|
|
// here with the proviso that all but the last one is marked as deleted.
|
|
// This is how we cater for multiple revisions.
|
|
// .......................................................................
|
|
|
|
compareResult = IndexStaticCompareKeyElement(skiplist,key,&(nextNode->_element), 0);
|
|
|
|
// .......................................................................
|
|
// The element is greater than the next node element. Keep going on this
|
|
// level.
|
|
// .......................................................................
|
|
|
|
if (compareResult > 0) {
|
|
currentNode = nextNode;
|
|
goto START;
|
|
}
|
|
|
|
|
|
|
|
if (compareResult == 0) {
|
|
|
|
// .....................................................................
|
|
// It may happen that this node is NOT deleted and simply there -
|
|
// check the ins & del transaction numbers.
|
|
// .....................................................................
|
|
|
|
|
|
if (nextNode->_insTransID > thisTransID) {
|
|
|
|
// ...................................................................
|
|
// This node has been inserted AFTER the reading starting reading!
|
|
// Treat this as if the node was NEVER there.
|
|
// ...................................................................
|
|
|
|
//return NULL;
|
|
}
|
|
|
|
|
|
// .....................................................................
|
|
// node has been previously inserted
|
|
// .....................................................................
|
|
|
|
|
|
if (nextNode->_delTransID > thisTransID) {
|
|
|
|
// ...................................................................
|
|
// Node has NOT been deleted (e.g. imagine it will be deleted some
|
|
// time in the future). This is the node we want, even though it may
|
|
// be deleted very very soon.
|
|
// ...................................................................
|
|
|
|
return nextNode;
|
|
}
|
|
|
|
// .....................................................................
|
|
// The only case left here is that the node has been deleted by either
|
|
// this transaction (which could happen in an UPDATE) or by some
|
|
// previous write transaction. Treat this case as if the element is
|
|
// less than the next node element - this ensures that that the
|
|
// most recent revision of the data is always to the LEFT.
|
|
// Keep going on this level.
|
|
// .....................................................................
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// The element is less than the next node. Can we drop down the list?
|
|
// If have reached the lowest level of the lists -- no such item.
|
|
// .......................................................................
|
|
|
|
if (currentLevel == 0) {
|
|
return NULL;
|
|
}
|
|
|
|
// .......................................................................
|
|
// Drop down the list
|
|
// .......................................................................
|
|
--currentLevel;
|
|
nextNode = currentNode;
|
|
goto START;
|
|
}
|
|
|
|
} // end of label START
|
|
|
|
} // end of label CAS_RESTART
|
|
|
|
assert(0);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief given a node returns the next node (if possible) in the skiplist
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void* TRI_NextNodeSkipListEx(TRI_skiplistEx_t* skiplist, void* currentNode, uint64_t thisTransID) {
|
|
if (skiplist != NULL) {
|
|
return NextNodeBaseSkipListEx( (TRI_skiplistEx_base_t*)(skiplist), currentNode, thisTransID);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief given a node returns the previous node (if possible) in the skiplist
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void* TRI_PrevNodeSkipListEx(TRI_skiplistEx_t* skiplist, void* currentNode, uint64_t thisTransID) {
|
|
if (skiplist != NULL) {
|
|
return PrevNodeBaseSkipListEx( (TRI_skiplistEx_base_t*)(skiplist), currentNode, thisTransID);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief removes an element from the skip list
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
int TRI_RemoveElementSkipListEx (TRI_skiplistEx_t* skiplist, void* element, void* old,
|
|
const int passLevel, const uint64_t thisTransID,
|
|
TRI_skiplistEx_node_t** passNode) {
|
|
// ...........................................................................
|
|
// To remove an element from this skip list we have three pass levels:
|
|
// Pass 1: locate (if possible) the exact NODE - must match exactly.
|
|
// Once located, add the transaction id to the node. Return.
|
|
// Pass 2: locate the node (if not possible report error) - must match exactly.
|
|
// Once located, attempt to unlink all the pointers and make the
|
|
// node a Glass Node.
|
|
// Pass 3: Excise the node by destroying it's allocated memory.
|
|
// ...........................................................................
|
|
|
|
int32_t currentLevel;
|
|
TRI_skiplistEx_node_t* currentNode = NULL;
|
|
TRI_skiplistEx_node_t* nextNode = NULL;
|
|
int casFailures = -1;
|
|
|
|
// ...........................................................................
|
|
// Just in case
|
|
// ...........................................................................
|
|
|
|
if (skiplist == NULL) {
|
|
LOG_ERROR("Internal Error");
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Only for pass level 1 do we have a requirement to locate the actual node
|
|
// using the key. For pass levels 2 & 3 we have the pointer to the node.
|
|
// ...........................................................................
|
|
|
|
if (passLevel != 1) { goto END; }
|
|
|
|
// ...........................................................................
|
|
// Big loop to restart the whole search routine
|
|
// ...........................................................................
|
|
|
|
CAS_RESTART: {
|
|
|
|
// ...........................................................................
|
|
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
|
|
// ...........................................................................
|
|
|
|
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
|
|
LOG_ERROR("CAS Failure");
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_CAS_FAILURE;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Provide a simple non-blocking, block?
|
|
// ...........................................................................
|
|
|
|
if (casFailures > -1) {
|
|
usleep(CAS_FAILURE_SLEEP_TIME);
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Increment the cas failures (which should always be hopefully 0).
|
|
// ...........................................................................
|
|
|
|
++casFailures;
|
|
|
|
|
|
// ...........................................................................
|
|
// Determine the starting level and the starting node
|
|
// ...........................................................................
|
|
|
|
currentLevel = skiplist->_base._startNode._colLength - 1;
|
|
currentNode = &(skiplist->_base._startNode);
|
|
nextNode = currentNode;
|
|
|
|
|
|
START: {
|
|
|
|
// .........................................................................
|
|
// Find the next node in the current level of the lists. Protect yourself
|
|
// in case something has gone wrong.
|
|
// .........................................................................
|
|
|
|
if (nextNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// We require the successor of the current node so we can perform a
|
|
// comparison. It should never be null.
|
|
// .........................................................................
|
|
|
|
nextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[currentLevel]._next);
|
|
if (nextNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// Is our next node a glass node? If so we must skip it. Recall we are in
|
|
// Phase I here -- meaning that we are searching for a node which has not
|
|
// be removed and previously inserted.
|
|
// .........................................................................
|
|
|
|
if (nextNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
|
|
goto START;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// WE HAVE FOUR CASES TO CONSIDER
|
|
// .........................................................................
|
|
|
|
// .........................................................................
|
|
// CASE ONE:
|
|
// At this level we have the smallest (start) and largest (end) nodes ONLY.
|
|
// CASE TWO:
|
|
// We have arrived at the end of the nodes and we are not at the
|
|
// start of the nodes either.
|
|
// .........................................................................
|
|
|
|
if (nextNode == &(skiplist->_base._endNode)) {
|
|
|
|
// .......................................................................
|
|
// We are at the lowest level of the lists, and we haven't found the item
|
|
// yet. Nothing to remove so return.
|
|
// .......................................................................
|
|
if (currentLevel == 0) {
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_ITEM_MISSING;
|
|
}
|
|
|
|
// .......................................................................
|
|
// We have not yet reached the lowest level continue down.
|
|
// .......................................................................
|
|
|
|
nextNode = currentNode;
|
|
--currentLevel;
|
|
|
|
goto START;
|
|
}
|
|
|
|
|
|
|
|
// .........................................................................
|
|
// CASE THREE:
|
|
// We are the smallest left most node and the NEXT node is NOT the end node.
|
|
// Compare this element with the element in the right node to see what we do.
|
|
// CASE FOUR:
|
|
// We are somewhere in the middle of a list, away from the smallest and
|
|
// largest nodes.
|
|
// .........................................................................
|
|
|
|
else { // nextNode != &(skiplist->_endNode
|
|
int compareResult;
|
|
|
|
// .......................................................................
|
|
// Use the callback to determine if the element is less or greater than
|
|
// the next node element.
|
|
// .......................................................................
|
|
|
|
compareResult = IndexStaticCompareElementElement(skiplist,element,&(nextNode->_element), -1);
|
|
|
|
|
|
// .......................................................................
|
|
// The element is greater than the next node element. Keep going on this
|
|
// level.
|
|
// .......................................................................
|
|
if (compareResult > 0) {
|
|
currentNode = nextNode;
|
|
goto START;
|
|
}
|
|
|
|
|
|
if (compareResult == 0) { // a node matches the key exactly
|
|
|
|
if (nextNode->_insTransID > thisTransID) {
|
|
|
|
// ...................................................................
|
|
// This node has been inserted AFTER the reader starting reading!
|
|
// An insertion can only have occured if (a) there never was a previous
|
|
// node with the same key or (b) there exists another with the same
|
|
// key but of course now must be marked as deleted.
|
|
// ...................................................................
|
|
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_ITEM_POST_INSERTED;
|
|
}
|
|
|
|
|
|
// .....................................................................
|
|
// node has been previously inserted
|
|
// .....................................................................
|
|
|
|
|
|
if (nextNode->_delTransID > thisTransID) {
|
|
|
|
// ...................................................................
|
|
// Node has NOT been deleted (e.g. imagine it will be deleted some
|
|
// time in the future). This is the node we want.
|
|
// ...................................................................
|
|
|
|
currentNode = nextNode;
|
|
goto END;
|
|
}
|
|
|
|
// .....................................................................
|
|
// The only case left here is that the node has been deleted by either
|
|
// this transaction (which could happen in an UPDATE) or by some
|
|
// previous write transaction. Treat this case as if the element is
|
|
// less than the next node element - this ensure that that the
|
|
// most recent revision of the data is always to the LEFT.
|
|
// Keep going on this level.
|
|
// .....................................................................
|
|
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// We have reached the lowest level of the lists -- no such item.
|
|
// .......................................................................
|
|
if (currentLevel == 0) {
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_ITEM_MISSING;
|
|
}
|
|
|
|
// .......................................................................
|
|
// Drop down the list
|
|
// .......................................................................
|
|
--currentLevel;
|
|
nextNode = currentNode;
|
|
goto START;
|
|
}
|
|
|
|
} // end of START label
|
|
} // end of CAS_RESTART label
|
|
|
|
END: {
|
|
|
|
switch (passLevel) {
|
|
|
|
// .........................................................................
|
|
// In this case we simply add the del transaction id with a CAS statement.
|
|
// It should never fail!
|
|
// .........................................................................
|
|
|
|
case 1: {
|
|
bool ok;
|
|
|
|
if (currentNode == NULL) { // something terribly wrong
|
|
assert(0);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
ok = TRI_CompareAndSwapIntegerUInt64 (&(currentNode->_delTransID),
|
|
UINT64_MAX, thisTransID);
|
|
if (!ok) {
|
|
assert(0);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
// ....................................................................
|
|
// If requested copy the contents of the element we have located into the
|
|
// storage sent.
|
|
// ....................................................................
|
|
|
|
if (old != NULL) {
|
|
IndexStaticCopyElementElement(&(skiplist->_base), old, &(currentNode->_element));
|
|
}
|
|
|
|
*passNode = currentNode;
|
|
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// In this case we wish to make the node a glass node and to unjoin all
|
|
// other connected nodes.
|
|
// .........................................................................
|
|
case 2: {
|
|
|
|
// .......................................................................
|
|
// We can not now rely upon looking up the node using the key, since
|
|
// we would need to traverse right and attempt to match either then
|
|
// transaction id and/or the pointer to the doc. Easier to simply
|
|
// send the address of the node back.
|
|
// .......................................................................
|
|
if (*passNode == NULL) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
currentNode = (TRI_skiplistEx_node_t*)(*passNode);
|
|
|
|
|
|
// .......................................................................
|
|
// Only the Garbage Collector can transform a node into a glass node, and
|
|
// since the GC is only operating in one thread safe to do a simple
|
|
// comparison here.
|
|
// .......................................................................
|
|
if (currentNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
// .......................................................................
|
|
// safety check
|
|
// .......................................................................
|
|
if (currentNode->_delTransID != thisTransID) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
// .......................................................................
|
|
// The stragey is this:
|
|
// (a) Brick each nearest neighbour on this node. This ensures that NO
|
|
// other nodes can be attached to this node.
|
|
// (b) Mark this node as being glass. This ensures that it is skipped
|
|
// since it is no longer required in the index.
|
|
// (c) Unbrick each of its nearest neighbours on this node. This ensures
|
|
// that an inserted node MAY be allowed to be attached but will later fail.
|
|
// Also allows us to brick other glass nodes.
|
|
// (d) Brick each prev and next nearest neighbour of this node. Irrespective
|
|
// if one of these are glass or not. This ensures that lookups can
|
|
// proceed unhinded.
|
|
// (e) Unjoin the node from the list.
|
|
// (f) Unbrick each prev/next nearest neigbour
|
|
// .......................................................................
|
|
|
|
return UnJoinOldNodeCas(currentNode);
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// In this case since no other reader/writer can be accessing the node,
|
|
// we simply destroy it. we require the node to be glass.
|
|
// .........................................................................
|
|
case 3: {
|
|
// .......................................................................
|
|
// We can not now rely upon looking up the node using the key, since
|
|
// we would need to traverse right and attempt to match either then
|
|
// transaction id and/or the pointer to the doc. Easier to simply
|
|
// send the address of the node back.
|
|
// .......................................................................
|
|
if (*passNode == NULL) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
currentNode = (TRI_skiplistEx_node_t*)(*passNode);
|
|
|
|
// .......................................................................
|
|
// Only the Garbage Collector can transform a node into a glass node, and
|
|
// since the GC is only operating in one thread safe to do a simple
|
|
// comparison here.
|
|
// .......................................................................
|
|
if (currentNode->_towerFlag != TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
// .......................................................................
|
|
// safety check
|
|
// .......................................................................
|
|
if (currentNode->_delTransID != thisTransID) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
FreeSkipListExNode(&(skiplist->_base), currentNode);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default: {
|
|
assert(0);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
} // end of switch statement
|
|
|
|
} // end of END label
|
|
|
|
return TRI_ERROR_NO_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief removes an key/element to the skip list
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
int TRI_RemoveKeySkipListEx(TRI_skiplistEx_t* skiplist, void* key, void* old,
|
|
const int passLevel, const uint64_t thisTransID,
|
|
TRI_skiplistEx_node_t** passNode) {
|
|
// Use the TRI_RemoveElementSkipList method instead.
|
|
assert(false);
|
|
return 0;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief returns smallest node greater than a given key
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void* TRI_RightLookupByKeySkipListEx(TRI_skiplistEx_t* skiplist, void* key, uint64_t thisTransID) {
|
|
int32_t currentLevel;
|
|
TRI_skiplistEx_node_t* currentNode;
|
|
TRI_skiplistEx_node_t* prevNode;
|
|
int casFailures = -1;
|
|
|
|
// ...........................................................................
|
|
// Just in case ...
|
|
// ...........................................................................
|
|
|
|
if (skiplist == NULL) {
|
|
LOG_ERROR("Internal Error");
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Big loop to restart the whole search routine
|
|
// ...........................................................................
|
|
|
|
CAS_RESTART: {
|
|
|
|
// ...........................................................................
|
|
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
|
|
// ...........................................................................
|
|
|
|
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
|
|
LOG_ERROR("CAS Failure");
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Provide a simple non-blocking, block?
|
|
// ...........................................................................
|
|
|
|
if (casFailures > -1) {
|
|
assert(0); // a test to see why it blocks - should not block!
|
|
usleep(CAS_FAILURE_SLEEP_TIME);
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Increment the cas failures (which should always be hopefully 0).
|
|
// ...........................................................................
|
|
|
|
++casFailures;
|
|
|
|
|
|
// ...........................................................................
|
|
// Determine the starting level and the starting node
|
|
// ...........................................................................
|
|
|
|
currentLevel = skiplist->_base._endNode._colLength - 1;
|
|
currentNode = &(skiplist->_base._endNode);
|
|
prevNode = currentNode;
|
|
|
|
|
|
START: {
|
|
|
|
|
|
// .........................................................................
|
|
// Find the next node in the current level of the lists. Protect yourself
|
|
// in case something has gone wrong.
|
|
// .........................................................................
|
|
|
|
if (prevNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// We require the successor of the current node so we can perform a
|
|
// comparison. It should never be null.
|
|
// .........................................................................
|
|
|
|
prevNode = (TRI_skiplistEx_node_t*)(prevNode->_column[currentLevel]._prev);
|
|
if (prevNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// Is our next node a glass node? If so we must skip it!
|
|
// Note: since Garbage Collection is performed in TWO passes, it is possible
|
|
// that we have more than one glass node.
|
|
// .........................................................................
|
|
|
|
if (prevNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
|
|
goto START;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// WE HAVE FOUR CASES TO CONSIDER
|
|
// .........................................................................
|
|
|
|
// .........................................................................
|
|
// CASE ONE:
|
|
// At this level we have the smallest (start) and largest (end) nodes ONLY.
|
|
// CASE TWO:
|
|
// We have arrived at the end of the nodes and we are not at the
|
|
// start of the nodes either.
|
|
// .........................................................................
|
|
|
|
if (prevNode == &(skiplist->_base._startNode)) {
|
|
|
|
|
|
// .......................................................................
|
|
// We are at the lowest level of the lists, and we haven't found the item
|
|
// yet. The currentNode does NOT compare and the next node is +\infinty.
|
|
// .......................................................................
|
|
|
|
if (currentLevel == 0) {
|
|
return currentNode;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// We have not yet reached the lowest level continue down. Possibly our
|
|
// item we seek is to be found a lower level.
|
|
// .......................................................................
|
|
|
|
prevNode = currentNode;
|
|
--currentLevel;
|
|
goto START;
|
|
}
|
|
|
|
|
|
|
|
// .........................................................................
|
|
// CASE THREE:
|
|
// We are the smallest left most node and the NEXT node is NOT the end node.
|
|
// Compare this element with the element in the right node to see what we do.
|
|
// CASE FOUR:
|
|
// We are somewhere in the middle of a list, away from the smallest and
|
|
// largest nodes.
|
|
// .........................................................................
|
|
|
|
else { // nextNode != &(skiplist->_endNode
|
|
int compareResult;
|
|
|
|
// .......................................................................
|
|
// Use the callback to determine if the element is less or greater than
|
|
// the next node element. We treat the comparison by assuming we are
|
|
// looking for a "key - epsilon". With this assumption we always find the
|
|
// last key to our right if it exists. The reason this is necessary is as
|
|
// follows: we allow a multiple documents with the same key to be stored
|
|
// here with the proviso that all but the last one is marked as deleted.
|
|
// This is how we cater for multiple revisions.
|
|
// .......................................................................
|
|
|
|
compareResult = IndexStaticCompareKeyElement(skiplist,key,&(prevNode->_element), 1);
|
|
|
|
// .......................................................................
|
|
// If the number of fields (attributes) in the key is LESS than the number
|
|
// of fields in the element to be compared to, then EVEN if the keys which
|
|
// which are common to both equate as EQUAL, we STILL return 1 rather than
|
|
// 0! This ensures that the right interval end point is correctly positioned
|
|
// -- slightly inefficient since the lowest level skip list 0 has to be reached
|
|
// in this case.
|
|
// .......................................................................
|
|
|
|
// .......................................................................
|
|
// We have found the item!
|
|
// .......................................................................
|
|
|
|
if (compareResult == 0) {
|
|
assert(false);
|
|
}
|
|
|
|
if (compareResult < 0) {
|
|
currentNode = prevNode;
|
|
goto START;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// The element is less than the next node. Can we drop down the list?
|
|
// If have reached the lowest level of the lists -- no such item.
|
|
// .......................................................................
|
|
|
|
if (currentLevel == 0) {
|
|
return currentNode;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// Drop down the list
|
|
// .......................................................................
|
|
|
|
--currentLevel;
|
|
prevNode = currentNode;
|
|
goto START;
|
|
}
|
|
|
|
} // end of label START
|
|
|
|
} // end of label CAS_RESTART
|
|
|
|
assert(false);
|
|
return NULL;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief returns the start node associated with a skip list.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
void* TRI_StartNodeSkipListEx(TRI_skiplistEx_t* skiplist) {
|
|
if (skiplist != NULL) {
|
|
return &(skiplist->_base._startNode);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @}
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// --SECTION-- non-unique skiplist constructors and destructors
|
|
// -----------------------------------------------------------------------------
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @addtogroup Skiplist_non_unique
|
|
/// @{
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief initialises a multi skip list which allows duplicate entries
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
int TRI_InitSkipListExMulti (TRI_skiplistEx_multi_t* skiplist, size_t elementSize,
|
|
int (*compareElementElement) (TRI_skiplistEx_multi_t*, void*, void*, int),
|
|
int (*compareKeyElement) (TRI_skiplistEx_multi_t*, void*, void*, int),
|
|
bool (*equalElementElement) (TRI_skiplistEx_multi_t*, void*, void*),
|
|
TRI_skiplistEx_prob_e probability,
|
|
uint32_t maximumHeight,
|
|
uint64_t lastKnownTransID) {
|
|
|
|
int result;
|
|
|
|
if (skiplist == NULL) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
// ..........................................................................
|
|
// Assign the STATIC comparision call back functions
|
|
// ..........................................................................
|
|
|
|
skiplist->compareElementElement = IndexStaticMultiCompareElementElement; //compareElementElement;
|
|
skiplist->compareKeyElement = IndexStaticMultiCompareKeyElement; // compareKeyElement;
|
|
skiplist->equalElementElement = IndexStaticMultiEqualElementElement; //equalElementElement;
|
|
|
|
// ..........................................................................
|
|
// Assign the maximum height of the skip list. This maximum height must be
|
|
// no greater than the absolute max height defined as a compile time parameter
|
|
// ..........................................................................
|
|
if (maximumHeight == 0) {
|
|
maximumHeight = SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT;
|
|
}
|
|
skiplist->_base._maxHeight = maximumHeight;
|
|
if (maximumHeight > SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT) {
|
|
LOG_ERROR("Invalid maximum height for skiplist");
|
|
assert(false);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
// ..........................................................................
|
|
// Assign the probability and determine the number of random numbers which
|
|
// we will require -- do it once off here
|
|
// ..........................................................................
|
|
skiplist->_base._prob = probability;
|
|
skiplist->_base._numRandom = 0;
|
|
switch (skiplist->_base._prob) {
|
|
case TRI_SKIPLIST_EX_PROB_HALF: {
|
|
// determine the number of random numbers which we require.
|
|
skiplist->_base._numRandom = (skiplist->_base._maxHeight / 32);
|
|
if ((skiplist->_base._maxHeight % 32) != 0) {
|
|
++(skiplist->_base._numRandom);
|
|
}
|
|
break;
|
|
}
|
|
case TRI_SKIPLIST_EX_PROB_THIRD: {
|
|
// determine the number of random numbers which we require.
|
|
skiplist->_base._numRandom = (skiplist->_base._maxHeight / 16);
|
|
if ((skiplist->_base._maxHeight % 16) != 0) {
|
|
++(skiplist->_base._numRandom);
|
|
}
|
|
break;
|
|
}
|
|
case TRI_SKIPLIST_EX_PROB_QUARTER: {
|
|
// determine the number of random numbers which we require.
|
|
skiplist->_base._numRandom = (skiplist->_base._maxHeight / 16);
|
|
if ((skiplist->_base._maxHeight % 16) != 0) {
|
|
++(skiplist->_base._numRandom);
|
|
}
|
|
break;
|
|
}
|
|
default: {
|
|
LOG_ERROR("Invalid probability assigned to skiplist");
|
|
assert(false);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
} // end of switch statement
|
|
|
|
|
|
// ..........................................................................
|
|
// Create storage for where to store the random numbers which we generated
|
|
// do it here once off.
|
|
// ..........................................................................
|
|
|
|
skiplist->_base._random = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(uint32_t) * skiplist->_base._numRandom, false);
|
|
if (skiplist->_base._random == NULL) {
|
|
return TRI_ERROR_OUT_OF_MEMORY;
|
|
}
|
|
|
|
|
|
// ..........................................................................
|
|
// Assign the element size
|
|
// ..........................................................................
|
|
skiplist->_base._elementSize = elementSize;
|
|
|
|
|
|
// ..........................................................................
|
|
// Initialise the vertical storage of the lists and the place where we
|
|
// are going to store elements
|
|
// ..........................................................................
|
|
skiplist->_base._startNode._column = NULL;
|
|
skiplist->_base._startNode._colLength = 0;
|
|
skiplist->_base._startNode._extraData = NULL;
|
|
skiplist->_base._startNode._element = NULL;
|
|
skiplist->_base._startNode._delTransID = UINT64_MAX;
|
|
skiplist->_base._startNode._insTransID = lastKnownTransID;
|
|
|
|
skiplist->_base._endNode._column = NULL;
|
|
skiplist->_base._endNode._colLength = 0;
|
|
skiplist->_base._endNode._extraData = NULL;
|
|
skiplist->_base._endNode._element = NULL;
|
|
skiplist->_base._endNode._delTransID = UINT64_MAX;
|
|
skiplist->_base._endNode._insTransID = lastKnownTransID;
|
|
|
|
|
|
// ...........................................................................
|
|
// 32 bit integer CAS flag
|
|
// ...........................................................................
|
|
skiplist->_base._growStartEndNodesFlag = TRI_SKIPLIST_EX_FREE_TO_GROW_START_END_NODES_FLAG;
|
|
|
|
|
|
// ..........................................................................
|
|
// Whenever a probability of 1/2, 1/3, 1/4 is used, on average
|
|
// each node will have a height of two. So initialise the start and end nodes
|
|
// with this 'average' height
|
|
// ..........................................................................
|
|
result = GrowNewNodeHeight(&(skiplist->_base._startNode), skiplist->_base._maxHeight, 2, TRI_ERROR_NO_ERROR); // may fail
|
|
result = GrowNewNodeHeight(&(skiplist->_base._endNode), skiplist->_base._maxHeight, 2, result); // may fail
|
|
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(skiplist->_base._random));
|
|
|
|
if (skiplist->_base._startNode._column != NULL) {
|
|
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(skiplist->_base._startNode._column));
|
|
}
|
|
|
|
if (skiplist->_base._endNode._column != NULL) {
|
|
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(skiplist->_base._endNode._column));
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
// ..........................................................................
|
|
// Join the empty lists together
|
|
// [N]<----------------------------------->[N]
|
|
// [N]<----------------------------------->[N]
|
|
// ..........................................................................
|
|
JoinStartEndNodes(&(skiplist->_base._startNode),&(skiplist->_base._endNode),0, skiplist->_base._maxHeight - 1); // joins list 0 & 1
|
|
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief destroys a multi skip list, but does not free the pointer
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void TRI_DestroySkipListExMulti (TRI_skiplistEx_multi_t* skiplist) {
|
|
if (skiplist != NULL) {
|
|
DestroyBaseSkipListEx( (TRI_skiplistEx_base_t*)(skiplist) );
|
|
}
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief destroys a multi skip list and frees the pointer
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void TRI_FreeSkipListExMulti (TRI_skiplistEx_multi_t* skiplist) {
|
|
if (skiplist != NULL) {
|
|
TRI_DestroySkipListExMulti(skiplist);
|
|
TRI_Free(TRI_UNKNOWN_MEM_ZONE, skiplist);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @}
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// --SECTION-- non-unique skiplist public methods
|
|
// -----------------------------------------------------------------------------
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @addtogroup Skiplist_non_unique
|
|
/// @{
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief Returns the end node associated with a skip list.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void* TRI_EndNodeSkipListExMulti(TRI_skiplistEx_multi_t* skiplist) {
|
|
if (skiplist != NULL) {
|
|
return &(skiplist->_base._endNode);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief adds an element to a multi skip list using an element for searching
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
int TRI_InsertElementSkipListExMulti(TRI_skiplistEx_multi_t* skiplist,
|
|
void* element,
|
|
bool overwrite,
|
|
uint64_t thisTransID) {
|
|
int32_t newHeight;
|
|
int32_t currentLevel;
|
|
TRI_skiplistEx_node_t* currentNode;
|
|
TRI_skiplistEx_node_t* nextNode;
|
|
TRI_skiplistEx_node_t* newNode;
|
|
int compareResult;
|
|
int result;
|
|
int casFailures = -1;
|
|
|
|
|
|
// ...........................................................................
|
|
// Just in case
|
|
// ...........................................................................
|
|
|
|
if (skiplist == NULL) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Determine the number of levels in which to add the item. That is, determine
|
|
// the height of the node so that it participates in that many lists.
|
|
// ...........................................................................
|
|
|
|
newHeight = RandLevel(&(skiplist->_base)) + 1;
|
|
|
|
// ...........................................................................
|
|
// Something wrong since the newHeight must be non-negative
|
|
// ...........................................................................
|
|
|
|
if (newHeight < 1) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Grow lists if required by increasing the height of the start and end nodes
|
|
// ...........................................................................
|
|
|
|
result = GrowStartEndNodes(&(skiplist->_base), newHeight);
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
return result;
|
|
}
|
|
|
|
// ...........................................................................
|
|
// Create the new node to be inserted. If there is some sort of failure,
|
|
// then we delete the node memory.
|
|
// ...........................................................................
|
|
newNode = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_skiplistEx_node_t) + skiplist->_base._elementSize, false);
|
|
if (newNode == NULL) { // out of memory?
|
|
return TRI_ERROR_OUT_OF_MEMORY;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Copy the contents of element into the new node to be inserted.
|
|
// If a duplicate has been found, then we destroy the allocated memory.
|
|
// ...........................................................................
|
|
newNode->_column = NULL;
|
|
newNode->_colLength = 0;
|
|
newNode->_extraData = NULL;
|
|
|
|
result = IndexStaticCopyElementElement(&(skiplist->_base), &(newNode->_element), element);
|
|
result = GrowNewNodeHeight(newNode, newHeight, newHeight, result);
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
FreeSkipListExNode(&(skiplist->_base), newNode);
|
|
return result;
|
|
}
|
|
|
|
// ...........................................................................
|
|
// Assign the deletion transaction id and the insertion transaction id
|
|
// ...........................................................................
|
|
|
|
newNode->_delTransID = UINT64_MAX; // since we are inserting this new node it can not be deleted
|
|
newNode->_insTransID = thisTransID; // this is what was given to us
|
|
|
|
|
|
// ...........................................................................
|
|
// Big loop to restart the whole search routine
|
|
// ...........................................................................
|
|
|
|
|
|
CAS_RESTART: {
|
|
|
|
// ...........................................................................
|
|
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
|
|
// ...........................................................................
|
|
|
|
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
|
|
LOG_ERROR("CAS Failure");
|
|
FreeSkipListExNode(&(skiplist->_base), newNode);
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Provide a simple non-blocking, lock? The sleep time should be something
|
|
// needs to be adjusted.
|
|
// ...........................................................................
|
|
|
|
if (casFailures > -1) {
|
|
usleep(CAS_FAILURE_SLEEP_TIME);
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Increment the cas failures (which should always be hopefully 0).
|
|
// ...........................................................................
|
|
|
|
++casFailures;
|
|
|
|
|
|
// ...........................................................................
|
|
// Determine the path where the new item is to be inserted. If the item
|
|
// already exists either replace it or return false. Recall that this
|
|
// skip list is used for unique key/value pairs. Use the skiplist-multi
|
|
// non-unique key/value pairs.
|
|
// ...........................................................................
|
|
|
|
currentLevel = skiplist->_base._startNode._colLength - 1; // NOT current height BUT current level is required here
|
|
currentNode = &(skiplist->_base._startNode);
|
|
nextNode = currentNode;
|
|
|
|
|
|
START: {
|
|
|
|
|
|
// .........................................................................
|
|
// The current node (which we have called the nextNode below) should never
|
|
// be null. Protect yourself in case something has gone wrong.
|
|
// .........................................................................
|
|
|
|
if (nextNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// We require the successor of the current node so we can perform a
|
|
// comparison. It should never be null.
|
|
// .........................................................................
|
|
|
|
nextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[currentLevel]._next);
|
|
if (nextNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// An insert/lookup/removal SEARCH like this, can ONLY ever find 1 glass
|
|
// node when we are very unlucky. (The GC makes the node glass and then
|
|
// goes and unlinks the pointers.) If we skip the glass node, then we
|
|
// will have the wrong pointers to compare, so we have to CAS_RESTART
|
|
// .........................................................................
|
|
|
|
if (nextNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// WE HAVE FOUR CASES TO CONSIDER
|
|
// .........................................................................
|
|
|
|
// .........................................................................
|
|
// CASE ONE:
|
|
// At this level we have the smallest (start) and largest (end) nodes ONLY.
|
|
// CASE TWO:
|
|
// We have arrived at the end of the nodes and we are not at the
|
|
// start of the nodes either.
|
|
// .........................................................................
|
|
|
|
if (nextNode == &(skiplist->_base._endNode)) {
|
|
|
|
|
|
// .......................................................................
|
|
// Store the current node and level in the path
|
|
// .......................................................................
|
|
|
|
if (currentLevel < newHeight) {
|
|
newNode->_column[currentLevel]._prev = currentNode;
|
|
newNode->_column[currentLevel]._next = nextNode;
|
|
}
|
|
|
|
// .......................................................................
|
|
// if we are at the lowest level of the lists, insert the item to the
|
|
// right of the current node
|
|
// .......................................................................
|
|
if (currentLevel == 0) {
|
|
goto END;
|
|
}
|
|
|
|
// .......................................................................
|
|
// We have not yet reached the lowest level continue down.
|
|
// .......................................................................
|
|
nextNode = currentNode;
|
|
--currentLevel;
|
|
|
|
goto START;
|
|
}
|
|
|
|
|
|
|
|
// .........................................................................
|
|
// CASE THREE:
|
|
// We are the smallest left most node and the NEXT node is NOT the end node.
|
|
// Compare this element with the element in the right node to see what we do.
|
|
// CASE FOUR:
|
|
// We are somewhere in the middle of a list, away from the smallest and
|
|
// largest nodes.
|
|
// .........................................................................
|
|
|
|
else { // nextNode != &(skiplist->_endNode
|
|
|
|
// .......................................................................
|
|
// Use the callback to determine if the element is less or greater than
|
|
// the next node element.
|
|
// .......................................................................
|
|
|
|
compareResult = IndexStaticMultiCompareElementElement(skiplist, element, &(nextNode->_element), -1);
|
|
|
|
|
|
// .......................................................................
|
|
// The element matches the next element. Overwrite if possible and return.
|
|
// The only possiblity of obtaining a compareResult equal to 0 is for the
|
|
// the element being the same, NOT the keys being the same.
|
|
// .......................................................................
|
|
|
|
if (compareResult == 0) {
|
|
FreeSkipListExNode(&(skiplist->_base), newNode);
|
|
if (overwrite) {
|
|
// ...................................................................
|
|
// Warning: there is NO check to ensure that this node has not been
|
|
// previously deleted.
|
|
// ...................................................................
|
|
result = IndexStaticCopyElementElement(&(skiplist->_base), &(nextNode->_element), element);
|
|
return result;
|
|
}
|
|
return TRI_ERROR_ARANGO_INDEX_SKIPLIST_INSERT_ITEM_DUPLICATED;
|
|
}
|
|
|
|
// .......................................................................
|
|
// The element to be inserted has a key which is greater than the next node's
|
|
// element key. Keep going on this level.
|
|
// .......................................................................
|
|
|
|
if (compareResult > 0) {
|
|
currentNode = nextNode;
|
|
goto START;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// The element is less than the next node. Can we drop down the list?
|
|
// Store the current node and level in the path.
|
|
// .......................................................................
|
|
|
|
if (currentLevel < newHeight) {
|
|
newNode->_column[currentLevel]._prev = currentNode;
|
|
newNode->_column[currentLevel]._next = nextNode;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// We have reached the lowest level of the lists. Time to insert item.
|
|
// Note that we will insert this item to the left of all the items with
|
|
// the same key. Note also that the higher transaction numbers are to
|
|
// the left always.
|
|
// .......................................................................
|
|
|
|
if (currentLevel == 0) {
|
|
goto END;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// Drop down the list
|
|
// .......................................................................
|
|
|
|
nextNode = currentNode;
|
|
--currentLevel;
|
|
|
|
goto START;
|
|
}
|
|
|
|
} // end of label START
|
|
|
|
} // end of label CAS_RESTART
|
|
|
|
|
|
END: {
|
|
|
|
// ..........................................................................
|
|
// Ok finished with the loop and we should have a path with AT MOST
|
|
// SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT number of elements.
|
|
// ..........................................................................
|
|
|
|
|
|
// ..........................................................................
|
|
// this is the tricky part since we have to attempt to do this as
|
|
// 'lock-free' as possible. This is acheived in three passes:
|
|
// Pass 1: Mark each prev and next node of the new node so that the GC
|
|
// can not modify it. If this fails goto CAS_RESTART
|
|
// Pass 2: Ensure that each prev and next tower is not glassed.
|
|
// Pass 3: Modify the newnode.prev.next to newnode and newnode.next.prev = newnode
|
|
// ..........................................................................
|
|
|
|
result = JoinNewNodeCas(newNode);
|
|
if (result == TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE) {
|
|
goto CAS_RESTART;
|
|
}
|
|
return result;
|
|
|
|
} // end of END label
|
|
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief adds an key/element to a multi skip list
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
int TRI_InsertKeySkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* key, void* element, bool overwrite, uint64_t thisTransID) {
|
|
// Use TRI_InsertElementSkipListExMulti instead of calling this method
|
|
assert(false);
|
|
return 0;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief returns greatest node less than a given key
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void* TRI_LeftLookupByKeySkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* key, uint64_t thistransID) {
|
|
int32_t currentLevel;
|
|
TRI_skiplistEx_node_t* currentNode;
|
|
TRI_skiplistEx_node_t* nextNode;
|
|
int casFailures = -1;
|
|
|
|
// ...........................................................................
|
|
// Just in case ...
|
|
// ...........................................................................
|
|
|
|
if (skiplist == NULL) {
|
|
LOG_ERROR("Internal Error");
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Big loop to restart the whole search routine
|
|
// ...........................................................................
|
|
|
|
CAS_RESTART: {
|
|
|
|
// ...........................................................................
|
|
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
|
|
// ...........................................................................
|
|
|
|
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
|
|
LOG_ERROR("CAS Failure");
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Provide a simple non-blocking, block?
|
|
// ...........................................................................
|
|
|
|
if (casFailures > -1) {
|
|
assert(0); // a test to see why it blocks - should not block!
|
|
usleep(CAS_FAILURE_SLEEP_TIME);
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Increment the cas failures (which should always be hopefully 0).
|
|
// ...........................................................................
|
|
|
|
++casFailures;
|
|
|
|
|
|
// ...........................................................................
|
|
// Determine the starting level and the starting node
|
|
// ...........................................................................
|
|
|
|
currentLevel = skiplist->_base._startNode._colLength - 1;
|
|
currentNode = &(skiplist->_base._startNode);
|
|
nextNode = currentNode;
|
|
|
|
|
|
START: {
|
|
|
|
|
|
// .........................................................................
|
|
// Find the next node in the current level of the lists. Protect yourself
|
|
// in case something has gone wrong.
|
|
// .........................................................................
|
|
|
|
if (nextNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// We require the successor of the current node so we can perform a
|
|
// comparison. It should never be null.
|
|
// .........................................................................
|
|
|
|
nextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[currentLevel]._next);
|
|
if (nextNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// Is our next node a glass node? If so we must skip it!
|
|
// Note: since Garbage Collection is performed in TWO passes, it is possible
|
|
// that we have more than one glass node.
|
|
// .........................................................................
|
|
|
|
if (nextNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
|
|
goto START;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// WE HAVE FOUR CASES TO CONSIDER
|
|
// .........................................................................
|
|
|
|
// .........................................................................
|
|
// CASE ONE:
|
|
// At this level we have the smallest (start) and largest (end) nodes ONLY.
|
|
// CASE TWO:
|
|
// We have arrived at the end of the nodes and we are not at the
|
|
// start of the nodes either.
|
|
// .........................................................................
|
|
|
|
if (nextNode == &(skiplist->_base._endNode)) {
|
|
|
|
|
|
// .......................................................................
|
|
// We are at the lowest level of the lists, and we haven't found the item
|
|
// yet. The currentNode does NOT compare and the next node is +\infinty.
|
|
// .......................................................................
|
|
|
|
if (currentLevel == 0) {
|
|
return currentNode;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// We have not yet reached the lowest level continue down. Possibly our
|
|
// item we seek is to be found a lower level.
|
|
// .......................................................................
|
|
|
|
nextNode = currentNode;
|
|
--currentLevel;
|
|
goto START;
|
|
}
|
|
|
|
|
|
|
|
// .........................................................................
|
|
// CASE THREE:
|
|
// We are the smallest left most node and the NEXT node is NOT the end node.
|
|
// Compare this element with the element in the right node to see what we do.
|
|
// CASE FOUR:
|
|
// We are somewhere in the middle of a list, away from the smallest and
|
|
// largest nodes.
|
|
// .........................................................................
|
|
|
|
else { // nextNode != &(skiplist->_endNode
|
|
int compareResult;
|
|
|
|
// .......................................................................
|
|
// Use the callback to determine if the element is less or greater than
|
|
// the next node element. We treat the comparison by assuming we are
|
|
// looking for a "key - epsilon". With this assumption we always find the
|
|
// last key to our right if it exists. The reason this is necessary is as
|
|
// follows: we allow a multiple documents with the same key to be stored
|
|
// here with the proviso that all but the last one is marked as deleted.
|
|
// This is how we cater for multiple revisions.
|
|
// .......................................................................
|
|
|
|
compareResult = IndexStaticMultiCompareKeyElement(skiplist, key, &(nextNode->_element), -1);
|
|
|
|
// .......................................................................
|
|
// -1 is returned if the number of fields (attributes) in the key is LESS
|
|
// than the number of fields in the index definition. This has the effect
|
|
// of being slightly less efficient since we have to proceed to the level
|
|
// 0 list in the set of skip lists. Where we allow duplicates such as this
|
|
// -1 is also returned when all the keys match.
|
|
// .......................................................................
|
|
|
|
// .......................................................................
|
|
// We have found the item!
|
|
// .......................................................................
|
|
|
|
if (compareResult == 0) {
|
|
assert(false);
|
|
}
|
|
|
|
if (compareResult > 0) {
|
|
currentNode = nextNode;
|
|
goto START;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// The element is less than the next node. Can we drop down the list?
|
|
// If have reached the lowest level of the lists -- no such item.
|
|
// .......................................................................
|
|
|
|
if (currentLevel == 0) {
|
|
return currentNode;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// Drop down the list
|
|
// .......................................................................
|
|
|
|
--currentLevel;
|
|
nextNode = currentNode;
|
|
goto START;
|
|
}
|
|
|
|
} // end of label START
|
|
|
|
} // end of label CAS_RESTART
|
|
|
|
assert(false);
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief locate a node using an element
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void* TRI_LookupByElementSkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* element, uint64_t thisTransID) {
|
|
assert(false); // there is no way you should be here
|
|
return 0;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief returns node which matches a key
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void* TRI_LookupByKeySkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* key, uint64_t thisTransID) {
|
|
// Since this index supports duplicate keys, it makes no sense to lookup an element in the index
|
|
// using a key - if there are such elements - what is returned is undefined (in the sense that a valid
|
|
// element is returned but which one?). Hence lookups can only really make sense to say give me the
|
|
// first such element and the last such element, so that we can traverse the elements which match the
|
|
// keys.
|
|
assert(false); // there is no way you should be here
|
|
return 0;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief given a node returns the next node (if possible) in the skiplist
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void* TRI_NextNodeSkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* currentNode, uint64_t thisTransID) {
|
|
if (skiplist != NULL) {
|
|
return NextNodeBaseSkipListEx( (TRI_skiplistEx_base_t*)(skiplist), currentNode, thisTransID);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief given a node returns the previous node (if possible) in the skiplist
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void* TRI_PrevNodeSkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* currentNode, uint64_t thisTransID) {
|
|
if (skiplist != NULL) {
|
|
return PrevNodeBaseSkipListEx( (TRI_skiplistEx_base_t*)(skiplist), currentNode, thisTransID);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief removes a key/element from a multi skip list
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
int TRI_RemoveElementSkipListExMulti (TRI_skiplistEx_multi_t* skiplist, void* element, void* old,
|
|
const int passLevel, const uint64_t thisTransID,
|
|
TRI_skiplistEx_node_t** passNode) {
|
|
// ...........................................................................
|
|
// To remove an element from this skip list we have three pass levels:
|
|
// Pass 1: locate (if possible) the exact NODE - must match exactly.
|
|
// Once located, add the transaction id to the node. Return.
|
|
// Pass 2: locate the node (if not possible report error) - must match exactly.
|
|
// Once located, attempt to unlink all the pointers and make the
|
|
// node a Glass Node.
|
|
// Pass 3: Excise the node by destroying it's allocated memory.
|
|
// ...........................................................................
|
|
|
|
int32_t currentLevel;
|
|
TRI_skiplistEx_node_t* currentNode = NULL;
|
|
TRI_skiplistEx_node_t* nextNode = NULL;
|
|
int casFailures = -1;
|
|
|
|
// ...........................................................................
|
|
// Just in case
|
|
// ...........................................................................
|
|
|
|
if (skiplist == NULL) {
|
|
LOG_ERROR("Internal Error");
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Only for pass level 1 do we have a requirement to locate the actual node
|
|
// using the key. For pass levels 2 & 3 we have the pointer to the node.
|
|
// ...........................................................................
|
|
|
|
if (passLevel != 1) { goto END; }
|
|
|
|
// ...........................................................................
|
|
// Big loop to restart the whole search routine
|
|
// ...........................................................................
|
|
|
|
CAS_RESTART: {
|
|
|
|
// ...........................................................................
|
|
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
|
|
// ...........................................................................
|
|
|
|
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
|
|
LOG_ERROR("CAS Failure");
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_CAS_FAILURE;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Provide a simple non-blocking, block?
|
|
// ...........................................................................
|
|
|
|
if (casFailures > -1) {
|
|
usleep(CAS_FAILURE_SLEEP_TIME);
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Increment the cas failures (which should always be hopefully 0).
|
|
// ...........................................................................
|
|
|
|
++casFailures;
|
|
|
|
|
|
// ...........................................................................
|
|
// Determine the starting level and the starting node
|
|
// ...........................................................................
|
|
|
|
currentLevel = skiplist->_base._startNode._colLength - 1;
|
|
currentNode = &(skiplist->_base._startNode);
|
|
nextNode = currentNode;
|
|
|
|
|
|
START: {
|
|
|
|
// .........................................................................
|
|
// Find the next node in the current level of the lists. Protect yourself
|
|
// in case something has gone wrong.
|
|
// .........................................................................
|
|
|
|
if (nextNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// We require the successor of the current node so we can perform a
|
|
// comparison. It should never be null.
|
|
// .........................................................................
|
|
|
|
nextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[currentLevel]._next);
|
|
if (nextNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// Is our next node a glass node? If so we must skip it. Recall we are in
|
|
// Phase I here -- meaning that we are searching for a node which has not
|
|
// be removed and previously inserted.
|
|
// .........................................................................
|
|
|
|
if (nextNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
|
|
goto START;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// WE HAVE FOUR CASES TO CONSIDER
|
|
// .........................................................................
|
|
|
|
// .........................................................................
|
|
// CASE ONE:
|
|
// At this level we have the smallest (start) and largest (end) nodes ONLY.
|
|
// CASE TWO:
|
|
// We have arrived at the end of the nodes and we are not at the
|
|
// start of the nodes either.
|
|
// .........................................................................
|
|
|
|
if (nextNode == &(skiplist->_base._endNode)) {
|
|
|
|
// .......................................................................
|
|
// We are at the lowest level of the lists, and we haven't found the item
|
|
// yet. Nothing to remove so return.
|
|
// .......................................................................
|
|
if (currentLevel == 0) {
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_ITEM_MISSING;
|
|
}
|
|
|
|
// .......................................................................
|
|
// We have not yet reached the lowest level continue down.
|
|
// .......................................................................
|
|
|
|
nextNode = currentNode;
|
|
--currentLevel;
|
|
|
|
goto START;
|
|
}
|
|
|
|
|
|
|
|
// .........................................................................
|
|
// CASE THREE:
|
|
// We are the smallest left most node and the NEXT node is NOT the end node.
|
|
// Compare this element with the element in the right node to see what we do.
|
|
// CASE FOUR:
|
|
// We are somewhere in the middle of a list, away from the smallest and
|
|
// largest nodes.
|
|
// .........................................................................
|
|
|
|
else { // nextNode != &(skiplist->_endNode
|
|
int compareResult;
|
|
|
|
// .......................................................................
|
|
// Use the callback to determine if the element is less or greater than
|
|
// the next node element.
|
|
// .......................................................................
|
|
|
|
compareResult = IndexStaticMultiCompareElementElement(skiplist,element,&(nextNode->_element), -1);
|
|
|
|
|
|
// .......................................................................
|
|
// The element is greater than the next node element. Keep going on this
|
|
// level.
|
|
// .......................................................................
|
|
if (compareResult > 0) {
|
|
currentNode = nextNode;
|
|
goto START;
|
|
}
|
|
|
|
|
|
if (compareResult == 0) { // a node matches exactly based upon the element
|
|
|
|
if (nextNode->_delTransID > thisTransID) {
|
|
|
|
// ...................................................................
|
|
// Node has NOT been deleted (e.g. imagine it will be deleted some
|
|
// time in the future). This is the node we want.
|
|
// ...................................................................
|
|
|
|
currentNode = nextNode;
|
|
goto END;
|
|
}
|
|
|
|
|
|
// .....................................................................
|
|
// In a skiplist supporting duplicate entries, the comparison function
|
|
// test ensures the elements are the same (e.g. same address in memory)
|
|
// it can never be the case that we rely simply on the keys matching.
|
|
// So the question remains: why has the item has been previously
|
|
// deleted? Has someone tried to remove this item twice?
|
|
// Don't know return error.
|
|
// .....................................................................
|
|
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_ITEM_PRIOR_REMOVED;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// We have reached the lowest level of the lists -- no such item.
|
|
// .......................................................................
|
|
if (currentLevel == 0) {
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_ITEM_MISSING;
|
|
}
|
|
|
|
// .......................................................................
|
|
// Drop down the list
|
|
// .......................................................................
|
|
--currentLevel;
|
|
nextNode = currentNode;
|
|
goto START;
|
|
}
|
|
|
|
} // end of START label
|
|
} // end of CAS_RESTART label
|
|
|
|
END: {
|
|
|
|
switch (passLevel) {
|
|
|
|
// .........................................................................
|
|
// In this case we simply add the del transaction id with a CAS statement.
|
|
// It should never fail!
|
|
// .........................................................................
|
|
|
|
case 1: {
|
|
bool ok;
|
|
|
|
if (currentNode == NULL) { // something terribly wrong
|
|
assert(0);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
ok = TRI_CompareAndSwapIntegerUInt64 (&(currentNode->_delTransID),
|
|
UINT64_MAX, thisTransID);
|
|
if (!ok) {
|
|
assert(0);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
// ....................................................................
|
|
// If requested copy the contents of the element we have located into the
|
|
// storage sent.
|
|
// ....................................................................
|
|
|
|
if (old != NULL) {
|
|
IndexStaticCopyElementElement(&(skiplist->_base), old, &(currentNode->_element));
|
|
}
|
|
|
|
*passNode = currentNode;
|
|
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// In this case we wish to make the node a glass node and to unjoin all
|
|
// other connected nodes.
|
|
// .........................................................................
|
|
case 2: {
|
|
|
|
// .......................................................................
|
|
// We can not now rely upon looking up the node using the key, since
|
|
// we would need to traverse right and attempt to match either then
|
|
// transaction id and/or the pointer to the doc. Easier to simply
|
|
// send the address of the node back.
|
|
// .......................................................................
|
|
if (*passNode == NULL) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
currentNode = (TRI_skiplistEx_node_t*)(*passNode);
|
|
|
|
|
|
// .......................................................................
|
|
// Only the Garbage Collector can transform a node into a glass node, and
|
|
// since the GC is only operating in one thread safe to do a simple
|
|
// comparison here.
|
|
// .......................................................................
|
|
if (currentNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
// .......................................................................
|
|
// safety check
|
|
// .......................................................................
|
|
if (currentNode->_delTransID != thisTransID) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
// .......................................................................
|
|
// The stragey is this:
|
|
// (a) Brick each nearest neighbour on this node. This ensures that NO
|
|
// other nodes can be attached to this node.
|
|
// (b) Mark this node as being glass. This ensures that it is skipped
|
|
// since it is no longer required in the index.
|
|
// (c) Unbrick each of its nearest neighbours on this node. This ensures
|
|
// that an inserted node MAY be allowed to be attached but will later fail.
|
|
// Also allows us to brick other glass nodes.
|
|
// (d) Brick each prev and next nearest neighbour of this node. Irrespective
|
|
// if one of these are glass or not. This ensures that lookups can
|
|
// proceed unhinded.
|
|
// (e) Unjoin the node from the list.
|
|
// (f) Unbrick each prev/next nearest neigbour
|
|
// .......................................................................
|
|
|
|
return UnJoinOldNodeCas(currentNode);
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// In this case since no other reader/writer can be accessing the node,
|
|
// we simply destroy it. we require the node to be glass.
|
|
// .........................................................................
|
|
case 3: {
|
|
// .......................................................................
|
|
// We can not now rely upon looking up the node using the key, since
|
|
// we would need to traverse right and attempt to match either then
|
|
// transaction id and/or the pointer to the doc. Easier to simply
|
|
// send the address of the node back.
|
|
// .......................................................................
|
|
if (*passNode == NULL) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
currentNode = (TRI_skiplistEx_node_t*)(*passNode);
|
|
|
|
// .......................................................................
|
|
// Only the Garbage Collector can transform a node into a glass node, and
|
|
// since the GC is only operating in one thread safe to do a simple
|
|
// comparison here.
|
|
// .......................................................................
|
|
if (currentNode->_towerFlag != TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
// .......................................................................
|
|
// safety check
|
|
// .......................................................................
|
|
if (currentNode->_delTransID != thisTransID) {
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
FreeSkipListExNode(&(skiplist->_base), currentNode);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default: {
|
|
assert(0);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
} // end of switch statement
|
|
|
|
} // end of END label
|
|
|
|
return TRI_ERROR_NO_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief removes a key/element from a multi skip list
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
int TRI_RemoveKeySkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* key, void* old,
|
|
const int passLevel, const uint64_t thisTransID,
|
|
TRI_skiplistEx_node_t** passNode) {
|
|
// Use the TRI_RemoveElementSkipListExMulti method instead.
|
|
assert(false);
|
|
return 0;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief returns smallest node greater than a given key
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void* TRI_RightLookupByKeySkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* key, uint64_t thisTransID) {
|
|
int32_t currentLevel;
|
|
TRI_skiplistEx_node_t* currentNode;
|
|
TRI_skiplistEx_node_t* prevNode;
|
|
int casFailures = -1;
|
|
|
|
// ...........................................................................
|
|
// Just in case ...
|
|
// ...........................................................................
|
|
|
|
if (skiplist == NULL) {
|
|
LOG_ERROR("Internal Error");
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Big loop to restart the whole search routine
|
|
// ...........................................................................
|
|
|
|
CAS_RESTART: {
|
|
|
|
// ...........................................................................
|
|
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
|
|
// ...........................................................................
|
|
|
|
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
|
|
LOG_ERROR("CAS Failure");
|
|
return NULL;
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Provide a simple non-blocking, block?
|
|
// ...........................................................................
|
|
|
|
if (casFailures > -1) {
|
|
assert(0); // a test to see why it blocks - should not block!
|
|
usleep(CAS_FAILURE_SLEEP_TIME);
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// Increment the cas failures (which should always be hopefully 0).
|
|
// ...........................................................................
|
|
|
|
++casFailures;
|
|
|
|
|
|
// ...........................................................................
|
|
// Determine the starting level and the starting node
|
|
// ...........................................................................
|
|
|
|
currentLevel = skiplist->_base._endNode._colLength - 1;
|
|
currentNode = &(skiplist->_base._endNode);
|
|
prevNode = currentNode;
|
|
|
|
|
|
START: {
|
|
|
|
|
|
// .........................................................................
|
|
// Find the next node in the current level of the lists. Protect yourself
|
|
// in case something has gone wrong.
|
|
// .........................................................................
|
|
|
|
if (prevNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// We require the successor of the current node so we can perform a
|
|
// comparison. It should never be null.
|
|
// .........................................................................
|
|
|
|
prevNode = (TRI_skiplistEx_node_t*)(prevNode->_column[currentLevel]._prev);
|
|
if (prevNode == NULL) {
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
goto CAS_RESTART;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// Is our next node a glass node? If so we must skip it!
|
|
// Note: since Garbage Collection is performed in TWO passes, it is possible
|
|
// that we have more than one glass node.
|
|
// .........................................................................
|
|
|
|
if (prevNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
|
|
goto START;
|
|
}
|
|
|
|
|
|
// .........................................................................
|
|
// WE HAVE FOUR CASES TO CONSIDER
|
|
// .........................................................................
|
|
|
|
// .........................................................................
|
|
// CASE ONE:
|
|
// At this level we have the smallest (start) and largest (end) nodes ONLY.
|
|
// CASE TWO:
|
|
// We have arrived at the end of the nodes and we are not at the
|
|
// start of the nodes either.
|
|
// .........................................................................
|
|
|
|
if (prevNode == &(skiplist->_base._startNode)) {
|
|
|
|
|
|
// .......................................................................
|
|
// We are at the lowest level of the lists, and we haven't found the item
|
|
// yet. The currentNode does NOT compare and the next node is +\infinty.
|
|
// .......................................................................
|
|
|
|
if (currentLevel == 0) {
|
|
return currentNode;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// We have not yet reached the lowest level continue down. Possibly our
|
|
// item we seek is to be found a lower level.
|
|
// .......................................................................
|
|
|
|
prevNode = currentNode;
|
|
--currentLevel;
|
|
goto START;
|
|
}
|
|
|
|
|
|
|
|
// .........................................................................
|
|
// CASE THREE:
|
|
// We are the smallest left most node and the NEXT node is NOT the end node.
|
|
// Compare this element with the element in the right node to see what we do.
|
|
// CASE FOUR:
|
|
// We are somewhere in the middle of a list, away from the smallest and
|
|
// largest nodes.
|
|
// .........................................................................
|
|
|
|
else { // nextNode != &(skiplist->_endNode
|
|
int compareResult;
|
|
|
|
// .......................................................................
|
|
// Use the callback to determine if the element is less or greater than
|
|
// the next node element. We treat the comparison by assuming we are
|
|
// looking for a "key - epsilon". With this assumption we always find the
|
|
// last key to our right if it exists. The reason this is necessary is as
|
|
// follows: we allow a multiple documents with the same key to be stored
|
|
// here with the proviso that all but the last one is marked as deleted.
|
|
// This is how we cater for multiple revisions.
|
|
// .......................................................................
|
|
|
|
compareResult = IndexStaticMultiCompareKeyElement(skiplist, key, &(prevNode->_element), 1);
|
|
|
|
// .......................................................................
|
|
// If the number of fields (attributes) in the key is LESS than the number
|
|
// of fields in the element to be compared to, then EVEN if the keys which
|
|
// which are common to both equate as EQUAL, we STILL return 1 rather than
|
|
// 0! This ensures that the right interval end point is correctly positioned
|
|
// -- slightly inefficient since the lowest level skip list 0 has to be reached
|
|
// in this case.
|
|
// .......................................................................
|
|
|
|
// .......................................................................
|
|
// We have found the item!
|
|
// .......................................................................
|
|
|
|
if (compareResult == 0) {
|
|
assert(false);
|
|
}
|
|
|
|
if (compareResult < 0) {
|
|
currentNode = prevNode;
|
|
goto START;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// The element is less than the next node. Can we drop down the list?
|
|
// If have reached the lowest level of the lists -- no such item.
|
|
// .......................................................................
|
|
|
|
if (currentLevel == 0) {
|
|
return currentNode;
|
|
}
|
|
|
|
|
|
// .......................................................................
|
|
// Drop down the list
|
|
// .......................................................................
|
|
|
|
--currentLevel;
|
|
prevNode = currentNode;
|
|
goto START;
|
|
}
|
|
|
|
} // end of label START
|
|
|
|
} // end of label CAS_RESTART
|
|
|
|
assert(false);
|
|
return NULL;
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief returns the start node associated with a multi skip list.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
void* TRI_StartNodeSkipListExMulti(TRI_skiplistEx_multi_t* skiplist) {
|
|
if (skiplist != NULL) {
|
|
return &(skiplist->_base._startNode);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @}
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// IMPLEMENTATION OF STATIC FORWARD DECLARED FUNCTIONS
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @addtogroup SkiplistEx_common
|
|
/// @{
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief destroys a skip list, but does not free the pointer
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
static void DestroyBaseSkipListEx(TRI_skiplistEx_base_t* baseSkiplist) {
|
|
|
|
// ...........................................................................
|
|
// No locking, blocking or CAS here. Someone asked for the index to destroyed.
|
|
// We assume that no further read/write operations are being accepted which
|
|
// require this index.
|
|
// TODO:
|
|
// Warning: there is a memory leak which requires fixing here. The Garbage
|
|
// collection may be working in the background and if we destroy the
|
|
// skiplist before the Garbage collection thread has been terminated - then
|
|
// bang!
|
|
// The idea is to send the Garbage collector a signal so that ALL references
|
|
// to this index are expunged, then the same process will call this function.
|
|
// ...........................................................................
|
|
|
|
|
|
TRI_skiplistEx_node_t* nextNode;
|
|
TRI_skiplistEx_node_t* nextNextNode;
|
|
|
|
if (baseSkiplist == NULL) {
|
|
return;
|
|
}
|
|
|
|
nextNode = &(baseSkiplist->_startNode);
|
|
|
|
while (nextNode != NULL) {
|
|
nextNextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[0]._next);
|
|
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(nextNode->_column));
|
|
if ((nextNode != &(baseSkiplist->_startNode)) && (nextNode != &(baseSkiplist->_endNode))) {
|
|
IndexStaticDestroyElement(baseSkiplist, &(nextNode->_element));
|
|
TRI_Free(TRI_UNKNOWN_MEM_ZONE, nextNode);
|
|
}
|
|
nextNode = nextNextNode;
|
|
}
|
|
TRI_Free(TRI_UNKNOWN_MEM_ZONE, baseSkiplist->_random);
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief destroys the internal structure allocation for a node
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
static void DestroySkipListExNode (TRI_skiplistEx_base_t* skiplist, TRI_skiplistEx_node_t* node) {
|
|
if (node == NULL) {
|
|
return;
|
|
}
|
|
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(node->_column));
|
|
// recall that the memory assigned for the node->_element is actually part of the node
|
|
// so we do not free that memory here - it is freed when we free the whole node
|
|
IndexStaticDestroyElement(skiplist, &(node->_element));
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief frees a node, destroying it first
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
static void FreeSkipListExNode (TRI_skiplistEx_base_t* skiplist, TRI_skiplistEx_node_t* node) {
|
|
DestroySkipListExNode(skiplist, node);
|
|
if ( (node == &(skiplist->_startNode)) ||
|
|
(node == &(skiplist->_endNode)) ) {
|
|
return;
|
|
}
|
|
TRI_Free(TRI_UNKNOWN_MEM_ZONE, node);
|
|
}
|
|
|
|
|
|
// ...............................................................................
|
|
// This function is thread safe since the node has just been created and has
|
|
// NOT YET been linked into the skiplist.
|
|
// ...............................................................................
|
|
|
|
static int GrowNewNodeHeight(TRI_skiplistEx_node_t* node, uint32_t height, uint32_t colLength, int result) {
|
|
|
|
// ............................................................................
|
|
// Don't go any further if we already have a previous error, simply return that error.
|
|
// ............................................................................
|
|
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
return result;
|
|
}
|
|
|
|
|
|
// ............................................................................
|
|
// In general the height is related to the colLength via the relation
|
|
// height = colLength. However, we allow for the fact that node may have a
|
|
// height much bigger than the current column length. This of course saves us
|
|
// from continually allocating and deallocating memory.
|
|
// ............................................................................
|
|
|
|
if (colLength > height) {
|
|
assert(0);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
node->_colLength = colLength;
|
|
|
|
node->_column = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_skiplistEx_nb_t) * height, false);
|
|
|
|
if (node->_column == NULL) { // out of memory?
|
|
return TRI_ERROR_OUT_OF_MEMORY;
|
|
}
|
|
|
|
|
|
// ..........................................................................
|
|
// Ensure that the towers are normal, at least initially for a new node
|
|
// ..........................................................................
|
|
|
|
node->_towerFlag = TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG;
|
|
|
|
|
|
// ...........................................................................
|
|
// Initialise the storage
|
|
// ...........................................................................
|
|
{
|
|
uint32_t j;
|
|
for (j = 0; j < height; ++j) {
|
|
node->_column[j]._prev = NULL;
|
|
node->_column[j]._next = NULL;
|
|
node->_column[j]._nbFlag = TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG;
|
|
}
|
|
}
|
|
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief joins a the start node to the end node and visa versa
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
static void JoinStartEndNodes(TRI_skiplistEx_node_t* leftNode,
|
|
TRI_skiplistEx_node_t* rightNode,
|
|
uint32_t startLevel, uint32_t endLevel) {
|
|
if (startLevel > endLevel) { // something wrong
|
|
assert(false);
|
|
return;
|
|
}
|
|
|
|
// change level to height
|
|
endLevel += 1;
|
|
|
|
{
|
|
uint32_t j;
|
|
for (j = startLevel; j < endLevel; ++j) {
|
|
(leftNode->_column)[j]._next = rightNode;
|
|
(rightNode->_column)[j]._prev = leftNode;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief given a node returns the next node (if possible) in the skiplist
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
static void* NextNodeBaseSkipListEx(TRI_skiplistEx_base_t* skiplist, void* currentNode, uint64_t thisTransID) {
|
|
TRI_skiplistEx_node_t* volatile nn = (TRI_skiplistEx_node_t* volatile)(currentNode);
|
|
|
|
if (nn == NULL) {
|
|
nn = &(skiplist->_startNode);
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// We are required to skip certain nodes based upon the transaction id
|
|
// ...........................................................................
|
|
|
|
while (nn != &(skiplist->_endNode)) {
|
|
nn = nn->_column[0]._next;
|
|
|
|
if (nn == NULL) {
|
|
// this should not happen!
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
return NULL;
|
|
}
|
|
|
|
if (nn->_insTransID > thisTransID) { // item was inserted AFTER this transaction started - skip it
|
|
continue;
|
|
}
|
|
|
|
if (nn->_delTransID <= thisTransID) { // item has been previously deleted - skip it
|
|
continue;
|
|
}
|
|
|
|
return (void*)(nn);
|
|
};
|
|
|
|
return(NULL);
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief given a node returns the next node (if possible) in the skiplist
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
static void* PrevNodeBaseSkipListEx(TRI_skiplistEx_base_t* skiplist, void* currentNode, uint64_t thisTransID) {
|
|
TRI_skiplistEx_node_t* volatile pn = (TRI_skiplistEx_node_t*)(currentNode);
|
|
|
|
if (pn == NULL) {
|
|
return &(skiplist->_endNode);
|
|
}
|
|
|
|
|
|
// ...........................................................................
|
|
// We are required to skip certain nodes based upon the transaction id
|
|
// ...........................................................................
|
|
|
|
while (pn != &(skiplist->_startNode)) {
|
|
pn = pn->_column[0]._prev;
|
|
|
|
if (pn == NULL) {
|
|
// this should not happen!
|
|
LOG_ERROR("CAS Failure");
|
|
assert(0);
|
|
return NULL;
|
|
}
|
|
|
|
if (pn->_insTransID > thisTransID) { // item was inserted AFTER this transaction started - skip it
|
|
continue;
|
|
}
|
|
|
|
if (pn->_delTransID <= thisTransID) { // item has been previously deleted - skip it
|
|
continue;
|
|
}
|
|
|
|
return (void*)(pn);
|
|
|
|
};
|
|
|
|
return(NULL);
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @brief determines at what 'height' the item is to be added
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
static int32_t RandLevel (TRI_skiplistEx_base_t* skiplist) {
|
|
|
|
uint32_t level = 0;
|
|
int counter = 0;
|
|
uint32_t* ptr = skiplist->_random;
|
|
int j;
|
|
|
|
|
|
// ...........................................................................
|
|
// Obtain the random numbers and store them in the pre allocated storage
|
|
// ...........................................................................
|
|
for (j = 0; j < skiplist->_numRandom; ++j) {
|
|
*ptr = TRI_UInt32Random();
|
|
++ptr;
|
|
}
|
|
ptr = skiplist->_random; // go back to the begining
|
|
|
|
|
|
// ...........................................................................
|
|
// Use the bit list to determine the probability of the level.
|
|
// For 1/2: if bit (0) we stop, otherwise increase level.
|
|
// For 1/3: if bits (0,0) we stop, if bits (1,1) ignore and continue, otherwise increase level
|
|
// For 1/4: if bits (0,0) we stop, otherwise increase level
|
|
// ...........................................................................
|
|
switch (skiplist->_prob) {
|
|
|
|
case TRI_SKIPLIST_EX_PROB_HALF: {
|
|
counter = 0;
|
|
while (level < skiplist->_maxHeight) {
|
|
if ((1 & (*ptr)) == 0) {
|
|
break;
|
|
}
|
|
++level;
|
|
(*ptr) = (*ptr) >> 1;
|
|
++counter;
|
|
if (counter == 32) {
|
|
++ptr;
|
|
counter = 0;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
case TRI_SKIPLIST_EX_PROB_THIRD: {
|
|
while (level < skiplist->_maxHeight) {
|
|
if ((3 & (*ptr)) == 0) {
|
|
break;
|
|
}
|
|
else if ((3 & (*ptr)) == 3) {
|
|
// do nothing do not increase level
|
|
}
|
|
else {
|
|
++level;
|
|
}
|
|
(*ptr) = (*ptr) >> 2;
|
|
++counter;
|
|
if (counter == 16) {
|
|
++ptr;
|
|
counter = 0;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
case TRI_SKIPLIST_EX_PROB_QUARTER: {
|
|
counter = 0;
|
|
while (level < skiplist->_maxHeight) {
|
|
if ((3 & (*ptr)) == 0) {
|
|
break;
|
|
}
|
|
++level;
|
|
(*ptr) = (*ptr) >> 2;
|
|
++counter;
|
|
if (counter == 16) {
|
|
++ptr;
|
|
counter = 0;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
default: {
|
|
return -1;
|
|
}
|
|
}
|
|
return level;
|
|
}
|
|
|
|
|
|
// .................................................................................
|
|
// If we have simultaneous inserts, then this function will keep retrying and
|
|
// attempt to wait until the CAS statement succeed. It is safe for
|
|
// simultaneous inserts.
|
|
// .................................................................................
|
|
|
|
static int GrowStartEndNodes(TRI_skiplistEx_base_t* skiplist, uint32_t newHeight) {
|
|
int result = TRI_ERROR_NO_ERROR;
|
|
int retries = 0;
|
|
uint32_t oldStartHeight, oldEndHeight;
|
|
|
|
// ................................................................................
|
|
// Is someone else growing the start/end nodes, if so return necessary error.
|
|
// Notice that this loop is only necessary if we assume multiple unordered inserts.
|
|
// ................................................................................
|
|
|
|
while (true) {
|
|
if (TRI_CompareAndSwapIntegerUInt32(&(skiplist->_growStartEndNodesFlag),
|
|
TRI_SKIPLIST_EX_FREE_TO_GROW_START_END_NODES_FLAG,
|
|
TRI_SKIPLIST_EX_NOT_FREE_TO_GROW_START_END_NODES_FLAG) ) {
|
|
break;
|
|
}
|
|
++retries;
|
|
if (retries > SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
|
|
LOG_ERROR("CAS failed for GrowStartEndNodes");
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
|
|
}
|
|
usleep(CAS_FAILURE_SLEEP_TIME);
|
|
}
|
|
|
|
oldStartHeight = skiplist->_startNode._colLength;
|
|
oldEndHeight = skiplist->_endNode._colLength;
|
|
|
|
if (oldStartHeight != oldEndHeight) {
|
|
result = TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
if (result == TRI_ERROR_NO_ERROR) {
|
|
if (oldStartHeight < newHeight) {
|
|
// ............................................................................
|
|
// need a CAS statement here since we may have multiple readers busy reading
|
|
// the height of the towers.
|
|
// ............................................................................
|
|
if (!TRI_CompareAndSwapIntegerUInt32(&(skiplist->_startNode._colLength),
|
|
oldStartHeight, newHeight) ) {
|
|
// should never happen
|
|
result = TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
|
|
}
|
|
else {
|
|
if (!TRI_CompareAndSwapIntegerUInt32(&(skiplist->_endNode._colLength),
|
|
oldEndHeight, newHeight) ) {
|
|
// should never happen
|
|
result = TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
|
|
}
|
|
if (result != TRI_ERROR_NO_ERROR) { // undo all of good work
|
|
TRI_CompareAndSwapIntegerUInt32(&(skiplist->_startNode._colLength), newHeight, oldStartHeight);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
if (!TRI_CompareAndSwapIntegerUInt32(&(skiplist->_growStartEndNodesFlag),
|
|
TRI_SKIPLIST_EX_NOT_FREE_TO_GROW_START_END_NODES_FLAG,
|
|
TRI_SKIPLIST_EX_FREE_TO_GROW_START_END_NODES_FLAG) ) {
|
|
// ..............................................................................
|
|
// failure is not a word we recognise - eventually send signal to database to rebuild index
|
|
// ..............................................................................
|
|
LOG_ERROR("CAS failed for GrowStartEndNodes");
|
|
assert(0); // remove after debugging
|
|
if (result == TRI_ERROR_NO_ERROR) {
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
static int UndoBricking (TRI_skiplistEx_node_t* node, int counter) {
|
|
bool ok = true;
|
|
int j;
|
|
TRI_skiplistEx_nb_t* leftNN;
|
|
TRI_skiplistEx_nb_t* rightNN;
|
|
TRI_skiplistEx_node_t* leftNode;
|
|
TRI_skiplistEx_node_t* rightNode;
|
|
|
|
for (j = 0; j < counter; ++j) {
|
|
leftNode = (TRI_skiplistEx_node_t*)(node->_column[j]._prev);
|
|
rightNode = (TRI_skiplistEx_node_t*)(node->_column[j]._next);
|
|
leftNN = &(leftNode->_column[j]);
|
|
rightNN = &(rightNode->_column[j]);
|
|
|
|
ok = TRI_CompareAndSwapIntegerUInt32 (&(leftNN->_nbFlag),
|
|
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG,
|
|
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG) && ok;
|
|
ok = TRI_CompareAndSwapIntegerUInt32 (&(rightNN->_nbFlag),
|
|
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG,
|
|
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG) && ok;
|
|
}
|
|
|
|
if (!ok) {
|
|
// should never occur - if it does eventually send signal to database to rebuild index
|
|
LOG_ERROR("CAS failed for UndoBricking");
|
|
assert(0);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
static int DoBricking (TRI_skiplistEx_node_t* node, int* counter) {
|
|
uint32_t j;
|
|
TRI_skiplistEx_nb_t* leftNN;
|
|
TRI_skiplistEx_nb_t* rightNN;
|
|
TRI_skiplistEx_node_t* leftNode;
|
|
TRI_skiplistEx_node_t* rightNode;
|
|
bool ok = true;
|
|
int result = TRI_ERROR_NO_ERROR;
|
|
|
|
*counter = 0;
|
|
|
|
for (j = 0; j < node->_colLength; ++j) {
|
|
leftNode = (TRI_skiplistEx_node_t*)(node->_column[j]._prev);
|
|
rightNode = (TRI_skiplistEx_node_t*)(node->_column[j]._next);
|
|
leftNN = &(leftNode->_column[j]);
|
|
rightNN = &(rightNode->_column[j]);
|
|
|
|
// left
|
|
ok = TRI_CompareAndSwapIntegerUInt32 (&(leftNN->_nbFlag),
|
|
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG,
|
|
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG);
|
|
if (!ok) { break; }
|
|
|
|
// right
|
|
ok = TRI_CompareAndSwapIntegerUInt32 (&(rightNN->_nbFlag),
|
|
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG,
|
|
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG);
|
|
if (!ok) {
|
|
if (!TRI_CompareAndSwapIntegerUInt32 (&(leftNN->_nbFlag),
|
|
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG,
|
|
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG)) {
|
|
// should never occur - if it does, then we need to eventually send signal to database to rebuild index
|
|
abort();
|
|
}
|
|
break;
|
|
}
|
|
|
|
++(*counter);
|
|
}
|
|
|
|
if (ok) {
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
result = UndoBricking (node, *counter);
|
|
if (result == TRI_ERROR_NO_ERROR) {
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
|
|
}
|
|
|
|
LOG_ERROR("CAS failed for UndoBricking");
|
|
assert(0);
|
|
return result;
|
|
}
|
|
|
|
static int UndoJoinPointers(TRI_skiplistEx_node_t* node, const int counter) {
|
|
int j;
|
|
TRI_skiplistEx_nb_t* leftNN;
|
|
TRI_skiplistEx_nb_t* rightNN;
|
|
TRI_skiplistEx_node_t* leftNode;
|
|
TRI_skiplistEx_node_t* rightNode;
|
|
bool ok = true;
|
|
|
|
for (j = 0; j < counter; ++j) {
|
|
leftNode = (TRI_skiplistEx_node_t*)(node->_column[j]._prev);
|
|
rightNode = (TRI_skiplistEx_node_t*)(node->_column[j]._next);
|
|
leftNN = &(leftNode->_column[j]);
|
|
rightNN = &(rightNode->_column[j]);
|
|
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(leftNN->_next)), node, rightNode) && ok;
|
|
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(rightNN->_prev)), node, leftNode) && ok;
|
|
}
|
|
|
|
if (!ok) {
|
|
// should never occur - if it does eventually send signal to database to rebuild index
|
|
LOG_ERROR("CAS failed for UndoBricking");
|
|
assert(0);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
static int DoJoinPointers (TRI_skiplistEx_node_t* node, int* counter) {
|
|
uint32_t j;
|
|
TRI_skiplistEx_nb_t* leftNN;
|
|
TRI_skiplistEx_nb_t* rightNN;
|
|
TRI_skiplistEx_node_t* leftNode;
|
|
TRI_skiplistEx_node_t* rightNode;
|
|
bool ok = true;
|
|
|
|
|
|
*counter = 0;
|
|
for (j = 0; j < node->_colLength; ++j) {
|
|
leftNode = (TRI_skiplistEx_node_t*)(node->_column[j]._prev);
|
|
rightNode = (TRI_skiplistEx_node_t*)(node->_column[j]._next);
|
|
leftNN = &(leftNode->_column[j]);
|
|
rightNN = &(rightNode->_column[j]);
|
|
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(leftNN->_next)), rightNode, node);
|
|
if (!ok) { break; }
|
|
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(rightNN->_prev)), leftNode, node);
|
|
if (!ok) {
|
|
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(leftNN->_next)), node, rightNode);
|
|
if (!ok) {
|
|
// should never occur - if it does eventually send signal to database to rebuild index
|
|
abort();
|
|
}
|
|
break;
|
|
}
|
|
++(*counter);
|
|
}
|
|
|
|
if (ok) {
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
UndoJoinPointers(node, *counter);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
static int JoinNewNodeCas (TRI_skiplistEx_node_t* newNode) {
|
|
int brickCounter = 0;
|
|
int pointerCounter = 0;
|
|
int result = TRI_ERROR_NO_ERROR;
|
|
uint32_t j;
|
|
|
|
// Pass 1: do bricking
|
|
result = DoBricking(newNode, &brickCounter);
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
return result;
|
|
}
|
|
|
|
|
|
// Pass 2: Ensure that each tower node is not glassed - glassing by the GC is NOT
|
|
// possible if Pass 1 above has succeeded.
|
|
for (j = 0; j < newNode->_colLength; ++j) {
|
|
TRI_skiplistEx_node_t* leftNode = (TRI_skiplistEx_node_t*)(newNode->_column[j]._prev);
|
|
TRI_skiplistEx_node_t* rightNode = (TRI_skiplistEx_node_t*)(newNode->_column[j]._next);
|
|
if ( (leftNode->_towerFlag != TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG) ||
|
|
(rightNode->_towerFlag != TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG) ) {
|
|
result = UndoBricking (newNode, brickCounter);
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
LOG_ERROR("failed unbricking");
|
|
abort();
|
|
return result;
|
|
}
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
|
|
}
|
|
}
|
|
|
|
|
|
// Pass 3: Join the new node by assigning pointers
|
|
result = DoJoinPointers(newNode, &pointerCounter);
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
UndoBricking (newNode, brickCounter);
|
|
return result;
|
|
}
|
|
|
|
|
|
// Now unbrick the left/right nodes so other processes can access them
|
|
result = UndoBricking (newNode, brickCounter);
|
|
return result;
|
|
}
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////
|
|
// removal static functions below
|
|
//////////////////////////////////////////////////////////////////////////////////
|
|
|
|
static int SelfUndoBricking(TRI_skiplistEx_node_t* node, const int counter) {
|
|
bool ok = true;
|
|
int j;
|
|
TRI_skiplistEx_nb_t* NN;
|
|
|
|
for (j = 0; j < counter; ++j) {
|
|
NN = &(node->_column[j]);
|
|
ok = TRI_CompareAndSwapIntegerUInt32 (&(NN->_nbFlag),
|
|
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG,
|
|
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG) && ok;
|
|
}
|
|
|
|
if (!ok) {
|
|
// should never occur - if it does eventually send signal to database to rebuild index
|
|
LOG_ERROR("CAS failed for UndoBricking");
|
|
assert(0);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
|
|
static int SelfBricking(TRI_skiplistEx_node_t* node, int* counter) {
|
|
uint32_t j;
|
|
TRI_skiplistEx_nb_t* NN;
|
|
bool ok = true;
|
|
int result = TRI_ERROR_NO_ERROR;
|
|
|
|
*counter = 0;
|
|
|
|
for (j = 0; j < node->_colLength; ++j) {
|
|
NN = &(node->_column[j]);
|
|
|
|
ok = TRI_CompareAndSwapIntegerUInt32 (&(NN->_nbFlag),
|
|
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG,
|
|
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG);
|
|
if (!ok) { break; }
|
|
++(*counter);
|
|
}
|
|
|
|
if (ok) {
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
result = SelfUndoBricking(node, *counter);
|
|
if (result == TRI_ERROR_NO_ERROR) {
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_CAS_FAILURE;
|
|
}
|
|
|
|
LOG_ERROR("CAS failed for UndoBricking");
|
|
assert(0);
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
static int UndoUnjoinPointers(TRI_skiplistEx_node_t* node, const int counter) {
|
|
int j;
|
|
TRI_skiplistEx_nb_t* leftNN;
|
|
TRI_skiplistEx_nb_t* rightNN;
|
|
TRI_skiplistEx_node_t* leftNode;
|
|
TRI_skiplistEx_node_t* rightNode;
|
|
bool ok = true;
|
|
|
|
for (j = 0; j < counter; ++j) {
|
|
leftNode = (TRI_skiplistEx_node_t*)(node->_column[j]._prev);
|
|
rightNode = (TRI_skiplistEx_node_t*)(node->_column[j]._next);
|
|
leftNN = &(leftNode->_column[j]);
|
|
rightNN = &(rightNode->_column[j]);
|
|
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(leftNN->_next)),rightNode, node) && ok;
|
|
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(rightNN->_prev)), leftNode, node) && ok;
|
|
}
|
|
|
|
if (!ok) {
|
|
// should never occur - if it does eventually send signal to database to rebuild index
|
|
LOG_ERROR("CAS failed for UndoBricking");
|
|
assert(0);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
static int DoUnjoinPointers (TRI_skiplistEx_node_t* node, int* counter) {
|
|
uint32_t j;
|
|
TRI_skiplistEx_nb_t* leftNN;
|
|
TRI_skiplistEx_nb_t* rightNN;
|
|
TRI_skiplistEx_node_t* leftNode;
|
|
TRI_skiplistEx_node_t* rightNode;
|
|
bool ok = true;
|
|
|
|
|
|
*counter = 0;
|
|
for (j = 0; j < node->_colLength; ++j) {
|
|
leftNode = (TRI_skiplistEx_node_t*)(node->_column[j]._prev);
|
|
rightNode = (TRI_skiplistEx_node_t*)(node->_column[j]._next);
|
|
leftNN = &(leftNode->_column[j]);
|
|
rightNN = &(rightNode->_column[j]);
|
|
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(leftNN->_next)), node, rightNode);
|
|
if (!ok) { break; }
|
|
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(rightNN->_prev)), node, leftNode);
|
|
if (!ok) {
|
|
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(leftNN->_next)), rightNode, node);
|
|
if (!ok) {
|
|
// should never occur - if it does eventually send signal to database to rebuild index
|
|
abort();
|
|
}
|
|
break;
|
|
}
|
|
++(*counter);
|
|
}
|
|
|
|
if (ok) {
|
|
return TRI_ERROR_NO_ERROR;
|
|
}
|
|
|
|
UndoUnjoinPointers(node, *counter);
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
|
|
|
|
static int UnJoinOldNodeCas (TRI_skiplistEx_node_t* oldNode) {
|
|
int selfBrickCounter = 0;
|
|
int brickCounter = 0;
|
|
int pointerCounter = 0;
|
|
int result = TRI_ERROR_NO_ERROR;
|
|
bool ok;
|
|
|
|
// Pass 1: brick the nearest neighbours on the node itself.
|
|
result = SelfBricking(oldNode, &selfBrickCounter);
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
return result;
|
|
}
|
|
|
|
|
|
// Pass 2: make the node glass
|
|
ok = TRI_CompareAndSwapIntegerUInt32 (&(oldNode->_towerFlag),
|
|
TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG,
|
|
TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG);
|
|
|
|
if (!ok) {
|
|
result = SelfUndoBricking(oldNode,selfBrickCounter);
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
LOG_ERROR("UnJoinOldNodeCas failed ");
|
|
abort();
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_CAS_FAILURE;
|
|
}
|
|
|
|
// Pass 3: unbrick each nearest neigbour node here
|
|
result = SelfUndoBricking(oldNode,selfBrickCounter);
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
// undo the glassing of the node
|
|
ok = TRI_CompareAndSwapIntegerUInt32(&(oldNode->_towerFlag), TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG, TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG);
|
|
if (!ok) {
|
|
LOG_ERROR("UnJoinOldNodeCas failed");
|
|
abort();
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
// Pass 4: brick each of it's nearest neighbours
|
|
result = DoBricking(oldNode, &brickCounter);
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
// undo the glassing of the node
|
|
ok = TRI_CompareAndSwapIntegerUInt32 (&(oldNode->_towerFlag),
|
|
TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG,
|
|
TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG);
|
|
if (!ok) {
|
|
LOG_ERROR("UnJoinOldNodeCas failed");
|
|
abort();
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
if (result != TRI_ERROR_INTERNAL) {
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_CAS_FAILURE;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// Pass 5: unjoin the old node from the list by assigning pointers
|
|
result = DoUnjoinPointers(oldNode, &pointerCounter);
|
|
if (result != TRI_ERROR_NO_ERROR) {
|
|
int tempResult;
|
|
tempResult = UndoBricking(oldNode,brickCounter);
|
|
if (tempResult != TRI_ERROR_NO_ERROR) {
|
|
LOG_ERROR("UnJoinOldNodeCas failed");
|
|
abort();
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
ok = TRI_CompareAndSwapIntegerUInt32 (&(oldNode->_towerFlag),
|
|
TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG,
|
|
TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG);
|
|
if (!ok) {
|
|
LOG_ERROR("UnJoinOldNodeCas failed");
|
|
abort();
|
|
return TRI_ERROR_INTERNAL;
|
|
}
|
|
|
|
if (result != TRI_ERROR_INTERNAL) {
|
|
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_CAS_FAILURE;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
|
|
// Now unbrick the left/right nodes so other processes can access them
|
|
result = UndoBricking (oldNode, brickCounter);
|
|
return result;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// @}
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#endif
|
|
|
|
// Local Variables:
|
|
// mode: outline-minor
|
|
// outline-regexp: "^\\(/// @brief\\|/// {@inheritDoc}\\|/// @addtogroup\\|// --SECTION--\\|/// @\\}\\)"
|
|
// End:
|