1
0
Fork 0
arangodb/arangod/SkipListsEx/skiplistEx.c

4125 lines
162 KiB
C

////////////////////////////////////////////////////////////////////////////////
/// @brief skiplist implementation
///
/// @file
///
/// DISCLAIMER
///
/// Copyright 2004-2012 triagens GmbH, Cologne, Germany
///
/// Licensed under the Apache License, Version 2.0 (the "License");
/// you may not use this file except in compliance with the License.
/// You may obtain a copy of the License at
///
/// http://www.apache.org/licenses/LICENSE-2.0
///
/// Unless required by applicable law or agreed to in writing, software
/// distributed under the License is distributed on an "AS IS" BASIS,
/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
/// See the License for the specific language governing permissions and
/// limitations under the License.
///
/// Copyright holder is triAGENS GmbH, Cologne, Germany
///
/// @author Anonymous
/// @author Copyright 2006-2012, triAGENS GmbH, Cologne, Germany
////////////////////////////////////////////////////////////////////////////////
#include <BasicsC/locks.h>
#include <BasicsC/logging.h>
#include <BasicsC/random.h>
#include "skiplistEx.h"
#include "compareEx.h"
#ifdef _WIN32
#include <BasicsC/win-utils.h>
#endif
#ifdef TRI_SKIPLIST_EX
// -----------------------------------------------------------------------------
// --SECTION-- SKIPLIST_EX
// -----------------------------------------------------------------------------
// -----------------------------------------------------------------------------
// --SECTION-- Private Type Structures
// -----------------------------------------------------------------------------
typedef enum {
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG, // the nearest neighbour node is normal
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG, // the nearest neighbour node is bricked - next/prev pointers can not be modified
TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG, // normal tower node, no removal pending
TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG, // glass tower node, skipped in a lookup, removal pending
TRI_SKIPLIST_EX_FREE_TO_GROW_START_END_NODES_FLAG, // start/end nodes special this and flag below ensures that
TRI_SKIPLIST_EX_NOT_FREE_TO_GROW_START_END_NODES_FLAG // the tower height of these nodes is performed sequentially
}
TRI_skiplistEx_tower_node_flag_e;
static unsigned int CAS_FAILURE_SLEEP_TIME = 1000;
static unsigned int SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT = 100;
static unsigned int SKIPLIST_EX_CAS_FAILURES_MAX_LOOP = 10;
// -----------------------------------------------------------------------------
// --SECTION-- STATIC FORWARD DECLARATIONS
// --SECTION-- common private functions
// -----------------------------------------------------------------------------
static void DestroyBaseSkipListEx (TRI_skiplistEx_base_t*);
static void DestroySkipListExNode (TRI_skiplistEx_base_t*, TRI_skiplistEx_node_t*);
static void FreeSkipListExNode (TRI_skiplistEx_base_t*, TRI_skiplistEx_node_t*);
static int GrowNewNodeHeight (TRI_skiplistEx_node_t*, uint32_t, uint32_t, int);
static int GrowStartEndNodes (TRI_skiplistEx_base_t*, uint32_t);
static void* NextNodeBaseSkipListEx (TRI_skiplistEx_base_t*, void*, uint64_t);
static void* PrevNodeBaseSkipListEx (TRI_skiplistEx_base_t*, void*, uint64_t);
static int32_t RandLevel (TRI_skiplistEx_base_t*);
static void JoinStartEndNodes (TRI_skiplistEx_node_t*, TRI_skiplistEx_node_t*, uint32_t, uint32_t);
static int JoinNewNodeCas (TRI_skiplistEx_node_t* newNode); // when node is inserted
static int UnJoinOldNodeCas (TRI_skiplistEx_node_t* oldNode); // when node is removed
// -----------------------------------------------------------------------------
// --SECTION-- unique skiplist constructors and destructors
// -----------------------------------------------------------------------------
////////////////////////////////////////////////////////////////////////////////
/// @addtogroup Skiplist_unique
/// @{
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/// @brief initialises an skip list
////////////////////////////////////////////////////////////////////////////////
// .............................................................................
// TODO: The static integer variables CAS_FAILURE_SLEEP_TIME(1000),
// SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT(100) and SKIPLIST_EX_CAS_FAILURES_MAX_LOOP(10)
// should be adjusted upon startup of the server -- command line perhaps?
// .............................................................................
int TRI_InitSkipListEx (TRI_skiplistEx_t* skiplist, size_t elementSize,
int (*compareElementElement) (TRI_skiplistEx_t*, void*, void*, int),
int (*compareKeyElement) (TRI_skiplistEx_t*, void*, void*, int),
TRI_skiplistEx_prob_e probability,
uint32_t maximumHeight,
uint64_t lastKnownTransID) {
int result;
if (skiplist == NULL) {
return TRI_ERROR_INTERNAL;
}
// ..........................................................................
// Assign the STATIC comparision call back functions
// ..........................................................................
skiplist->compareElementElement = IndexStaticCompareElementElement; // compareElementElement;
skiplist->compareKeyElement = IndexStaticCompareKeyElement; // compareKeyElement;
// ..........................................................................
// Assign the maximum height of the skip list. This maximum height must be
// no greater than the absolute max height defined as a compile time parameter
// ..........................................................................
if (maximumHeight == 0) {
maximumHeight = SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT;
}
skiplist->_base._maxHeight = maximumHeight;
if (maximumHeight > SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT) {
LOG_ERROR("Invalid maximum height for skiplist");
assert(false);
return TRI_ERROR_INTERNAL;
}
// ..........................................................................
// Assign the probability and determine the number of random numbers which
// we will require -- do it once off here
// ..........................................................................
skiplist->_base._prob = probability;
skiplist->_base._numRandom = 0;
switch (skiplist->_base._prob) {
case TRI_SKIPLIST_EX_PROB_HALF: {
// determine the number of random numbers which we require.
skiplist->_base._numRandom = (skiplist->_base._maxHeight / 32);
if ((skiplist->_base._maxHeight % 32) != 0) {
++(skiplist->_base._numRandom);
}
break;
}
case TRI_SKIPLIST_EX_PROB_THIRD: {
// determine the number of random numbers which we require.
skiplist->_base._numRandom = (skiplist->_base._maxHeight / 16);
if ((skiplist->_base._maxHeight % 16) != 0) {
++(skiplist->_base._numRandom);
}
break;
}
case TRI_SKIPLIST_EX_PROB_QUARTER: {
// determine the number of random numbers which we require.
skiplist->_base._numRandom = (skiplist->_base._maxHeight / 16);
if ((skiplist->_base._maxHeight % 16) != 0) {
++(skiplist->_base._numRandom);
}
break;
}
default: {
LOG_ERROR("Invalid probability assigned to skiplist");
assert(false);
return TRI_ERROR_INTERNAL;
}
} // end of switch statement
// ..........................................................................
// Create storage for where to store the random numbers which we generated
// do it here once off.
// ..........................................................................
skiplist->_base._random = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(uint32_t) * skiplist->_base._numRandom, false);
if (skiplist->_base._random == NULL) {
return TRI_ERROR_OUT_OF_MEMORY;
}
// ..........................................................................
// Assign the element size
// ..........................................................................
skiplist->_base._elementSize = elementSize;
// ..........................................................................
// Initialise the vertical storage of the lists and the place where we
// are going to store elements
// ..........................................................................
skiplist->_base._startNode._column = NULL;
skiplist->_base._startNode._colLength = 0;
skiplist->_base._startNode._extraData = NULL;
skiplist->_base._startNode._element = NULL;
skiplist->_base._startNode._delTransID = UINT64_MAX;
skiplist->_base._startNode._insTransID = lastKnownTransID;
skiplist->_base._endNode._column = NULL;
skiplist->_base._endNode._colLength = 0;
skiplist->_base._endNode._extraData = NULL;
skiplist->_base._endNode._element = NULL;
skiplist->_base._endNode._delTransID = UINT64_MAX;
skiplist->_base._endNode._insTransID = lastKnownTransID;
// ...........................................................................
// 32 bit integer CAS flag
// ...........................................................................
skiplist->_base._growStartEndNodesFlag = TRI_SKIPLIST_EX_FREE_TO_GROW_START_END_NODES_FLAG;
// ..........................................................................
// Whenever a probability of 1/2, 1/3, 1/4 is used, on average there will be
// each node will have a height of two. So initialise the start and end nodes
// with this 'average' height
// ..........................................................................
result = GrowNewNodeHeight(&(skiplist->_base._startNode), skiplist->_base._maxHeight, 2,TRI_ERROR_NO_ERROR); // may fail
result = GrowNewNodeHeight(&(skiplist->_base._endNode), skiplist->_base._maxHeight, 2, result); // may fail
if (result != TRI_ERROR_NO_ERROR) {
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(skiplist->_base._random));
if (skiplist->_base._startNode._column != NULL) {
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(skiplist->_base._startNode._column));
}
if (skiplist->_base._endNode._column != NULL) {
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(skiplist->_base._endNode._column));
}
return result;
}
// ..........................................................................
// Join the empty lists together
// no locking requirements for joining nodes since the skip list index is not known
// to anyone yet!
// [N]<----------------------------------->[N]
// [N]<----------------------------------->[N]
// ..........................................................................
JoinStartEndNodes(&(skiplist->_base._startNode), &(skiplist->_base._endNode), 0, skiplist->_base._maxHeight - 1); // joins list 0 & 1
return TRI_ERROR_NO_ERROR;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief destroys a skip list, but does not free the pointer
////////////////////////////////////////////////////////////////////////////////
void TRI_DestroySkipListEx(TRI_skiplistEx_t* skiplist) {
if (skiplist != NULL) {
DestroyBaseSkipListEx( (TRI_skiplistEx_base_t*)(skiplist) );
}
}
////////////////////////////////////////////////////////////////////////////////
/// @brief destroys a skip list and frees the pointer
////////////////////////////////////////////////////////////////////////////////
void TRI_FreeSkipListEx(TRI_skiplistEx_t* skiplist) {
if (skiplist != NULL) {
TRI_DestroySkipListEx(skiplist);
TRI_Free(TRI_UNKNOWN_MEM_ZONE, skiplist);
}
}
////////////////////////////////////////////////////////////////////////////////
/// @}
////////////////////////////////////////////////////////////////////////////////
// -----------------------------------------------------------------------------
// --SECTION-- unique skiplist public functions
// -----------------------------------------------------------------------------
////////////////////////////////////////////////////////////////////////////////
/// @addtogroup Skiplist_unique
/// @{
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/// @brief returns the end node associated with a skip list
////////////////////////////////////////////////////////////////////////////////
// .............................................................................
// Observe that this is some sort of read transaction. The only possibilitiy
// we have is that the index must have been created AFTER this read transaction
// occurred (given that the skip list is valid of course). We do not check
// for this internal error.
// Also note that the ADDRESS of the START (HEAD) and END (TAIL) nodes never
// change once the skip list is created. (These addresses are static.)
// .............................................................................
void* TRI_EndNodeSkipListEx(TRI_skiplistEx_t* skiplist) {
if (skiplist != NULL) {
return &(skiplist->_base._endNode);
}
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief adds an key/element to the skip list
////////////////////////////////////////////////////////////////////////////////
int TRI_InsertElementSkipListEx(TRI_skiplistEx_t* skiplist, void* element, bool overwrite, uint64_t thisTransID) {
// Use TRI_InsertKeySkipList instead of calling this method
LOG_TRACE("Insertions into a skip list require a key. Elements/items are not currently supported.");
assert(false);
return TRI_ERROR_INTERNAL;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief inserts (adds) an element to the skip list using a key
////////////////////////////////////////////////////////////////////////////////
int TRI_InsertKeySkipListEx (TRI_skiplistEx_t* skiplist, // the skiplist we are using
void* key, // the key used to locate the position of the item within the list
void* element, // the data stored within the skiplist node
bool overwrite, // if true, then if the key already exists, the element will be replaced by this one
uint64_t thisTransID) { // the transaction id of the writer which has requested the insertion
int32_t newHeight;
int32_t currentLevel;
TRI_skiplistEx_node_t* currentNode;
TRI_skiplistEx_node_t* nextNode;
TRI_skiplistEx_node_t* newNode;
int compareResult;
int result;
int casFailures = -1;
// ...........................................................................
// Just in case
// ...........................................................................
if (skiplist == NULL) {
return TRI_ERROR_INTERNAL;
}
// ...........................................................................
// Determine the number of levels in which to add the item. That is, determine
// the height of the node so that it participates in that many lists.
// Convert the level to a height
// ...........................................................................
newHeight = (RandLevel(&(skiplist->_base))) + 1;
// ...........................................................................
// Something wrong since the newHeight must be at least 1
// ...........................................................................
if (newHeight < 1) {
return TRI_ERROR_INTERNAL;
}
// ...........................................................................
// Grow lists if required by increasing the height of the start and end nodes
// ...........................................................................
result = GrowStartEndNodes(&(skiplist->_base), newHeight);
if (result != TRI_ERROR_NO_ERROR) {
return result;
}
// ...........................................................................
// Create the new node to be inserted. If there is some sort of failure,
// then we delete the node memory.
// ...........................................................................
newNode = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_skiplistEx_node_t) + skiplist->_base._elementSize, false);
if (newNode == NULL) { // out of memory?
// no necessity to undo the start/end node growth
return TRI_ERROR_OUT_OF_MEMORY;
}
// ...........................................................................
// Copy the contents of element into the new node to be inserted.
// If a duplicate has been found, then we destroy the allocated memory.
// ...........................................................................
newNode->_column = NULL;
newNode->_colLength = 0;
newNode->_extraData = NULL;
result = IndexStaticCopyElementElement(&(skiplist->_base), &(newNode->_element), element);
result = GrowNewNodeHeight(newNode, newHeight, newHeight, result);
if (result != TRI_ERROR_NO_ERROR) {
FreeSkipListExNode(&(skiplist->_base), newNode);
return result;
}
// ...........................................................................
// Assign the deletion transaction id and the insertion transaction id
// ...........................................................................
newNode->_delTransID = UINT64_MAX; // since we are inserting this new node it can not be deleted
newNode->_insTransID = thisTransID; // this is what was given to us
// ...........................................................................
// Big loop to restart the whole search routine
// ...........................................................................
CAS_RESTART: {
// ...........................................................................
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
// ...........................................................................
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
LOG_ERROR("CAS Failure");
FreeSkipListExNode(&(skiplist->_base), newNode);
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
}
// ...........................................................................
// Provide a simple non-blocking, lock? The sleep time should be something
// needs to be adjusted.
// ...........................................................................
if (casFailures > -1) {
usleep(CAS_FAILURE_SLEEP_TIME);
}
// ...........................................................................
// Increment the cas failures (which should always be hopefully 0).
// ...........................................................................
++casFailures;
// ...........................................................................
// Determine the path where the new item is to be inserted. If the item
// already exists either replace it or return false. Recall that this
// skip list is used for unique key/value pairs. Use the skiplist-multi
// non-unique key/value pairs.
// ...........................................................................
currentLevel = skiplist->_base._startNode._colLength - 1; // NOT current height BUT current level is required here
currentNode = &(skiplist->_base._startNode);
nextNode = currentNode;
START: {
// .........................................................................
// The current node (which we have called the nextNode below) should never
// be null. Protect yourself in case something has gone wrong.
// .........................................................................
if (nextNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// We require the successor of the current node so we can perform a
// comparison. It should never be null.
// .........................................................................
nextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[currentLevel]._next);
if (nextNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// An insert/lookup/removal SEARCH like this, can ONLY ever find 1 glass
// node when we are very unlucky. (The GC makes the node glass and then
// goes and unlinks the pointers.) If we skip the glass node, then we
// will have the wrong pointers to compare, so we have to CAS_RESTART
// .........................................................................
if (nextNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
goto CAS_RESTART;
}
// .........................................................................
// WE HAVE FOUR CASES TO CONSIDER
// .........................................................................
// .........................................................................
// CASE ONE:
// At this level we have the smallest (start) and largest (end) nodes ONLY.
// CASE TWO:
// We have arrived at the end of the nodes and we are not at the
// start of the nodes either.
// .........................................................................
if (nextNode == &(skiplist->_base._endNode)) {
// .......................................................................
// Store the current node and level in the path
// .......................................................................
if (currentLevel < newHeight) {
newNode->_column[currentLevel]._prev = currentNode;
newNode->_column[currentLevel]._next = nextNode;
}
// .......................................................................
// if we are at the lowest level of the lists, insert the item to the
// right of the current node
// .......................................................................
if (currentLevel == 0) {
goto END;
}
// .......................................................................
// We have not yet reached the lowest level continue down.
// .......................................................................
nextNode = currentNode;
--currentLevel;
goto START;
}
// .........................................................................
// CASE THREE:
// We are the smallest left most node and the NEXT node is NOT the end node.
// Compare this element with the element in the right node to see what we do.
// CASE FOUR:
// We are somewhere in the middle of a list, away from the smallest and
// largest nodes.
// .........................................................................
else { // nextNode != &(skiplist->_endNode
// .......................................................................
// Use the callback to determine if the element is less or greater than
// the next node element.
// .......................................................................
compareResult = IndexStaticCompareKeyElement(skiplist,key,&(nextNode->_element), 0);
// .......................................................................
// The element to be inserted has a key which greater than the next node's
// element key. Keep going on this level.
// .......................................................................
if (compareResult > 0) {
currentNode = nextNode;
goto START;
}
// .......................................................................
// The element matches the next element.
// However since we support transactions some things are different and we
// we have to tread carefully. Note that any nodes with the same key are
// ALWAYS inserted to the LEFT of the existing node. This means we need
// only check the next node.
// .......................................................................
if (compareResult == 0) {
// .....................................................................
// It may happen that this node is NOT deleted and simply there -
// check the ins & del transaction numbers.
// .....................................................................
if (nextNode->_insTransID > thisTransID) {
// ...................................................................
// Something terrible has happened since writers have been serialized,
// how is that an existing node has a higher transaction number than
// this transaction
// ...................................................................
printf("%s:%s:%d:Can not be here!\n",__FILE__,__FUNCTION__,__LINE__);
assert(false); // there is no way we can be here
}
// .....................................................................
// node has been previously inserted
// .....................................................................
if (nextNode->_delTransID > thisTransID) {
// ...................................................................
// Node has NOT been deleted (e.g. imagine it will be deleted some
// time in the future). Treat this as a duplicate key, overwrite if
// possible and return. We do not allow elements with duplicate 'keys'.
// ...................................................................
FreeSkipListExNode(&(skiplist->_base), newNode);
if (overwrite) {
result = IndexStaticCopyElementElement(&(skiplist->_base), &(nextNode->_element), element);
return result;
}
return TRI_set_errno(TRI_ERROR_ARANGO_UNIQUE_CONSTRAINT_VIOLATED);
}
// .....................................................................
// The only case left here is that the node has been deleted by either
// this transaction (which could happen in an UPDATE) or by some
// previous write transaction. Treat this case as if the element is
// less than the next node element - this ensure that that the
// most recent revision of the data is always to the LEFT.
// Keep going on this level.
// .....................................................................
}
// .......................................................................
// The element is less than the next node. Can we drop down the list?
// Store the current node and level in the path.
// .......................................................................
if (currentLevel < newHeight) {
newNode->_column[currentLevel]._prev = currentNode;
newNode->_column[currentLevel]._next = nextNode;
}
// .......................................................................
// We have reached the lowest level of the lists. Time to insert item.
// .......................................................................
if (currentLevel == 0) {
goto END;
}
// .......................................................................
// Drop down the list
// .......................................................................
nextNode = currentNode;
--currentLevel;
goto START;
}
} // end of label START
} // end of label CAS_RESTART
END: {
// ..........................................................................
// Ok finished with the loop and we should have a path with AT MOST
// SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT number of elements.
// ..........................................................................
// ..........................................................................
// this is the tricky part since we have to attempt to do this as
// 'lock-free' as possible. This is acheived in three passes:
// Pass 1: Mark each prev and next node of the new node so that the GC
// can not modify it. If this fails goto CAS_RESTART
// Pass 2: Ensure that each prev and next tower is not glassed.
// Pass 3: Modify the newnode.prev.next to newnode and newnode.next.prev = newnode
// ..........................................................................
result = JoinNewNodeCas(newNode);
if (result == TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE) {
goto CAS_RESTART;
}
return result;
} // end of END label
return TRI_ERROR_NO_ERROR;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief returns greatest node less than a given key
////////////////////////////////////////////////////////////////////////////////
void* TRI_LeftLookupByKeySkipListEx(TRI_skiplistEx_t* skiplist, void* key, uint64_t thisTransID) {
int32_t currentLevel;
TRI_skiplistEx_node_t* currentNode;
TRI_skiplistEx_node_t* nextNode;
int casFailures = -1;
// ...........................................................................
// Just in case ...
// ...........................................................................
if (skiplist == NULL) {
LOG_ERROR("Internal Error");
return NULL;
}
// ...........................................................................
// Big loop to restart the whole search routine
// ...........................................................................
CAS_RESTART: {
// ...........................................................................
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
// ...........................................................................
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
LOG_ERROR("CAS Failure");
return NULL;
}
// ...........................................................................
// Provide a simple non-blocking, block?
// ...........................................................................
if (casFailures > -1) {
assert(0); // a test to see why it blocks - should not block!
usleep(CAS_FAILURE_SLEEP_TIME);
}
// ...........................................................................
// Increment the cas failures (which should always be hopefully 0).
// ...........................................................................
++casFailures;
// ...........................................................................
// Determine the starting level and the starting node
// ...........................................................................
currentLevel = skiplist->_base._startNode._colLength - 1;
currentNode = &(skiplist->_base._startNode);
nextNode = currentNode;
START: {
// .........................................................................
// Find the next node in the current level of the lists. Protect yourself
// in case something has gone wrong.
// .........................................................................
if (nextNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// We require the successor of the current node so we can perform a
// comparison. It should never be null.
// .........................................................................
nextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[currentLevel]._next);
if (nextNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// Is our next node a glass node? If so we must skip it!
// Note: since Garbage Collection is performed in TWO passes, it is possible
// that we have more than one glass node.
// .........................................................................
if (nextNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
goto START;
}
// .........................................................................
// WE HAVE FOUR CASES TO CONSIDER
// .........................................................................
// .........................................................................
// CASE ONE:
// At this level we have the smallest (start) and largest (end) nodes ONLY.
// CASE TWO:
// We have arrived at the end of the nodes and we are not at the
// start of the nodes either.
// .........................................................................
if (nextNode == &(skiplist->_base._endNode)) {
// .......................................................................
// We are at the lowest level of the lists, and we haven't found the item
// yet. The currentNode does NOT compare and the next node is +\infinty.
// .......................................................................
if (currentLevel == 0) {
return currentNode;
}
// .......................................................................
// We have not yet reached the lowest level continue down. Possibly our
// item we seek is to be found a lower level.
// .......................................................................
nextNode = currentNode;
--currentLevel;
goto START;
}
// .........................................................................
// CASE THREE:
// We are the smallest left most node and the NEXT node is NOT the end node.
// Compare this element with the element in the right node to see what we do.
// CASE FOUR:
// We are somewhere in the middle of a list, away from the smallest and
// largest nodes.
// .........................................................................
else { // nextNode != &(skiplist->_endNode
int compareResult;
// .......................................................................
// Use the callback to determine if the element is less or greater than
// the next node element. We treat the comparison by assuming we are
// looking for a "key - epsilon". With this assumption we always find the
// last key to our right if it exists. The reason this is necessary is as
// follows: we allow a multiple documents with the same key to be stored
// here with the proviso that all but the last one is marked as deleted.
// This is how we cater for multiple revisions.
// .......................................................................
compareResult = IndexStaticCompareKeyElement(skiplist,key,&(nextNode->_element), -1);
// .......................................................................
// -1 is returned if the number of fields (attributes) in the key is LESS
// than the number of fields in the index definition. This has the effect
// of being slightly less efficient since we have to proceed to the level
// 0 list in the set of skip lists.
// .......................................................................
// .......................................................................
// We have found the item!
// .......................................................................
if (compareResult == 0) {
assert(false);
}
if (compareResult > 0) {
currentNode = nextNode;
goto START;
}
// .......................................................................
// The element is less than the next node. Can we drop down the list?
// If have reached the lowest level of the lists -- no such item.
// .......................................................................
if (currentLevel == 0) {
return currentNode;
}
// .......................................................................
// Drop down the list
// .......................................................................
--currentLevel;
nextNode = currentNode;
goto START;
}
} // end of label START
} // end of label CAS_RESTART
assert(false);
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief returns node which matches a key
////////////////////////////////////////////////////////////////////////////////
void* TRI_LookupByKeySkipListEx (TRI_skiplistEx_t* skiplist, void* key, uint64_t thisTransID) {
int32_t currentLevel;
TRI_skiplistEx_node_t* currentNode;
TRI_skiplistEx_node_t* nextNode;
int casFailures = -1;
// ...........................................................................
// Just in case ...
// ...........................................................................
if (skiplist == NULL) {
LOG_ERROR("Internal Error");
return NULL;
}
// ...........................................................................
// Big loop to restart the whole search routine
// ...........................................................................
CAS_RESTART: {
// ...........................................................................
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
// ...........................................................................
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
LOG_ERROR("CAS Failure");
return NULL;
}
// ...........................................................................
// Provide a simple non-blocking, block?
// ...........................................................................
if (casFailures > -1) {
usleep(CAS_FAILURE_SLEEP_TIME);
}
// ...........................................................................
// Increment the cas failures (which should always be hopefully 0).
// ...........................................................................
++casFailures;
// ...........................................................................
// Determine the starting level and the starting node
// ...........................................................................
currentLevel = skiplist->_base._startNode._colLength - 1;
currentNode = &(skiplist->_base._startNode);
nextNode = currentNode;
START: {
// .........................................................................
// Find the next node in the current level of the lists. Protect yourself
// in case something has gone wrong.
// .........................................................................
if (nextNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// We require the successor of the current node so we can perform a
// comparison. It should never be null.
// .........................................................................
nextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[currentLevel]._next);
if (nextNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// Is our next node a glass node? If so we must skip it!
// Note: since Garbage Collection is performed in TWO passes, it is possible
// that we have more than one glass node.
// .........................................................................
if (nextNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
goto START;
}
// .........................................................................
// WE HAVE FOUR CASES TO CONSIDER
// .........................................................................
// .........................................................................
// CASE ONE:
// At this level we have the smallest (start) and largest (end) nodes ONLY.
// CASE TWO:
// We have arrived at the end of the nodes and we are not at the
// start of the nodes either.
// .........................................................................
if (nextNode == &(skiplist->_base._endNode)) {
// .......................................................................
// We are at the lowest level of the lists, and we haven't found the item
// yet. The currentNode does NOT compare and the next node is +\infinty.
// .......................................................................
if (currentLevel == 0) {
return NULL;
}
// .......................................................................
// We have not yet reached the lowest level continue down. Possibly our
// item we seek is to be found a lower level.
// .......................................................................
nextNode = currentNode;
--currentLevel;
goto START;
}
// .........................................................................
// CASE THREE:
// We are the smallest left most node and the NEXT node is NOT the end node.
// Compare this element with the element in the right node to see what we do.
// CASE FOUR:
// We are somewhere in the middle of a list, away from the smallest and
// largest nodes.
// .........................................................................
else { // nextNode != &(skiplist->_endNode
int compareResult;
// .......................................................................
// Use the callback to determine if the element is less or greater than
// the next node element. We treat the comparison by assuming we are
// looking for a "key - epsilon". With this assumption we always find the
// last key to our right if it exists. The reason this is necessary is as
// follows: we allow a multiple documents with the same key to be stored
// here with the proviso that all but the last one is marked as deleted.
// This is how we cater for multiple revisions.
// .......................................................................
compareResult = IndexStaticCompareKeyElement(skiplist,key,&(nextNode->_element), 0);
// .......................................................................
// The element is greater than the next node element. Keep going on this
// level.
// .......................................................................
if (compareResult > 0) {
currentNode = nextNode;
goto START;
}
if (compareResult == 0) {
// .....................................................................
// It may happen that this node is NOT deleted and simply there -
// check the ins & del transaction numbers.
// .....................................................................
if (nextNode->_insTransID > thisTransID) {
// ...................................................................
// This node has been inserted AFTER the reading starting reading!
// Treat this as if the node was NEVER there.
// ...................................................................
//return NULL;
}
// .....................................................................
// node has been previously inserted
// .....................................................................
if (nextNode->_delTransID > thisTransID) {
// ...................................................................
// Node has NOT been deleted (e.g. imagine it will be deleted some
// time in the future). This is the node we want, even though it may
// be deleted very very soon.
// ...................................................................
return nextNode;
}
// .....................................................................
// The only case left here is that the node has been deleted by either
// this transaction (which could happen in an UPDATE) or by some
// previous write transaction. Treat this case as if the element is
// less than the next node element - this ensures that that the
// most recent revision of the data is always to the LEFT.
// Keep going on this level.
// .....................................................................
}
// .......................................................................
// The element is less than the next node. Can we drop down the list?
// If have reached the lowest level of the lists -- no such item.
// .......................................................................
if (currentLevel == 0) {
return NULL;
}
// .......................................................................
// Drop down the list
// .......................................................................
--currentLevel;
nextNode = currentNode;
goto START;
}
} // end of label START
} // end of label CAS_RESTART
assert(0);
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief given a node returns the next node (if possible) in the skiplist
////////////////////////////////////////////////////////////////////////////////
void* TRI_NextNodeSkipListEx(TRI_skiplistEx_t* skiplist, void* currentNode, uint64_t thisTransID) {
if (skiplist != NULL) {
return NextNodeBaseSkipListEx( (TRI_skiplistEx_base_t*)(skiplist), currentNode, thisTransID);
}
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief given a node returns the previous node (if possible) in the skiplist
////////////////////////////////////////////////////////////////////////////////
void* TRI_PrevNodeSkipListEx(TRI_skiplistEx_t* skiplist, void* currentNode, uint64_t thisTransID) {
if (skiplist != NULL) {
return PrevNodeBaseSkipListEx( (TRI_skiplistEx_base_t*)(skiplist), currentNode, thisTransID);
}
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief removes an element from the skip list
////////////////////////////////////////////////////////////////////////////////
int TRI_RemoveElementSkipListEx (TRI_skiplistEx_t* skiplist, void* element, void* old,
const int passLevel, const uint64_t thisTransID,
TRI_skiplistEx_node_t** passNode) {
// ...........................................................................
// To remove an element from this skip list we have three pass levels:
// Pass 1: locate (if possible) the exact NODE - must match exactly.
// Once located, add the transaction id to the node. Return.
// Pass 2: locate the node (if not possible report error) - must match exactly.
// Once located, attempt to unlink all the pointers and make the
// node a Glass Node.
// Pass 3: Excise the node by destroying it's allocated memory.
// ...........................................................................
int32_t currentLevel;
TRI_skiplistEx_node_t* currentNode = NULL;
TRI_skiplistEx_node_t* nextNode = NULL;
int casFailures = -1;
// ...........................................................................
// Just in case
// ...........................................................................
if (skiplist == NULL) {
LOG_ERROR("Internal Error");
return TRI_ERROR_INTERNAL;
}
// ...........................................................................
// Only for pass level 1 do we have a requirement to locate the actual node
// using the key. For pass levels 2 & 3 we have the pointer to the node.
// ...........................................................................
if (passLevel != 1) { goto END; }
// ...........................................................................
// Big loop to restart the whole search routine
// ...........................................................................
CAS_RESTART: {
// ...........................................................................
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
// ...........................................................................
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
LOG_ERROR("CAS Failure");
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_CAS_FAILURE;
}
// ...........................................................................
// Provide a simple non-blocking, block?
// ...........................................................................
if (casFailures > -1) {
usleep(CAS_FAILURE_SLEEP_TIME);
}
// ...........................................................................
// Increment the cas failures (which should always be hopefully 0).
// ...........................................................................
++casFailures;
// ...........................................................................
// Determine the starting level and the starting node
// ...........................................................................
currentLevel = skiplist->_base._startNode._colLength - 1;
currentNode = &(skiplist->_base._startNode);
nextNode = currentNode;
START: {
// .........................................................................
// Find the next node in the current level of the lists. Protect yourself
// in case something has gone wrong.
// .........................................................................
if (nextNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// We require the successor of the current node so we can perform a
// comparison. It should never be null.
// .........................................................................
nextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[currentLevel]._next);
if (nextNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// Is our next node a glass node? If so we must skip it. Recall we are in
// Phase I here -- meaning that we are searching for a node which has not
// be removed and previously inserted.
// .........................................................................
if (nextNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
goto START;
}
// .........................................................................
// WE HAVE FOUR CASES TO CONSIDER
// .........................................................................
// .........................................................................
// CASE ONE:
// At this level we have the smallest (start) and largest (end) nodes ONLY.
// CASE TWO:
// We have arrived at the end of the nodes and we are not at the
// start of the nodes either.
// .........................................................................
if (nextNode == &(skiplist->_base._endNode)) {
// .......................................................................
// We are at the lowest level of the lists, and we haven't found the item
// yet. Nothing to remove so return.
// .......................................................................
if (currentLevel == 0) {
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_ITEM_MISSING;
}
// .......................................................................
// We have not yet reached the lowest level continue down.
// .......................................................................
nextNode = currentNode;
--currentLevel;
goto START;
}
// .........................................................................
// CASE THREE:
// We are the smallest left most node and the NEXT node is NOT the end node.
// Compare this element with the element in the right node to see what we do.
// CASE FOUR:
// We are somewhere in the middle of a list, away from the smallest and
// largest nodes.
// .........................................................................
else { // nextNode != &(skiplist->_endNode
int compareResult;
// .......................................................................
// Use the callback to determine if the element is less or greater than
// the next node element.
// .......................................................................
compareResult = IndexStaticCompareElementElement(skiplist,element,&(nextNode->_element), -1);
// .......................................................................
// The element is greater than the next node element. Keep going on this
// level.
// .......................................................................
if (compareResult > 0) {
currentNode = nextNode;
goto START;
}
if (compareResult == 0) { // a node matches the key exactly
if (nextNode->_insTransID > thisTransID) {
// ...................................................................
// This node has been inserted AFTER the reader starting reading!
// An insertion can only have occured if (a) there never was a previous
// node with the same key or (b) there exists another with the same
// key but of course now must be marked as deleted.
// ...................................................................
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_ITEM_POST_INSERTED;
}
// .....................................................................
// node has been previously inserted
// .....................................................................
if (nextNode->_delTransID > thisTransID) {
// ...................................................................
// Node has NOT been deleted (e.g. imagine it will be deleted some
// time in the future). This is the node we want.
// ...................................................................
currentNode = nextNode;
goto END;
}
// .....................................................................
// The only case left here is that the node has been deleted by either
// this transaction (which could happen in an UPDATE) or by some
// previous write transaction. Treat this case as if the element is
// less than the next node element - this ensure that that the
// most recent revision of the data is always to the LEFT.
// Keep going on this level.
// .....................................................................
}
// .......................................................................
// We have reached the lowest level of the lists -- no such item.
// .......................................................................
if (currentLevel == 0) {
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_ITEM_MISSING;
}
// .......................................................................
// Drop down the list
// .......................................................................
--currentLevel;
nextNode = currentNode;
goto START;
}
} // end of START label
} // end of CAS_RESTART label
END: {
switch (passLevel) {
// .........................................................................
// In this case we simply add the del transaction id with a CAS statement.
// It should never fail!
// .........................................................................
case 1: {
bool ok;
if (currentNode == NULL) { // something terribly wrong
assert(0);
return TRI_ERROR_INTERNAL;
}
ok = TRI_CompareAndSwapIntegerUInt64 (&(currentNode->_delTransID),
UINT64_MAX, thisTransID);
if (!ok) {
assert(0);
return TRI_ERROR_INTERNAL;
}
// ....................................................................
// If requested copy the contents of the element we have located into the
// storage sent.
// ....................................................................
if (old != NULL) {
IndexStaticCopyElementElement(&(skiplist->_base), old, &(currentNode->_element));
}
*passNode = currentNode;
return TRI_ERROR_NO_ERROR;
}
// .........................................................................
// In this case we wish to make the node a glass node and to unjoin all
// other connected nodes.
// .........................................................................
case 2: {
// .......................................................................
// We can not now rely upon looking up the node using the key, since
// we would need to traverse right and attempt to match either then
// transaction id and/or the pointer to the doc. Easier to simply
// send the address of the node back.
// .......................................................................
if (*passNode == NULL) {
return TRI_ERROR_INTERNAL;
}
currentNode = (TRI_skiplistEx_node_t*)(*passNode);
// .......................................................................
// Only the Garbage Collector can transform a node into a glass node, and
// since the GC is only operating in one thread safe to do a simple
// comparison here.
// .......................................................................
if (currentNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
return TRI_ERROR_INTERNAL;
}
// .......................................................................
// safety check
// .......................................................................
if (currentNode->_delTransID != thisTransID) {
return TRI_ERROR_INTERNAL;
}
// .......................................................................
// The stragey is this:
// (a) Brick each nearest neighbour on this node. This ensures that NO
// other nodes can be attached to this node.
// (b) Mark this node as being glass. This ensures that it is skipped
// since it is no longer required in the index.
// (c) Unbrick each of its nearest neighbours on this node. This ensures
// that an inserted node MAY be allowed to be attached but will later fail.
// Also allows us to brick other glass nodes.
// (d) Brick each prev and next nearest neighbour of this node. Irrespective
// if one of these are glass or not. This ensures that lookups can
// proceed unhinded.
// (e) Unjoin the node from the list.
// (f) Unbrick each prev/next nearest neigbour
// .......................................................................
return UnJoinOldNodeCas(currentNode);
}
// .........................................................................
// In this case since no other reader/writer can be accessing the node,
// we simply destroy it. we require the node to be glass.
// .........................................................................
case 3: {
// .......................................................................
// We can not now rely upon looking up the node using the key, since
// we would need to traverse right and attempt to match either then
// transaction id and/or the pointer to the doc. Easier to simply
// send the address of the node back.
// .......................................................................
if (*passNode == NULL) {
return TRI_ERROR_INTERNAL;
}
currentNode = (TRI_skiplistEx_node_t*)(*passNode);
// .......................................................................
// Only the Garbage Collector can transform a node into a glass node, and
// since the GC is only operating in one thread safe to do a simple
// comparison here.
// .......................................................................
if (currentNode->_towerFlag != TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
return TRI_ERROR_INTERNAL;
}
// .......................................................................
// safety check
// .......................................................................
if (currentNode->_delTransID != thisTransID) {
return TRI_ERROR_INTERNAL;
}
FreeSkipListExNode(&(skiplist->_base), currentNode);
break;
}
default: {
assert(0);
return TRI_ERROR_INTERNAL;
}
} // end of switch statement
} // end of END label
return TRI_ERROR_NO_ERROR;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief removes an key/element to the skip list
////////////////////////////////////////////////////////////////////////////////
int TRI_RemoveKeySkipListEx(TRI_skiplistEx_t* skiplist, void* key, void* old,
const int passLevel, const uint64_t thisTransID,
TRI_skiplistEx_node_t** passNode) {
// Use the TRI_RemoveElementSkipList method instead.
assert(false);
return 0;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief returns smallest node greater than a given key
////////////////////////////////////////////////////////////////////////////////
void* TRI_RightLookupByKeySkipListEx(TRI_skiplistEx_t* skiplist, void* key, uint64_t thisTransID) {
int32_t currentLevel;
TRI_skiplistEx_node_t* currentNode;
TRI_skiplistEx_node_t* prevNode;
int casFailures = -1;
// ...........................................................................
// Just in case ...
// ...........................................................................
if (skiplist == NULL) {
LOG_ERROR("Internal Error");
return NULL;
}
// ...........................................................................
// Big loop to restart the whole search routine
// ...........................................................................
CAS_RESTART: {
// ...........................................................................
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
// ...........................................................................
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
LOG_ERROR("CAS Failure");
return NULL;
}
// ...........................................................................
// Provide a simple non-blocking, block?
// ...........................................................................
if (casFailures > -1) {
assert(0); // a test to see why it blocks - should not block!
usleep(CAS_FAILURE_SLEEP_TIME);
}
// ...........................................................................
// Increment the cas failures (which should always be hopefully 0).
// ...........................................................................
++casFailures;
// ...........................................................................
// Determine the starting level and the starting node
// ...........................................................................
currentLevel = skiplist->_base._endNode._colLength - 1;
currentNode = &(skiplist->_base._endNode);
prevNode = currentNode;
START: {
// .........................................................................
// Find the next node in the current level of the lists. Protect yourself
// in case something has gone wrong.
// .........................................................................
if (prevNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// We require the successor of the current node so we can perform a
// comparison. It should never be null.
// .........................................................................
prevNode = (TRI_skiplistEx_node_t*)(prevNode->_column[currentLevel]._prev);
if (prevNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// Is our next node a glass node? If so we must skip it!
// Note: since Garbage Collection is performed in TWO passes, it is possible
// that we have more than one glass node.
// .........................................................................
if (prevNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
goto START;
}
// .........................................................................
// WE HAVE FOUR CASES TO CONSIDER
// .........................................................................
// .........................................................................
// CASE ONE:
// At this level we have the smallest (start) and largest (end) nodes ONLY.
// CASE TWO:
// We have arrived at the end of the nodes and we are not at the
// start of the nodes either.
// .........................................................................
if (prevNode == &(skiplist->_base._startNode)) {
// .......................................................................
// We are at the lowest level of the lists, and we haven't found the item
// yet. The currentNode does NOT compare and the next node is +\infinty.
// .......................................................................
if (currentLevel == 0) {
return currentNode;
}
// .......................................................................
// We have not yet reached the lowest level continue down. Possibly our
// item we seek is to be found a lower level.
// .......................................................................
prevNode = currentNode;
--currentLevel;
goto START;
}
// .........................................................................
// CASE THREE:
// We are the smallest left most node and the NEXT node is NOT the end node.
// Compare this element with the element in the right node to see what we do.
// CASE FOUR:
// We are somewhere in the middle of a list, away from the smallest and
// largest nodes.
// .........................................................................
else { // nextNode != &(skiplist->_endNode
int compareResult;
// .......................................................................
// Use the callback to determine if the element is less or greater than
// the next node element. We treat the comparison by assuming we are
// looking for a "key - epsilon". With this assumption we always find the
// last key to our right if it exists. The reason this is necessary is as
// follows: we allow a multiple documents with the same key to be stored
// here with the proviso that all but the last one is marked as deleted.
// This is how we cater for multiple revisions.
// .......................................................................
compareResult = IndexStaticCompareKeyElement(skiplist,key,&(prevNode->_element), 1);
// .......................................................................
// If the number of fields (attributes) in the key is LESS than the number
// of fields in the element to be compared to, then EVEN if the keys which
// which are common to both equate as EQUAL, we STILL return 1 rather than
// 0! This ensures that the right interval end point is correctly positioned
// -- slightly inefficient since the lowest level skip list 0 has to be reached
// in this case.
// .......................................................................
// .......................................................................
// We have found the item!
// .......................................................................
if (compareResult == 0) {
assert(false);
}
if (compareResult < 0) {
currentNode = prevNode;
goto START;
}
// .......................................................................
// The element is less than the next node. Can we drop down the list?
// If have reached the lowest level of the lists -- no such item.
// .......................................................................
if (currentLevel == 0) {
return currentNode;
}
// .......................................................................
// Drop down the list
// .......................................................................
--currentLevel;
prevNode = currentNode;
goto START;
}
} // end of label START
} // end of label CAS_RESTART
assert(false);
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief returns the start node associated with a skip list.
////////////////////////////////////////////////////////////////////////////////
void* TRI_StartNodeSkipListEx(TRI_skiplistEx_t* skiplist) {
if (skiplist != NULL) {
return &(skiplist->_base._startNode);
}
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
/// @}
////////////////////////////////////////////////////////////////////////////////
// -----------------------------------------------------------------------------
// --SECTION-- non-unique skiplist constructors and destructors
// -----------------------------------------------------------------------------
////////////////////////////////////////////////////////////////////////////////
/// @addtogroup Skiplist_non_unique
/// @{
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/// @brief initialises a multi skip list which allows duplicate entries
////////////////////////////////////////////////////////////////////////////////
int TRI_InitSkipListExMulti (TRI_skiplistEx_multi_t* skiplist, size_t elementSize,
int (*compareElementElement) (TRI_skiplistEx_multi_t*, void*, void*, int),
int (*compareKeyElement) (TRI_skiplistEx_multi_t*, void*, void*, int),
bool (*equalElementElement) (TRI_skiplistEx_multi_t*, void*, void*),
TRI_skiplistEx_prob_e probability,
uint32_t maximumHeight,
uint64_t lastKnownTransID) {
int result;
if (skiplist == NULL) {
return TRI_ERROR_INTERNAL;
}
// ..........................................................................
// Assign the STATIC comparision call back functions
// ..........................................................................
skiplist->compareElementElement = IndexStaticMultiCompareElementElement; //compareElementElement;
skiplist->compareKeyElement = IndexStaticMultiCompareKeyElement; // compareKeyElement;
skiplist->equalElementElement = IndexStaticMultiEqualElementElement; //equalElementElement;
// ..........................................................................
// Assign the maximum height of the skip list. This maximum height must be
// no greater than the absolute max height defined as a compile time parameter
// ..........................................................................
if (maximumHeight == 0) {
maximumHeight = SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT;
}
skiplist->_base._maxHeight = maximumHeight;
if (maximumHeight > SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT) {
LOG_ERROR("Invalid maximum height for skiplist");
assert(false);
return TRI_ERROR_INTERNAL;
}
// ..........................................................................
// Assign the probability and determine the number of random numbers which
// we will require -- do it once off here
// ..........................................................................
skiplist->_base._prob = probability;
skiplist->_base._numRandom = 0;
switch (skiplist->_base._prob) {
case TRI_SKIPLIST_EX_PROB_HALF: {
// determine the number of random numbers which we require.
skiplist->_base._numRandom = (skiplist->_base._maxHeight / 32);
if ((skiplist->_base._maxHeight % 32) != 0) {
++(skiplist->_base._numRandom);
}
break;
}
case TRI_SKIPLIST_EX_PROB_THIRD: {
// determine the number of random numbers which we require.
skiplist->_base._numRandom = (skiplist->_base._maxHeight / 16);
if ((skiplist->_base._maxHeight % 16) != 0) {
++(skiplist->_base._numRandom);
}
break;
}
case TRI_SKIPLIST_EX_PROB_QUARTER: {
// determine the number of random numbers which we require.
skiplist->_base._numRandom = (skiplist->_base._maxHeight / 16);
if ((skiplist->_base._maxHeight % 16) != 0) {
++(skiplist->_base._numRandom);
}
break;
}
default: {
LOG_ERROR("Invalid probability assigned to skiplist");
assert(false);
return TRI_ERROR_INTERNAL;
}
} // end of switch statement
// ..........................................................................
// Create storage for where to store the random numbers which we generated
// do it here once off.
// ..........................................................................
skiplist->_base._random = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(uint32_t) * skiplist->_base._numRandom, false);
if (skiplist->_base._random == NULL) {
return TRI_ERROR_OUT_OF_MEMORY;
}
// ..........................................................................
// Assign the element size
// ..........................................................................
skiplist->_base._elementSize = elementSize;
// ..........................................................................
// Initialise the vertical storage of the lists and the place where we
// are going to store elements
// ..........................................................................
skiplist->_base._startNode._column = NULL;
skiplist->_base._startNode._colLength = 0;
skiplist->_base._startNode._extraData = NULL;
skiplist->_base._startNode._element = NULL;
skiplist->_base._startNode._delTransID = UINT64_MAX;
skiplist->_base._startNode._insTransID = lastKnownTransID;
skiplist->_base._endNode._column = NULL;
skiplist->_base._endNode._colLength = 0;
skiplist->_base._endNode._extraData = NULL;
skiplist->_base._endNode._element = NULL;
skiplist->_base._endNode._delTransID = UINT64_MAX;
skiplist->_base._endNode._insTransID = lastKnownTransID;
// ...........................................................................
// 32 bit integer CAS flag
// ...........................................................................
skiplist->_base._growStartEndNodesFlag = TRI_SKIPLIST_EX_FREE_TO_GROW_START_END_NODES_FLAG;
// ..........................................................................
// Whenever a probability of 1/2, 1/3, 1/4 is used, on average
// each node will have a height of two. So initialise the start and end nodes
// with this 'average' height
// ..........................................................................
result = GrowNewNodeHeight(&(skiplist->_base._startNode), skiplist->_base._maxHeight, 2, TRI_ERROR_NO_ERROR); // may fail
result = GrowNewNodeHeight(&(skiplist->_base._endNode), skiplist->_base._maxHeight, 2, result); // may fail
if (result != TRI_ERROR_NO_ERROR) {
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(skiplist->_base._random));
if (skiplist->_base._startNode._column != NULL) {
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(skiplist->_base._startNode._column));
}
if (skiplist->_base._endNode._column != NULL) {
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(skiplist->_base._endNode._column));
}
return result;
}
// ..........................................................................
// Join the empty lists together
// [N]<----------------------------------->[N]
// [N]<----------------------------------->[N]
// ..........................................................................
JoinStartEndNodes(&(skiplist->_base._startNode),&(skiplist->_base._endNode),0, skiplist->_base._maxHeight - 1); // joins list 0 & 1
return TRI_ERROR_NO_ERROR;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief destroys a multi skip list, but does not free the pointer
////////////////////////////////////////////////////////////////////////////////
void TRI_DestroySkipListExMulti (TRI_skiplistEx_multi_t* skiplist) {
if (skiplist != NULL) {
DestroyBaseSkipListEx( (TRI_skiplistEx_base_t*)(skiplist) );
}
}
////////////////////////////////////////////////////////////////////////////////
/// @brief destroys a multi skip list and frees the pointer
////////////////////////////////////////////////////////////////////////////////
void TRI_FreeSkipListExMulti (TRI_skiplistEx_multi_t* skiplist) {
if (skiplist != NULL) {
TRI_DestroySkipListExMulti(skiplist);
TRI_Free(TRI_UNKNOWN_MEM_ZONE, skiplist);
}
}
////////////////////////////////////////////////////////////////////////////////
/// @}
////////////////////////////////////////////////////////////////////////////////
// -----------------------------------------------------------------------------
// --SECTION-- non-unique skiplist public methods
// -----------------------------------------------------------------------------
////////////////////////////////////////////////////////////////////////////////
/// @addtogroup Skiplist_non_unique
/// @{
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/// @brief Returns the end node associated with a skip list.
////////////////////////////////////////////////////////////////////////////////
void* TRI_EndNodeSkipListExMulti(TRI_skiplistEx_multi_t* skiplist) {
if (skiplist != NULL) {
return &(skiplist->_base._endNode);
}
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief adds an element to a multi skip list using an element for searching
////////////////////////////////////////////////////////////////////////////////
int TRI_InsertElementSkipListExMulti(TRI_skiplistEx_multi_t* skiplist,
void* element,
bool overwrite,
uint64_t thisTransID) {
int32_t newHeight;
int32_t currentLevel;
TRI_skiplistEx_node_t* currentNode;
TRI_skiplistEx_node_t* nextNode;
TRI_skiplistEx_node_t* newNode;
int compareResult;
int result;
int casFailures = -1;
// ...........................................................................
// Just in case
// ...........................................................................
if (skiplist == NULL) {
return TRI_ERROR_INTERNAL;
}
// ...........................................................................
// Determine the number of levels in which to add the item. That is, determine
// the height of the node so that it participates in that many lists.
// ...........................................................................
newHeight = RandLevel(&(skiplist->_base)) + 1;
// ...........................................................................
// Something wrong since the newHeight must be non-negative
// ...........................................................................
if (newHeight < 1) {
return TRI_ERROR_INTERNAL;
}
// ...........................................................................
// Grow lists if required by increasing the height of the start and end nodes
// ...........................................................................
result = GrowStartEndNodes(&(skiplist->_base), newHeight);
if (result != TRI_ERROR_NO_ERROR) {
return result;
}
// ...........................................................................
// Create the new node to be inserted. If there is some sort of failure,
// then we delete the node memory.
// ...........................................................................
newNode = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_skiplistEx_node_t) + skiplist->_base._elementSize, false);
if (newNode == NULL) { // out of memory?
return TRI_ERROR_OUT_OF_MEMORY;
}
// ...........................................................................
// Copy the contents of element into the new node to be inserted.
// If a duplicate has been found, then we destroy the allocated memory.
// ...........................................................................
newNode->_column = NULL;
newNode->_colLength = 0;
newNode->_extraData = NULL;
result = IndexStaticCopyElementElement(&(skiplist->_base), &(newNode->_element), element);
result = GrowNewNodeHeight(newNode, newHeight, newHeight, result);
if (result != TRI_ERROR_NO_ERROR) {
FreeSkipListExNode(&(skiplist->_base), newNode);
return result;
}
// ...........................................................................
// Assign the deletion transaction id and the insertion transaction id
// ...........................................................................
newNode->_delTransID = UINT64_MAX; // since we are inserting this new node it can not be deleted
newNode->_insTransID = thisTransID; // this is what was given to us
// ...........................................................................
// Big loop to restart the whole search routine
// ...........................................................................
CAS_RESTART: {
// ...........................................................................
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
// ...........................................................................
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
LOG_ERROR("CAS Failure");
FreeSkipListExNode(&(skiplist->_base), newNode);
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
}
// ...........................................................................
// Provide a simple non-blocking, lock? The sleep time should be something
// needs to be adjusted.
// ...........................................................................
if (casFailures > -1) {
usleep(CAS_FAILURE_SLEEP_TIME);
}
// ...........................................................................
// Increment the cas failures (which should always be hopefully 0).
// ...........................................................................
++casFailures;
// ...........................................................................
// Determine the path where the new item is to be inserted. If the item
// already exists either replace it or return false. Recall that this
// skip list is used for unique key/value pairs. Use the skiplist-multi
// non-unique key/value pairs.
// ...........................................................................
currentLevel = skiplist->_base._startNode._colLength - 1; // NOT current height BUT current level is required here
currentNode = &(skiplist->_base._startNode);
nextNode = currentNode;
START: {
// .........................................................................
// The current node (which we have called the nextNode below) should never
// be null. Protect yourself in case something has gone wrong.
// .........................................................................
if (nextNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// We require the successor of the current node so we can perform a
// comparison. It should never be null.
// .........................................................................
nextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[currentLevel]._next);
if (nextNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// An insert/lookup/removal SEARCH like this, can ONLY ever find 1 glass
// node when we are very unlucky. (The GC makes the node glass and then
// goes and unlinks the pointers.) If we skip the glass node, then we
// will have the wrong pointers to compare, so we have to CAS_RESTART
// .........................................................................
if (nextNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
goto CAS_RESTART;
}
// .........................................................................
// WE HAVE FOUR CASES TO CONSIDER
// .........................................................................
// .........................................................................
// CASE ONE:
// At this level we have the smallest (start) and largest (end) nodes ONLY.
// CASE TWO:
// We have arrived at the end of the nodes and we are not at the
// start of the nodes either.
// .........................................................................
if (nextNode == &(skiplist->_base._endNode)) {
// .......................................................................
// Store the current node and level in the path
// .......................................................................
if (currentLevel < newHeight) {
newNode->_column[currentLevel]._prev = currentNode;
newNode->_column[currentLevel]._next = nextNode;
}
// .......................................................................
// if we are at the lowest level of the lists, insert the item to the
// right of the current node
// .......................................................................
if (currentLevel == 0) {
goto END;
}
// .......................................................................
// We have not yet reached the lowest level continue down.
// .......................................................................
nextNode = currentNode;
--currentLevel;
goto START;
}
// .........................................................................
// CASE THREE:
// We are the smallest left most node and the NEXT node is NOT the end node.
// Compare this element with the element in the right node to see what we do.
// CASE FOUR:
// We are somewhere in the middle of a list, away from the smallest and
// largest nodes.
// .........................................................................
else { // nextNode != &(skiplist->_endNode
// .......................................................................
// Use the callback to determine if the element is less or greater than
// the next node element.
// .......................................................................
compareResult = IndexStaticMultiCompareElementElement(skiplist, element, &(nextNode->_element), -1);
// .......................................................................
// The element matches the next element. Overwrite if possible and return.
// The only possiblity of obtaining a compareResult equal to 0 is for the
// the element being the same, NOT the keys being the same.
// .......................................................................
if (compareResult == 0) {
FreeSkipListExNode(&(skiplist->_base), newNode);
if (overwrite) {
// ...................................................................
// Warning: there is NO check to ensure that this node has not been
// previously deleted.
// ...................................................................
result = IndexStaticCopyElementElement(&(skiplist->_base), &(nextNode->_element), element);
return result;
}
return TRI_ERROR_ARANGO_INDEX_SKIPLIST_INSERT_ITEM_DUPLICATED;
}
// .......................................................................
// The element to be inserted has a key which is greater than the next node's
// element key. Keep going on this level.
// .......................................................................
if (compareResult > 0) {
currentNode = nextNode;
goto START;
}
// .......................................................................
// The element is less than the next node. Can we drop down the list?
// Store the current node and level in the path.
// .......................................................................
if (currentLevel < newHeight) {
newNode->_column[currentLevel]._prev = currentNode;
newNode->_column[currentLevel]._next = nextNode;
}
// .......................................................................
// We have reached the lowest level of the lists. Time to insert item.
// Note that we will insert this item to the left of all the items with
// the same key. Note also that the higher transaction numbers are to
// the left always.
// .......................................................................
if (currentLevel == 0) {
goto END;
}
// .......................................................................
// Drop down the list
// .......................................................................
nextNode = currentNode;
--currentLevel;
goto START;
}
} // end of label START
} // end of label CAS_RESTART
END: {
// ..........................................................................
// Ok finished with the loop and we should have a path with AT MOST
// SKIPLIST_EX_ABSOLUTE_MAX_HEIGHT number of elements.
// ..........................................................................
// ..........................................................................
// this is the tricky part since we have to attempt to do this as
// 'lock-free' as possible. This is acheived in three passes:
// Pass 1: Mark each prev and next node of the new node so that the GC
// can not modify it. If this fails goto CAS_RESTART
// Pass 2: Ensure that each prev and next tower is not glassed.
// Pass 3: Modify the newnode.prev.next to newnode and newnode.next.prev = newnode
// ..........................................................................
result = JoinNewNodeCas(newNode);
if (result == TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE) {
goto CAS_RESTART;
}
return result;
} // end of END label
return TRI_ERROR_NO_ERROR;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief adds an key/element to a multi skip list
////////////////////////////////////////////////////////////////////////////////
int TRI_InsertKeySkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* key, void* element, bool overwrite, uint64_t thisTransID) {
// Use TRI_InsertElementSkipListExMulti instead of calling this method
assert(false);
return 0;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief returns greatest node less than a given key
////////////////////////////////////////////////////////////////////////////////
void* TRI_LeftLookupByKeySkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* key, uint64_t thistransID) {
int32_t currentLevel;
TRI_skiplistEx_node_t* currentNode;
TRI_skiplistEx_node_t* nextNode;
int casFailures = -1;
// ...........................................................................
// Just in case ...
// ...........................................................................
if (skiplist == NULL) {
LOG_ERROR("Internal Error");
return NULL;
}
// ...........................................................................
// Big loop to restart the whole search routine
// ...........................................................................
CAS_RESTART: {
// ...........................................................................
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
// ...........................................................................
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
LOG_ERROR("CAS Failure");
return NULL;
}
// ...........................................................................
// Provide a simple non-blocking, block?
// ...........................................................................
if (casFailures > -1) {
assert(0); // a test to see why it blocks - should not block!
usleep(CAS_FAILURE_SLEEP_TIME);
}
// ...........................................................................
// Increment the cas failures (which should always be hopefully 0).
// ...........................................................................
++casFailures;
// ...........................................................................
// Determine the starting level and the starting node
// ...........................................................................
currentLevel = skiplist->_base._startNode._colLength - 1;
currentNode = &(skiplist->_base._startNode);
nextNode = currentNode;
START: {
// .........................................................................
// Find the next node in the current level of the lists. Protect yourself
// in case something has gone wrong.
// .........................................................................
if (nextNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// We require the successor of the current node so we can perform a
// comparison. It should never be null.
// .........................................................................
nextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[currentLevel]._next);
if (nextNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// Is our next node a glass node? If so we must skip it!
// Note: since Garbage Collection is performed in TWO passes, it is possible
// that we have more than one glass node.
// .........................................................................
if (nextNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
goto START;
}
// .........................................................................
// WE HAVE FOUR CASES TO CONSIDER
// .........................................................................
// .........................................................................
// CASE ONE:
// At this level we have the smallest (start) and largest (end) nodes ONLY.
// CASE TWO:
// We have arrived at the end of the nodes and we are not at the
// start of the nodes either.
// .........................................................................
if (nextNode == &(skiplist->_base._endNode)) {
// .......................................................................
// We are at the lowest level of the lists, and we haven't found the item
// yet. The currentNode does NOT compare and the next node is +\infinty.
// .......................................................................
if (currentLevel == 0) {
return currentNode;
}
// .......................................................................
// We have not yet reached the lowest level continue down. Possibly our
// item we seek is to be found a lower level.
// .......................................................................
nextNode = currentNode;
--currentLevel;
goto START;
}
// .........................................................................
// CASE THREE:
// We are the smallest left most node and the NEXT node is NOT the end node.
// Compare this element with the element in the right node to see what we do.
// CASE FOUR:
// We are somewhere in the middle of a list, away from the smallest and
// largest nodes.
// .........................................................................
else { // nextNode != &(skiplist->_endNode
int compareResult;
// .......................................................................
// Use the callback to determine if the element is less or greater than
// the next node element. We treat the comparison by assuming we are
// looking for a "key - epsilon". With this assumption we always find the
// last key to our right if it exists. The reason this is necessary is as
// follows: we allow a multiple documents with the same key to be stored
// here with the proviso that all but the last one is marked as deleted.
// This is how we cater for multiple revisions.
// .......................................................................
compareResult = IndexStaticMultiCompareKeyElement(skiplist, key, &(nextNode->_element), -1);
// .......................................................................
// -1 is returned if the number of fields (attributes) in the key is LESS
// than the number of fields in the index definition. This has the effect
// of being slightly less efficient since we have to proceed to the level
// 0 list in the set of skip lists. Where we allow duplicates such as this
// -1 is also returned when all the keys match.
// .......................................................................
// .......................................................................
// We have found the item!
// .......................................................................
if (compareResult == 0) {
assert(false);
}
if (compareResult > 0) {
currentNode = nextNode;
goto START;
}
// .......................................................................
// The element is less than the next node. Can we drop down the list?
// If have reached the lowest level of the lists -- no such item.
// .......................................................................
if (currentLevel == 0) {
return currentNode;
}
// .......................................................................
// Drop down the list
// .......................................................................
--currentLevel;
nextNode = currentNode;
goto START;
}
} // end of label START
} // end of label CAS_RESTART
assert(false);
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief locate a node using an element
////////////////////////////////////////////////////////////////////////////////
void* TRI_LookupByElementSkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* element, uint64_t thisTransID) {
assert(false); // there is no way you should be here
return 0;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief returns node which matches a key
////////////////////////////////////////////////////////////////////////////////
void* TRI_LookupByKeySkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* key, uint64_t thisTransID) {
// Since this index supports duplicate keys, it makes no sense to lookup an element in the index
// using a key - if there are such elements - what is returned is undefined (in the sense that a valid
// element is returned but which one?). Hence lookups can only really make sense to say give me the
// first such element and the last such element, so that we can traverse the elements which match the
// keys.
assert(false); // there is no way you should be here
return 0;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief given a node returns the next node (if possible) in the skiplist
////////////////////////////////////////////////////////////////////////////////
void* TRI_NextNodeSkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* currentNode, uint64_t thisTransID) {
if (skiplist != NULL) {
return NextNodeBaseSkipListEx( (TRI_skiplistEx_base_t*)(skiplist), currentNode, thisTransID);
}
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief given a node returns the previous node (if possible) in the skiplist
////////////////////////////////////////////////////////////////////////////////
void* TRI_PrevNodeSkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* currentNode, uint64_t thisTransID) {
if (skiplist != NULL) {
return PrevNodeBaseSkipListEx( (TRI_skiplistEx_base_t*)(skiplist), currentNode, thisTransID);
}
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief removes a key/element from a multi skip list
////////////////////////////////////////////////////////////////////////////////
int TRI_RemoveElementSkipListExMulti (TRI_skiplistEx_multi_t* skiplist, void* element, void* old,
const int passLevel, const uint64_t thisTransID,
TRI_skiplistEx_node_t** passNode) {
// ...........................................................................
// To remove an element from this skip list we have three pass levels:
// Pass 1: locate (if possible) the exact NODE - must match exactly.
// Once located, add the transaction id to the node. Return.
// Pass 2: locate the node (if not possible report error) - must match exactly.
// Once located, attempt to unlink all the pointers and make the
// node a Glass Node.
// Pass 3: Excise the node by destroying it's allocated memory.
// ...........................................................................
int32_t currentLevel;
TRI_skiplistEx_node_t* currentNode = NULL;
TRI_skiplistEx_node_t* nextNode = NULL;
int casFailures = -1;
// ...........................................................................
// Just in case
// ...........................................................................
if (skiplist == NULL) {
LOG_ERROR("Internal Error");
return TRI_ERROR_INTERNAL;
}
// ...........................................................................
// Only for pass level 1 do we have a requirement to locate the actual node
// using the key. For pass levels 2 & 3 we have the pointer to the node.
// ...........................................................................
if (passLevel != 1) { goto END; }
// ...........................................................................
// Big loop to restart the whole search routine
// ...........................................................................
CAS_RESTART: {
// ...........................................................................
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
// ...........................................................................
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
LOG_ERROR("CAS Failure");
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_CAS_FAILURE;
}
// ...........................................................................
// Provide a simple non-blocking, block?
// ...........................................................................
if (casFailures > -1) {
usleep(CAS_FAILURE_SLEEP_TIME);
}
// ...........................................................................
// Increment the cas failures (which should always be hopefully 0).
// ...........................................................................
++casFailures;
// ...........................................................................
// Determine the starting level and the starting node
// ...........................................................................
currentLevel = skiplist->_base._startNode._colLength - 1;
currentNode = &(skiplist->_base._startNode);
nextNode = currentNode;
START: {
// .........................................................................
// Find the next node in the current level of the lists. Protect yourself
// in case something has gone wrong.
// .........................................................................
if (nextNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// We require the successor of the current node so we can perform a
// comparison. It should never be null.
// .........................................................................
nextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[currentLevel]._next);
if (nextNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// Is our next node a glass node? If so we must skip it. Recall we are in
// Phase I here -- meaning that we are searching for a node which has not
// be removed and previously inserted.
// .........................................................................
if (nextNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
goto START;
}
// .........................................................................
// WE HAVE FOUR CASES TO CONSIDER
// .........................................................................
// .........................................................................
// CASE ONE:
// At this level we have the smallest (start) and largest (end) nodes ONLY.
// CASE TWO:
// We have arrived at the end of the nodes and we are not at the
// start of the nodes either.
// .........................................................................
if (nextNode == &(skiplist->_base._endNode)) {
// .......................................................................
// We are at the lowest level of the lists, and we haven't found the item
// yet. Nothing to remove so return.
// .......................................................................
if (currentLevel == 0) {
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_ITEM_MISSING;
}
// .......................................................................
// We have not yet reached the lowest level continue down.
// .......................................................................
nextNode = currentNode;
--currentLevel;
goto START;
}
// .........................................................................
// CASE THREE:
// We are the smallest left most node and the NEXT node is NOT the end node.
// Compare this element with the element in the right node to see what we do.
// CASE FOUR:
// We are somewhere in the middle of a list, away from the smallest and
// largest nodes.
// .........................................................................
else { // nextNode != &(skiplist->_endNode
int compareResult;
// .......................................................................
// Use the callback to determine if the element is less or greater than
// the next node element.
// .......................................................................
compareResult = IndexStaticMultiCompareElementElement(skiplist,element,&(nextNode->_element), -1);
// .......................................................................
// The element is greater than the next node element. Keep going on this
// level.
// .......................................................................
if (compareResult > 0) {
currentNode = nextNode;
goto START;
}
if (compareResult == 0) { // a node matches exactly based upon the element
if (nextNode->_delTransID > thisTransID) {
// ...................................................................
// Node has NOT been deleted (e.g. imagine it will be deleted some
// time in the future). This is the node we want.
// ...................................................................
currentNode = nextNode;
goto END;
}
// .....................................................................
// In a skiplist supporting duplicate entries, the comparison function
// test ensures the elements are the same (e.g. same address in memory)
// it can never be the case that we rely simply on the keys matching.
// So the question remains: why has the item has been previously
// deleted? Has someone tried to remove this item twice?
// Don't know return error.
// .....................................................................
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_ITEM_PRIOR_REMOVED;
}
// .......................................................................
// We have reached the lowest level of the lists -- no such item.
// .......................................................................
if (currentLevel == 0) {
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_ITEM_MISSING;
}
// .......................................................................
// Drop down the list
// .......................................................................
--currentLevel;
nextNode = currentNode;
goto START;
}
} // end of START label
} // end of CAS_RESTART label
END: {
switch (passLevel) {
// .........................................................................
// In this case we simply add the del transaction id with a CAS statement.
// It should never fail!
// .........................................................................
case 1: {
bool ok;
if (currentNode == NULL) { // something terribly wrong
assert(0);
return TRI_ERROR_INTERNAL;
}
ok = TRI_CompareAndSwapIntegerUInt64 (&(currentNode->_delTransID),
UINT64_MAX, thisTransID);
if (!ok) {
assert(0);
return TRI_ERROR_INTERNAL;
}
// ....................................................................
// If requested copy the contents of the element we have located into the
// storage sent.
// ....................................................................
if (old != NULL) {
IndexStaticCopyElementElement(&(skiplist->_base), old, &(currentNode->_element));
}
*passNode = currentNode;
return TRI_ERROR_NO_ERROR;
}
// .........................................................................
// In this case we wish to make the node a glass node and to unjoin all
// other connected nodes.
// .........................................................................
case 2: {
// .......................................................................
// We can not now rely upon looking up the node using the key, since
// we would need to traverse right and attempt to match either then
// transaction id and/or the pointer to the doc. Easier to simply
// send the address of the node back.
// .......................................................................
if (*passNode == NULL) {
return TRI_ERROR_INTERNAL;
}
currentNode = (TRI_skiplistEx_node_t*)(*passNode);
// .......................................................................
// Only the Garbage Collector can transform a node into a glass node, and
// since the GC is only operating in one thread safe to do a simple
// comparison here.
// .......................................................................
if (currentNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
return TRI_ERROR_INTERNAL;
}
// .......................................................................
// safety check
// .......................................................................
if (currentNode->_delTransID != thisTransID) {
return TRI_ERROR_INTERNAL;
}
// .......................................................................
// The stragey is this:
// (a) Brick each nearest neighbour on this node. This ensures that NO
// other nodes can be attached to this node.
// (b) Mark this node as being glass. This ensures that it is skipped
// since it is no longer required in the index.
// (c) Unbrick each of its nearest neighbours on this node. This ensures
// that an inserted node MAY be allowed to be attached but will later fail.
// Also allows us to brick other glass nodes.
// (d) Brick each prev and next nearest neighbour of this node. Irrespective
// if one of these are glass or not. This ensures that lookups can
// proceed unhinded.
// (e) Unjoin the node from the list.
// (f) Unbrick each prev/next nearest neigbour
// .......................................................................
return UnJoinOldNodeCas(currentNode);
}
// .........................................................................
// In this case since no other reader/writer can be accessing the node,
// we simply destroy it. we require the node to be glass.
// .........................................................................
case 3: {
// .......................................................................
// We can not now rely upon looking up the node using the key, since
// we would need to traverse right and attempt to match either then
// transaction id and/or the pointer to the doc. Easier to simply
// send the address of the node back.
// .......................................................................
if (*passNode == NULL) {
return TRI_ERROR_INTERNAL;
}
currentNode = (TRI_skiplistEx_node_t*)(*passNode);
// .......................................................................
// Only the Garbage Collector can transform a node into a glass node, and
// since the GC is only operating in one thread safe to do a simple
// comparison here.
// .......................................................................
if (currentNode->_towerFlag != TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
return TRI_ERROR_INTERNAL;
}
// .......................................................................
// safety check
// .......................................................................
if (currentNode->_delTransID != thisTransID) {
return TRI_ERROR_INTERNAL;
}
FreeSkipListExNode(&(skiplist->_base), currentNode);
break;
}
default: {
assert(0);
return TRI_ERROR_INTERNAL;
}
} // end of switch statement
} // end of END label
return TRI_ERROR_NO_ERROR;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief removes a key/element from a multi skip list
////////////////////////////////////////////////////////////////////////////////
int TRI_RemoveKeySkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* key, void* old,
const int passLevel, const uint64_t thisTransID,
TRI_skiplistEx_node_t** passNode) {
// Use the TRI_RemoveElementSkipListExMulti method instead.
assert(false);
return 0;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief returns smallest node greater than a given key
////////////////////////////////////////////////////////////////////////////////
void* TRI_RightLookupByKeySkipListExMulti(TRI_skiplistEx_multi_t* skiplist, void* key, uint64_t thisTransID) {
int32_t currentLevel;
TRI_skiplistEx_node_t* currentNode;
TRI_skiplistEx_node_t* prevNode;
int casFailures = -1;
// ...........................................................................
// Just in case ...
// ...........................................................................
if (skiplist == NULL) {
LOG_ERROR("Internal Error");
return NULL;
}
// ...........................................................................
// Big loop to restart the whole search routine
// ...........................................................................
CAS_RESTART: {
// ...........................................................................
// To stop this loop CAS_RESTART becomming an infinite loop, use this check
// ...........................................................................
if (casFailures == SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
LOG_ERROR("CAS Failure");
return NULL;
}
// ...........................................................................
// Provide a simple non-blocking, block?
// ...........................................................................
if (casFailures > -1) {
assert(0); // a test to see why it blocks - should not block!
usleep(CAS_FAILURE_SLEEP_TIME);
}
// ...........................................................................
// Increment the cas failures (which should always be hopefully 0).
// ...........................................................................
++casFailures;
// ...........................................................................
// Determine the starting level and the starting node
// ...........................................................................
currentLevel = skiplist->_base._endNode._colLength - 1;
currentNode = &(skiplist->_base._endNode);
prevNode = currentNode;
START: {
// .........................................................................
// Find the next node in the current level of the lists. Protect yourself
// in case something has gone wrong.
// .........................................................................
if (prevNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// We require the successor of the current node so we can perform a
// comparison. It should never be null.
// .........................................................................
prevNode = (TRI_skiplistEx_node_t*)(prevNode->_column[currentLevel]._prev);
if (prevNode == NULL) {
LOG_ERROR("CAS Failure");
assert(0);
goto CAS_RESTART;
}
// .........................................................................
// Is our next node a glass node? If so we must skip it!
// Note: since Garbage Collection is performed in TWO passes, it is possible
// that we have more than one glass node.
// .........................................................................
if (prevNode->_towerFlag == TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG) {
goto START;
}
// .........................................................................
// WE HAVE FOUR CASES TO CONSIDER
// .........................................................................
// .........................................................................
// CASE ONE:
// At this level we have the smallest (start) and largest (end) nodes ONLY.
// CASE TWO:
// We have arrived at the end of the nodes and we are not at the
// start of the nodes either.
// .........................................................................
if (prevNode == &(skiplist->_base._startNode)) {
// .......................................................................
// We are at the lowest level of the lists, and we haven't found the item
// yet. The currentNode does NOT compare and the next node is +\infinty.
// .......................................................................
if (currentLevel == 0) {
return currentNode;
}
// .......................................................................
// We have not yet reached the lowest level continue down. Possibly our
// item we seek is to be found a lower level.
// .......................................................................
prevNode = currentNode;
--currentLevel;
goto START;
}
// .........................................................................
// CASE THREE:
// We are the smallest left most node and the NEXT node is NOT the end node.
// Compare this element with the element in the right node to see what we do.
// CASE FOUR:
// We are somewhere in the middle of a list, away from the smallest and
// largest nodes.
// .........................................................................
else { // nextNode != &(skiplist->_endNode
int compareResult;
// .......................................................................
// Use the callback to determine if the element is less or greater than
// the next node element. We treat the comparison by assuming we are
// looking for a "key - epsilon". With this assumption we always find the
// last key to our right if it exists. The reason this is necessary is as
// follows: we allow a multiple documents with the same key to be stored
// here with the proviso that all but the last one is marked as deleted.
// This is how we cater for multiple revisions.
// .......................................................................
compareResult = IndexStaticMultiCompareKeyElement(skiplist, key, &(prevNode->_element), 1);
// .......................................................................
// If the number of fields (attributes) in the key is LESS than the number
// of fields in the element to be compared to, then EVEN if the keys which
// which are common to both equate as EQUAL, we STILL return 1 rather than
// 0! This ensures that the right interval end point is correctly positioned
// -- slightly inefficient since the lowest level skip list 0 has to be reached
// in this case.
// .......................................................................
// .......................................................................
// We have found the item!
// .......................................................................
if (compareResult == 0) {
assert(false);
}
if (compareResult < 0) {
currentNode = prevNode;
goto START;
}
// .......................................................................
// The element is less than the next node. Can we drop down the list?
// If have reached the lowest level of the lists -- no such item.
// .......................................................................
if (currentLevel == 0) {
return currentNode;
}
// .......................................................................
// Drop down the list
// .......................................................................
--currentLevel;
prevNode = currentNode;
goto START;
}
} // end of label START
} // end of label CAS_RESTART
assert(false);
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief returns the start node associated with a multi skip list.
////////////////////////////////////////////////////////////////////////////////
void* TRI_StartNodeSkipListExMulti(TRI_skiplistEx_multi_t* skiplist) {
if (skiplist != NULL) {
return &(skiplist->_base._startNode);
}
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
/// @}
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// IMPLEMENTATION OF STATIC FORWARD DECLARED FUNCTIONS
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/// @addtogroup SkiplistEx_common
/// @{
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/// @brief destroys a skip list, but does not free the pointer
////////////////////////////////////////////////////////////////////////////////
static void DestroyBaseSkipListEx(TRI_skiplistEx_base_t* baseSkiplist) {
// ...........................................................................
// No locking, blocking or CAS here. Someone asked for the index to destroyed.
// We assume that no further read/write operations are being accepted which
// require this index.
// TODO:
// Warning: there is a memory leak which requires fixing here. The Garbage
// collection may be working in the background and if we destroy the
// skiplist before the Garbage collection thread has been terminated - then
// bang!
// The idea is to send the Garbage collector a signal so that ALL references
// to this index are expunged, then the same process will call this function.
// ...........................................................................
TRI_skiplistEx_node_t* nextNode;
TRI_skiplistEx_node_t* nextNextNode;
if (baseSkiplist == NULL) {
return;
}
nextNode = &(baseSkiplist->_startNode);
while (nextNode != NULL) {
nextNextNode = (TRI_skiplistEx_node_t*)(nextNode->_column[0]._next);
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(nextNode->_column));
if ((nextNode != &(baseSkiplist->_startNode)) && (nextNode != &(baseSkiplist->_endNode))) {
IndexStaticDestroyElement(baseSkiplist, &(nextNode->_element));
TRI_Free(TRI_UNKNOWN_MEM_ZONE, nextNode);
}
nextNode = nextNextNode;
}
TRI_Free(TRI_UNKNOWN_MEM_ZONE, baseSkiplist->_random);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief destroys the internal structure allocation for a node
////////////////////////////////////////////////////////////////////////////////
static void DestroySkipListExNode (TRI_skiplistEx_base_t* skiplist, TRI_skiplistEx_node_t* node) {
if (node == NULL) {
return;
}
TRI_Free(TRI_UNKNOWN_MEM_ZONE, (void*)(node->_column));
// recall that the memory assigned for the node->_element is actually part of the node
// so we do not free that memory here - it is freed when we free the whole node
IndexStaticDestroyElement(skiplist, &(node->_element));
}
////////////////////////////////////////////////////////////////////////////////
/// @brief frees a node, destroying it first
////////////////////////////////////////////////////////////////////////////////
static void FreeSkipListExNode (TRI_skiplistEx_base_t* skiplist, TRI_skiplistEx_node_t* node) {
DestroySkipListExNode(skiplist, node);
if ( (node == &(skiplist->_startNode)) ||
(node == &(skiplist->_endNode)) ) {
return;
}
TRI_Free(TRI_UNKNOWN_MEM_ZONE, node);
}
// ...............................................................................
// This function is thread safe since the node has just been created and has
// NOT YET been linked into the skiplist.
// ...............................................................................
static int GrowNewNodeHeight(TRI_skiplistEx_node_t* node, uint32_t height, uint32_t colLength, int result) {
// ............................................................................
// Don't go any further if we already have a previous error, simply return that error.
// ............................................................................
if (result != TRI_ERROR_NO_ERROR) {
return result;
}
// ............................................................................
// In general the height is related to the colLength via the relation
// height = colLength. However, we allow for the fact that node may have a
// height much bigger than the current column length. This of course saves us
// from continually allocating and deallocating memory.
// ............................................................................
if (colLength > height) {
assert(0);
return TRI_ERROR_INTERNAL;
}
node->_colLength = colLength;
node->_column = TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_skiplistEx_nb_t) * height, false);
if (node->_column == NULL) { // out of memory?
return TRI_ERROR_OUT_OF_MEMORY;
}
// ..........................................................................
// Ensure that the towers are normal, at least initially for a new node
// ..........................................................................
node->_towerFlag = TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG;
// ...........................................................................
// Initialise the storage
// ...........................................................................
{
uint32_t j;
for (j = 0; j < height; ++j) {
node->_column[j]._prev = NULL;
node->_column[j]._next = NULL;
node->_column[j]._nbFlag = TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG;
}
}
return TRI_ERROR_NO_ERROR;
}
////////////////////////////////////////////////////////////////////////////////
/// @brief joins a the start node to the end node and visa versa
////////////////////////////////////////////////////////////////////////////////
static void JoinStartEndNodes(TRI_skiplistEx_node_t* leftNode,
TRI_skiplistEx_node_t* rightNode,
uint32_t startLevel, uint32_t endLevel) {
if (startLevel > endLevel) { // something wrong
assert(false);
return;
}
// change level to height
endLevel += 1;
{
uint32_t j;
for (j = startLevel; j < endLevel; ++j) {
(leftNode->_column)[j]._next = rightNode;
(rightNode->_column)[j]._prev = leftNode;
}
}
}
////////////////////////////////////////////////////////////////////////////////
/// @brief given a node returns the next node (if possible) in the skiplist
////////////////////////////////////////////////////////////////////////////////
static void* NextNodeBaseSkipListEx(TRI_skiplistEx_base_t* skiplist, void* currentNode, uint64_t thisTransID) {
TRI_skiplistEx_node_t* volatile nn = (TRI_skiplistEx_node_t* volatile)(currentNode);
if (nn == NULL) {
nn = &(skiplist->_startNode);
}
// ...........................................................................
// We are required to skip certain nodes based upon the transaction id
// ...........................................................................
while (nn != &(skiplist->_endNode)) {
nn = nn->_column[0]._next;
if (nn == NULL) {
// this should not happen!
LOG_ERROR("CAS Failure");
assert(0);
return NULL;
}
if (nn->_insTransID > thisTransID) { // item was inserted AFTER this transaction started - skip it
continue;
}
if (nn->_delTransID <= thisTransID) { // item has been previously deleted - skip it
continue;
}
return (void*)(nn);
};
return(NULL);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief given a node returns the next node (if possible) in the skiplist
////////////////////////////////////////////////////////////////////////////////
static void* PrevNodeBaseSkipListEx(TRI_skiplistEx_base_t* skiplist, void* currentNode, uint64_t thisTransID) {
TRI_skiplistEx_node_t* volatile pn = (TRI_skiplistEx_node_t*)(currentNode);
if (pn == NULL) {
return &(skiplist->_endNode);
}
// ...........................................................................
// We are required to skip certain nodes based upon the transaction id
// ...........................................................................
while (pn != &(skiplist->_startNode)) {
pn = pn->_column[0]._prev;
if (pn == NULL) {
// this should not happen!
LOG_ERROR("CAS Failure");
assert(0);
return NULL;
}
if (pn->_insTransID > thisTransID) { // item was inserted AFTER this transaction started - skip it
continue;
}
if (pn->_delTransID <= thisTransID) { // item has been previously deleted - skip it
continue;
}
return (void*)(pn);
};
return(NULL);
}
////////////////////////////////////////////////////////////////////////////////
/// @brief determines at what 'height' the item is to be added
////////////////////////////////////////////////////////////////////////////////
static int32_t RandLevel (TRI_skiplistEx_base_t* skiplist) {
uint32_t level = 0;
int counter = 0;
uint32_t* ptr = skiplist->_random;
int j;
// ...........................................................................
// Obtain the random numbers and store them in the pre allocated storage
// ...........................................................................
for (j = 0; j < skiplist->_numRandom; ++j) {
*ptr = TRI_UInt32Random();
++ptr;
}
ptr = skiplist->_random; // go back to the begining
// ...........................................................................
// Use the bit list to determine the probability of the level.
// For 1/2: if bit (0) we stop, otherwise increase level.
// For 1/3: if bits (0,0) we stop, if bits (1,1) ignore and continue, otherwise increase level
// For 1/4: if bits (0,0) we stop, otherwise increase level
// ...........................................................................
switch (skiplist->_prob) {
case TRI_SKIPLIST_EX_PROB_HALF: {
counter = 0;
while (level < skiplist->_maxHeight) {
if ((1 & (*ptr)) == 0) {
break;
}
++level;
(*ptr) = (*ptr) >> 1;
++counter;
if (counter == 32) {
++ptr;
counter = 0;
}
}
break;
}
case TRI_SKIPLIST_EX_PROB_THIRD: {
while (level < skiplist->_maxHeight) {
if ((3 & (*ptr)) == 0) {
break;
}
else if ((3 & (*ptr)) == 3) {
// do nothing do not increase level
}
else {
++level;
}
(*ptr) = (*ptr) >> 2;
++counter;
if (counter == 16) {
++ptr;
counter = 0;
}
}
break;
}
case TRI_SKIPLIST_EX_PROB_QUARTER: {
counter = 0;
while (level < skiplist->_maxHeight) {
if ((3 & (*ptr)) == 0) {
break;
}
++level;
(*ptr) = (*ptr) >> 2;
++counter;
if (counter == 16) {
++ptr;
counter = 0;
}
}
break;
}
default: {
return -1;
}
}
return level;
}
// .................................................................................
// If we have simultaneous inserts, then this function will keep retrying and
// attempt to wait until the CAS statement succeed. It is safe for
// simultaneous inserts.
// .................................................................................
static int GrowStartEndNodes(TRI_skiplistEx_base_t* skiplist, uint32_t newHeight) {
int result = TRI_ERROR_NO_ERROR;
int retries = 0;
uint32_t oldStartHeight, oldEndHeight;
// ................................................................................
// Is someone else growing the start/end nodes, if so return necessary error.
// Notice that this loop is only necessary if we assume multiple unordered inserts.
// ................................................................................
while (true) {
if (TRI_CompareAndSwapIntegerUInt32(&(skiplist->_growStartEndNodesFlag),
TRI_SKIPLIST_EX_FREE_TO_GROW_START_END_NODES_FLAG,
TRI_SKIPLIST_EX_NOT_FREE_TO_GROW_START_END_NODES_FLAG) ) {
break;
}
++retries;
if (retries > SKIPLIST_EX_CAS_FAILURES_MAX_LOOP) {
LOG_ERROR("CAS failed for GrowStartEndNodes");
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
}
usleep(CAS_FAILURE_SLEEP_TIME);
}
oldStartHeight = skiplist->_startNode._colLength;
oldEndHeight = skiplist->_endNode._colLength;
if (oldStartHeight != oldEndHeight) {
result = TRI_ERROR_INTERNAL;
}
if (result == TRI_ERROR_NO_ERROR) {
if (oldStartHeight < newHeight) {
// ............................................................................
// need a CAS statement here since we may have multiple readers busy reading
// the height of the towers.
// ............................................................................
if (!TRI_CompareAndSwapIntegerUInt32(&(skiplist->_startNode._colLength),
oldStartHeight, newHeight) ) {
// should never happen
result = TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
}
else {
if (!TRI_CompareAndSwapIntegerUInt32(&(skiplist->_endNode._colLength),
oldEndHeight, newHeight) ) {
// should never happen
result = TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
}
if (result != TRI_ERROR_NO_ERROR) { // undo all of good work
TRI_CompareAndSwapIntegerUInt32(&(skiplist->_startNode._colLength), newHeight, oldStartHeight);
}
}
}
}
if (!TRI_CompareAndSwapIntegerUInt32(&(skiplist->_growStartEndNodesFlag),
TRI_SKIPLIST_EX_NOT_FREE_TO_GROW_START_END_NODES_FLAG,
TRI_SKIPLIST_EX_FREE_TO_GROW_START_END_NODES_FLAG) ) {
// ..............................................................................
// failure is not a word we recognise - eventually send signal to database to rebuild index
// ..............................................................................
LOG_ERROR("CAS failed for GrowStartEndNodes");
assert(0); // remove after debugging
if (result == TRI_ERROR_NO_ERROR) {
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
}
}
return result;
}
static int UndoBricking (TRI_skiplistEx_node_t* node, int counter) {
bool ok = true;
int j;
TRI_skiplistEx_nb_t* leftNN;
TRI_skiplistEx_nb_t* rightNN;
TRI_skiplistEx_node_t* leftNode;
TRI_skiplistEx_node_t* rightNode;
for (j = 0; j < counter; ++j) {
leftNode = (TRI_skiplistEx_node_t*)(node->_column[j]._prev);
rightNode = (TRI_skiplistEx_node_t*)(node->_column[j]._next);
leftNN = &(leftNode->_column[j]);
rightNN = &(rightNode->_column[j]);
ok = TRI_CompareAndSwapIntegerUInt32 (&(leftNN->_nbFlag),
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG,
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG) && ok;
ok = TRI_CompareAndSwapIntegerUInt32 (&(rightNN->_nbFlag),
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG,
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG) && ok;
}
if (!ok) {
// should never occur - if it does eventually send signal to database to rebuild index
LOG_ERROR("CAS failed for UndoBricking");
assert(0);
return TRI_ERROR_INTERNAL;
}
return TRI_ERROR_NO_ERROR;
}
static int DoBricking (TRI_skiplistEx_node_t* node, int* counter) {
uint32_t j;
TRI_skiplistEx_nb_t* leftNN;
TRI_skiplistEx_nb_t* rightNN;
TRI_skiplistEx_node_t* leftNode;
TRI_skiplistEx_node_t* rightNode;
bool ok = true;
int result = TRI_ERROR_NO_ERROR;
*counter = 0;
for (j = 0; j < node->_colLength; ++j) {
leftNode = (TRI_skiplistEx_node_t*)(node->_column[j]._prev);
rightNode = (TRI_skiplistEx_node_t*)(node->_column[j]._next);
leftNN = &(leftNode->_column[j]);
rightNN = &(rightNode->_column[j]);
// left
ok = TRI_CompareAndSwapIntegerUInt32 (&(leftNN->_nbFlag),
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG,
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG);
if (!ok) { break; }
// right
ok = TRI_CompareAndSwapIntegerUInt32 (&(rightNN->_nbFlag),
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG,
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG);
if (!ok) {
if (!TRI_CompareAndSwapIntegerUInt32 (&(leftNN->_nbFlag),
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG,
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG)) {
// should never occur - if it does, then we need to eventually send signal to database to rebuild index
abort();
}
break;
}
++(*counter);
}
if (ok) {
return TRI_ERROR_NO_ERROR;
}
result = UndoBricking (node, *counter);
if (result == TRI_ERROR_NO_ERROR) {
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
}
LOG_ERROR("CAS failed for UndoBricking");
assert(0);
return result;
}
static int UndoJoinPointers(TRI_skiplistEx_node_t* node, const int counter) {
int j;
TRI_skiplistEx_nb_t* leftNN;
TRI_skiplistEx_nb_t* rightNN;
TRI_skiplistEx_node_t* leftNode;
TRI_skiplistEx_node_t* rightNode;
bool ok = true;
for (j = 0; j < counter; ++j) {
leftNode = (TRI_skiplistEx_node_t*)(node->_column[j]._prev);
rightNode = (TRI_skiplistEx_node_t*)(node->_column[j]._next);
leftNN = &(leftNode->_column[j]);
rightNN = &(rightNode->_column[j]);
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(leftNN->_next)), node, rightNode) && ok;
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(rightNN->_prev)), node, leftNode) && ok;
}
if (!ok) {
// should never occur - if it does eventually send signal to database to rebuild index
LOG_ERROR("CAS failed for UndoBricking");
assert(0);
return TRI_ERROR_INTERNAL;
}
return TRI_ERROR_NO_ERROR;
}
static int DoJoinPointers (TRI_skiplistEx_node_t* node, int* counter) {
uint32_t j;
TRI_skiplistEx_nb_t* leftNN;
TRI_skiplistEx_nb_t* rightNN;
TRI_skiplistEx_node_t* leftNode;
TRI_skiplistEx_node_t* rightNode;
bool ok = true;
*counter = 0;
for (j = 0; j < node->_colLength; ++j) {
leftNode = (TRI_skiplistEx_node_t*)(node->_column[j]._prev);
rightNode = (TRI_skiplistEx_node_t*)(node->_column[j]._next);
leftNN = &(leftNode->_column[j]);
rightNN = &(rightNode->_column[j]);
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(leftNN->_next)), rightNode, node);
if (!ok) { break; }
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(rightNN->_prev)), leftNode, node);
if (!ok) {
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(leftNN->_next)), node, rightNode);
if (!ok) {
// should never occur - if it does eventually send signal to database to rebuild index
abort();
}
break;
}
++(*counter);
}
if (ok) {
return TRI_ERROR_NO_ERROR;
}
UndoJoinPointers(node, *counter);
return TRI_ERROR_INTERNAL;
}
static int JoinNewNodeCas (TRI_skiplistEx_node_t* newNode) {
int brickCounter = 0;
int pointerCounter = 0;
int result = TRI_ERROR_NO_ERROR;
uint32_t j;
// Pass 1: do bricking
result = DoBricking(newNode, &brickCounter);
if (result != TRI_ERROR_NO_ERROR) {
return result;
}
// Pass 2: Ensure that each tower node is not glassed - glassing by the GC is NOT
// possible if Pass 1 above has succeeded.
for (j = 0; j < newNode->_colLength; ++j) {
TRI_skiplistEx_node_t* leftNode = (TRI_skiplistEx_node_t*)(newNode->_column[j]._prev);
TRI_skiplistEx_node_t* rightNode = (TRI_skiplistEx_node_t*)(newNode->_column[j]._next);
if ( (leftNode->_towerFlag != TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG) ||
(rightNode->_towerFlag != TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG) ) {
result = UndoBricking (newNode, brickCounter);
if (result != TRI_ERROR_NO_ERROR) {
LOG_ERROR("failed unbricking");
abort();
return result;
}
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_INSERT_CAS_FAILURE;
}
}
// Pass 3: Join the new node by assigning pointers
result = DoJoinPointers(newNode, &pointerCounter);
if (result != TRI_ERROR_NO_ERROR) {
UndoBricking (newNode, brickCounter);
return result;
}
// Now unbrick the left/right nodes so other processes can access them
result = UndoBricking (newNode, brickCounter);
return result;
}
//////////////////////////////////////////////////////////////////////////////////
// removal static functions below
//////////////////////////////////////////////////////////////////////////////////
static int SelfUndoBricking(TRI_skiplistEx_node_t* node, const int counter) {
bool ok = true;
int j;
TRI_skiplistEx_nb_t* NN;
for (j = 0; j < counter; ++j) {
NN = &(node->_column[j]);
ok = TRI_CompareAndSwapIntegerUInt32 (&(NN->_nbFlag),
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG,
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG) && ok;
}
if (!ok) {
// should never occur - if it does eventually send signal to database to rebuild index
LOG_ERROR("CAS failed for UndoBricking");
assert(0);
return TRI_ERROR_INTERNAL;
}
return TRI_ERROR_NO_ERROR;
}
static int SelfBricking(TRI_skiplistEx_node_t* node, int* counter) {
uint32_t j;
TRI_skiplistEx_nb_t* NN;
bool ok = true;
int result = TRI_ERROR_NO_ERROR;
*counter = 0;
for (j = 0; j < node->_colLength; ++j) {
NN = &(node->_column[j]);
ok = TRI_CompareAndSwapIntegerUInt32 (&(NN->_nbFlag),
TRI_SKIPLIST_EX_NORMAL_NEAREST_NEIGHBOUR_FLAG,
TRI_SKIPLIST_EX_BRICKED_NEAREST_NEIGHBOUR_FLAG);
if (!ok) { break; }
++(*counter);
}
if (ok) {
return TRI_ERROR_NO_ERROR;
}
result = SelfUndoBricking(node, *counter);
if (result == TRI_ERROR_NO_ERROR) {
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_CAS_FAILURE;
}
LOG_ERROR("CAS failed for UndoBricking");
assert(0);
return result;
}
static int UndoUnjoinPointers(TRI_skiplistEx_node_t* node, const int counter) {
int j;
TRI_skiplistEx_nb_t* leftNN;
TRI_skiplistEx_nb_t* rightNN;
TRI_skiplistEx_node_t* leftNode;
TRI_skiplistEx_node_t* rightNode;
bool ok = true;
for (j = 0; j < counter; ++j) {
leftNode = (TRI_skiplistEx_node_t*)(node->_column[j]._prev);
rightNode = (TRI_skiplistEx_node_t*)(node->_column[j]._next);
leftNN = &(leftNode->_column[j]);
rightNN = &(rightNode->_column[j]);
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(leftNN->_next)),rightNode, node) && ok;
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(rightNN->_prev)), leftNode, node) && ok;
}
if (!ok) {
// should never occur - if it does eventually send signal to database to rebuild index
LOG_ERROR("CAS failed for UndoBricking");
assert(0);
return TRI_ERROR_INTERNAL;
}
return TRI_ERROR_NO_ERROR;
}
static int DoUnjoinPointers (TRI_skiplistEx_node_t* node, int* counter) {
uint32_t j;
TRI_skiplistEx_nb_t* leftNN;
TRI_skiplistEx_nb_t* rightNN;
TRI_skiplistEx_node_t* leftNode;
TRI_skiplistEx_node_t* rightNode;
bool ok = true;
*counter = 0;
for (j = 0; j < node->_colLength; ++j) {
leftNode = (TRI_skiplistEx_node_t*)(node->_column[j]._prev);
rightNode = (TRI_skiplistEx_node_t*)(node->_column[j]._next);
leftNN = &(leftNode->_column[j]);
rightNN = &(rightNode->_column[j]);
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(leftNN->_next)), node, rightNode);
if (!ok) { break; }
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(rightNN->_prev)), node, leftNode);
if (!ok) {
ok = TRI_CompareAndSwapPointer((void* volatile*)(&(leftNN->_next)), rightNode, node);
if (!ok) {
// should never occur - if it does eventually send signal to database to rebuild index
abort();
}
break;
}
++(*counter);
}
if (ok) {
return TRI_ERROR_NO_ERROR;
}
UndoUnjoinPointers(node, *counter);
return TRI_ERROR_INTERNAL;
}
static int UnJoinOldNodeCas (TRI_skiplistEx_node_t* oldNode) {
int selfBrickCounter = 0;
int brickCounter = 0;
int pointerCounter = 0;
int result = TRI_ERROR_NO_ERROR;
bool ok;
// Pass 1: brick the nearest neighbours on the node itself.
result = SelfBricking(oldNode, &selfBrickCounter);
if (result != TRI_ERROR_NO_ERROR) {
return result;
}
// Pass 2: make the node glass
ok = TRI_CompareAndSwapIntegerUInt32 (&(oldNode->_towerFlag),
TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG,
TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG);
if (!ok) {
result = SelfUndoBricking(oldNode,selfBrickCounter);
if (result != TRI_ERROR_NO_ERROR) {
LOG_ERROR("UnJoinOldNodeCas failed ");
abort();
return TRI_ERROR_INTERNAL;
}
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_CAS_FAILURE;
}
// Pass 3: unbrick each nearest neigbour node here
result = SelfUndoBricking(oldNode,selfBrickCounter);
if (result != TRI_ERROR_NO_ERROR) {
// undo the glassing of the node
ok = TRI_CompareAndSwapIntegerUInt32(&(oldNode->_towerFlag), TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG, TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG);
if (!ok) {
LOG_ERROR("UnJoinOldNodeCas failed");
abort();
return TRI_ERROR_INTERNAL;
}
return result;
}
// Pass 4: brick each of it's nearest neighbours
result = DoBricking(oldNode, &brickCounter);
if (result != TRI_ERROR_NO_ERROR) {
// undo the glassing of the node
ok = TRI_CompareAndSwapIntegerUInt32 (&(oldNode->_towerFlag),
TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG,
TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG);
if (!ok) {
LOG_ERROR("UnJoinOldNodeCas failed");
abort();
return TRI_ERROR_INTERNAL;
}
if (result != TRI_ERROR_INTERNAL) {
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_CAS_FAILURE;
}
return result;
}
// Pass 5: unjoin the old node from the list by assigning pointers
result = DoUnjoinPointers(oldNode, &pointerCounter);
if (result != TRI_ERROR_NO_ERROR) {
int tempResult;
tempResult = UndoBricking(oldNode,brickCounter);
if (tempResult != TRI_ERROR_NO_ERROR) {
LOG_ERROR("UnJoinOldNodeCas failed");
abort();
return TRI_ERROR_INTERNAL;
}
ok = TRI_CompareAndSwapIntegerUInt32 (&(oldNode->_towerFlag),
TRI_SKIPLIST_EX_GLASS_TOWER_NODE_FLAG,
TRI_SKIPLIST_EX_NORMAL_TOWER_NODE_FLAG);
if (!ok) {
LOG_ERROR("UnJoinOldNodeCas failed");
abort();
return TRI_ERROR_INTERNAL;
}
if (result != TRI_ERROR_INTERNAL) {
return TRI_WARNING_ARANGO_INDEX_SKIPLIST_REMOVE_CAS_FAILURE;
}
return result;
}
// Now unbrick the left/right nodes so other processes can access them
result = UndoBricking (oldNode, brickCounter);
return result;
}
////////////////////////////////////////////////////////////////////////////////
/// @}
////////////////////////////////////////////////////////////////////////////////
#endif
// Local Variables:
// mode: outline-minor
// outline-regexp: "^\\(/// @brief\\|/// {@inheritDoc}\\|/// @addtogroup\\|// --SECTION--\\|/// @\\}\\)"
// End: