herddb.index.blink.BLink Maven / Gradle / Ivy
/*
Licensed to Diennea S.r.l. under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. Diennea S.r.l. licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
*/
package herddb.index.blink;
import herddb.core.Page;
import herddb.core.Page.Metadata;
import herddb.core.PageReplacementPolicy;
import herddb.index.blink.BLinkMetadata.BLinkNodeMetadata;
import herddb.utils.BooleanHolder;
import herddb.utils.Holder;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.NoSuchElementException;
import java.util.Queue;
import java.util.Set;
import java.util.Spliterators;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.LongAdder;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
/**
* Java implementation of b-link tree derived from Vladimir Lanin and Dennis
* Shasha work: A symmetric concurrent b-tree algorithm
*
*
* This implementations add variable sized nodes, range scans, truncation and a
* way to store data pages.
*
*
*
* For original work see:
*
*
* LANIN, Vladimir; SHASHA, Dennis. A symmetric concurrent b-tree algorithm.
* In: Proceedings of 1986 ACM Fall joint computer conference.
* IEEE Computer Society Press, 1986. p. 380-389
*
*
* @author diego.salvi
*/
public class BLink, V> implements AutoCloseable, Page.Owner {
/**
* Debug flag to remove some logs and checks during normal operations
*/
private static final boolean DEBUG = false;
private static final Logger LOGGER = Logger.getLogger(BLink.class.getName());
/**
* Signal that a node size is unknown and need to be recalculated (if a node
* has really size 0 recalculation will trigger too but isn't a problem:
* nothing to recalculate)
*/
static final long UNKNOWN_SIZE = 0L;
// type
// locktype = (readlock, writelock);
// nodeptr = "node;
// height = 1 .. maxint;
// task = (add, remove);
private static final int READ_LOCK = 1;
private static final int WRITE_LOCK = 2;
private static final int ADD_TASK = 1;
private static final int REMOVE_TASK = 2;
/**
* Minimum number of children to keep a node as root.
*
* In original algorithm was children > 3 but testing this facility too much
* time was expended into move_right at higher levels. It's better to keep
* it at a minimum of 2 children to reduce move_right invocations.
*
*/
private static final int CRITIC_MIN_CHILDREN = 2;
/**
* Size value for {@link #constantKeySize} {@link #constantValueSize} and {@link #constantFullSize}
* to signal that size isn't constant.
*
* @see SizeEvaluator#constantKeySize()
* @see SizeEvaluator#constantValueSize()
*/
private static final long VARIABLE_SIZE = -1L;
private final Anchor anchor;
private final ConcurrentMap> nodes;
private final AtomicLong nextID;
private final K positiveInfinity;
private final long maxSize;
private final long minSize;
private final long constantKeySize;
private final long constantValueSize;
private final long constantFullSize;
private final SizeEvaluator evaluator;
private final BLinkIndexDataStorage storage;
private final PageReplacementPolicy policy;
private final AtomicBoolean closed;
private final LongAdder size;
private final LongAdder usedMemory;
/**
* Support structure to evaluates memory byte size occupancy of keys and
* values
*
* @author diego.salvi
*/
public interface SizeEvaluator {
/**
* Evaluate the key size only
*/
long evaluateKey(X key);
/**
* Evaluate the value size only
*/
long evaluateValue(Y value);
/**
* Evaluate both key and value size
*/
long evaluateAll(X key, Y value);
/**
* Check if handled keys have a constant byte size or it changes from key to key.
*
* @return {@code true} if key size is constant, {@code false} otherwise
*/
default boolean isKeySizeConstant() {
return false;
}
/**
* Returns constant key size if key size doesn't changes.
*
* @return constant key size
* @throws UnsupportedOperationException if key size isn't constant
* @see #isKeySizeConstant()
*/
default long constantKeySize() throws UnsupportedOperationException {
throw new UnsupportedOperationException("Method constantKeySize not supported");
}
/**
* Check if handled value have a constant byte size or it changes from value to value.
*
* @return {@code true} if value size is constant, {@code false} otherwise
*/
default boolean isValueSizeConstant() {
return false;
}
/**
* Returns constant value size if value size doesn't changes.
*
* @return constant value size
* @throws UnsupportedOperationException if value size isn't constant
* @see #isValueSizeConstant()
*/
default long constantValueSize() throws UnsupportedOperationException {
throw new UnsupportedOperationException("Method constantValueSize not supported");
}
/**
* Returns a value which is greater than all of the other values.
* Code will check using '==', so this value must be a singleton.
*
* @return a value which is greater than every other value
*/
X getPosiviveInfinityKey();
}
public BLink(
long maxSize, SizeEvaluator evaluator,
PageReplacementPolicy policy, BLinkIndexDataStorage storage
) {
this.positiveInfinity = evaluator.getPosiviveInfinityKey();
if (this.positiveInfinity != evaluator.getPosiviveInfinityKey()) {
throw new IllegalStateException("getPosiviveInfinityKey must always return the same value");
}
if (evaluator.isKeySizeConstant()) {
constantKeySize = evaluator.constantKeySize();
if (constantKeySize <= 0) {
throw new IllegalArgumentException(
"Invalid constant key size " + constantKeySize + ". It must be greater than 0");
}
} else {
constantKeySize = -1L;
}
if (evaluator.isValueSizeConstant()) {
constantValueSize = evaluator.constantValueSize();
if (constantValueSize <= 0) {
throw new IllegalArgumentException(
"Invalid constant value size " + constantValueSize + ". It must be greater than 0");
}
} else {
constantValueSize = -1L;
}
if (evaluator.isKeySizeConstant() && evaluator.isValueSizeConstant()) {
constantFullSize = constantKeySize + constantValueSize;
} else {
constantFullSize = -1L;
}
this.maxSize = maxSize;
this.minSize = maxSize / 2;
this.evaluator = evaluator;
this.storage = storage;
this.policy = policy;
this.nextID = new AtomicLong(1L);
this.closed = new AtomicBoolean(false);
this.size = new LongAdder();
this.usedMemory = new LongAdder();
this.nodes = new ConcurrentHashMap<>();
final Node root = allocate_node(true);
this.anchor = new Anchor<>(root);
/* Nothing to load locked now (we are creating a new tree) */
final Metadata meta = policy.add(root);
if (meta != null) {
meta.owner.unload(meta.pageId);
}
}
public BLink(
long maxSize, SizeEvaluator evaluator,
PageReplacementPolicy policy, BLinkIndexDataStorage storage,
BLinkMetadata metadata
) {
this.positiveInfinity = evaluator.getPosiviveInfinityKey();
if (this.positiveInfinity != evaluator.getPosiviveInfinityKey()) {
throw new IllegalStateException("getPosiviveInfinityKey must always return the same value");
}
if (evaluator.isKeySizeConstant()) {
constantKeySize = evaluator.constantKeySize();
if (constantKeySize <= 0) {
throw new IllegalArgumentException(
"Invalid constant key size " + constantKeySize + ". It must be greater than 0");
}
} else {
constantKeySize = -1L;
}
if (evaluator.isValueSizeConstant()) {
constantValueSize = evaluator.constantValueSize();
if (constantValueSize <= 0) {
throw new IllegalArgumentException(
"Invalid constant data size " + constantValueSize + ". It must be greater than 0");
}
} else {
constantValueSize = -1L;
}
if (evaluator.isKeySizeConstant() && evaluator.isValueSizeConstant()) {
constantFullSize = constantKeySize + constantValueSize;
} else {
constantFullSize = -1L;
}
this.maxSize = maxSize;
this.minSize = maxSize / 2;
this.evaluator = evaluator;
this.storage = storage;
this.policy = policy;
this.nextID = new AtomicLong(metadata.nextID);
this.closed = new AtomicBoolean(false);
this.size = new LongAdder();
size.add(metadata.values);
this.usedMemory = new LongAdder();
this.nodes = new ConcurrentHashMap<>();
convertNodeMetadata(metadata.nodes, nodes);
this.anchor = new Anchor<>(
nodes.get(metadata.fast), metadata.fastheight,
nodes.get(metadata.top), metadata.topheight,
nodes.get(metadata.first));
}
/**
* Convert given node metadatas in real nodes and push them into given map
*/
private void convertNodeMetadata(List> nodes, Map> map) {
/* First loop: create every node without links (no rightlink nor outlink) */
for (BLinkNodeMetadata metadata : nodes) {
map.put(metadata.id, new Node<>(metadata, this));
}
/* Second loop: add missing links (rightlink or outlink) */
for (BLinkNodeMetadata metadata : nodes) {
final Node node = map.get(metadata.id);
if (metadata.rightlink != BLinkNodeMetadata.NO_LINK) {
node.rightlink = map.get(metadata.rightlink);
}
if (metadata.outlink != BLinkNodeMetadata.NO_LINK) {
node.outlink = map.get(metadata.outlink);
}
}
}
/**
* Fully close the facility
*/
@Override
public void close() {
if (closed.compareAndSet(false, true)) {
final Iterator> iterator = nodes.values().iterator();
while (iterator.hasNext()) {
Node node = iterator.next();
/* If the node has been unloaded removes it from policy */
if (node.unload(false, false)) {
policy.remove(node);
}
/* linked nodes dereferencing */
node.outlink = null;
node.rightlink = null;
}
/* Anchor nodes dereferencing */
anchor.fast = null;
anchor.top = null;
nodes.clear();
size.reset();
}
}
/**
* Truncate any tree data. Invokers must ensure to call this method in a not
* concurrent way.
*/
public void truncate() {
final Iterator> iterator = nodes.values().iterator();
while (iterator.hasNext()) {
Node node = iterator.next();
/* If the node has been unloaded removes it from policy */
if (node.unload(false, false)) {
policy.remove(node);
}
/* linked nodes dereferencing */
node.outlink = null;
node.rightlink = null;
}
nodes.clear();
size.reset();
final Node root = allocate_node(true);
this.anchor.reset(root);
/* Nothing to load locked now */
final Metadata meta = policy.add(root);
if (meta != null) {
meta.owner.unload(meta.pageId);
}
}
/**
* Returns the number of key/value pairs stored into this tree.
*
* @return current size of the tree
*/
public long size() {
return size.sum();
}
/**
* Returns the actually used memory in bytes (only data currently loaded will be accounted).
*
* @return current memory occupancy of the tree
*/
public long getUsedMemory() {
return usedMemory.sum();
}
/**
* Returns the current nodes count.
*
* @return current tree nodes count
*/
public int nodes() {
return nodes.size();
}
/* ******************** */
/* *** PAGE LOADING *** */
/* ******************** */
@Override
public void unload(long pageId) {
nodes.get(pageId).unload(true, false);
}
/**
* Handles page unloading, using special try & unload if given metadata
* represent a page owned by current BLink tree.
*
* @param unload metadata to unload
* @return {@code true} if unloaded
*/
private boolean attemptUnload(Metadata unload) {
if (unload.owner == this) {
/*
* Page owned by current BLink tree, use try -> unload to avoid deadlock on
* loadLock acquisition. If not unloaded here invoking code will have to unload
* the page later after releasing his load lock
*/
/* Attempt to unload metadata if a lock can be acquired */
return nodes.get(unload.pageId).unload(true, true);
} else {
/* Directly unload metatada */
unload.owner.unload(unload.pageId);
return true;
}
}
/**
* Executes a complete tree checkpoint.
*
* Invoking method must ensure that there isn't any concurrent update, read
* operations could be executed concurrently with checkpoint.
*
*
* @return tree checkpoint metadata
* @throws IOException
*/
public BLinkMetadata checkpoint() throws IOException {
final List> metadatas = new LinkedList<>();
for (Node node : nodes.values()) {
/*
* Lock shouldn't be really needed because checkpoint must invoked when no thread are modifying
* the index but to ensure that latest value of "empty" flag is read we must read it from RAM. Any
* memory breaking operation would suffice but tacking a read lock on the node is "cleaner"
*/
lock(node, READ_LOCK);
try {
/*
* Do not checkpoint empty nodes. They aren't needed at all and because no modification
* operations is occurring currently seen empty node aren't referenced by anyone.
*/
if (node.empty()) {
/* If the node existed in policy remove it and unload */
if (policy.remove(node)) {
node.unload(false, false);
}
/*
* Remove the node from nodes knowledge: it is a safe operation, if a node is empty it will not be
* used anymore and it has just an outlink from a possibly real node. If there is a concurrent
* traversal it will continue anyway (nodes reachable through links). Nodes memory is needed only
* for page unload and close and truncate operations (the page was just unloaded and close and
* truncate will deal with remaining "live" nodes).
*/
nodes.remove(node.pageId);
continue;
}
} finally {
unlock(node, READ_LOCK);
}
BLinkNodeMetadata metadata = node.checkpoint();
metadatas.add(metadata);
if (LOGGER.isLoggable(Level.FINER)) {
LOGGER.log(Level.FINER, "node {0} has {1} keys at checkpoint", new Object[]{metadata.id, metadata.keys});
}
}
lock_anchor(READ_LOCK);
long fast = anchor.fast.pageId;
int fastheight = anchor.fastheight;
long top = anchor.top.pageId;
int topheight = anchor.topheight;
long first = anchor.first.pageId;
unlock_anchor(READ_LOCK);
return new BLinkMetadata<>(nextID.get(), fast, fastheight, top, topheight, first, size.sum(), metadatas);
}
/* ******************** */
/* *** TREE METHODS *** */
/* ******************** */
// function search(v: value); boolean;
// var
// n: nodeptr;
// descent: stack;
// begin
// n := locate-leaf(v, readlock, descent); {v € coverset(n), n read-locked}
// search := check-key(v, n); {decisive}
// unlock(n, readlock)
// end;
public V search(K v) {
Node n;
@SuppressWarnings("unchecked")
Deque> descent = DummyDeque.INSTANCE;
try {
n = locate_leaf(v, READ_LOCK, descent); // v € coverset(n), n read-locked
} catch (IOException ex) {
throw new UncheckedIOException("failed to search for " + v, ex);
}
try {
V search = n.check_key(v); // decisive;
return search;
} catch (IOException ex) {
throw new UncheckedIOException("failed to search for " + v, ex);
} finally {
unlock(n, READ_LOCK);
}
}
/**
* Supports both from and to empty.
*
* @param from inclusive (if not empty)
* @param to exclusive
* @return
*/
public Stream> scan(K from, K to) {
Node n;
@SuppressWarnings("unchecked")
Deque> descent = DummyDeque.INSTANCE;
if (from == null) {
lock_anchor(READ_LOCK);
n = anchor.first;
unlock_anchor(READ_LOCK);
/*
* We have to lock the first node, scan iterator require a read locked node (as produced from
* locate_leaf too)
*/
lock(n, READ_LOCK);
} else {
try {
n = locate_leaf(from, READ_LOCK, descent); // v € coverset(n), n read-locked
} catch (IOException ex) {
throw new UncheckedIOException("failed to scan from " + from + " to " + to, ex);
}
}
return StreamSupport.stream(
Spliterators.spliteratorUnknownSize(
new ScanIterator(n, from, from != null, to, false),
/* No characteristics */ 0),
/* No parallel */ false);
}
public Stream> scan(K from, K to, boolean toInclusive) {
Node n;
@SuppressWarnings("unchecked")
Deque> descent = DummyDeque.INSTANCE;
if (from == null) {
lock_anchor(READ_LOCK);
n = anchor.first;
unlock_anchor(READ_LOCK);
/*
* We have to lock the first node, scan iterator require a read locked node (as produced from
* locate_leaf too)
*/
lock(n, READ_LOCK);
} else {
try {
n = locate_leaf(from, READ_LOCK, descent); // v € coverset(n), n read-locked
} catch (IOException ex) {
throw new UncheckedIOException("failed to scan from " + from + " to " + to, ex);
}
}
return StreamSupport.stream(
Spliterators.spliteratorUnknownSize(
new ScanIterator(n, from, from != null, to, toInclusive),
/* No characteristics */ 0),
/* No parallel */ false);
}
// function insert(v: value): boolean;
// var
// n: nodeptr;
// descent: stack;
// begin
// n := locate-leaf(v, writelock, descent); {v € coverset(n), n write-locked}
// insert := add-key(v, n); {decisive}
// normalize(n, descent, 1);
// unlock(n, writelock)
// end;
public boolean insert(K v, V e, V expected) {
Node n;
Deque> descent = new LinkedList<>();
Queue maintenance = new LinkedList<>();
try {
n = locate_leaf(v, WRITE_LOCK, descent); // v € coverset(n), n write-locked
} catch (IOException ex) {
throw new UncheckedIOException("failed to insert " + v, ex);
}
boolean added;
try {
added = n.add_key_if(v, e, expected); // decisive
normalize(n, descent, 1, maintenance);
} catch (IOException ex) {
throw new UncheckedIOException("failed to insert " + v, ex);
} finally {
unlock(n, WRITE_LOCK);
}
if (added && expected == null) {
size.increment();
}
handleMainenance(maintenance);
return added;
}
public V insert(K v, V e) {
Node n;
Deque> descent = new LinkedList<>();
Queue maintenance = new LinkedList<>();
try {
n = locate_leaf(v, WRITE_LOCK, descent); // v € coverset(n), n write-locked
} catch (IOException ex) {
throw new UncheckedIOException("failed to insert " + v, ex);
}
V replaced;
try {
replaced = n.add_key(v, e); // decisive
normalize(n, descent, 1, maintenance);
} catch (IOException ex) {
throw new UncheckedIOException("failed to insert " + v, ex);
} finally {
unlock(n, WRITE_LOCK);
}
if (replaced == null) {
size.increment();
}
handleMainenance(maintenance);
return replaced;
}
// function delete(v: value): boolean;
// var
// n: nodeptr;
// descent: stack;
// begin
// n := locate-leaf(v, writelock, descent); {v € coverset(n) , n write-locked}
// delete := remove-key(v, n); {decisive}
// nornialize(n, descent, 1); unlock(n, writelock)
// end;
public V delete(K v) {
Node n;
Deque> descent = new LinkedList<>();
Queue maintenance = new LinkedList<>();
try {
n = locate_leaf(v, WRITE_LOCK, descent); // v € coverset(n), n write-locked
} catch (IOException ex) {
throw new UncheckedIOException("failed to delete " + v, ex);
}
V delete;
try {
delete = n.remove_key(v); // decisive
normalize(n, descent, 1, maintenance);
} catch (IOException ex) {
throw new UncheckedIOException("failed to delete " + v, ex);
} finally {
unlock(n, WRITE_LOCK);
}
if (delete != null) {
size.decrement();
}
handleMainenance(maintenance);
return delete;
}
// function locate-leaf(v: value; lastlock: locktype; var descent: stack): nodeptr;
// { locate-leaf descends from the anchor to the leaf whose coverset
// includes v, places a lock of kind specified in lastlock on that leaf,
// and returns a pointer to it. It records its path in the stack descent. }
// var
// n,m: nodeptr;
// h,enterheight: height;
// ubleftsep: value;
// { ubleftsep stands for "upper bound on the leftsep of the current node".
// This value is recorded for each node on the descent stack so that an ascending process can tell if it's too far to the right. }
// begin
// lock-anchor(readlock);
// n := anchor.fast; enterheight := anchor.fastheight; ubleftsep := +inf";
// unlock-anchor(readlock);
// set-to-empty(descent);
// for h := enterheight downto 2 do begin { v > leftsep (n)}
// move-right(v, n, ubleftsep, readlock);{ v € coverset(n) }
// push(n, ubleftsep, descent);
// (m, ubleftsep) := find(v, n, ubleftsep); { v > leftsep (m) }
// unlock(n, readlock); n := m
// end;
// move-right(v, n, ubleftsep, lastlock); {v € coverset(n) }
// locate-leaf := n
// end;
/**
* locate-leaf descends from the anchor to the leaf whose coverset includes
* v, places a lock of kind specified in lastlock on that leaf, and returns
* a pointer to it. It records its path in the stack descent.
*
* @param v
* @param lastlock
* @param descent
* @return
*/
private Node locate_leaf(K v, int lastlock, Deque> descent) throws IOException {
Node n, m;
int h, enterheight;
/*
* ubleftsep stands for "upper bound on the leftsep of the current node". This value is recorded for
* each node on the descent stack so that an ascending process can tell if it's too far to the right.
*/
K ubleftsep;
lock_anchor(READ_LOCK);
n = anchor.fast;
enterheight = anchor.fastheight;
ubleftsep = positiveInfinity;
unlock_anchor(READ_LOCK);
descent.clear();
for (h = enterheight; h > 1; --h) { // v > leftsep (n)
ResultCouple move_right = move_right(v, n, ubleftsep, READ_LOCK); // v € coverset(n)
n = move_right.node;
ubleftsep = move_right.ubleftsep;
descent.push(move_right);
try {
final ResultCouple find = n.find(v, ubleftsep); // v > leftsep (m)
m = find.node;
ubleftsep = find.ubleftsep;
} catch (IOException e) {
throw new IOException("failed to find key " + v + " on leaf " + n.pageId, e);
} finally {
unlock(n, READ_LOCK);
}
n = m;
}
ResultCouple move_right = move_right(v, n, ubleftsep, lastlock); // v € coverset(n)
n = move_right.node;
ubleftsep = move_right.ubleftsep;
return n;
}
// procedure move-right(v: value; var n: nodeptr; var ubleftsep: value; rw: locktype);
// { move-right scans along a level starting with node n until it comes to a node into whose coverset v falls (trivially, n itself).
// It assumes that no lock is held on n initially, and leaves a lock
// of the kind specified in rw on the final node. }
// var
// m: nodeptr;
// begin {assume v > leftsep (n)}
// lock(n, rw);
// while empty(n) or (rightsep(n) < v) do begin { v > leftsep (n) }
// if empty(n) then m := outlink(n) { v > leftsep (n) = leftsep (m) }
// else begin
// m := rightlink(n); {v > rightsep(n) = leftsep(m) }
// ubleftsep := rightsep(n);
// end;
// unlock(n, rw);
// lock(m, rw)
// n := m;
// end;
// end;
/**
* move-right scans along a level starting with node n until it comes to a
* node into whose coverset v falls (trivially, n itself). It assumes that
* no lock is held on n initially, and leaves a lock of the kind specified
* in rw on the final node.
*
* @param v
* @param n
* @param ubleftsep
* @param rw
* @return
*/
private ResultCouple move_right(K v, Node n, K ubleftsep, int rw) {
Node m;
// assume v > leftsep (n)
lock(n, rw);
while (n.empty() || n.rightsep().compareTo(v) < 0) { // v > leftsep (n)
if (n.empty()) {
m = n.outlink(); // v > leftsep (n) = leftsep (m)
} else {
m = n.rightlink(); // v > rightsep(n) = leftsep(m)
ubleftsep = n.rightsep();
}
unlock(n, rw);
lock(m, rw);
n = m;
}
return new ResultCouple<>(n, ubleftsep);
}
// procedure normalize(n: nodeptr; descent: stack; atheight: height);
// { normalize makes sure that node n is not too crowded
// or sparse by performing a split or merge as needed.
// A split may be necessary after a merge, n is assumed to be write-locked.
// descent and atheight are needed to ascend to the level above to complete a split or merge. }
// var
// sib, newsib: nodeptr;
// sep, newsep: value;
// begin
// if too-sparse(n) and (rightlink(n) <> nil) then begin
// sib := rightlink(n);
// lock(sib, writelock);
// sep := half-merge(n, sib);
// unlock(sib, writelock);
// spawn(ascend(remove, sep, sib, atheight+1, descent))
// end;
// if too-crowded(n) then begin
// allocate-node(newsib);
// newsep := half-split(n, newsib);
// spawn(ascend(add, newsep, newsib, atheight+1, descent))
// end
// end;
/**
* normalize makes sure that node n is not too crowded or sparse by
* performing a split or merge as needed. A split may be necessary after a
* merge, n is assumed to be write-locked. descent and at height are needed
* to ascend to the level above to complete a split or merge.
*
* @param n
* @param descent
*/
private void normalize(Node n, Deque> descent, int atheight, Queue maintenance) throws IOException {
Node sib, newsib;
K sep, newsep;
if (n.too_sparse() && (n.rightlink() != null)) {
sib = n.rightlink();
lock(sib, WRITE_LOCK);
try {
sep = n.half_merge(sib);
} finally {
unlock(sib, WRITE_LOCK);
}
spawn(() -> ascend(REMOVE_TASK, sep, sib, atheight + 1, clone(descent), maintenance), maintenance);
/* Having merged a node we could potentially lower the root or shrink it too much to be effective, run a
* critic check */
/*
* TODO: improve the critic execution heuristic. Actual heuristic is very conservative but it
* could be run many fewer times! (run critic every x merge of node size?)
*/
spawn(() -> run_critic(), maintenance);
}
if (n.too_crowded()) {
newsib = allocate_node(n.leaf);
newsep = n.half_split(newsib);
/* Nothing to load locked now */
final Metadata meta = policy.add(newsib);
if (meta != null) {
meta.owner.unload(meta.pageId);
}
spawn(() -> ascend(ADD_TASK, newsep, newsib, atheight + 1, clone(descent), maintenance), maintenance);
}
}
// procedure ascend(t: task; sep: value; child: nodeptr; toheight: height; descent: stack);
// { adds or removes separator sep and downlink to child at height toheight,
// using the descent stack to ascend to it. }
// var
// n: nodeptr;
// ubleftsep: value;
// begin n := locate-internal(sep, toheight, descent)
// while not add-or-remove-link(task, sep, child, n, toheight, descent) do begin
// { wait and try again, very rare }
// unlock(n, writelock);
// delay; { sep > teftsep(n) }
// move-right(sep, n, ubleftsep, writelock) { sep € coverset(n) }
// end;
// normalize(n, descent, toheight);
// unlock(n, writelock)
// end;
/**
* adds or removes separator sep and downlink to child at height toheight,
* using the descent stack to ascend to it.
*
* @param task
* @param sep
* @param child
* @param toheight
* @param descent
*/
private void ascend(int t, K sep, Node child, int toheight, Deque> descent, Queue maintenance) throws IOException {
Node n;
K ubleftsep;
ResultCouple locate_internal = locate_internal(sep, toheight, descent, maintenance);
n = locate_internal.node;
ubleftsep = locate_internal.ubleftsep;
try {
while (!add_or_remove_link(t, sep, child, n, toheight, descent, maintenance)) {
// wait and try again, very rare
unlock(n, WRITE_LOCK);
delay(1L); // sep > teftsep(n)
ResultCouple move_right = move_right(sep, n, ubleftsep, WRITE_LOCK); // sep € coverset(n)
n = move_right.node;
ubleftsep = move_right.ubleftsep;
}
normalize(n, descent, toheight, maintenance);
} finally {
unlock(n, WRITE_LOCK);
}
}
// function add-or-remove-link(t: task; sep: value; child: nodeptr;
// n: nodeptr; atheight: height; descent: stack): boolean;
// { tries to add or removes sep and downlink to child from
// node n and returns true if succeeded, if removing,
// and sep is rightmost in n, merges n with its right neighbor first,
// (if the resulting node is too large, it will be split by the upcoming normalization.). A solution that avoids this merge exists,
// but we present this for the sake of simplicity. }
// var
// sib: nodeptr;
// newsep: value;
// begin
// if t=add then add-or-remove-link := add-link(sep, child, n)
// else begin {t= remove}
// if rightsep(n) = sep then begin
// { the downlink to be removed is in n's right neighbor. }
// sib := rightlink(n); {rightsep(n) = sep < +inf, thus rightlink(n)<>nil
// lock(sib, writelock);
// newsep := half-merge(n, sib); {newsep = sep}
// unlock(sib, writelock);
// spawn(ascend(remove, newsep, sib, atheight+1, descent))
// end;
// add-or-remove-link := remove-link(sep, child, n)
// end
// end;
/**
* tries to add or removes sep and downlink to child from node n and returns
* true if succeeded, if removing, and sep is rightmost in n, merges n with
* its right neighbor first, (if the resulting node is too large, it will be
* split by the upcoming normalization.). A solution that avoids this merge
* exists, but we present this for the sake of simplicity.
*
* @param t
* @param sep
* @param child
* @param n
* @param atheight
* @param descent
* @return
*/
private boolean add_or_remove_link(int t, K sep, Node child, Node n, int atheight, Deque> descent, Queue maintenance) throws IOException {
Node sib;
K newsep;
if (t == ADD_TASK) {
return n.add_link(sep, child);
} else {
if (n.rightsep().equals(sep)) {
// the downlink to be removed is in n's right neighbor.
sib = n.rightlink(); // rightsep(n) = sep < +inf, thus rightlink(n)<>nil
lock(sib, WRITE_LOCK);
try {
newsep = n.half_merge(sib); // newsep = sep
} catch (IOException ex) {
throw new UncheckedIOException("failed to remove link from " + n.pageId + " to " + sib.pageId, ex);
} finally {
unlock(sib, WRITE_LOCK);
}
spawn(() -> ascend(REMOVE_TASK, newsep, sib, atheight + 1, clone(descent), maintenance), maintenance);
}
return n.remove_link(sep, child);
}
}
// function locate-internal(v: value; toheight: height; var descent: stack): nodeptr;
// { a modified locate phase; instead of finding a leaf whose coverset includes
// V, finds a node at height toheight whose coverset includes v.
// if possible, uses the descent stack (whose top points at toheight) }
// var
// n, m, newroot: nodeptr;
// h, enterheight: height;
// ubleftsep: value;
// begin
// if empty-stack(descent) then ubleftsep := +inf { force new descent }
// else pop(n, ubleftsep, descent);
// if v < = ubleftsep then begin
// { a new descent from the top must be made}
// lock-anchor(readlock);
// if anchor. topheight < toheight then begin
// unlock-anchor(readlock); lock-anchor(writelock);
// if anchor. topheight < toheight then begin
// allocate-node(newroot);
// grow(newroot)
// end;
// unlock-anchor(writelock); lock-anchor(readlock)
// end;
// if anchor. fastheight > = toheight then begin
// n := anchor. fast; enterheight := anchor. fastheight
// end
// else begin
// n := anchor. top; enterheight := anchor. topheight
// end;
// ubleftsep := +inf; { v > leftsep(n) }
// unlock-anchor(readlock);
// set-to-empty (descent);
// for h := enterheight downto toheight+1 do begin { v > leftsep(n) }
// move-right(v, n, ubleftsep, readlock);{ v € coverset(n) }
// push(n, ubleftsep, descent);
// (m, ubleftsep) := find(v, n, ubleftsep); { v > leftsep(m) }
// unlock(n, readlock);
// n := m
// end
// end;
// { v > leftsep(n), height of n = toheight }
// move-right(v, n, ubleftsep, writelock); { v € coverset(n) }
// locate-internal := n
// end;
/**
* a modified locate phase; instead of finding a leaf whose coverset
* includes v, finds a node at height toheight whose coverset includes v. if
* possible, uses the descent stack (whose top points at toheight)
*
* @param v
* @param toheight
* @param descent
* @return
*/
private ResultCouple locate_internal(K v, int toheight, Deque> descent, Queue maintenance) throws IOException {
Node n, m, newroot;
int h, enterheight;
K ubleftsep;
if (descent.isEmpty()) {
/*
* Just to avoid "The local variable n may not have been initialized" at last move_right.
*
* If there isn't descent ubleftsep is +inf then the check ubleftsep.comparteTo(v) > 0 will always
* be true and a root will be retrieved
*/
n = null;
ubleftsep = positiveInfinity; // force new descent
} else {
ResultCouple pop = descent.pop();
n = pop.node;
ubleftsep = pop.ubleftsep;
}
if (ubleftsep.compareTo(v) > 0) { // invert the check ubleftsep isn't always a K
// a new descent from the top must be made
lock_anchor(READ_LOCK);
if (anchor.topheight < toheight) {
unlock_anchor(READ_LOCK);
lock_anchor(WRITE_LOCK);
if (anchor.topheight < toheight) {
newroot = allocate_node(false);
grow(newroot);
/* Nothing to load locked now */
final Metadata meta = policy.add(newroot);
if (meta != null) {
meta.owner.unload(meta.pageId);
}
spawn(() -> run_critic(), maintenance);
}
unlock_anchor(WRITE_LOCK);
lock_anchor(READ_LOCK);
}
if (anchor.fastheight >= toheight) {
n = anchor.fast;
enterheight = anchor.fastheight;
} else {
n = anchor.top;
enterheight = anchor.topheight;
}
ubleftsep = positiveInfinity; // v > leftsep(n)
unlock_anchor(READ_LOCK);
descent.clear();
for (h = enterheight; h > toheight; --h) { // v > leftsep(n)
ResultCouple move_right = move_right(v, n, ubleftsep, READ_LOCK); // v € coverset(n)
try {
n = move_right.node;
ubleftsep = move_right.ubleftsep;
descent.push(move_right);
ResultCouple find = n.find(v, ubleftsep); // v > leftsep(m)
m = find.node;
ubleftsep = find.ubleftsep;
} finally {
unlock(n, READ_LOCK);
}
n = m;
}
}
// v > leftsep(n), height of n = toheight
ResultCouple move_right = move_right(v, n, ubleftsep, WRITE_LOCK); // v € coverset(n)
n = move_right.node;
ubleftsep = move_right.ubleftsep;
return move_right;
}
// procedure critic;
// { the critic runs continuously; its function is to keep the target
// of the fast pointer in the anchor close to the highest level containing more than one downlink. }
// var
// n, m: nodeptr;
// h: height;
// begin
// while true do begin
// lock-anchor(readlock);
// n := anchor. top; h := anchor. topheight;
// unlock-anchor(readlock);
// lock(n, readlock);
// while numberofchildren(n)< = 3 and rightlink(n) = nil and h> 1 do begin
// m := leftmostchild(n);
// unlock(n, readlock);
// n := m;
// lock(n, readlock);
// h := h - 1
// end;
// unlock(n, readlock):
// lock-anchor(readlock);
// if anchor. fastheight = h then
// unlock-anchor(readlock)
// else begin
// unlock-anchor(readlock);
// lock-anchor(writelock);
// anchor.fastheight := h; anchor.fast := n;
// unlock-anchor(writelock)
// end;
// delay
// end
// end;
/**
* Custom critic version to be ran from querying threads. Avoid to run more
* than a critic a time
*
*
* original comment: the critic runs continuously; its function is to keep
* the target of the fast pointer in the anchor close to the highest level
* containing more than one downlink.
*
*/
private final AtomicBoolean criticRunning = new AtomicBoolean(false);
private void run_critic() throws IOException {
if (!criticRunning.compareAndSet(false, true)) {
/* Someone else is already running a critic, we don't need to run it twice */
return;
}
Node n, m;
int h;
lock_anchor(READ_LOCK);
n = anchor.top;
h = anchor.topheight;
unlock_anchor(READ_LOCK);
lock(n, READ_LOCK);
try {
while (n.number_of_children() < CRITIC_MIN_CHILDREN && n.rightlink() == null && h > 1) {
try {
m = n.leftmost_child();
} catch (IOException e) {
throw new IOException("failed to find leftmost child on node " + n.pageId, e);
} finally {
unlock(n, READ_LOCK);
}
n = m;
lock(n, READ_LOCK);
--h;
}
} catch (IOException e) {
throw new IOException("failed to evaluate anchor fast height", e);
} finally {
unlock(n, READ_LOCK);
}
lock_anchor(READ_LOCK);
if (anchor.fastheight == h) {
unlock_anchor(READ_LOCK);
} else {
unlock_anchor(READ_LOCK);
lock_anchor(WRITE_LOCK);
anchor.fastheight = h;
anchor.fast = n;
unlock_anchor(WRITE_LOCK);
}
criticRunning.set(false);
}
/**
* Differently from original algorithm spawning means just enqueuing
* maintenance work at for execution at the end of insert/update/delete
*/
private void spawn(CriticJob runnable, Queue maintenance) {
maintenance.offer(runnable);
}
private void handleMainenance(Queue maintenance) {
try {
while (!maintenance.isEmpty()) {
maintenance.poll().execute();
}
} catch (IOException ex) {
throw new UncheckedIOException("failed to handle Blink maintenance", ex);
}
}
@SuppressWarnings("unchecked")
private Deque> clone(Deque> descent) {
return (Deque>) ((LinkedList>) descent).clone();
}
/*
* The search structure operations. Locking ensures that they are atomic.
*/
private Node allocate_node(boolean leaf) {
final Long nodeID = nextID.getAndIncrement();
final Node node = new Node<>(nodeID, leaf, this, positiveInfinity);
nodes.put(nodeID, node);
return node;
}
/**
* n is made an internal node containing only a downlink to the current
* target of the anchor's top pointer and the separator +inf to its right.
* The anchor's top pointer is then set to point to n, and its height
* indicator is incremented.
*
* @param n
*/
private void grow(Node n) {
n.grow(anchor.top);
anchor.top = n;
anchor.topheight++;
}
private void lock_anchor(int locktype) {
lock(anchor.lock, locktype);
}
private void unlock_anchor(int locktype) {
unlock(anchor.lock, locktype);
}
private void lock(Node n, int locktype) {
//
// System.out.println("T" + Thread.currentThread().getId() + " " + System.currentTimeMillis() + " Locking " + n + " " + (locktype == READ_LOCK ? "r" : "w"));
//
// try {
// Lock lock;
// if (locktype == READ_LOCK) {
// lock = locks.get(n).readLock();
// } else {
// lock = locks.get(n).writeLock();
// }
// if (!lock.tryLock(3, TimeUnit.SECONDS)) {
// System.out.println("T" + Thread.currentThread().getId() + " " + System.currentTimeMillis() + " --------------> Deadlock " + n);
//
// Set threadSet = Thread.getAllStackTraces().keySet();
//
// for( Thread thread : threadSet )
// for (StackTraceElement ste : thread.getStackTrace()) {
// System.out.println("T" + Thread.currentThread().getId() + " TD" + thread.getId() + " -> " + ste);
// }
//
// throw new InternalError("Deadlock " + n);
// }
// } catch (InterruptedException e) {
// throw new InternalError("interrupt " + n);
// }
//
// System.out.println("T" + Thread.currentThread().getId() + " " + System.currentTimeMillis() + " Lock " + n + " " + (locktype == READ_LOCK ? "r" : "w"));
lock(n.lock, locktype);
}
private void unlock(Node n, int locktype) {
//
// try {
//
// Lock lock;
// if (locktype == READ_LOCK) {
// lock = locks.get(n).readLock();
// } else {
// lock = locks.get(n).writeLock();
// }
//
// lock.unlock();
//
// } catch (Exception e) {
// System.out.println("T" + Thread.currentThread().getId() + " " + System.currentTimeMillis() + " --------------> UNLOCK FAIL " + n + " " + (locktype == READ_LOCK ? "r" : "w"));
//
// System.out.println("T" + Thread.currentThread().getId() + " TD" + Thread.currentThread().getId() + " UNLOCK FAIL -> " + e);
// System.out.println("T" + Thread.currentThread().getId() + " TD" + Thread.currentThread().getId() + " UNLOCK FAIL -> " + e.getMessage());
// e.printStackTrace(System.out);
// for (StackTraceElement ste : Thread.currentThread().getStackTrace()) {
// System.out.println("T" + Thread.currentThread().getId() + " TD" + Thread.currentThread().getId() + " UNLOCK FAIL -> " + ste);
// }
// }
//
// System.out.println("T" + Thread.currentThread().getId() + " " + System.currentTimeMillis() + " Unlocked " + n + " " + (locktype == READ_LOCK ? "r" : "w"));
unlock(n.lock, locktype);
}
private void lock(ReadWriteLock lock, int locktype) {
if (locktype == READ_LOCK) {
lock.readLock().lock();
} else {
lock.writeLock().lock();
}
}
private void unlock(ReadWriteLock lock, int locktype) {
if (locktype == READ_LOCK) {
lock.readLock().unlock();
} else {
lock.writeLock().unlock();
}
}
private void delay(long time) {
try {
Thread.sleep(time);
} catch (InterruptedException soaked) {/* SOAK */
Thread.currentThread().interrupt();
}
}
@Override
public String toString() {
return "BLink [anchor=" + anchor
+ ", nextID=" + nextID
+ ", keys=" + size()
+ ", maxSize=" + maxSize
+ ", minSize=" + minSize
+ ", closed=" + closed
+ "]";
}
/**
* Build a full string representation of this tree.
*
* Pay attention: it will load/unload nodes potentially polluting the
* page replacement policy. Use this method just for test and analysis
* purposes.
*
*
* @return full tree string representation
*/
@SuppressWarnings("unchecked")
public String toStringFull() {
Node top = anchor.top;
Deque