![JAR search and dependency download from the Maven repository](/logo.png)
com.bigdata.btree.proc.AbstractKeyArrayIndexProcedure Maven / Gradle / Ivy
/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Jan 7, 2008
*/
package com.bigdata.btree.proc;
import java.io.Externalizable;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.OutputStream;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.log4j.Logger;
import com.bigdata.btree.AbstractBTree;
import com.bigdata.btree.BTree;
import com.bigdata.btree.Errors;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.ILinearList;
import com.bigdata.btree.ILocalBTreeView;
import com.bigdata.btree.ITupleSerializer;
import com.bigdata.btree.UnisolatedReadWriteIndex;
import com.bigdata.btree.raba.IRaba;
import com.bigdata.btree.raba.ReadOnlyKeysRaba;
import com.bigdata.btree.raba.ReadOnlyValuesRaba;
import com.bigdata.btree.raba.SubRangeRaba;
import com.bigdata.btree.raba.codec.IRabaCoder;
import com.bigdata.btree.view.FusedView;
import com.bigdata.io.AbstractFixedByteArrayBuffer;
import com.bigdata.io.DataOutputBuffer;
import com.bigdata.io.FixedByteArrayBuffer;
import com.bigdata.journal.IIndexManager;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.service.Split;
import com.bigdata.service.ndx.IClientIndex;
import com.bigdata.service.ndx.NopAggregator;
import it.unimi.dsi.bits.BitVector;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
/**
* Abstract base class supports compact serialization and compression for remote
* {@link IKeyArrayIndexProcedure} execution (procedures may be executed on a
* local index, but they are only (de-)serialized when executed on a remote
* index).
*
* @author Bryan Thompson
*
* @see BLZG-1537 (Schedule more IOs when loading data)
*/
abstract public class AbstractKeyArrayIndexProcedure extends
AbstractIndexProcedure implements IKeyArrayIndexProcedure,
Externalizable {
private static final Logger log = Logger.getLogger(AbstractKeyArrayIndexProcedure.class);
/*
* FIXME These parameters should be specified from the derived class and
* default from the environment or be set dynamically through ergonomics. It
* might be possible to do this by sharing a global reader thread pool for
* the journal.
*
* Right now they can be set from the environment, but this is just for
* testing purposes. We need a better mechanisms / ergonomics. Some of the
* main drivers should be the #of keys, the size of the index (total range
* count), and the amount of observed scatter (inverse of locality) on the
* index (ID2TERM has none on write (but a fair amount on read), SPO has
* little on write, TERM2ID can have a lot of write if there are UUIDs, OSP
* has a lot on write).
*/
/**
* The index procedure will be read by at most this many reader tasks.
* Parallelizing this index reads let's us speed up the overall operation
* significantly. Set to ZERO (0) to always run in the caller's thread (this
* is the historical behavior).
*/
transient static private final int maxReaders = Integer
.parseInt(System.getProperty(AbstractKeyArrayIndexProcedure.class.getName() + ".maxReaders", "0"));
/**
* How many keys to skip over in the reader threads.
*
* Note: This also sets the minimum number of keys in a batch that we hand
* off to the writer.
*/
transient static private final int skipCount =
Integer
.parseInt(System.getProperty(AbstractKeyArrayIndexProcedure.class.getName() + ".skipCount", "256"));
/**
* This is multiplied by the branching factor of the index (when ZERO, the
* branching factor is multiplied by itself) to determine how many tuples
* must lie between the first key to enter a batch and the last key that may
* enter a batch before a reader evicts a batch to the queue. Since we get a
* lot of locality from the tree structure, we should not require that the
* key is in the same page as the first key, but only that it is close and
* will share most of the parents in the B+Tree ancestry.
*/
transient static private final int spannedRangeMultiplier = Integer
.parseInt(System.getProperty(AbstractKeyArrayIndexProcedure.class.getName() + ".spannedRangeMultiplier", "10"));;
/**
* The size of a sub-key-range that will be handed off by a reader to a
* queue. A writer will drain these key ranges and apply the index procedure
* to each sub-key-range in turn. This separation makes it possible to
* ensure that pages on in memory and that the writer only does work on
* pages that are already in memory.
*
*
* FIXME As a rule of thumb, performance is quite reasonable with a single
* thread running the batch until the indices grow relatively large. So we
* really want to increase the striping of the readers as a function of the
* index size and the proportion of scattered reads on the index. A large
* index with good update locality is not a problem. A large index with poor
* update locality is a big problem and requires a bunch of readers to
* prefetch the index pages for the writer.
*/
transient static private final int batchSize = Integer
.parseInt(System.getProperty(AbstractKeyArrayIndexProcedure.class.getName() + ".batchSize", "10240"));
/**
* The maximum depth of the queue -or- ZERO (0) to use
* maxReaders * 2
(note that this is based on maxReaders, not
* the actual number of readers). This should be at least equal to the #of
* readers and could be a small multiple of that number.
*/
transient static private final int queueCapacity = Integer
.parseInt(System.getProperty(AbstractKeyArrayIndexProcedure.class.getName() + ".queueCapacity", "0"));;
static private class Stats {
/**
* The #of reader batches that were assigned for the parallel execution
* of the index procedure.
*/
private final AtomicLong readerBatchCount = new AtomicLong();
/**
* The #of batches that were processed by the writer.
*/
private final AtomicLong writerBatchCount = new AtomicLong();
}
/**
* The object used to (de-)code the keys when they are sent to the remote
* service.
*/
private IRabaCoder keysCoder;
/**
* The object used to (de-)code the values when they are sent to the remote
* service.
*/
private IRabaCoder valsCoder;
/**
* The object used to (de-)code the keys when they are sent to the remote
* service.
*/
protected IRabaCoder getKeysCoder() {
return keysCoder;
}
/**
* The object used to (de-)code the values when they are sent to the remote
* service.
*/
protected IRabaCoder getValuesCoder() {
return valsCoder;
}
/**
* The keys.
*/
private IRaba keys;
/**
* The values.
*/
private IRaba vals;
@Override
final public IRaba getKeys() {
return keys;
}
@Override
final public IRaba getValues() {
return vals;
}
/**
* Return an {@link IResultHandler} that will be used to combine the results
* if the index procedure is parallelized against a local index (including a
* scale-out shard). If a null
is returned, then the index
* procedure WILL NOT be parallelized against the local index. To
* parallelize index procedures that do not return anything against a local
* index, just use {@link NopAggregator}. A non-null
value will
* permit both index local parallelization of the index procedure and (in
* scale-out) parallelization of the index procedure across the shards as
* well. In order to be parallelized, the index procedure must also be
* marked as {@link IParallelizableIndexProcedure}.
*
* @return The {@link IResultHandler} -or- null
*
* @see NopAggregator
* @see IParallelizableIndexProcedure
* @see BLZG-1537 (Schedule more IOs when loading data)
*/
abstract protected IResultHandler newAggregator();
/**
* De-serialization constructor.
*/
protected AbstractKeyArrayIndexProcedure() {
}
/**
* @param keysCoder
* The object used to serialize the keys.
* @param valsCoder
* The object used to serialize the vals (optional IFF
* vals is null
).
* @param fromIndex
* The index of the first key in keys to be processed
* (inclusive).
* @param toIndex
* The index of the last key in keys to be processed.
* @param keys
* The keys (unsigned variable length byte[]s) MUST
* be in sorted order (the logic to split procedures across
* partitioned indices depends on this, plus ordered reads and
* writes on indices are MUCH more efficient).
* @param vals
* The values (optional, must be co-indexed with keys
* when non-null
).
*/
protected AbstractKeyArrayIndexProcedure(final IRabaCoder keysCoder,
final IRabaCoder valsCoder, final int fromIndex, final int toIndex,
final byte[][] keys, final byte[][] vals) {
if (keysCoder == null)
throw new IllegalArgumentException();
if (valsCoder == null && vals != null)
throw new IllegalArgumentException();
if (keys == null)
throw new IllegalArgumentException(Errors.ERR_KEYS_NULL);
if (fromIndex < 0)
throw new IllegalArgumentException(Errors.ERR_FROM_INDEX);
if (fromIndex >= toIndex )
throw new IllegalArgumentException(Errors.ERR_FROM_INDEX);
if (toIndex > keys.length )
throw new IllegalArgumentException(Errors.ERR_TO_INDEX);
if (vals != null && toIndex > vals.length)
throw new IllegalArgumentException(Errors.ERR_TO_INDEX);
this.keysCoder = keysCoder;
this.valsCoder = valsCoder;
/*
* FIXME I ignoring the (fromIndex, toIndex) on the original keys and
* values? These should really be passed through. The correctness issue
* probably only shows up in scale-out.
*/
this.keys = new ReadOnlyKeysRaba(fromIndex, toIndex, keys);
this.vals = (vals == null ? null : new ReadOnlyValuesRaba(fromIndex,
toIndex, vals));
}
/**
* Applies the logic of the procedure.
*
* Note: For invocations where the {@link IRaba#size()} of the
* {@link #getKeys() keys} is large, this class breaks down the {@link IRaba
* keys} into a multiple key ranges to parallelize the work. If the
* procedure is read-only, then we can trivially parallelize the operation.
* When the procedure is read-write, a prefetch pattern is used to ensure
* that the index pages are in cache and then work is handed off to a single
* thread that does the actual work while obeying the single-threaded for
* writer constraint on the index.
*
* @author bryan
*
* @see BLZG-1537 (Schedule more IOs when loading data)
*/
@Override
final public T apply(final IIndex ndx) {
if (ndx instanceof IClientIndex) {
/*
* The client index views already parallelize index operations
* across the shards so we should never hit this code path. The code
* will throw an exception if we do hit this code path as an aid to
* tracking down invalid assumptions (among them that the
* IResultHandler would be null on the DS/MDS nodes in scale-out
* since only the client has access to that object).
*
* TODO Note: It *is* safe to just uncomment the applyOnce() call
* rather than throwing an exception.
*/
// return applyOnce(ndx, keys, vals);
throw new UnsupportedOperationException();
}
/*
* Note: Do not parallelize small batches. A single thread is enough.
*
* FIXME We actually we might to run parallel threads even for smaller
* batches if the index is large enough since the parallelism will be
* required to drive the disk read queue. Otherwise we will be facing
* additive latency from sequential disk reads. This would show up in
* small updates to large indices. The point of comparison would be
* that large updates to large indices were more efficient. If this is
* observed, then we do want to parallelize small batches also if the
* index is large.
*/
final boolean smallBatch = false;//keys.size() <= batchSize;
if (maxReaders <= 0 || smallBatch || !(this instanceof IParallelizableIndexProcedure)) {
// Disables parallelism entirely.
return applyOnce(ndx, keys, vals);
}
/*
* Obtain an aggregator that can be used to combine the results across
* index local splits. This index-local aggregator was introduced to
* support parallelizing the operation against a local index or local
* index view.
*/
final IResultHandler resultHandler = newAggregator();
if (resultHandler == null) {
/*
* Can't use parallelism without an aggregator.
*
* Note: Use NopAggregator to avoid this code path and strip an
* index procedure against a local index even if it is not going
* to return anything.
*/
return applyOnce(ndx, keys, vals);
}
// FIXME IMPLEMENT PARALLEL OPERATION FOR FusedView.
final boolean isFusedView = (ndx instanceof ILocalBTreeView) && ((ILocalBTreeView) ndx).getSourceCount() > 1;
if (isFusedView && !isReadOnly()) {
// Do not parallelize mutations against a fused view (not yet implemented).
return applyOnce(ndx, keys, vals);
}
/* If it is not a client index view, then it is one of either:
*
* - UnisolatedReadWriteIndex (wrapping a BTree)
*
* - ILocalBTreeView, which in turn is one of:
* - AbstractBTree (BTree or IndexSegment)
* - FusedView (or IsolatedFusedView)
*/
final IRawStore store;
if (ndx instanceof ILocalBTreeView) {
// Note: BTree, FusedLocalView, FusedIsolatedView.
store = ((ILocalBTreeView) ndx).getMutableBTree().getStore();
} else if (ndx instanceof UnisolatedReadWriteIndex) {
store = ((UnisolatedReadWriteIndex) ndx).getStore();
} else {
/*
* Note: This is a trap for other cases that are not covered above.
* Not that I am aware of any.
*/
throw new AssertionError("Can't get backing store for " + ndx.getClass().getName());
}
final ExecutorService executorService;
if (store instanceof IIndexManager) {
executorService = ((IIndexManager) store).getExecutorService();
} else {
/*
* What typically hits this are unit tests that are using an
* SimpleMemoryStore or the like rather than a Journal. To avoid
* breaking those tests this does not parallelize the operation.
*/
return applyOnce(ndx, keys, vals);
// throw new AssertionError("Can't get ExecutorService for " + store.getClass().getName());
}
try {
if (isReadOnly()) {
/*
* Simpler code path for parallelizing read-only operations. We
* just split up the keys among N readers. All work is done by a
* ReadOnlyTask for its own key-range and the results are
* aggregated. No locking is required since no mutation is
* involved.
*/
return applyMultipleReadersNoWriter(executorService, ndx, resultHandler);
}
/*
* Parallelize a mutable index procedure. Here we need to take
* additional precautions since underlying BTree class is only
* thread-safe for a single writer.
*/
return applyMultipleReadersOneWriter(executorService, ndx, false/* readOnly */, resultHandler);
} catch (ExecutionException | InterruptedException ex) {
throw new RuntimeException(ex);
}
}
/**
* Read-only version with concurrent readers.
*
* @param ndx
* @param resultHandler
* @return
* @throws InterruptedException
* @throws ExecutionException
*/
private T applyMultipleReadersNoWriter(final ExecutorService executorService, final IIndex ndx,
final IResultHandler resultHandler) throws InterruptedException, ExecutionException {
// This is the #of keys in the keys IRaba.
final int keysSize = keys.size();
// Track statistics.
final Stats stats = new Stats();
// Setup readers.
final List> readerTasks = new LinkedList>();
{
/*
* Determine how many tuples to assign to each reader. Round up. The
* last reader will wind up a bit short if the tuples can not be
* divided evenly by the #of readers.
*
* Note: If there is not enough data for a single batch, then we use
* only one reader.
*/
final int readerSize = Math.max(batchSize, (int) Math.ceil(keysSize / (double) maxReaders));
int fromIndex = 0, toIndex = -1;
boolean done = false;
while (!done) {
toIndex = fromIndex + readerSize;
if (toIndex > keysSize) {
/*
* This will be the last reader.
*
* Note: toIndex is an exclusive upper bound. Allowable
* values are in 0:rangeCount-1. Setting toIndex to nstmts
* (aka rangeCount) sets it to one more than the last legal
* toIndex. The reader will notice that the toIndex is GTE
* the rangeCount and use a [null] toKey to read until the
* last tuple in the index.
*
* We validate that we have read and transferred rangeCount
* tuples to the mapgraph-runtime as a cross check.
*/
toIndex = keysSize;
done = true;
}
readerTasks.add(new ReadOnlyTask(ndx, resultHandler, stats, new Batch(fromIndex, toIndex, keys, vals)));
fromIndex = toIndex;
}
stats.readerBatchCount.set(readerTasks.size());
}
// Start readers. all readers are done by the time this returns (including if interrupted).
final List> readerFutures = executorService.invokeAll(readerTasks);
// check reader futures.
for (Future f : readerFutures) {
f.get();
}
// configuration parameters. followed by invocation instance data.
log.fatal("maxReaders=" + maxReaders //
+ ", skipCount=" + skipCount //
+ ", spannedRangeMultiplier=" + spannedRangeMultiplier //
+ ", batchSize=" + batchSize //
+ ", queueCapacity=" + queueCapacity
// invocation instance data.
+ ", nkeys=" + keysSize //
+ ", nreaders=" + stats.readerBatchCount //
// + ", writerBatches=" + stats.writerBatchCount //
// + ", keys/writeBatch=" + (keysSize / stats.writerBatchCount.get()) //
+ ", proc=" + getClass().getSimpleName()//
);
return resultHandler.getResult();
}
/**
* Task for a read-only index procedure. No locking is required. Each task
* just handles its sub-key-range of the original keys raba.
*
* @author bryan
*/
private class ReadOnlyTask implements Callable {
private final IIndex view;
private final Batch batch;
private final IResultHandler resultHandler;
private final Stats stats;
/**
*
* @param view
* The index against which the procedure will be applied.
* @param resultHandler
* Used to combine the intermediate results from the
* application of the index procedure to each {@link Batch}.
* @param batch
* A batch of keys (and optionally values) to be processed.
*/
ReadOnlyTask(final IIndex view, final IResultHandler resultHandler, final Stats stats,
final Batch batch) {
if (view == null)
throw new IllegalArgumentException();
if (batch == null)
throw new IllegalArgumentException();
if (resultHandler== null)
throw new IllegalArgumentException();
if (stats== null)
throw new IllegalArgumentException();
this.view = view;
this.batch = batch;
this.resultHandler = resultHandler;
this.stats = stats;
}
@Override
public Void call() throws Exception {
// Setup view onto the sub-key range of the keys / vals.
final IRaba keysView = new SubRangeRaba(batch.keys, batch.fromIndex, batch.toIndex);
final IRaba valsView = batch.vals == null ? null
: new SubRangeRaba(batch.vals, batch.fromIndex, batch.toIndex);
// Apply procedure to sub-key range.
final T aResult = applyOnce(view, keysView, valsView);
// aggregate results.
resultHandler.aggregate(aResult, batch);
return null;
}
}
/**
* MROW version (multiple readers, one writer).
*
* @param ndx
* A local index (any of {@link UnisolatedReadWriteIndex},
* {@link BTree}, or {@link FusedView}).
* @param resultHandler
* The handler used to aggregate results across the parallel
* stripes.
*
* @return The result.
*
* @throws InterruptedException
* @throws ExecutionException
*/
private T applyMultipleReadersOneWriter(final ExecutorService executorService, IIndex ndx, final boolean readOnly,
final IResultHandler resultHandler) throws InterruptedException, ExecutionException {
/*
* Use concurrent readers.
*
* Note: With concurrent readers the data are *NOT* transferred into a
* total ordering and the mapgraph runtime will need to sort on S (or
* SPO) before building the indices.
*/
/**
* Note: When this method is invoked for an
* {@link UnisolatedReadWriteIndex} , that class method hands off the
* inner {@link BTree} object. Since the invoking thread at the
* top-level owns the read or write lock for the
* {@link UnisolatedReadWriteIndex} (depending on whether the procedure
* is read only), we are not able to acquire the write lock inside of
* the {@link WriterTask} (unless it is run in the caller's thread) and
* we can not acquire the read lock in any of the {@link ReaderTask}
* threads. To work around this, we explicitly coordinate among the
* readers and with the writer thread using a read/write lock.
*/
final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
// This is the #of keys in the keys IRaba.
final int keysSize = keys.size();
final int effectiveQueueCapacity;
{
if (queueCapacity <= 0) {
/*
* Note: This is MAX readers, not the actual number of readers.
* We need to create the queue before we create the readers.
*/
effectiveQueueCapacity = maxReaders * 2;
} else {
effectiveQueueCapacity = queueCapacity;
}
}
// Queue used to pass batches from readers to writer.
final LinkedBlockingQueue queue = new LinkedBlockingQueue(effectiveQueueCapacity);
// Track statistics.
final Stats stats = new Stats();
// Setup writer.
final FutureTask writerFuture = new FutureTask(new WriterTask(lock, queue, ndx, resultHandler, stats));
// Setup readers.
final List> readerTasks = new LinkedList>();
{
/*
* Determine how many tuples to assign to each reader. Round up. The
* last reader will wind up a bit short if the tuples can not be
* divided evenly by the #of readers.
*
* Note: If there is not enough data for a single batch, then we use
* only one reader.
*/
final int readerSize = Math.max(batchSize, (int) Math.ceil(keysSize / (double) maxReaders));
int fromIndex = 0, toIndex = -1;
boolean done = false;
while (!done) {
toIndex = fromIndex + readerSize;
if (toIndex > keysSize) {
/*
* This will be the last reader.
*
* Note: toIndex is an exclusive upper bound. Allowable
* values are in 0:rangeCount-1. Setting toIndex to nstmts
* (aka rangeCount) sets it to one more than the last legal
* toIndex. The reader will notice that the toIndex is GTE
* the rangeCount and use a [null] toKey to read until the
* last tuple in the index.
*
* We validate that we have read and transferred rangeCount
* tuples to the mapgraph-runtime as a cross check.
*/
toIndex = keysSize;
done = true;
}
readerTasks.add(new ReaderTask(readOnly, lock, queue, writerFuture, ndx, new Batch(fromIndex, toIndex, keys, vals)));
fromIndex = toIndex;
}
stats.readerBatchCount.set(readerTasks.size());
}
try {
// start writer.
executorService.submit(writerFuture);
// start readers. all readers are done by the time this returns (including if interrupted).
final List> readerFutures = executorService.invokeAll(readerTasks);
// Readers are done. drop poison pill on writer so it will terminate.
// Note: will block if queue is full.
// Note: will notice if the writer fails.
putOnQueue(writerFuture, queue, Batch.POISON_PILL);
// check reader futures.
for (Future f : readerFutures) {
f.get();
}
// check writer future.
final T ret = writerFuture.get();
// configuration parameters. followed by invocation instance data.
log.fatal("maxReaders=" + maxReaders //
+ ", skipCount=" + skipCount //
+ ", spannedRangeMultiplier=" + spannedRangeMultiplier //
+ ", batchSize=" + batchSize //
+ ", queueCapacity=" + queueCapacity
// invocation instance data.
+ ", nkeys=" + keysSize //
+ ", nreaders=" + stats.readerBatchCount //
+ ", writerBatches=" + stats.writerBatchCount //
+ ", keys/writeBatch=" + (keysSize / stats.writerBatchCount.get()) //
+ ", proc=" + getClass().getSimpleName()//
);
return ret;
} finally {
/*
* Ensure writer is terminated.
*
* Note: Readers will be done by the time invokeAll()
* returns via any code path so we do not need to cancel
* them here.
*/
writerFuture.cancel(true/* mayInterruptIfRunning */);
}
}
/**
* A key-range of the caller's keys (and optionally values) to be operated
* on.
*
* @author bryan
*/
private static class Batch extends Split {
/**
* The original keys and values. Code using a {@link Batch} MUST respect
* the {@link Split#fromIndex} when indexing into these data.
*/
private final IRaba keys, vals;
/**
*
* @param fromIndex
* The inclusive lower bound index into the original
* {@link IRaba}s (offset).
* @param toIndex
* The exclusive upper bound index into the original
* {@link IRaba}s.
* @param keys
* The original {@link IRaba} for the keys.
* @param vals
* The original {@link IRaba} for the values.
*/
Batch(final int fromIndex, final int toIndex, final IRaba keys, final IRaba vals) {
super(null/* pmd */, fromIndex, toIndex);
this.keys = keys;
this.vals = vals;
}
private Batch() {
super(null/* pmd */, 0, 0);
this.keys = this.vals = null;
}
/**
* Singleton instance is used to signal the end of service for a queue.
*/
final private static Batch POISON_PILL = new Batch();
}
/**
* Task applies the index procedure to a specific key range.
*
* @author bryan
*/
private class WriterTask implements Callable {
private final ReentrantReadWriteLock lock;
private final IIndex ndx;
private final LinkedBlockingQueue queue;
private final IResultHandler resultHandler;
private final Stats stats;
/**
*
* @param lock
* Lock used to allow concurrent readers on the index or a
* single thread that applies mutation to the index.
* @param queue
* Queue used to hand off work to the writer.
* @param view
* The index against which the procedure will be applied.
* @param resultHandler
* Used to combine the intermediate results from the
* application of the index procedure to each {@link Batch}.
*/
WriterTask(final ReentrantReadWriteLock lock, final LinkedBlockingQueue queue,
final IIndex view, final IResultHandler resultHandler, final Stats stats) {
if (lock == null)
throw new IllegalArgumentException();
if (view == null)
throw new IllegalArgumentException();
if (queue == null)
throw new IllegalArgumentException();
if (resultHandler == null)
throw new IllegalArgumentException();
if (stats == null)
throw new IllegalArgumentException();
this.lock = lock;
this.queue = queue;
this.ndx = view;
this.resultHandler = resultHandler;
this.stats = stats;
if (isReadOnly()) {
// No point. Other code path is used.
throw new UnsupportedOperationException();
}
}
@Override
public T call() throws Exception {
while (true) {
// blocking take
final Batch batch = queue.take();
if (batch == Batch.POISON_PILL)
break;
if(batch.ntuples==0) throw new AssertionError("Empty batch");
/*
* Setup sub-range for keys and values and invoke the index
* procedure on that sub-range.
*/
final IRaba keysView = new SubRangeRaba(batch.keys, batch.fromIndex, batch.toIndex);
final IRaba valsView = batch.vals == null ? null
: new SubRangeRaba(batch.vals, batch.fromIndex, batch.toIndex);
/*
* Acquire write lock to avoid concurrent mutation errors in the
* B+Tree.
*/
final T aResult;
lock.writeLock().lock();
try {
// invoke index procedure on sub-range.
aResult = applyOnce(ndx, keysView, valsView);
} finally {
lock.writeLock().unlock();
}
// aggregate results.
resultHandler.aggregate(aResult, batch);
stats.writerBatchCount.incrementAndGet();
}
return resultHandler.getResult();
}
}
/**
* Read a key-range of the SPO index into a sequence of {s,p,o} tuple
* {@link Batch}es and drop each one in turn onto the caller's queue.
*
* @author bryan
*/
static private class ReaderTask implements Callable {
private final boolean readOnly;
private final ReentrantReadWriteLock lock;
private final LinkedBlockingQueue queue;
private final Future writerFuture;
private final IIndex view;
private final Batch batch;
/**
*
* @param lock
* Lock used to allow concurrent readers on the index or a
* single thread that applies mutation to the index.
* @param queue
* Queue used to hand off work to the writer.
* @param view
* The index against which the procedure will be applied.
* @param batch
* A batch of keys (and optionally values). The reader uses
* the keys in the batch to pre-fetch the associated index
* page(s) and then drops a batch (which might have only a
* subset of those keys) onto the queue. The reader may
* choose to break up batches when the keys do not have good
* locality in the index.
*/
ReaderTask(final boolean readOnly, final ReentrantReadWriteLock lock, final LinkedBlockingQueue queue,
final Future writerFuture, final IIndex view, final Batch batch) {
if (lock == null)
throw new IllegalArgumentException();
if (queue == null)
throw new IllegalArgumentException();
if (writerFuture == null)
throw new IllegalArgumentException();
if (view == null)
throw new IllegalArgumentException();
if (batch == null)
throw new IllegalArgumentException();
this.readOnly = readOnly;
this.lock = lock;
this.queue = queue;
this.writerFuture = writerFuture;
this.view = view;
this.batch = batch;
}
@Override
public Void call() throws Exception {
if (view instanceof UnisolatedReadWriteIndex
|| (view instanceof ILocalBTreeView && ((ILocalBTreeView) view).getSourceCount() == 1)) {
// A single B+Tree object.
if (!(view instanceof ILinearList))
throw new AssertionError("Unexpected index type: " + view.getClass().getName()
+ " does not implement " + ILinearList.class.getName());
// Note: Can't take the lock here. Queue.put() will block if writer has lock.
doSimpleBTree(lock, view, batch, queue);
} else if (view instanceof ILocalBTreeView) {
// A fused view of a mutable BTree and one or more additional B+Tree objects.
doFusedView((ILocalBTreeView) view, batch, queue);
} else {
throw new AssertionError("Unexpected index type: " + view.getClass().getName());
}
return null;
}
/**
* This is the complex case. The index is not a single B+Tree, but some
* sort of fused ordered view of 2 or more B+Tree objects. For this case
* we do not have access to the {@link ILinearList} API.
*
* @param view
*
* TODO How can we explicitly test this case and assess the
* performance impact? Perhaps for a tx? This code path is
* not used by scale-out since we are not parallelizing
* within the thread for scale-out at this time (because the
* {@link IResultHandler} is not available.)
*/
static private void doFusedView(final ILocalBTreeView view, final Batch batch,
final LinkedBlockingQueue queue) {
if (view == null)
throw new IllegalArgumentException();
if (batch == null)
throw new IllegalArgumentException();
if (queue == null)
throw new IllegalArgumentException();
// * @param firstKey
// * The first key for this reader and never null (the keys are
// * ordered but not necessarily dense and may not be null).
// * @param lastKey
// * The last key for this reader and never null (the keys are
// * ordered but not necessarily dense and may not be null).
//
// // Start at the beginning. Proceed until then end. Then stop.
// final byte[] firstKey = batch.keysView.get(0); // firstKey (inclusive)
// final byte[] lastKey = batch.keysView.get(batch.keysView.size() - 1); // lastKey (inclusive)
//
// TODO Auto-generated method stub
// final AbstractBTree sources[] = view.getSources();
// final int effectiveBranchingFactor;
// final IndexMetadata md = ndx.getIndexMetadata();
// if (ndx instanceof ILocalBTreeView) {
// int tmp = 0;
// final int nsources = ((ILocalBTreeView) ndx).getSourceCount();
// for (AbstractBTree btree : ((ILocalBTreeView) ndx).getSources()) {
// tmp += btree.getBranchingFactor();
// }
// tmp /= nsources;
// effectiveBranchingFactor = tmp;
// } else {
// effectiveBranchingFactor = md.getIndexSegmentBranchingFactor();
// }
// // Iterator used to seek along the index.
// final ITupleCursor> itr = (ITupleCursor>) ndx.rangeIterator(firstKey, null/* toKey */, 1/* capacity */,
// IRangeQuery.KEYS | IRangeQuery.VALS | IRangeQuery.CURSOR, null/* filterCtor */);
//
// // Note: itr.hasNext() will force in the page for the current key.
// while (itr.hasNext()) {
throw new UnsupportedOperationException();
}
/**
* This is the simple case. We have either an
* {@link UnisolatedReadWriteIndex} or an {@link AbstractBTree}.
*
* @param ndx
* Either an {@link UnisolatedReadWriteIndex} or an
* {@link AbstractBTree}.
*
* @throws InterruptedException
*/
private void doSimpleBTree(final ReentrantReadWriteLock lock, final IIndex ndx, final Batch batch,
final LinkedBlockingQueue queue) throws InterruptedException {
if (lock == null)
throw new IllegalArgumentException();
if (ndx == null)
throw new IllegalArgumentException();
if (!(ndx instanceof UnisolatedReadWriteIndex) && !(ndx instanceof AbstractBTree)) {
throw new IllegalArgumentException("Index may be either: " + UnisolatedReadWriteIndex.class.getName()
+ " or " + AbstractBTree.class.getName() + ", but have " + ndx.getClass().getName());
}
if (batch == null)
throw new IllegalArgumentException();
/*
* The maximum #of keys in a leaf.
*/
final int branchingFactor = ndx.getIndexMetadata().getBranchingFactor();
final long evictRange = branchingFactor
* (spannedRangeMultiplier == 0 ? branchingFactor : spannedRangeMultiplier);
// The linear list position into the B+Tree for the current key.
long firstIndex = -1;
// The index into the raba for the start of the batch.
int firstRabaIndex = batch.fromIndex;
// The current index into the raba.
int currentRabaIndex = firstRabaIndex;
// Loop over the current index into the raba.
for (; currentRabaIndex < batch.toIndex; currentRabaIndex += skipCount) {
final byte[] currentKey = batch.keys.get(currentRabaIndex);
/*
* Advance index to the next caller's key.
*
* Note: the return is an insert position. It will be negative
* if the key is not found in the index. If it is negative it is
* converted into an insert position. Either way it indicates
* where in the index the key exists / would be inserted.
*/
final long indexOf;
lock.readLock().lock();
try {
if (writerFuture.isDone()) {
/*
* If the writer hits an error condition, then the index
* can be left is an inconsistent state. At this point
* we MUST NOT read on the index.
*/
throw new RuntimeException("Writer is dead?");
}
long n = ((ILinearList) ndx).indexOf(currentKey);
if (n < 0) {
// Convert to an insert position.
n = -(n + 1);
}
indexOf = n;
} finally {
lock.readLock().unlock();
}
if (firstIndex == -1) {
firstIndex = indexOf;
}
/*
* The #of tuples that lie between the first key accepted and
* the current key (or the insert position for the current key).
*/
final long spannedRange = indexOf - firstIndex;
assert spannedRange >= 0; // should not be negative.
if (spannedRange >= evictRange) {
// Evict a batch (blocking put).
putOnQueue(new Batch(firstRabaIndex, currentRabaIndex, batch.keys, batch.vals));
// start a new batch.
firstRabaIndex = currentRabaIndex;
}
}
if ((currentRabaIndex - firstRabaIndex) > 0) {
// Last batch (blocking put).
putOnQueue(new Batch(firstRabaIndex, batch.toIndex, batch.keys, batch.vals));
}
}
/**
* Evict a batch (blocking put, but spins to look for an error in the
* writer {@link Future}).
*
* @param batch
* A batch.
*
* @throws InterruptedException
*/
private void putOnQueue(final Batch batch) throws InterruptedException {
AbstractKeyArrayIndexProcedure.putOnQueue(writerFuture, queue, batch);
}
} // ReaderTask
/**
* Evict a batch (blocking put, but spins to look for an error in the
* writerFuture to avoid a deadlock if the writer fails).
*
* @param writerFuture
* The {@link Future} of the {@link WriterTask} (required).
* @param queue
* The queue onto which the batches are being transferred
* (required).
* @param batch
* A batch (required).
*
* @throws InterruptedException
*/
private static void putOnQueue(final Future> writerFuture, final LinkedBlockingQueue queue,
final Batch batch) throws InterruptedException {
while (!writerFuture.isDone()) {
if (queue.offer(batch, 100L, TimeUnit.MILLISECONDS)) {
return;
}
}
if (writerFuture.isDone()) {
/*
* This is most likely to indicate either an error or interrupt in the writer.
*/
throw new RuntimeException("Writer is done, but reader still working?");
}
}
/**
* Apply the procedure to the specified key range of the index.
*
* @param ndx
* The index.
* @return
*/
abstract protected T applyOnce(final IIndex ndx, final IRaba keys, final IRaba vals);
@Override
final public void readExternal(final ObjectInput in) throws IOException,
ClassNotFoundException {
readMetadata(in);
final boolean haveVals = in.readBoolean();
{
// the keys.
final int len = in.readInt();
final byte[] a = new byte[len];
in.readFully(a);
keys = keysCoder.decode(FixedByteArrayBuffer.wrap(a));
// keys = new MutableValuesRaba(0, 0, new byte[n][]);
//
// getKeysCoder().read(in, keys);
}
if (haveVals) {
/*
* Wrap the coded the values.
*/
// the byte length of the coded values.
final int len = in.readInt();
// allocate backing array.
final byte[] a = new byte[len];
// read the coded record into the array.
in.readFully(a);
// wrap the coded record.
vals = valsCoder.decode(FixedByteArrayBuffer.wrap(a));
// vals = new MutableValuesRaba( 0, 0, new byte[n][] );
//
// getValuesCoder().read(in, vals);
} else {
vals = null;
}
}
@Override
final public void writeExternal(final ObjectOutput out) throws IOException {
writeMetadata(out);
out.writeBoolean(vals != null); // haveVals
final DataOutputBuffer buf = new DataOutputBuffer();
{
// code the keys
final AbstractFixedByteArrayBuffer slice = keysCoder.encode(keys,
buf);
// The #of bytes in the coded keys.
out.writeInt(slice.len());
// The coded keys.
slice.writeOn(out);
}
if (vals != null) {
// reuse the buffer.
buf.reset();
// code the values.
final AbstractFixedByteArrayBuffer slice = valsCoder.encode(vals,
buf);
// The #of bytes in the coded keys.
out.writeInt(slice.len());
// The coded keys.
slice.writeOn(out);
}
}
/**
* Reads metadata written by {@link #writeMetadata(ObjectOutput)}.
*
* @param in
*
* @throws IOException
* @throws ClassNotFoundException
*/
protected void readMetadata(final ObjectInput in) throws IOException,
ClassNotFoundException {
final byte version = in.readByte();
switch (version) {
case VERSION0:
break;
default:
throw new IOException();
}
// fromIndex = 0;
//
// toIndex = (int) LongPacker.unpackLong(in);
keysCoder = (IRabaCoder) in.readObject();
valsCoder = (IRabaCoder) in.readObject();
}
/**
* Writes metadata (not the keys or values, but just other metadata used by
* the procedure).
*
* The default implementation writes out the {@link #getKeysCoder()} and the
* {@link #getValuesCoder()}.
*
* @param out
*
* @throws IOException
*/
protected void writeMetadata(final ObjectOutput out) throws IOException {
out.write(VERSION0);
// final int n = toIndex - fromIndex;
//
// LongPacker.packLong(out, n);
out.writeObject(keysCoder);
out.writeObject(valsCoder);
}
private static final byte VERSION0 = 0x00;
/**
* A class useful for sending some kinds of data back from a remote
* procedure call (those readily expressed as a byte[][]
).
*
* @author Bryan Thompson
*/
public static class ResultBuffer implements Externalizable {
/**
*
*/
private static final long serialVersionUID = 3545214696708412869L;
private IRaba vals;
private IRabaCoder valsCoder;
/**
* De-serialization ctor.
*
*/
public ResultBuffer() {
}
/**
*
* @param n
* #of values in a containing data.
* @param a
* The data.
* @param valSer
* The data are serialized using using this object. Typically
* this is the value returned by
* {@link ITupleSerializer#getLeafValuesCoder()}.
*/
public ResultBuffer(final int n, final byte[][] a,
final IRabaCoder valsCoder) {
assert n >= 0;
assert a != null;
assert valsCoder != null;
this.vals = new ReadOnlyValuesRaba(0/* fromIndex */, n/* toIndex */, a);
this.valsCoder = valsCoder;
}
public IRaba getValues() {
return vals;
}
/**
* @deprecated by {@link #getValues()}
*/
public int getResultCount() {
return vals.size();
}
/**
* @deprecated by {@link #getValues()}
*/
public byte[] getResult(final int index) {
return vals.get(index);
}
@Override
public void readExternal(final ObjectInput in) throws IOException,
ClassNotFoundException {
final byte version = in.readByte();
switch (version) {
case VERSION0:
break;
default:
throw new IOException();
}
// final int n = in.readInt();
// The values coder.
valsCoder = (IRabaCoder) in.readObject();
// The #of bytes in the coded values.
final int len = in.readInt();
final byte[] b = new byte[len];
in.readFully(b);
// Wrap the coded values.
vals = valsCoder.decode(FixedByteArrayBuffer.wrap(b));
// a = new MutableValuesRaba(0/* fromIndex */, 0/* toIndex */,
// n/* capacity */, new byte[n][]);
//
// valSer.read(in, a);
}
@Override
public void writeExternal(final ObjectOutput out) throws IOException {
out.writeByte(VERSION0);
// out.writeInt(a.size());
// The values coder.
out.writeObject(valsCoder);
// Code the values.
final AbstractFixedByteArrayBuffer slice = valsCoder.encode(vals,
new DataOutputBuffer());
// The #of bytes in the coded keys.
out.writeInt(slice.len());
// The coded keys.
slice.writeOn(out);
// valSer.write(out, a);
}
private static final byte VERSION0 = 0x00;
}
/**
* A class useful for sending a logical boolean[]
back from a
* remote procedure call.
*
* @todo provide run-length coding for bits?
*
* @todo use {@link LongArrayBitVector} for more compact storage?
*
* @author Bryan
* Thompson
*/
public static class ResultBitBuffer implements Externalizable {
/**
*
*/
private static final long serialVersionUID = 1918403771057371471L;
private int n;
/**
* @todo represent using a {@link BitVector}. {@link LongArrayBitVector}
* when allocating. Directly write the long[] backing bits
* (getBits()) onto the output stream. Reconstruct from backing
* long[] when reading. Hide the boolean[] from the API by
* modifying {@link #getResult()} to accept the index of the bit
* of interest or to return the {@link BitVector} directly.
*/
// private BitVector a;
private boolean[] a;
transient private int onCount;
/**
* De-serialization ctor.
*/
public ResultBitBuffer() {
}
/**
*
* @param n
* #of values in a containing data.
* @param a
* The data.
* @param onCount
* The #of bits which were on in the array.
*/
public ResultBitBuffer(final int n, final boolean[] a, final int onCount) {
if (n < 0)
throw new IllegalArgumentException();
if (a == null)
throw new IllegalArgumentException();
if (onCount < 0 || onCount > n)
throw new IllegalArgumentException();
this.n = n;
this.a = a;
/*
* Note: The onCount is a require parameter because this class is
* used in non-RMI contexts as well where it is not deserialized and
* hence onCount will not be set unless it is done in this
* constructor.
*/
this.onCount = onCount;
}
public int getResultCount() {
return n;
}
/**
*
*/
public boolean[] getResult() {
return a;
}
/**
* Return the #of bits which are "on" (aka true).
*/
public int getOnCount() {
return onCount;
}
@Override
public void readExternal(final ObjectInput in) throws IOException,
ClassNotFoundException {
final byte version = in.readByte();
switch (version) {
case VERSION0:
break;
default:
throw new UnsupportedOperationException("Unknown version: "
+ version);
}
@SuppressWarnings("resource")
final InputBitStream ibs = new InputBitStream((InputStream) in,
0/* unbuffered */, false/* reflectionTest */);
n = ibs.readNibble();
// a = LongArrayBitVector.getInstance(n);
a = new boolean[n];
for (int i = 0; i < n; i++) {
final boolean bit = ibs.readBit() == 1 ? true : false;
// a.set(i, bit);
if (a[i] = bit)
onCount++;
}
}
@Override
public void writeExternal(final ObjectOutput out) throws IOException {
out.writeByte(VERSION);
@SuppressWarnings("resource")
final OutputBitStream obs = new OutputBitStream((OutputStream) out,
0/* unbuffered! */, false/*reflectionTest*/);
obs.writeNibble(n);
// obs.write(a.iterator());
for (int i = 0; i < n; i++) {
obs.writeBit(a[i]);
}
obs.flush();
}
/**
* The initial version.
*/
private static final transient byte VERSION0 = 0;
/**
* The current version.
*/
private static final transient byte VERSION = VERSION0;
}
/**
* Knows how to aggregate {@link ResultBuffer} objects.
*
* @author Bryan Thompson
*/
public static class ResultBufferHandler implements
IResultHandler {
private final byte[][] results;
private final IRabaCoder valsCoder;
public ResultBufferHandler(final int nkeys, final IRabaCoder valsCoder) {
this.results = new byte[nkeys][];
this.valsCoder = valsCoder;
}
@Override
public void aggregate(final ResultBuffer result, final Split split) {
final IRaba src = result.getValues();
for (int i = 0, j = split.fromIndex; i < split.ntuples; i++, j++) {
results[j] = src.get(i);
}
}
/**
* The aggregated results.
*/
@Override
public ResultBuffer getResult() {
return new ResultBuffer(results.length, results, valsCoder);
}
}
/**
* Knows how to aggregate {@link ResultBitBuffer} objects.
*/
public static class ResultBitBufferHandler implements
IResultHandler {
private final boolean[] results;
/**
* I added this so I could encode information about tuple modification
* that takes more than one boolean to encode. For example, SPOs can
* be: INSERTED, REMOVED, UPDATED, NO_OP (2 bits).
*/
private final int multiplier;
private final AtomicInteger onCount = new AtomicInteger();
public ResultBitBufferHandler(final int nkeys) {
this(nkeys, 1);
}
public ResultBitBufferHandler(final int nkeys, final int multiplier) {
results = new boolean[nkeys * multiplier];
this.multiplier = multiplier;
}
@Override
public void aggregate(final ResultBitBuffer result, final Split split) {
System.arraycopy(result.getResult(), 0, results,
split.fromIndex*multiplier,
split.ntuples*multiplier);
onCount.addAndGet(result.getOnCount());
}
/**
* The aggregated results.
*/
@Override
public ResultBitBuffer getResult() {
return new ResultBitBuffer(results.length, results, onCount.get());
}
}
/**
* Counts the #of true
bits in the {@link ResultBitBuffer}(s).
*/
public static class ResultBitBufferCounter implements
IResultHandler {
private final AtomicLong ntrue = new AtomicLong();
public ResultBitBufferCounter() {
}
@Override
public void aggregate(final ResultBitBuffer result, final Split split) {
int delta = 0;
for (int i = 0; i < result.n; i++) {
if (result.a[i])
delta++;
}
this.ntrue.addAndGet(delta);
}
/**
* The #of true
values observed in the aggregated
* {@link ResultBitBuffer}s.
*/
@Override
public Long getResult() {
return ntrue.get();
}
}
}