
com.bigdata.relation.accesspath.AccessPath Maven / Gradle / Ivy
Show all versions of bigdata-core Show documentation
/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Jun 19, 2008
*/
package com.bigdata.relation.accesspath;
import java.util.Iterator;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.FutureTask;
import java.util.concurrent.RejectedExecutionException;
import org.apache.log4j.Logger;
import com.bigdata.bop.BOpContext;
import com.bigdata.bop.BufferAnnotations;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IPredicate;
import com.bigdata.bop.ap.filter.SameVariableConstraint;
import com.bigdata.bop.cost.BTreeCostModel;
import com.bigdata.bop.cost.DiskCostModel;
import com.bigdata.bop.cost.IndexSegmentCostModel;
import com.bigdata.bop.cost.ScanCostReport;
import com.bigdata.bop.join.BaseJoinStats;
import com.bigdata.btree.AbstractBTree;
import com.bigdata.btree.BTree;
import com.bigdata.btree.IBTreeStatistics;
import com.bigdata.btree.IBloomFilter;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.ILocalBTreeView;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.IndexSegment;
import com.bigdata.btree.Tuple;
import com.bigdata.btree.UnisolatedReadWriteIndex;
import com.bigdata.btree.isolation.IsolatedFusedView;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.proc.ISimpleIndexProcedure;
import com.bigdata.btree.view.FusedView;
import com.bigdata.io.DirectBufferPool;
import com.bigdata.journal.IIndexManager;
import com.bigdata.journal.ITx;
import com.bigdata.journal.Journal;
import com.bigdata.journal.NoSuchIndexException;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.mdi.IMetadataIndex;
import com.bigdata.mdi.LocalPartitionMetadata;
import com.bigdata.relation.AbstractResource;
import com.bigdata.relation.IRelation;
import com.bigdata.service.AbstractClient;
import com.bigdata.service.DataService;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.service.ndx.IClientIndex;
import com.bigdata.service.ndx.IScaleOutClientIndex;
import com.bigdata.striterator.ChunkedArrayIterator;
import com.bigdata.striterator.ChunkedWrappedIterator;
import com.bigdata.striterator.EmptyChunkedIterator;
import com.bigdata.striterator.IChunkedIterator;
import com.bigdata.striterator.IChunkedOrderedIterator;
import com.bigdata.striterator.IKeyOrder;
import com.bigdata.util.Bytes;
import com.bigdata.util.BytesUtil;
import cutthecrap.utils.striterators.FilterBase;
import cutthecrap.utils.striterators.ICloseableIterator;
import cutthecrap.utils.striterators.IFilter;
import cutthecrap.utils.striterators.NOPFilter;
import cutthecrap.utils.striterators.Striterator;
/**
* Abstract base class for type-specific {@link IAccessPath} implementations.
*
* Note: Filters should be specified when the {@link IAccessPath} is constructed
* so that they will be evaluated on the data service rather than materializing
* the elements and then filtering them. This can be accomplished by adding the
* filter as a constraint on the predicate when specifying the access path.
*
* @author Bryan Thompson
* @version $Id$
* @param R
* The generic type of the elements of the {@link IRelation}.
*
* @todo Add support for non-perfect access paths. This class should layer on an
* index local {@link IFilter} which rejects tuples which do not satisfy
* the {@link IPredicate}'s bindings. This will give the effect of a SCAN
* with an implied filter. The javadoc on
* {@link IRelation#getKeyOrder(IPredicate)} should also be updated to
* reflect the allowance for non-perfect access paths.
*/
public class AccessPath implements IAccessPath, IBindingSetAccessPath {
static final protected Logger log = Logger.getLogger(IAccessPath.class);
private static final boolean DEBUG = log.isDebugEnabled();
/** Relation (resolved lazily if not specified to the ctor). */
private final IRelation relation;
/** Access to the index, resource locator, executor service, etc. */
protected final IIndexManager indexManager;
/** Timestamp of the view. */
protected final long timestamp;
/** Predicate (the resource name on the predicate is the relation namespace). */
protected final IPredicate predicate;
/**
* The description of the index partition iff the {@link #predicate} is
* constrained to an index partition and null
otherwise.
*/
final LocalPartitionMetadata pmd;
/**
* Index order (the relation namespace plus the index order and the option
* partitionId constraint on the predicate identify the index).
*/
protected final IKeyOrder keyOrder;
/** The index. */
protected final IIndex ndx;
/** Iterator flags. */
protected final int flags;
protected final int chunkOfChunksCapacity;
protected final int chunkCapacity;
protected final int fullyBufferedReadThreshold;
/**
* true
iff the {@link IPredicate}is fully bound.
*/
private final boolean isFullyBoundForKey;
/**
* true
iff there is a filter for the access path (either local
* or remote).
*/
private final boolean hasFilter;
/**
* true
iff there is a filter for the access path (either local
* or remote).
*/
public final boolean hasFilter() {
return hasFilter;
}
/**
* true
iff all elements in the predicate which are required
* to generate the key are bound to constants.
*/
public boolean isFullyBoundForKey() {
return isFullyBoundForKey;
}
/**
* @see AbstractResource#getChunkCapacity()
*/
public int getChunkCapacity() {
return chunkCapacity;
}
/**
* @see AbstractResource#getChunkOfChunksCapacity()
*/
public int getChunkOfChunksCapacity() {
return chunkOfChunksCapacity;
}
/**
* The maximum limit that is allowed for a fully-buffered read. The
* {@link #asynchronousIterator(Iterator)} will always be used above this
* limit.
*
* FIXME Array limits in truth maintenance code. This should probably be
* close to the branching factor or chunk capacity. It has been temporarily
* raised to a very large value in order to support truth maintenance where
* the code assumes access to the fully buffered result. That change needs
* to be examined for an impact on query performance. It is effectively
* forcing all access path reads to be fully buffered rather than using an
* asynchronous iterator pattern.
*
* @see
* Array limits in truth maintenance code.
*/
protected static final int MAX_FULLY_BUFFERED_READ_LIMIT = 10000000;
/**
* We cache some stuff for historical reads.
*
* Note: We cache results on a per-{@link IAccessPath} basis rather than a
* per-{@link IIndex} basis since range counts and range iterators are both
* constrained to a specific key range of interest for an
* {@link IAccessPath} while they would span the entire {@link IIndex}
* otherwise.
*
* @todo cache the {@link IAccessPath}s themselves so that we benefit from
* reuse of the cached data.
*
* @todo we could also cache small iterator result sets.
*/
private final boolean historicalRead;
/**
* For {@link #historicalRead}s only, the range count is cached once it is
* computed. It is also set if we discover using {@link #isEmpty()} or
* {@link #iterator(long, long, int)} that the {@link IAccessPath} is empty.
* Likewise, those methods test this flag to see if we have proven the
* {@link IAccessPath} to be empty.
*/
private long rangeCount = -1L;
/**
* The filter derived from optional
* {@link IPredicate.Annotations#INDEX_LOCAL_FILTER}. If there are shared
* variables in the {@link IPredicate} then a {@link SameVariableConstraint}
* is added regardless of whether the {@link IPredicate} specified a filter
* or not.
*/
final protected IFilter indexLocalFilter;
/**
* The filter derived from optional
* {@link IPredicate.Annotations#ACCESS_PATH_FILTER}.
*/
final protected IFilter accessPathFilter;
/**
* Used to detect failure to call {@link #init()}.
*/
private boolean didInit = false;
private final byte[] fromKey;
private final byte[] toKey;
/**
* The key corresponding to the inclusive lower bound for the
* {@link IAccessPath} null
if there is no lower bound.
*/
final public byte[] getFromKey() {
return fromKey;
}
/**
* The key corresponding to the exclusive upper bound for the
* {@link IAccessPath} -or- null
if there is no upper bound.
*/
final public byte[] getToKey() {
return toKey;
}
@Override
final public IKeyOrder getKeyOrder() {
return keyOrder;
}
/**
* @param relation
* The relation for the access path (optional). The
* relation is not specified when requested an
* {@link IAccessPath} for a specific index partition in order to
* avoid forcing the materialization of the {@link IRelation}.
* @param localIndexManager
* Access to the indices, resource locators, executor service,
* etc.
* @param predicate
* The constraints on the access path.
* @param keyOrder
* The order in which the elements would be visited for this
* access path.
*/
public AccessPath(//
final IRelation relation,//
final IIndexManager localIndexManager, //
final IPredicate predicate,//
final IKeyOrder keyOrder //
) {
if (relation == null)
throw new IllegalArgumentException();
if (predicate == null)
throw new IllegalArgumentException();
if (keyOrder == null)
throw new IllegalArgumentException();
this.relation = relation;
final int partitionId = predicate.getPartitionId();
/*
* If the predicate is addressing a specific shard, then the default is
* to assume that it will not be using a remote access path. However, if
* a remote access path was explicitly request and the partitionId was
* specified, then it will be an error (which is trapped below).
*/
final boolean remoteAccessPath = predicate
.getProperty(
IPredicate.Annotations.REMOTE_ACCESS_PATH,
partitionId == -1 ? IPredicate.Annotations.DEFAULT_REMOTE_ACCESS_PATH
: false);
/*
* Chose the right index manger. If relation.getIndexManager() is not
* federation, then always use that index manager. Otherwise, if AP is
* REMOTE use the relation's index manager. Otherwise, the
* localIndexManager MUST NOT be null and we will use it.
*/
if (!(relation.getIndexManager() instanceof IBigdataFederation)) {
this.indexManager = relation.getIndexManager();
} else if (remoteAccessPath) {
this.indexManager = relation.getIndexManager();
} else {
if (localIndexManager == null) {
throw new RuntimeException("Local index manager not given but"
+ " access path specifies local index: pred="+predicate);
}
this.indexManager = localIndexManager;
}
this.predicate = predicate;
this.keyOrder = keyOrder;
final int flags = predicate.getProperty(
IPredicate.Annotations.FLAGS,
IPredicate.Annotations.DEFAULT_FLAGS);
this.flags = flags;
/*
* Choose the timestamp of the view. If the request is for the
* unisolated index but the predicate was flagged as READONLY then
* automatically choose READ_COMMITTED instead.
*/
{
long timestamp = relation.getTimestamp();
timestamp = (timestamp == ITx.UNISOLATED
&& (flags & IRangeQuery.READONLY) != 0 ? ITx.READ_COMMITTED
: timestamp);
this.timestamp = timestamp;
}
this.historicalRead = TimestampUtility.isReadOnly(timestamp);
// final int partitionId = predicate.getPartitionId();
final IIndex ndx;
if (partitionId != -1) {
if (remoteAccessPath) {
/*
* A request for a specific shard is not compatible with a
* request for a remote access path.
*/
throw new RuntimeException("Annotations are not compatible: "
+ IPredicate.Annotations.REMOTE_ACCESS_PATH + "="
+ remoteAccessPath + ", but "
+ IPredicate.Annotations.PARTITION_ID + "="
+ partitionId + " for "+predicate
);
}
final String namespace = relation.getNamespace();
// The name of the desired index partition.
final String name = DataService.getIndexPartitionName(namespace
+ "." + keyOrder.getIndexName(), partitionId);
try {
// MUST be a local index view.
ndx = (ILocalBTreeView) indexManager.getIndex(name, timestamp);
} catch (Throwable t) {
throw new RuntimeException(predicate.toString(), t);
}
if (ndx == null) {
/*
* Some possible root causes for failing to find a shard on a DS
* are listed below. You should verify that the addressed shard
* was actually present on the addressed data service as of the
* effect read time of the request.
*
*
* - The as-bound predicate was mapped onto the wrong shard.
* Some subtle problems have been tracked back to this. See
* https://sourceforge.net/apps/trac/bigdata/ticket/457. There
* was also a problem where as were failing to use the as-bound
* predicate when mapping the predicate onto a shard.
*
* - A failure in IndexManager to locate the shard. This could
* include concurrency holes in the indexCache, the access to
* the journal for the appropriate commit time, a
* read-historical request without a read-lock (application
* error), etc.
*
* - The shard was moved (but this will be a
* StaleLocatorException and can only occur with the unisolated
* index view, at least until we implement shard caching as part
* of the hybrid shared disk / shared nothing architecture).
*/
// // For debugging only - comment this out.
// dumpMDI((AbstractScaleOutFederation) relation
// .getIndexManager(), relation.getNamespace(), timestamp,
// keyOrder);
throw new RuntimeException("No such index: relation="
+ relation.getNamespace() + ", timestamp=" + timestamp
+ ", keyOrder=" + keyOrder + ", pred=" + predicate
+ ", indexManager=" + indexManager);
}
/*
* An index partition constraint was specified, so verify that we
* were given a local index object and that the index object is for
* the correct index partition.
*/
pmd = ndx.getIndexMetadata().getPartitionMetadata();
if (pmd == null)
throw new RuntimeException("Not an index partition");
if (pmd.getPartitionId() != partitionId) {
throw new RuntimeException("Expecting partitionId="
+ partitionId + ", but have " + pmd.getPartitionId());
}
} else {
// The predicate is not constrained to an index partition.
pmd = null;
/*
* Obtain the index.
*
* FIXME The getIndex(IKeyOrder) code path is optimized by
* SPORelation and LexiconRelation. However, we should have
* automatic caching of the index references to avoid the
* significant penalty of going down to the commitRecordIndex and
* Name2Addr each time we need to resolve an index. (Scale-out has
* separate caching for this in IndexManager.)
*/
ndx = relation.getIndex(keyOrder);
// final String fqn = AbstractRelation.getFQN(relation, keyOrder);
//
// ndx = AbstractRelation.getIndex(indexManager, fqn, timestamp);
if (ndx == null) {
throw new RuntimeException("No such index: relation="
+ relation.getNamespace() + ", timestamp=" + timestamp
+ ", keyOrder=" + keyOrder + ", pred=" + predicate
+ ", indexManager=" + indexManager);
}
}
this.ndx = ndx;
/**
* See AST2BOpUtility.toPredicate(). It is responsible for copying these
* annotations from the StatementPatternNode onto the Predicate so they
* can influence the behavior of the AccessPath.
*
* @see
* Clean up query hints
*/
final int chunkOfChunksCapacity = predicate.getProperty(
BufferAnnotations.CHUNK_OF_CHUNKS_CAPACITY,
BufferAnnotations.DEFAULT_CHUNK_OF_CHUNKS_CAPACITY);
final int chunkCapacity = predicate.getProperty(
BufferAnnotations.CHUNK_CAPACITY,
BufferAnnotations.DEFAULT_CHUNK_CAPACITY);
final int fullyBufferedReadThreshold = predicate.getProperty(
IPredicate.Annotations.FULLY_BUFFERED_READ_THRESHOLD,
IPredicate.Annotations.DEFAULT_FULLY_BUFFERED_READ_THRESHOLD);
this.chunkOfChunksCapacity = chunkOfChunksCapacity;
this.chunkCapacity = chunkCapacity;
this.fullyBufferedReadThreshold = fullyBufferedReadThreshold;
this.isFullyBoundForKey = predicate.isFullyBound(keyOrder);
{
/*
* The filter to be evaluated at the index (optional).
*
* Note: This MUST be an implementation which is "aware" of the
* reuse of tuples within tuple iterators. That is why it is being
* cast to a BOpTupleIterator.
*
* @todo if not a perfect index then impose additional filter first
* to skip over tuples which do not satisfy the concrete asBound
* predicate. This allows us to use the "best" index, not just a
* "perfect" index.
*/
final IFilter indexLocalFilter = predicate.getIndexLocalFilter();
/*
* Optional constraint enforces the "same variable" constraint. The
* constraint will be null unless at least one variable appears in
* more than one position in the predicate.
*/
final SameVariableConstraint sameVarConstraint = SameVariableConstraint
.newInstance(predicate);
if (sameVarConstraint != null) {
/*
* Stack filters.
*/
final FilterBase tmp = new NOPFilter();
if (indexLocalFilter != null)
tmp.addFilter(indexLocalFilter);
tmp.addFilter(new SameVariableConstraintTupleFilter(
sameVarConstraint));
this.indexLocalFilter = tmp;
} else {
this.indexLocalFilter = indexLocalFilter;
}
}
// optional filter to be evaluated by the AccessPath.
this.accessPathFilter = predicate.getAccessPathFilter();
// true iff there is a filter (either local or remote).
this.hasFilter = (indexLocalFilter != null || accessPathFilter != null);
final IKeyBuilder keyBuilder = ndx.getIndexMetadata()
.getTupleSerializer().getKeyBuilder();
fromKey = keyOrder.getFromKey(keyBuilder, predicate);
toKey = keyOrder.getToKey(keyBuilder, predicate);
}
@Override
public String toString() {
return getClass().getName()
+ "{predicate="
+ predicate
+ ", keyOrder="
+ keyOrder
+ ", flags="
+ Tuple.flagString(flags)
+ ", fromKey="
+ (fromKey == null ? "n/a" : BytesUtil.toString(fromKey))
+ ", toKey="
+ (toKey == null ? "n/a" : BytesUtil.toString(toKey))
+ ", hasFilter=" + hasFilter
+ ", indexLocalFilter="
+ (indexLocalFilter == null ? "n/a" : indexLocalFilter)
+ ", accessPathFilter="
+ (accessPathFilter == null ? "n/a" : accessPathFilter)
+ ", indexManager="+indexManager
+ "}";
}
/**
* @throws IllegalStateException
* unless {@link #init()} has been invoked.
*/
final protected void assertInitialized() {
if (!didInit)
throw new IllegalStateException();
}
/**
* Required post-ctor initialization.
*
* @return this
*/
public AccessPath init() {
if (didInit)
throw new IllegalStateException();
didInit = true;
if(DEBUG) {
if (fromKey != null && toKey != null) {
if (BytesUtil.compareBytes(fromKey, toKey) >= 0) {
throw new AssertionError("keys are out of order: " + toString());
}
}
log.debug(toString());
}
return this;
}
public IRelation getRelation() {
return relation;
}
public IIndexManager getIndexManager() {
return indexManager;
}
public long getTimestamp() {
return timestamp;
}
@Override
public IPredicate getPredicate() {
return predicate;
}
@Override
public IIndex getIndex() {
return ndx;
}
/**
* @todo for scale-out, it may be better to implement {@link #isEmpty()}
* without specifying a capacity of ONE (1) and then caching the
* returned iterator. This could avoid an expensive RMI test if we
* invoke {@link #iterator()} shortly after {@link #isEmpty()} returns
* false
.
*/
@Override
public boolean isEmpty() {
assertInitialized();
if (historicalRead && rangeCount != -1) {
/*
* Optimization for a historical read in which we have already
* proven that the access path is empty.
*/
return rangeCount == 0L;
}
if(DEBUG) {
log.debug(toString());
}
final IChunkedIterator itr = iterator(0L/* offset */, 1L/* limit */,
1/* capacity */);
try {
final boolean empty = ! itr.hasNext();
if (empty && historicalRead) {
// the access path is known to be empty.
rangeCount = 0L;
}
return empty;
} finally {
itr.close();
}
}
// /**
// * {@inheritDoc}
// *
// * @see https://sourceforge.net/apps/trac/bigdata/ticket/209 (Access path
// * should visit solutions for high level query).
// */
// public ICloseableIterator solutions(final BaseJoinStats stats) {
//
//// final IVariable[] vars = BOpUtility
//// .getDistinctArgumentVariables(predicate);
//
// return BOpContext.solutions(iterator(), predicate, /*vars,*/ stats);
//
// }
/**
* {@inheritDoc}
*
* @see https://sourceforge.net/apps/trac/bigdata/ticket/209 (Access path
* should visit solutions for high level query).
*/
@Override
public ICloseableIterator solutions(final BOpContext context, final long limit,
final BaseJoinStats stats) {
// final IVariable[] vars = BOpUtility
// .getDistinctArgumentVariables(predicate);
return context.solutions(
iterator(0L/* offset */, limit, 0/* capacity */), predicate,
stats);
}
@Override
final public IChunkedOrderedIterator iterator() {
return iterator(0L/* offset */, 0L/* limit */, 0);
}
// final public IChunkedOrderedIterator iterator(final int limit,
// final int capacity) {
//
// return iterator(0L/* offset */, limit, capacity);
//
// }
/**
* @throws RejectedExecutionException
* if the iterator is run asynchronously and the
* {@link ExecutorService} is shutdown or has a maximum capacity
* and is saturated.
*
* FIXME Support both offset and limit for asynchronous
* iterators. right now this will force the use of the
* {@link #synchronousIterator(long, long, Iterator)} when the
* offset or limit are non-zero, but that is only permitted up
* to a limit of {@link #MAX_FULLY_BUFFERED_READ_LIMIT}.
*
* FIXME in order to support large limits we need to verify that
* the asynchronous iterator can correctly handle REMOVEALL and
* that incremental materialization up to the [limit] will not
* effect the semantics for REMOVEALL or the other iterator
* flags (per above). (In fact, the asynchronous iterator does
* not support either [offset] or [limit] at this time).
*
* FIXME write unit tests for slice handling by this method and
* modify the SAIL integration to use it for SLICE on an
* {@link IAccessPath} scan. Note that there are several
* {@link IAccessPath} implementations and they all need to be
* tested with SLICE.
*
* Those tests should be located in
* {@link com.bigdata.rdf.spo.TestSPOAccessPath}.
*
* FIXME The offset and limit should probably be rolled into the
* predicate and removed from the {@link IAccessPath}. This way
* they will be correctly applied when {@link #isEmpty()} is
* implemented using the {@link #iterator()} to determine if any
*/
@Override
@SuppressWarnings("unchecked")
final public IChunkedOrderedIterator iterator(final long offset,
long limit, int capacity) {
if (offset < 0)
throw new IllegalArgumentException();
if (limit < 0)
throw new IllegalArgumentException();
if (limit == Long.MAX_VALUE) {
// treat MAX_VALUE as meaning NO limit.
limit = 0L;
}
if (limit > MAX_FULLY_BUFFERED_READ_LIMIT) {
// Note: remove constraint when async itr supports SLICE.
throw new UnsupportedOperationException("limit=" + limit
+ " exceeds maximum fully buffered read limit: "
+ MAX_FULLY_BUFFERED_READ_LIMIT);
}
if (historicalRead && rangeCount >= 0L && ((rangeCount - offset) <= 0L)) {
/*
* The access path has already been proven to be empty.
*/
if (DEBUG)
log.debug("Proven empty by historical range count");
return new EmptyChunkedIterator(keyOrder);
}
if (DEBUG)
log.debug("offset=" + offset + ", limit=" + limit + ", capacity="
+ capacity + ", accessPath=" + this);
final boolean fullyBufferedRead;
// true iff a point test is a hit on the bloom filter.
boolean bloomHit = false;
if(isFullyBoundForKey) {
if (DEBUG)
log.debug("Predicate is fully bound for the key.");
/*
* If the predicate is fully bound then there can be at most one
* element matched so we constrain the limit and capacity
* accordingly.
*/
if (offset > 0L) {
// the iterator will be empty if the offset is GT zero.
return new EmptyChunkedIterator(keyOrder);
}
capacity = 1;
limit = 1L;
fullyBufferedRead = true;
/*
* Note: Since this is a point test, we apply the bloom filter for
* fast rejection. However, we can only apply the bloom filter if
* (a) you are using the local index object (either a BTree or a
* FusedView); and (b) the bloom filter exists (and is enabled).
*
* Note: The scale-out case is dealt with by pipelining the
* intermediate binding sets to the data service on which the index
* partition resides, at which point we again can apply the local
* bloom filter efficiently.
*/
if(ndx instanceof ILocalBTreeView) {
final IBloomFilter filter = ((ILocalBTreeView)ndx).getBloomFilter();
if (filter != null) {
if(!filter.contains(fromKey)) {
// proven to not exist.
return new EmptyChunkedIterator(keyOrder);
}
bloomHit = true;
// fall through
}
// fall through
}
// fall through
} else if (limit > 0L) {
/*
* A [limit] was specified.
*
* NOTE: When the [limit] is (GT ZERO) we MUST NOT let the
* DataService layer iterator read more than [limit] elements at a
* time.
*
* This is part of the contract for REMOVEALL - when you set the
* [limit] and specify REMOVEALL you are only removing the 1st
* [limit] elements in the traversal order.
*
* This is also part of the atomic queue operations contract - the
* head and tail queue operations function by specifying [limit :=
* 1] (tail also specifies the REVERSE traversal option).
*
* Note: When the [limit] is specified we always do a fully buffered
* (aka synchronous) read. This simplifies the behavior of the
* iterator and limits are generally quite small.
*/
capacity = (int) limit;
fullyBufferedRead = true;
} else {
/*
* No limit was specified.
*
* Range count the access path and use a synchronous read if the
* rangeCount is LTE the threshold.
*
* Note: the range count is corrected by the offset so that it gives
* the effective remaining range count. When the effective remaining
* range count is zero we know that the iterator will not visit
* anything.
*
* @todo this kind of rangeCount might be replaced by an estimated
* range count basic on historical data and NOT requiring RMI.
*/
final long rangeCountRemaining = rangeCount(false/* exact */)
- offset;
if (DEBUG)
log.debug("offset=" + offset + ", limit=" + limit
+ ", rangeCountRemaining=" + rangeCountRemaining
+ ", fullyBufferedReadThreashold="
+ fullyBufferedReadThreshold);
if(rangeCountRemaining <= 0) {
/*
* Since the range count is an upper bound we KNOW that the
* iterator would not visit anything.
*/
if (DEBUG)
log.debug("No elements based on range count.");
return new EmptyChunkedIterator(keyOrder);
}
if(rangeCountRemaining < fullyBufferedReadThreshold) {
// adjust limit to no more than the #of remaining elements.
if (limit == 0L) {
limit = rangeCountRemaining;
} else {
limit = Math.min(limit, rangeCountRemaining);
}
// adjust capacity to no more than the maximum capacity.
capacity = (int) Math.min(MAX_FULLY_BUFFERED_READ_LIMIT, limit);
fullyBufferedRead = true;
} else {
fullyBufferedRead = false;
}
}
/*
* Note: The [capacity] gets passed through to the DataService layer.
*
* Note: The ElementFilter on the IPredicate (if any) is encapsulated
* within [filter] and is passed through to the DataService layer. It
* MUST be Serializable and it will be executed right up against the
* data.
*
* FIXME pass the offset and limit into the source iterator
* (IRangeQuery, ITupleIterator). This will require a lot of changes to
* the code as that gets used everywhere.
*/
// The raw tuple iterator: the impl depends on the IIndex impl (BTree,
// IndexSegment, ClientIndexView, or DataServiceIndexView).
final ITupleIterator tupleItr = rangeIterator(capacity, flags,
indexLocalFilter);
// Wrap raw tuple iterator with resolver that materializes the elements
// from the visited tuples.
final Iterator src = new Striterator(tupleItr)
.addFilter(new TupleObjectResolver());
if (accessPathFilter != null) {
/*
* Chain in the optional access path filter stack.
*/
((Striterator) src).addFilter(accessPathFilter);
}
if (fullyBufferedRead) {
/*
* Synchronous fully buffered read of no more than [limit] elements.
*/
final IChunkedOrderedIterator tmp = synchronousIterator(offset,
limit, src);
if(bloomHit) {
if(!tmp.hasNext()) {
// notify filter of a false positive.
((ILocalBTreeView)ndx).getBloomFilter().falsePos();
}
}
return tmp;
} else {
/*
* Asynchronous read (does not support either offset or limit for
* now).
*/
assert offset == 0L : "offset=" + limit;
assert limit == 0L : "limit=" + limit;
return asynchronousIterator(src);
}
}
/**
* Fully buffers all elements that would be visited by the
* {@link IAccessPath} iterator.
*
* @param accessPath
* The access path (including the triple pattern).
* @param offset
* The first element that will be materialized (non-negative).
* @param limit
* The maximum #of elements that will be materialized (must be
* positive, so use a range count before calling this method if
* there was no limit specified by the caller).
*
* FIXME pass the offset and limit into the source iterator and remove them
* from this method's signature. This will require a change to the
* {@link IRangeQuery} API and {@link ITupleIterator} impls.
*/
@SuppressWarnings("unchecked")
final protected IChunkedOrderedIterator synchronousIterator(
final long offset, final long limit, final Iterator src) {
if (offset < 0)
throw new IllegalArgumentException();
if (limit <= 0)
throw new IllegalArgumentException();
assert limit < MAX_FULLY_BUFFERED_READ_LIMIT : "limit=" + limit
+ ", max=" + MAX_FULLY_BUFFERED_READ_LIMIT;
if (DEBUG) {
log.debug("offset=" + offset + ", limit=" + limit);
}
int nread = 0;
int nused = 0;
// skip past the offset elements.
while (nread < offset && src.hasNext()) {
src.next();
nread++;
}
// read up to [limit] elements into the buffer.
R[] buffer = null;
while (nused < limit && src.hasNext()) {
final R e = src.next();
if (buffer == null) {
buffer = (R[]) java.lang.reflect.Array.newInstance(
e.getClass(), (int) limit);
}
buffer[nused] = e;
nused++;
nread++;
}
if(DEBUG) {
log.debug("Fully buffered: read=" + nread + ", used=" + nused
+ ", offset=" + offset + ", limit=" + limit);
}
// if (limit == 1)
// System.err.println("Fully buffered: used=" + nused + ", limit=" + limit);
if (nread == 0) {
return new EmptyChunkedIterator(keyOrder);
}
return new ChunkedArrayIterator(nused, buffer, keyOrder);
}
/**
* Asynchronous read using a {@link BlockingBuffer}.
*
* @param src
* The source iterator.
*
* @return
*
* @throws RejectedExecutionException
* if the {@link ExecutorService} is shutdown or has a maximum
* capacity and is saturated.
*/
final protected IChunkedOrderedIterator asynchronousIterator(
final Iterator src) {
if (src == null)
throw new IllegalArgumentException();
if (DEBUG)
log.debug("");
/*
* Note: The filter is applied by the ITupleIterator so that it gets
* evaluated close to the data, not here where it would be evaluated
* once the elements were materialized on the client.
*/
final BlockingBuffer buffer = new BlockingBuffer(
chunkOfChunksCapacity);
/**
* @see
* BlockingBuffer.close() does not unblock threads
*/
// Wrap computation as FutureTask.
final FutureTask ft = new FutureTask(
new ChunkConsumerTask(this, src, buffer));
// Set Future on BlockingBuffer *before* starting computation.
buffer.setFuture(ft);
// Start computation.
indexManager.getExecutorService().submit(ft);
return new ChunkConsumerIterator(buffer.iterator(), keyOrder);
}
/**
* Consumes elements from the source iterator, converting them into chunks
* on a {@link BlockingBuffer}. The consumer will drain the chunks from the
* buffer.
*
* @author Bryan Thompson
*/
static private class ChunkConsumerTask implements Callable {
static protected final Logger log = Logger.getLogger(ChunkConsumerTask.class);
private final AccessPath accessPath;
private final Iterator src;
private final BlockingBuffer buffer;
/**
*
* @param src
* The source iterator visiting elements read from the
* relation.
* @param buffer
* The buffer onto which chunks of those elements will be
* written.
*/
public ChunkConsumerTask(final AccessPath accessPath,
final Iterator src, final BlockingBuffer buffer) {
if (accessPath == null)
throw new IllegalArgumentException();
if (src == null)
throw new IllegalArgumentException();
if (buffer == null)
throw new IllegalArgumentException();
this.accessPath = accessPath;
this.src = src;
this.buffer = buffer;
}
@Override
public Void call() throws Exception {
/*
* Chunked iterator reading from the ITupleIterator. The filter was
* already applied by the ITupleIterator so we do not use it here.
*
* Note: The chunk size is determined [chunkCapacity].
*
* Note: The BlockingBuffer can combine multiple chunks together
* dynamically to provide a larger effective chunk size as long as
* those chunks are available with little or no added latency.
*/
final IChunkedOrderedIterator itr = new ChunkedWrappedIterator(
src, accessPath.chunkCapacity, accessPath.keyOrder, null/* filter */);
long nchunks = 0;
long nelements = 0;
try {
while (src.hasNext()) {
final R[] chunk = itr.nextChunk();
nchunks++;
nelements += chunk.length;
if (DEBUG)
log.debug("#chunks=" + nchunks + ", chunkSize="
+ chunk.length + ", nelements=" + nelements);
buffer.add(chunk);
}
} finally {
if (log.isInfoEnabled())
log.info("Closing buffer: #chunks=" + nchunks
+ ", #elements=" + nelements + ", accessPath="
+ accessPath);
buffer.close();
itr.close();
}
return null;
}
}
@Override
final public long rangeCount(final boolean exact) {
assertInitialized();
long n = 0L;
if (exact) {
/*
* @todo we can cache exact range counts also, but we can not return
* a cached estimated range count when an exact range count is
* requested.
*/
if (hasFilter) {
/*
* If there is a filter, then we need to visit the elements and
* apply the filter to those elements.
*
* FIXME If the filter is properly driven through to the indices
* then the index should be able to enable the (KEYS,VALS) flags
* locally and we can avoid sending back the full tuple when
* just doing a range count. This could be done using a
* rangeCount(exact,filter) method on IIndex.
*/
final IChunkedOrderedIterator itr = iterator();
while (itr.hasNext()) {
itr.next();
n++;
}
} else {
n = ndx.rangeCountExact(fromKey, toKey);
}
} else {
if (historicalRead) {
// cachable.
n = historicalRangeCount(fromKey, toKey);
} else {
// not cachable.
n = ndx.rangeCount(fromKey, toKey);
}
}
if (DEBUG) {
log.debug("exact=" + exact + ", filter=" + hasFilter + ", n=" + n
+ " : " + toString());
}
return n;
}
/**
* Note: the range count is cached for a historical read to reduce round
* trips to the DataService.
*/
final private long historicalRangeCount(final byte[] fromKey,
final byte[] toKey) {
if (rangeCount == -1L) {
// do query and cache the result.
return rangeCount = ndx.rangeCount(fromKey, toKey);
} else {
// cached value.
return rangeCount;
}
}
// @Override
// final public ITupleIterator rangeIterator() {
//
// return rangeIterator(0/* capacity */, flags, indexLocalFilter);
//
// }
@SuppressWarnings( { "unchecked" })
protected ITupleIterator rangeIterator(final int capacity,
final int flags, final IFilter filter) {
assertInitialized();
if (DEBUG) {
log.debug(this + " : capacity=" + capacity + ", flags=" + flags
+ ", filter=" + filter);
}
return ndx.rangeIterator(fromKey, toKey, capacity, flags, filter);
}
/**
* This implementation removes all tuples that would be visited by the
* access path from the backing index.
*
* Note: If you are maintaining multiple indices then you MUST override this
* method to remove the data from each of those indices.
*/
@Override
public long removeAll() {
assertInitialized();
if (DEBUG) {
log.debug(this.toString());
}
/*
* Remove everything in the key range which satisfies the filter. Do
* not materialize keys or values.
*
* @todo if offset and limit are rolled into the access path then
* they would also belong here.
*/
final ITupleIterator itr = rangeIterator(0/* capacity */,
IRangeQuery.REMOVEALL, indexLocalFilter);
long n = 0;
while (itr.hasNext()) {
itr.next();
n++;
}
return n;
}
/**
* Return an estimate of the cost of a scan on the predicate.
*
* @param pred
* The predicate.
*
* @return The estimated cost of a scan on that predicate.
*/
public ScanCostReport estimateCost() {
if(ndx instanceof UnisolatedReadWriteIndex) {
return ((UnisolatedReadWriteIndex) ndx).estimateCost(diskCostModel,
rangeCount(false/* exact */));
}
if (ndx instanceof BTree) {
/*
* Fast path for a local BTree.
*/
// fast range count (may be cached by the access path).
final long rangeCount = rangeCount(false/*exact*/);
return estimateCost(diskCostModel, (BTree) ndx, rangeCount);
}
if (ndx instanceof ILocalBTreeView) {
/*
* A local view. This path is for both transactions and local
* shards.
*/
// fast range count (may be cached by the access path).
final long rangeCount = rangeCount(false/* exact */);
return estimateCost((ILocalBTreeView) ndx, rangeCount, fromKey,
toKey);
}
if (ndx instanceof IScaleOutClientIndex) {
/*
* A scale-out index is being addressed.
*/
return estimateCost((IScaleOutClientIndex) ndx);
}
throw new UnsupportedOperationException("index=" + ndx);
}
/**
* Return the estimated cost of an index scan on a local {@link BTree}.
*
* @param btree
* The {@link BTree}.
*
* @return The estimated cost of the scan.
*/
private ScanCostReport estimateCost(final DiskCostModel diskCostModel,
final BTree btree, final long rangeCount) {
// BTree is its own statistics view.
final IBTreeStatistics stats = (BTree) btree;
// Estimate cost based on random seek per node/leaf.
final double cost = new BTreeCostModel(diskCostModel).rangeScan(
rangeCount, stats.getBranchingFactor(), stats.getHeight(),
stats.getUtilization().getLeafUtilization());
return new ScanCostReport(rangeCount, cost);
}
/**
* Return the estimated cost of a key-range scan for a local B+Tree view.
* This handles both {@link IsolatedFusedView} (transactions) and
* {@link FusedView} (shards).
*
* @param view
* The view.
*
* @return The estimated cost.
*/
static private ScanCostReport estimateCost(final ILocalBTreeView view,
final long rangeCount, final byte[] fromKey, final byte[] toKey) {
double cost = 0d;
final AbstractBTree[] sources = view.getSources();
for (AbstractBTree source : sources) {
final IBTreeStatistics stats = source.getStatistics();
// fast range count on that source.
final long sourceRangeCount = source.rangeCount(fromKey, toKey);
if (source instanceof IndexSegment) {
// Cost for an index segment based on multi-block IO.
final IndexSegment seg = (IndexSegment) source;
final long extentLeaves = seg.getStore().getCheckpoint().extentLeaves;
final long leafCount = stats.getLeafCount();
// Note: bytesPerLeaf is never more than an int32 value!
final int bytesPerLeaf = (int) Math
.ceil(((double) extentLeaves) / leafCount);
cost += new IndexSegmentCostModel(diskCostModel).rangeScan(
(int) sourceRangeCount, stats.getBranchingFactor(),
bytesPerLeaf, DirectBufferPool.INSTANCE
.getBufferCapacity());
} else {
// Cost for a B+Tree based on random seek per node/leaf.
cost += new BTreeCostModel(diskCostModel).rangeScan(
sourceRangeCount, stats.getBranchingFactor(), stats
.getHeight(), stats.getUtilization()
.getLeafUtilization());
}
}
// @todo pass details per source back in the cost report.
return new ScanCostReport(rangeCount, cost);
}
/**
* Return the estimated cost of a key-range scan on a remote view of a
* scale-out index.
*
* @param ndx
* The scale-out index.
*
* @return
*
* @todo Remote scans can be parallelized. If flags includes PARALLEL then
* the cost can be as little as the cost of scanning one shard.
* However, the {@link IClientIndex} has a configuration value which
* specifies the maximum parallelism of any given operation (this is
* self-reported if we cast to the implementation class). Further,
* even if we assume that the shards are evenly distributed over the
* nodes, when the #of shards is significantly larger than the #of
* nodes then the scan can interfere with itself. Finally, this should
* include an estimate of the RMI overhead.
*/
private ScanCostReport estimateCost(final IScaleOutClientIndex ndx) {
final String name = ndx.getIndexMetadata().getName();
final AbstractClient client = ndx.getFederation().getClient();
// maximum parallelization by the client : @todo not used yet.
final int maxParallel = client.getMaxParallelTasksPerRequest();
// the metadata index for that scale-out index.
final IMetadataIndex mdi = ndx.getFederation().getMetadataIndex(name,
timestamp);
if (mdi == null)
throw new NoSuchIndexException("name=" + name + "@"
+ TimestampUtility.toString(timestamp));
// #of index partitions to be scanned.
final long partitionCount = mdi.rangeCount(fromKey, toKey);
if (partitionCount == 0) {
/*
* SWAG in case zero partition count is reported (I am not sure that
* this code path is possible).
*
* @todo This is proven possible. Now figure out why. Maybe this is
* fromKey==toKey, in which case we can optimize that out.
*/
return new ScanCostReport(0L/* rangeCount */, partitionCount, 100/* millis */);
// /*
// * Should never be "zero" partition count.
// */
// throw new AssertionError();
}
// fast range count (may be cached by the access path).
final long rangeCount = rangeCount(false/* exact */);
if (partitionCount == 1) {
/*
* Delegate the operation to the remote shard.
*/
return (ScanCostReport) ndx.submit(
fromKey == null ? BytesUtil.EMPTY : fromKey,
new EstimateShardScanCost(rangeCount, fromKey, toKey));
}
/*
* Assume a statistical model. Each partition is comprised of 1 journal
* with 50k tuples plus two index segments of 100M each.
*/
// one journal per shard.
final int njournals = 1;
// two segments per shard.
final int nsegments = 2;
final long rangeCountOnJournal = rangeCount
/ (partitionCount * (njournals + nsegments));
final double costPerJournal = new BTreeCostModel(diskCostModel)
.rangeScan(rangeCountOnJournal, //
mdi.getIndexMetadata().getBranchingFactor(), //
5,// height (SWAG)
70// leafUtilization (percent, SWAG).
);
final double costPerSegment = diskCostModel.seekTime + Bytes.megabyte
* 100;
final double costPerShard = costPerJournal + 2 * costPerSegment;
// @todo ignores potential parallelism.
final double cost = costPerShard * partitionCount;
return new ScanCostReport(rangeCount, partitionCount, cost);
}
/**
* Procedure to estimate the cost of an index range scan on a remote shard.
*
* @author Bryan
* Thompson
*/
private static final class EstimateShardScanCost implements
ISimpleIndexProcedure {
private static final long serialVersionUID = 1L;
private final long rangeCount;
private final byte[] fromKey;
private final byte[] toKey;
public EstimateShardScanCost(final long rangeCount,
final byte[] fromKey, final byte[] toKey) {
this.rangeCount = rangeCount;
this.fromKey = fromKey;
this.toKey = toKey;
}
@Override
public ScanCostReport apply(final IIndex ndx) {
final ScanCostReport scanCostReport = AccessPath.estimateCost(
((ILocalBTreeView) ndx), rangeCount, fromKey, toKey);
return scanCostReport;
}
@Override
public boolean isReadOnly() {
return true;
}
}
/*
* Cost models.
*/
/**
* The cost model associated with the disk on which the indices are stored.
* For a {@link Journal}, this is just the cost model of the backing disk.
* For the federation, this should be an average cost model.
*
* @todo This is not parameterized. A simple cost model is always assumed.
* The correct cost model is necessary in order to get the tradeoff
* point right for SCAN+FILTER versus SUBQUERY on SSD or RAID arrays
* with lots of spindles versus normal disk.
*
* @todo In a shared disk deployment, we might introduce one cost model for
* local SSD used to cache journals, one for local non-SSD disks used
* to cache index segments, and one for remote storage used to
* materialize historical journals and index segments for query.
*
* @todo In a federation, this should be reported out as metadata for the
* federation. Perhaps as a Jini attribute. Or we could self-publish
* this using a System property whose value was either the name of the
* desired cost model enum or a representation of the cost model which
* we could then parse.
*/
private static final DiskCostModel diskCostModel = DiskCostModel.DEFAULT;
// /**
// * Dumps the locators for an index of a relation.
// *
// * @param fed
// * @param namespace
// * The relation namespace.
// * @param timestamp
// * The timestamp of the view.
// * @param keyOrder
// * The index.
// */
// private static void dumpMDI(AbstractScaleOutFederation fed,
// final String namespace, final long timestamp,
// final IKeyOrder keyOrder) {
//
// final String name = namespace + "." + keyOrder.getIndexName();
//
// final Iterator itr = fed
// .locatorScan(name, timestamp, new byte[] {}/* fromKey */,
// null/* toKey */, false/* reverseScan */);
//
// System.err.println("name=" + name + " @ "
// + TimestampUtility.toString(timestamp));
// while (itr.hasNext()) {
// System.err.println(itr.next());
// }
//
// }
}