com.bigdata.service.ndx.PartitionedTupleIterator Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.bigdata.service.ndx;
import java.io.IOException;
import java.util.Iterator;
import java.util.NoSuchElementException;
import org.apache.log4j.Logger;
import com.bigdata.btree.DelegateTuple;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.ResultSet;
import com.bigdata.btree.Tuple;
import com.bigdata.btree.proc.AbstractKeyRangeIndexProcedure;
import com.bigdata.journal.ITx;
import com.bigdata.mdi.PartitionLocator;
import com.bigdata.resources.StaleLocatorException;
import com.bigdata.service.DataService;
import com.bigdata.service.IDataService;
import com.bigdata.util.BytesUtil;
import com.bigdata.util.InnerCause;
import cutthecrap.utils.striterators.IFilter;
/**
* Class supports range query across one or more index partitions. Each
* partition is mapped onto a single {@link DataServiceTupleIterator} query. In
* turn, the {@link DataServiceTupleIterator} may make several queries to the
* data service per partition. The actual #of queries made to the data service
* depends on the #of index entries that are visited per partition and the
* capacity specified to the ctor. {@link StaleLocatorException}s are handled
* transparently by restarting the locator scan and continuing the range query
* request from the successor (or predecessor for a reverse scan) of the last
* key visited. When the operation is unisolated, this class will correctly
* complete the range iterator request even if the index partition is split,
* joined or moved during traversal.
*
* @author Bryan Thompson
* @version $Id$
*/
public class PartitionedTupleIterator implements ITupleIterator {
private static final transient Logger log = Logger
.getLogger(PartitionedTupleIterator.class);
// protected static final boolean INFO = log.isInfoEnabled();
// protected static final boolean DEBUG = log.isDebugEnabled();
/**
* The index on which the range query is being performed.
*/
private final IScaleOutClientIndex ndx;
/**
* Iterator traversing the index partition locators spanned by the query.
*/
private Iterator locatorItr;
/**
* The timestamp from the ctor.
*/
private final long ts;
/**
* true
iff the {@link #ts} is a read-historical
* transaction created specifically to give the iterator read-consistent
* semantics. when true
, this class will ensure that the
* transaction is eventually aborted so that its read lock will be released.
*/
private final boolean isReadConsistentTx;
/**
* The first key to visit -or- null
iff no lower bound (from
* the ctor).
*/
private final byte[] fromKey;
/**
* The first key to NOT visit -or- null
iff no upper bound
* (from the ctor).
*/
private final byte[] toKey;
/**
* This controls the #of results per data service query.
*/
private final int capacity;
/**
* These flags control whether keys and/or values are requested. If
* neither keys nor values are requested, then this is just a range
* count operation and you might as well use rangeCount instead.
*/
private final int flags;
private final IFilter filter;
/**
* true
iff {@link IRangeQuery#REVERSE} was specified by the
* caller. When {@link IRangeQuery#REVERSE} was specified then we will use a
* reverse locator scan so that we proceed in reverse order over the index
* partitions as in reverse order within each index partition.
*/
private final boolean reverseScan;
/**
* The {@link #currentFromKey} and {@link #currentToKey} are updated each
* time we formulate a query against the partitioned index, which can occur
* one or more times per index partition. The update of the fields depends
* on whether we are doing a forward or reverse scan. If we have to handle a
* {@link StaleLocatorException} then these fields will be used to restart
* the locator scan.
*
* The {@link #currentFromKey} is initially set by the ctor to the
* {@link #fromKey}.
*
* For a reverse scan, the {@link #currentFromKey} remains unchanged.
*
* For a forward scan the {@link #currentFromKey} is set to the last key
* visited in each {@link ResultSet}.
*
* @todo {@link ResultSet#getLastKey()} will return null
if
* there were no tuples to visit. This class does not appear to handle
* that condition, but it is able to pass a unit test where there is
* an empty partition in the middle of the key range scan for both
* forward and reverse traversal and a test where there is a non-empty
* partition whose tuples are filtered out such that it is effectively
* empty. see TestRangeQuery#test_reverseScan()
*
* @todo if we restart a locator scan and the index partition boundaries
* have changed, then what do we need to do in order to make sure that
* we are issuing the query?
*/
private byte[] currentFromKey;
/**
* The {@link #currentToKey} is initially set by the ctor to the
* {@link #toKey}.
*
* For a forward scan, the {@link #currentToKey} remains unchanged.
*
* For a reverse scan, the {@link #currentToKey} is set to the last key
* visited in each {@link ResultSet}.
*/
private byte[] currentToKey;
/**
* The metadata for the current index partition.
*/
private PartitionLocator locator = null;
/**
* The last locator for which we received a {@link StaleLocatorException}.
* We note this so that we can avoid an endless retry if the same locator is
* reported when we attempt to restart the {@link #locatorItr}.
*/
private PartitionLocator lastStaleLocator = null;
/**
* The #of index partitions that have been queried so far. There will be one
* {@link DataServiceTupleIterator} query issued per partition.
*
* @deprecated The #of partitions is a bit tricky since splits and joins can
* change the #of index partitions dynamically (for unisolated
* or read-committed reads where read-consistent is not true).
*/
private int nparts = 0;
/**
* The #of tuples visited so far.
*/
private long nvisited = 0L;
/**
* The {@link DataServiceTupleIterator} reading from the current index
* partition.
*/
private DataServiceTupleIterator src;
/**
* When true, the entire key range specified by the client has been
* visited and the iterator is exhausted (i.e., all done).
*
* @see #close()
*/
private boolean exhausted = false;
/**
* The #of index partitions queried so far.
*
* @deprecated The #of partitions is a bit tricky since splits and joins can
* introduce new partitions unless you are using a
* read-consistent view.
*/
public int getPartitionCount() {
return nparts;
}
/**
* The #of entries visited so far (not the #of entries scanned, which
* can be much greater if a filter is in use).
*/
public long getVisitedCount() {
return nvisited;
}
/**
*
* Note: The {@link PartitionedTupleIterator} uses a sequential scan (rather
* than mapping across the index partitions in parallel) and always picks up
* from the successor of the last key visited. Read-consistent is achieved
* by specifying a commitTime for the timestamp rather than
* {@link ITx#READ_COMMITTED}. The latter will use dirty reads (each time a
* {@link ResultSet} is fetched it will be fetched from the most recently
* committed state of the database).
*
* @param ndx
* @param ts
* The timestamp for the view (may be a transaction).
* @param isReadConsistentTx
* true
iff the caller specified timestamp is a
* read-historical transaction created specifically to give the
* iterator read-consistent semantics. when true
,
* this class will ensure that the transaction is eventually
* aborted so that its read lock will be released. This is done
* eagerly when the iterator is exhausted and with a
* {@link #finalize()} method otherwise.
* @param fromKey
* @param toKey
* @param capacity
* @param flags
* @param filter
*
* @throws IllegalArgumentException
* if readConsistent is requested and the index view is
* {@link ITx#UNISOLATED}.
*/
public PartitionedTupleIterator(final IScaleOutClientIndex ndx,
final long ts, final boolean isReadConsistentTx, final byte[] fromKey,
final byte[] toKey, final int capacity, final int flags,
final IFilter filter) {
if (ndx == null) {
throw new IllegalArgumentException();
}
if (capacity < 0) {
throw new IllegalArgumentException();
}
this.ndx = ndx;
this.ts = ts;
this.isReadConsistentTx = isReadConsistentTx;
this.fromKey = this.currentFromKey = fromKey;
this.toKey = this.currentToKey = toKey;
this.capacity = capacity;
this.flags = flags;
this.filter = filter;
this.reverseScan = (flags & IRangeQuery.REVERSE) != 0;
// start locator scan
this.locatorItr = ndx.locatorScan(ts, fromKey, toKey, reverseScan);
}
protected void finalize() {
close();
}
/**
* Marks the iterator as {@link #exhausted} and aborts the {@link #ts} iff
* it was identified to the ctor as being created specifically to provide
* read-consistent semantics for this iterator and hence our responsibility
* to clean up.
*/
synchronized private void close() {
if (exhausted)
return;
exhausted = true;
if (isReadConsistentTx) {
try {
ndx.getFederation().getTransactionService().abort(ts);
} catch (IOException e) {
// log and ignore since the caller was not directly affected.
log.error(ClientIndexView.ERR_ABORT_TX + ts, e);
}
}
}
public boolean hasNext() {
if (exhausted) {
return false;
}
if (locator == null) {
// Setup query for the first partition.
if (!nextPartition()) {
return false;
}
}
assert src != null;
try {
if(src.hasNext()) {
// More from the current source iterator.
return true;
}
} catch(RuntimeException ex) {
final StaleLocatorException cause = (StaleLocatorException) InnerCause
.getInnerCause(ex, StaleLocatorException.class);
if(cause != null) {
/*
* Handle StaleLocatorException. This exception indicates that
* we have a stale index partition locator. This can happen when
* index partitions are split, joined, or moved. It can only
* happen for UNISOLATED or READ_COMMITTED operations since we
* never change historical locators. Also, If the index view is
* read-committed and read-consistent operations are specified,
* then IIndexStore#getLastCommitTime() will be used so stale
* locators will not occur for that case either.
*/
if (lastStaleLocator != null) {
if (lastStaleLocator.getPartitionId() == locator
.getPartitionId()) {
/*
* This happens if we get a StaleLocatorException,
* restart the locator scan, and get another
* StaleLocatorException on the same index partition.
* Since a new index partition identifier is assigned
* every time there is a split/join/move this is a clear
* indication that something is wrong with either the
* locator, with the cached view of the metadata index
* used by the client, or the data service. For example,
* the client may have failed to refresh its cached view
* for the locator or the index partition might have
* been dropped on the data service (a no-no to be
* sure).
*/
throw new RuntimeException(
"Missing index partition on data service? "
+ locator, ex);
}
}
// notify the client so that it can refresh its cache.
ndx.staleLocator(ts, locator,cause);
// save reference
lastStaleLocator = locator;
// clear since invalid.
locator = null;
// Re-start the locator scan.
locatorItr = ndx.locatorScan(ts, currentFromKey,
currentToKey, reverseScan);
// Recursive query.
return hasNext();
} else throw ex;
}
/*
* The current index partition is empty, but there are other index
* partitions left to query.
*
* Each source iterator reads from one index partition. (The source
* iterator is itself a chunked iterator so it may issue multiple remote
* requests to consume the data available on a given index partition).
*/
if(nextPartition()) {
/*
* Recursive query since the index partition might be empty.
*/
return hasNext();
}
/*
* Exausted.
*/
close();
return false;
}
/**
* Issues a new range query against the next index partition.
*/
private boolean nextPartition() {
assert ! exhausted;
if (Thread.interrupted()) {
// notice an interrupt no later than the next partition.
throw new RuntimeException(new InterruptedException());
}
if (!locatorItr.hasNext()) {
if(log.isInfoEnabled())
log.info("No more locators");
return false;
}
locator = locatorItr.next();
if (log.isInfoEnabled())
log.info("locator=" + locator);
// submit query to the next partition.
rangeQuery();
assert src != null;
return true;
}
/**
* Issues a range query against the current {@link #locator}.
*/
private void rangeQuery() {
assert ! exhausted;
assert locator != null;
if (Thread.interrupted()) {
// notice an interrupt no later than the next chunk.
throw new RuntimeException(new InterruptedException());
}
try {
/*
* Note: The range query request is formed such that it addresses
* only those keys that actually lie within the partition and also
* within the caller's given key range. This has two benefits:
*
* (1) The data service can check the range and report an error for
* clients that appear to be requesting data for index partitions
* that have been relocated.
*
* (2) It avoids double-counting (or possible under-counting) when
* an index partition join (or split) causes the partition bounds to
* be greater than was originally anticipated.
*/
final byte[] _fromKey = AbstractKeyRangeIndexProcedure
.constrainFromKey(currentFromKey, locator);
final byte[] _toKey = AbstractKeyRangeIndexProcedure
.constrainToKey(currentToKey, locator);
final int partitionId = locator.getPartitionId();
if (log.isInfoEnabled())
log.info("name=" + ndx.getName() //
+ ", tx=" + ts //
+ ", reverseScan=" + reverseScan //
+ ", partition=" + partitionId //
+ ", fromKey=" + BytesUtil.toString(_fromKey) //
+ ", toKey=" + BytesUtil.toString(_toKey));
/*
* The data service for the current index partition.
*
* @todo this should failover.
*/
final IDataService dataService = ndx.getDataService(locator);
/*
* Iterator will visit all data on that index partition.
*
* Note: This merely initializes the variables on the iterator, but
* it DOES NOT send the request to the data service. That does not
* happen until you call [src.hasNext()].
*/
src = new DataServiceTupleIterator(ndx, dataService, DataService
.getIndexPartitionName(ndx.getName(), partitionId),
ts, _fromKey, _toKey, capacity, flags, filter) {
/**
* Overridden so that we observe each distinct result set
* obtained from the DataService.
*/
protected ResultSet getResultSet(final long timestamp,
final byte[] fromKey, final byte[] toKey,
final int capacity, final int flags,
final IFilter filter) {
final ResultSet tmp = super.getResultSet(timestamp,
fromKey, toKey, capacity, flags, filter);
if (INFO)
log.info("Got chunk: ntuples=" + tmp.getNumTuples()
+ ", exhausted=" + tmp.isExhausted()
+ ", lastKey="
+ BytesUtil.toString(tmp.getLastKey()));
if (reverseScan) {
/*
* We are moving backwards through the key order so we
* take the last key visited and use it to restrict our
* exclusive upper bound. Without this the iterator will
* not "advance".
*/
currentToKey = tmp.getLastKey();
if (INFO)
log.info("New exclusive upper bound: "
+ BytesUtil.toString(currentToKey));
// assert currentToKey != null;
} else {
/*
* We are moving forwards through the key order so we
* take the last key visited and use it to advanced our
* inclusive lower bound. Without this the iterator will
* not advance.
*/
currentFromKey = tmp.getLastKey();
if (INFO)
log.info("New inclusive lower bound: "
+ BytesUtil.toString(currentFromKey));
// assert currentFromKey != null;
}
return tmp;
}
};
// increment the #of partitions visited.
nparts++;
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
public ITuple next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
nvisited++;
final long nvisited = this.nvisited;
final ITuple sourceTuple = src.next();
/*
* Override the visitCount.
*/
return new DelegateTuple( sourceTuple ) {
public long getVisitCount() {
return nvisited;
}
public String toString() {
return super.toString()+" : partition="+src.name;
}
};
}
/**
* Batch delete behind semantics.
*
* @see DataServiceTupleIterator#remove()
*/
public void remove() {
if (src == null)
throw new IllegalStateException();
src.remove();
}
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append(getClass().getSimpleName());
sb.append("{ flags=" + Tuple.flagString(flags));
sb.append(", timestamp=" + ts);
sb.append(", isReadConsistentTx=" + isReadConsistentTx);
sb.append(", capacity=" + capacity);
sb.append(", fromKey="
+ (fromKey == null ? "n/a" : BytesUtil.toString(fromKey)));
sb.append(", toKey="
+ (toKey == null ? "n/a" : BytesUtil.toString(toKey)));
sb.append(", filter=" + filter);
// dynamic state.
sb.append(", #visited=" + nvisited);
sb.append(", exhausted=" + exhausted);
sb.append(", locator=" + locator);
sb.append(", lastStaleLocator=" + lastStaleLocator);
// Note: [src] is the per index partition source (dynamic state).
sb.append(", src=" + (src == null ? "N/A" : src.getClass()));
sb.append("}");
return sb.toString();
}
}