com.bigdata.service.ndx.PartitionedTupleIterator Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package com.bigdata.service.ndx;

import java.io.IOException;
import java.util.Iterator;
import java.util.NoSuchElementException;

import org.apache.log4j.Logger;

import com.bigdata.btree.DelegateTuple;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.ResultSet;
import com.bigdata.btree.Tuple;
import com.bigdata.btree.proc.AbstractKeyRangeIndexProcedure;
import com.bigdata.journal.ITx;
import com.bigdata.mdi.PartitionLocator;
import com.bigdata.resources.StaleLocatorException;
import com.bigdata.service.DataService;
import com.bigdata.service.IDataService;
import com.bigdata.util.BytesUtil;
import com.bigdata.util.InnerCause;

import cutthecrap.utils.striterators.IFilter;

/**
 * Class supports range query across one or more index partitions. Each
 * partition is mapped onto a single {@link DataServiceTupleIterator} query. In
 * turn, the {@link DataServiceTupleIterator} may make several queries to the
 * data service per partition. The actual #of queries made to the data service
 * depends on the #of index entries that are visited per partition and the
 * capacity specified to the ctor. {@link StaleLocatorException}s are handled
 * transparently by restarting the locator scan and continuing the range query
 * request from the successor (or predecessor for a reverse scan) of the last
 * key visited. When the operation is unisolated, this class will correctly
 * complete the range iterator request even if the index partition is split,
 * joined or moved during traversal.
 * 
 * @author Bryan Thompson
 * @version $Id$
 */
public class PartitionedTupleIterator implements ITupleIterator {

    private static final transient Logger log = Logger
            .getLogger(PartitionedTupleIterator.class);
    
//    protected static final boolean INFO = log.isInfoEnabled();
    
//    protected static final boolean DEBUG = log.isDebugEnabled();
    
    /**
     * The index on which the range query is being performed.
     */
    private final IScaleOutClientIndex ndx;
    
    /**
     * Iterator traversing the index partition locators spanned by the query.
     */
    private Iterator locatorItr;
    
    /**
     * The timestamp from the ctor.
     */
    private final long ts;

    /**
     * true iff the {@link #ts} is a read-historical
     * transaction created specifically to give the iterator read-consistent
     * semantics. when true, this class will ensure that the
     * transaction is eventually aborted so that its read lock will be released.
     */
    private final boolean isReadConsistentTx;
    
    /**
     * The first key to visit -or- null iff no lower bound (from
     * the ctor).
     */
    private final byte[] fromKey;
    
    /**
     * The first key to NOT visit -or- null iff no upper bound
     * (from the ctor).
     */
    private final byte[] toKey;

    /**
     * This controls the #of results per data service query.
     */
    private final int capacity;

    /**
     * These flags control whether keys and/or values are requested. If
     * neither keys nor values are requested, then this is just a range
     * count operation and you might as well use rangeCount instead.
     */
    private final int flags;
    
    private final IFilter filter;
    
    /**
     * true iff {@link IRangeQuery#REVERSE} was specified by the
     * caller. When {@link IRangeQuery#REVERSE} was specified then we will use a
     * reverse locator scan so that we proceed in reverse order over the index
     * partitions as in reverse order within each index partition.
     */
    private final boolean reverseScan;

    /**
     * The {@link #currentFromKey} and {@link #currentToKey} are updated each
     * time we formulate a query against the partitioned index, which can occur
     * one or more times per index partition. The update of the fields depends
     * on whether we are doing a forward or reverse scan. If we have to handle a
     * {@link StaleLocatorException} then these fields will be used to restart
     * the locator scan.
     * 
     * The {@link #currentFromKey} is initially set by the ctor to the
     * {@link #fromKey}.
     * 

     * For a reverse scan, the {@link #currentFromKey} remains unchanged.
     * 

     * For a forward scan the {@link #currentFromKey} is set to the last key
     * visited in each {@link ResultSet}.
     * 
     * @todo {@link ResultSet#getLastKey()} will return null if
     *       there were no tuples to visit. This class does not appear to handle
     *       that condition, but it is able to pass a unit test where there is
     *       an empty partition in the middle of the key range scan for both
     *       forward and reverse traversal and a test where there is a non-empty
     *       partition whose tuples are filtered out such that it is effectively
     *       empty. see TestRangeQuery#test_reverseScan()
     * 
     * @todo if we restart a locator scan and the index partition boundaries
     *       have changed, then what do we need to do in order to make sure that
     *       we are issuing the query?
     */
    private byte[] currentFromKey;
    
    /**
     * The {@link #currentToKey} is initially set by the ctor to the
     * {@link #toKey}.
     * 

     * For a forward scan, the {@link #currentToKey} remains unchanged.
     * 

     * For a reverse scan, the {@link #currentToKey} is set to the last key
     * visited in each {@link ResultSet}.
     */
    private byte[] currentToKey;
    
    /**
     * The metadata for the current index partition.
     */
    private PartitionLocator locator = null;

    /**
     * The last locator for which we received a {@link StaleLocatorException}.
     * We note this so that we can avoid an endless retry if the same locator is
     * reported when we attempt to restart the {@link #locatorItr}.
     */
    private PartitionLocator lastStaleLocator = null;
    
    /**
     * The #of index partitions that have been queried so far. There will be one
     * {@link DataServiceTupleIterator} query issued per partition.
     * 
     * @deprecated The #of partitions is a bit tricky since splits and joins can
     *             change the #of index partitions dynamically (for unisolated
     *             or read-committed reads where read-consistent is not true).
     */
    private int nparts = 0;
    
    /**
     * The #of tuples visited so far.
     */
    private long nvisited = 0L;
    
    /**
     * The {@link DataServiceTupleIterator} reading from the current index
     * partition.
     */
    private DataServiceTupleIterator src;
   
    /**
     * When true, the entire key range specified by the client has been
     * visited and the iterator is exhausted (i.e., all done).
     * 
     * @see #close()
     */
    private boolean exhausted = false;

    /**
     * The #of index partitions queried so far.
     * 
     * @deprecated The #of partitions is a bit tricky since splits and joins can
     *             introduce new partitions unless you are using a
     *             read-consistent view.
     */
    public int getPartitionCount() {
        
        return nparts;
        
    }
    
    /**
     * The #of entries visited so far (not the #of entries scanned, which
     * can be much greater if a filter is in use).
     */
    public long getVisitedCount() {
        
        return nvisited;
        
    }
    
    /**
     * 
     * Note: The {@link PartitionedTupleIterator} uses a sequential scan (rather
     * than mapping across the index partitions in parallel) and always picks up
     * from the successor of the last key visited. Read-consistent is achieved
     * by specifying a commitTime for the timestamp rather than
     * {@link ITx#READ_COMMITTED}. The latter will use dirty reads (each time a
     * {@link ResultSet} is fetched it will be fetched from the most recently
     * committed state of the database).
     * 
     * @param ndx
     * @param ts
     *            The timestamp for the view (may be a transaction).
     * @param isReadConsistentTx
     *            true iff the caller specified timestamp is a
     *            read-historical transaction created specifically to give the
     *            iterator read-consistent semantics. when true,
     *            this class will ensure that the transaction is eventually
     *            aborted so that its read lock will be released. This is done
     *            eagerly when the iterator is exhausted and with a
     *            {@link #finalize()} method otherwise.
     * @param fromKey
     * @param toKey
     * @param capacity
     * @param flags
     * @param filter
     * 
     * @throws IllegalArgumentException
     *             if readConsistent is requested and the index view is
     *             {@link ITx#UNISOLATED}.
     */
    public PartitionedTupleIterator(final IScaleOutClientIndex ndx,
            final long ts, final boolean isReadConsistentTx, final byte[] fromKey,
            final byte[] toKey, final int capacity, final int flags,
            final IFilter filter) {

        if (ndx == null) {

            throw new IllegalArgumentException();
            
        }
        
        if (capacity < 0) {

            throw new IllegalArgumentException();
            
        }
        
        this.ndx = ndx;
        this.ts = ts;
        this.isReadConsistentTx = isReadConsistentTx;
        this.fromKey = this.currentFromKey = fromKey;
        this.toKey = this.currentToKey = toKey;
        this.capacity = capacity;
        
        this.flags = flags;
                
        this.filter = filter;

        this.reverseScan = (flags & IRangeQuery.REVERSE) != 0;

        // start locator scan
        this.locatorItr = ndx.locatorScan(ts, fromKey, toKey, reverseScan);

    }

    protected void finalize() {
        
        close();
        
    }

    /**
     * Marks the iterator as {@link #exhausted} and aborts the {@link #ts} iff
     * it was identified to the ctor as being created specifically to provide
     * read-consistent semantics for this iterator and hence our responsibility
     * to clean up.
     */
    synchronized private void close() {

        if (exhausted)
            return;

        exhausted = true;

        if (isReadConsistentTx) {

            try {

                ndx.getFederation().getTransactionService().abort(ts);

            } catch (IOException e) {

                // log and ignore since the caller was not directly affected.
                log.error(ClientIndexView.ERR_ABORT_TX + ts, e);

            }

        }

    }
    
    public boolean hasNext() {

        if (exhausted) {

            return false;
            
        }

        if (locator == null) {

            // Setup query for the first partition.
            
            if (!nextPartition()) {
                
                return false;
                
            }

        }
        
        assert src != null;

        try {

            if(src.hasNext()) {
            
                // More from the current source iterator.
                
                return true;
                
            }
            
        } catch(RuntimeException ex) {
            
            final StaleLocatorException cause = (StaleLocatorException) InnerCause
                    .getInnerCause(ex, StaleLocatorException.class);
            
            if(cause != null) {
                
                /*
                 * Handle StaleLocatorException. This exception indicates that
                 * we have a stale index partition locator. This can happen when
                 * index partitions are split, joined, or moved. It can only
                 * happen for UNISOLATED or READ_COMMITTED operations since we
                 * never change historical locators. Also, If the index view is
                 * read-committed and read-consistent operations are specified,
                 * then IIndexStore#getLastCommitTime() will be used so stale
                 * locators will not occur for that case either.
                 */
                
                if (lastStaleLocator != null) {

                    if (lastStaleLocator.getPartitionId() == locator
                            .getPartitionId()) {

                        /*
                         * This happens if we get a StaleLocatorException,
                         * restart the locator scan, and get another
                         * StaleLocatorException on the same index partition.
                         * Since a new index partition identifier is assigned
                         * every time there is a split/join/move this is a clear
                         * indication that something is wrong with either the
                         * locator, with the cached view of the metadata index
                         * used by the client, or the data service. For example,
                         * the client may have failed to refresh its cached view
                         * for the locator or the index partition might have
                         * been dropped on the data service (a no-no to be
                         * sure).
                         */

                        throw new RuntimeException(
                                "Missing index partition on data service? "
                                        + locator, ex);

                    }
                    
                }
                
                // notify the client so that it can refresh its cache.
                ndx.staleLocator(ts, locator,cause);
                
                // save reference
                lastStaleLocator = locator;
                
                // clear since invalid.
                locator = null;
                
                // Re-start the locator scan.
                locatorItr = ndx.locatorScan(ts, currentFromKey,
                        currentToKey, reverseScan);
                
                // Recursive query.
                return hasNext();
                
            } else throw ex;
            
        }
        
        /*
         * The current index partition is empty, but there are other index
         * partitions left to query.
         * 
         * Each source iterator reads from one index partition. (The source
         * iterator is itself a chunked iterator so it may issue multiple remote
         * requests to consume the data available on a given index partition).
         */
        
        if(nextPartition()) {
        
            /*
             * Recursive query since the index partition might be empty.
             */
            
            return hasNext();
            
        }
        
        /*
         * Exausted.
         */
        
        close();
        
        return false;
        
    }

    /**
     * Issues a new range query against the next index partition.
     */
    private boolean nextPartition() {

        assert ! exhausted;
        
        if (Thread.interrupted()) {

            // notice an interrupt no later than the next partition.
            throw new RuntimeException(new InterruptedException());
            
        }

        if (!locatorItr.hasNext()) {

            if(log.isInfoEnabled())
                log.info("No more locators");

            return false;
            
        }
       
        locator = locatorItr.next();
        
        if (log.isInfoEnabled())
            log.info("locator=" + locator);
        
        // submit query to the next partition.
        rangeQuery();

        assert src != null;

        return true;

    }
    
    /**
     * Issues a range query against the current {@link #locator}.
     */
    private void rangeQuery() {

        assert ! exhausted;
        
        assert locator != null;

        if (Thread.interrupted()) {

            // notice an interrupt no later than the next chunk.
            throw new RuntimeException(new InterruptedException());
            
        }
        
        try {

            /*
             * Note: The range query request is formed such that it addresses
             * only those keys that actually lie within the partition and also
             * within the caller's given key range. This has two benefits:
             * 
             * (1) The data service can check the range and report an error for
             * clients that appear to be requesting data for index partitions
             * that have been relocated.
             * 
             * (2) It avoids double-counting (or possible under-counting) when
             * an index partition join (or split) causes the partition bounds to
             * be greater than was originally anticipated.
             */

            final byte[] _fromKey = AbstractKeyRangeIndexProcedure
                    .constrainFromKey(currentFromKey, locator);

            final byte[] _toKey = AbstractKeyRangeIndexProcedure
                    .constrainToKey(currentToKey, locator);
            
            final int partitionId = locator.getPartitionId();
            
            if (log.isInfoEnabled())
                log.info("name=" + ndx.getName() //
                        + ", tx=" + ts //
                        + ", reverseScan=" + reverseScan //
                        + ", partition=" + partitionId //
                        + ", fromKey=" + BytesUtil.toString(_fromKey) //
                        + ", toKey=" + BytesUtil.toString(_toKey));
            
            /*
             * The data service for the current index partition.
             * 
             * @todo this should failover.
             */
            final IDataService dataService = ndx.getDataService(locator);
            
            /*
             * Iterator will visit all data on that index partition.
             * 
             * Note: This merely initializes the variables on the iterator, but
             * it DOES NOT send the request to the data service. That does not
             * happen until you call [src.hasNext()].
             */
            
            src = new DataServiceTupleIterator(ndx, dataService, DataService
                    .getIndexPartitionName(ndx.getName(), partitionId),
                    ts, _fromKey, _toKey, capacity, flags, filter) {
                
                /**
                 * Overridden so that we observe each distinct result set
                 * obtained from the DataService.
                 */
                protected ResultSet getResultSet(final long timestamp,
                        final byte[] fromKey, final byte[] toKey,
                        final int capacity, final int flags,
                        final IFilter filter) {

                    final ResultSet tmp = super.getResultSet(timestamp,
                            fromKey, toKey, capacity, flags, filter);

                    if (INFO)
                        log.info("Got chunk: ntuples=" + tmp.getNumTuples()
                                + ", exhausted=" + tmp.isExhausted()
                                + ", lastKey="
                                + BytesUtil.toString(tmp.getLastKey()));

                    if (reverseScan) {

                        /*
                         * We are moving backwards through the key order so we
                         * take the last key visited and use it to restrict our
                         * exclusive upper bound. Without this the iterator will
                         * not "advance".
                         */
                        
                        currentToKey = tmp.getLastKey();

                        if (INFO)
                            log.info("New exclusive upper bound: "
                                    + BytesUtil.toString(currentToKey));

                        // assert currentToKey != null;

                    } else {

                        /*
                         * We are moving forwards through the key order so we
                         * take the last key visited and use it to advanced our
                         * inclusive lower bound. Without this the iterator will
                         * not advance.
                         */

                        currentFromKey = tmp.getLastKey();

                        if (INFO)
                            log.info("New inclusive lower bound: "
                                    + BytesUtil.toString(currentFromKey));

//                        assert currentFromKey != null;
                        
                    }
                    
                    return tmp;
                    
                }
                
            };
            
            // increment the #of partitions visited.
            nparts++;

        } catch (Exception ex) {

            throw new RuntimeException(ex);

        }

    }
    
    public ITuple next() {

        if (!hasNext()) {

            throw new NoSuchElementException();
            
        }
        
        nvisited++;

        final long nvisited = this.nvisited;
        
        final ITuple sourceTuple = src.next();

        /*
         * Override the visitCount.
         */
        return new DelegateTuple( sourceTuple ) {
            
            public long getVisitCount() {
                
                return nvisited;
                
            }
            
            public String toString() {
                
                return super.toString()+" : partition="+src.name;
                
            }
            
        };
        
    }

    /**
     * Batch delete behind semantics.
     * 
     * @see DataServiceTupleIterator#remove()
     */
    public void remove() {
        
        if (src == null)
            throw new IllegalStateException();
        
        src.remove();
        
    }

    public String toString() {
        
        final StringBuilder sb = new StringBuilder();
        
        sb.append(getClass().getSimpleName());

        sb.append("{ flags=" + Tuple.flagString(flags));

        sb.append(", timestamp=" + ts);

        sb.append(", isReadConsistentTx=" + isReadConsistentTx);

        sb.append(", capacity=" + capacity);

        sb.append(", fromKey="
                + (fromKey == null ? "n/a" : BytesUtil.toString(fromKey)));

        sb.append(", toKey="
                + (toKey == null ? "n/a" : BytesUtil.toString(toKey)));

        sb.append(", filter=" + filter);

        // dynamic state.

        sb.append(", #visited=" + nvisited);

        sb.append(", exhausted=" + exhausted);

        sb.append(", locator=" + locator);

        sb.append(", lastStaleLocator=" + lastStaleLocator);

        // Note: [src] is the per index partition source (dynamic state).
        sb.append(", src=" + (src == null ? "N/A" : src.getClass()));

        sb.append("}");

        return sb.toString();

    }

}