com.bigdata.service.AbstractScaleOutFederation Maven / Gradle / Ivy

Go to download
/*

 Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

 Contact:
 SYSTAP, LLC DBA Blazegraph
 2501 Calvert ST NW #106
 Washington, DC 20008
 [email protected]

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; version 2 of the License.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

 */
/*
 * Created on Sep 13, 2008
 */

package com.bigdata.service;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeoutException;

import org.apache.log4j.Logger;

import com.bigdata.btree.ILinearList;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.IndexSegment;
import com.bigdata.journal.ITransactionService;
import com.bigdata.journal.ITx;
import com.bigdata.journal.NoSuchIndexException;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.mdi.IMetadataIndex;
import com.bigdata.mdi.PartitionLocator;
import com.bigdata.resources.StoreManager;
import com.bigdata.service.AbstractScaleOutClient.MetadataIndexCachePolicy;
import com.bigdata.service.AbstractScaleOutClient.Options;
import com.bigdata.service.ndx.ClientIndexView;
import com.bigdata.util.BytesUtil;

import cutthecrap.utils.striterators.Resolver;
import cutthecrap.utils.striterators.Striterator;

/**
 * Abstract base class for federation implementations using the scale-out index
 * architecture (federations that support key-range partitioned indices).
 * 
 * @author Bryan Thompson
 * @version $Id$
 * @param 
 *            The generic type of the client or service.
 */
public abstract class AbstractScaleOutFederation extends AbstractFederation {

    /**
     * @param client
     */
    public AbstractScaleOutFederation(final IBigdataClient client) {
       
        super(client);
        
        indexCache = new IndexCache(this, client.getIndexCacheCapacity(),
                client.getIndexCacheTimeout());

        metadataIndexCache = new MetadataIndexCache(this, client
                .getIndexCacheCapacity(), client.getIndexCacheTimeout());
        
        final Properties properties = client.getProperties();
        
        metadataIndexCachePolicy = MetadataIndexCachePolicy.valueOf(properties
                .getProperty(Options.METADATA_INDEX_CACHE_POLICY,
                        Options.DEFAULT_METADATA_INDEX_CACHE_POLICY));

        if (log.isInfoEnabled())
            log.info(Options.METADATA_INDEX_CACHE_POLICY + "="
                    + metadataIndexCachePolicy);
               
    }
    
    protected final MetadataIndexCachePolicy metadataIndexCachePolicy;

    /**
     * Strengthens the return type.
     * 
     * {@inheritDoc}
     */
    public ClientIndexView getIndex(final String name, final long timestamp) {

        return (ClientIndexView) super.getIndex(name, timestamp);
        
    }
    
    /**
     * {@inheritDoc}
     * 
     * Extended to clear the various caches.
     */
    @Override
    public synchronized void shutdown() {
        
        super.shutdown();

        indexCache.shutdown();

        metadataIndexCache.shutdown();
        
    }

    /**
     * {@inheritDoc}
     * 

     * Extended to clear the various caches.
     */
    @Override
    public synchronized void shutdownNow() {
    
        super.shutdownNow();
        
        indexCache.shutdown();

        metadataIndexCache.shutdown();
            
    }

    /**
     * Return a read-only view onto an {@link IMetadataIndex}.
     * 
     * @param name
     *            The name of the scale-out index.
     * @param timestamp
     *            The timestamp for the view.
     * 
     * @todo The easiest way to have the view be correct is for the operations
     *       to all run against the remote metadata index (no caching).
     *       

     *       There are three kinds of queries that we do against the metadata
     *       index: (1) get(key); (2) find(key); and (3)
     *       locatorScan(fromKey,toKey). The first is only used by the unit
     *       tests. The second is used when we start a locator scan, when we
     *       split a batch operation against the index partitions, and when we
     *       map an index procedure over a key range or use a key range
     *       iterator. This is the most costly of the queries, but it is also
     *       the one that is the least easy to cache. The locator scan itself is
     *       heavily buffered - a cache would only help for frequently scanned
     *       and relatively small key ranges. For this purpose, it may be better
     *       to cache the iterator result itself locally to the client (for
     *       historical reads or transactional reads).
     *       

     *       The difficulty with caching find(key) is that we need to use the
     *       {@link ILinearList} API to locate the appropriate index partition.
     *       However, since it is a cache, there can be cache misses. These
     *       would show up as a "gap" in the (leftSeparator, rightSeparator)
     *       coverage.
     *       

     *       If we do not cache access to the remote metadata index then we will
     *       impose additional latency on clients, traffic on the network, and
     *       demands on the metadata service. However, with high client
     *       concurrency mitigates the increase in access latency to the
     *       metadata index.
     * 
     * @todo Use a weak-ref cache with an LRU (or hard reference cache) to evict
     *       cached {@link PartitionLocator}. The client needs access by {
     *       indexName, timestamp, key }. We need to eventually evict the cached
     *       locators to prevent the client from building up too much state
     *       locally. Also the cached locators can not be shared across
     *       different timestamps, so clients will build up a locator cache when
     *       working on a transaction but then never go back to that cache once
     *       the transaction completes.
     *       

     *       While it may be possible to share cached locators between
     *       historical reads and transactions for the same point in history, we
     *       do not have enough information on hand to make those decisions.
     *       What we would need to know is the historical commit time
     *       corresponding to an assigned transaction startTime. This is not
     *       one-to-one since the start times for transactions must be unique
     *       (among those in play). See {@link ITransactionService#newTx(long)}
     *       for more on this.
     * 
     * @todo cache leased information about index partitions of interest to the
     *       client. The cache will be a little tricky since we need to know
     *       when the client does not possess a partition definition. Index
     *       partitions are defined by the separator key - the first key that
     *       lies beyond that partition. the danger then is that a client will
     *       presume that any key before the first leased partition is part of
     *       that first partition. To guard against that the client needs to
     *       know both the separator key that represents the upper and lower
     *       bounds of each partition. If a lookup in the cache falls outside of
     *       any known partitions upper and lower bounds then it is a cache miss
     *       and we have to ask the metadata service for a lease on the
     *       partition. the cache itself is just a btree data structure with the
     *       proviso that some cache entries represent missing partition
     *       definitions (aka the lower bounds for known partitions where the
     *       left sibling partition is not known to the client).
     *       

     *       With even a modest #of partitions, a locator scan against the MDS
     *       will be cheaper than attempting to fill multiple "gaps" in a local
     *       locator cache, so such a cache might be reserved for point tests.
     *       Such point tests are used by the sparse row store for its row local
     *       operations (vs scans) but are less common for JOINs.
     * 
     * @todo Just create cache view when MDI is large and then cache on demand.
     * 
     * @todo If the {@link IMetadataIndex#get(byte[])} and
     *       {@link IMetadataIndex#find(byte[])} methods are to be invoked
     *       remotely then we should return the byte[] rather than the
     *       de-serialized {@link PartitionLocator} so that we don't
     *       de-serialize them from the index only to serialize them for RMI and
     *       then de-serialize them again on the client.
     * 
     * @todo the easiest way to handle a scale-out metadata index is to make it
     *       hash-partitioned (vs range-partitioned). We can just flood queries
     *       to the hash partitioned index. For the iterator, we have to buffer
     *       the results and place them back into order. A fused view style
     *       iterator could be used to merge the iterator results from each
     *       partition into a single totally ordered iterator.
     */
    public IMetadataIndex getMetadataIndex(final String name,
            final long timestamp) {

        if (log.isInfoEnabled())
            log.info("name=" + name + " @ " + timestamp);

        assertOpen();

        return getMetadataIndexCache().getIndex(name, timestamp);
         
    }

    /**
     * Returns an iterator that will visit the {@link PartitionLocator}s for the
     * specified scale-out index key range.
     * 

     * The method fetches a chunk of locators at a time from the metadata index.
     * Unless the #of index partitions spanned is very large, this will be an
     * atomic read of locators from the metadata index. When the #of index
     * partitions spanned is very large, then this will allow a chunked
     * approach.
     * 

     * Note: It is possible that a split, join or move could occur during the
     * process of mapping the procedure across the index partitions. When the
     * view is {@link ITx#UNISOLATED} or {@link ITx#READ_COMMITTED} this could
     * make the set of mapped index partitions inconsistent in the sense that it
     * might double count some parts of the key range or that it might skip some
     * parts of the key range. In order to avoid this problem the caller MUST
     * use read-consistent semantics. If the {@link ClientIndexView} is
     * not already isolated by a transaction, then the caller MUST create a
     * read-only transaction use the global last commit time of the federation.
     * 
     * @param name
     *            The name of the scale-out index.
     * @param timestamp
     *            The timestamp of the view. It is the responsibility of the
     *            caller to choose timestamp so as to provide
     *            read-consistent semantics for the locator scan.
     * @param fromKey
     *            The scale-out index first key that will be visited
     *            (inclusive). When null there is no lower bound.
     * @param toKey
     *            The first scale-out index key that will NOT be visited
     *            (exclusive). When null there is no upper bound.
     * @param reverseScan
     *            true if you need to visit the index partitions in
     *            reverse key order (this is done when the partitioned iterator
     *            is scanning backwards).
     * 
     * @return The iterator.
     * 
     * @throws IllegalArgumentException
     *             if name is null.
     * @throws NoSuchIndexException
     *             if there is no such scale-out index at the specified
     *             timestamp
     */
    @SuppressWarnings("unchecked")
    public Iterator locatorScan(final String name,
            final long timestamp, final byte[] fromKey, final byte[] toKey,
            final boolean reverseScan) {

        if (name == null)
            throw new IllegalArgumentException();
        
        if (log.isInfoEnabled())
            log.info("Querying metadata index: name=" + name + ", timestamp="
                    + timestamp + ", reverseScan=" + reverseScan + ", fromKey="
                    + BytesUtil.toString(fromKey) + ", toKey="
                    + BytesUtil.toString(toKey));
        
        final IMetadataIndex mdi = getMetadataIndex(name, timestamp);

        if (mdi == null)
            throw new NoSuchIndexException("name=" + name + "@"
                    + TimestampUtility.toString(timestamp));
        
        final ITupleIterator itr;

        // the values are the locators (keys are not required).
        final int flags = IRangeQuery.VALS;
        
        if (reverseScan) {
         
            /*
             * Reverse locator scan.
             * 
             * The first locator visited will be the first index partition whose
             * leftSeparator is LT the optional toKey. (If the toKey falls on an
             * index partition boundary then we use the prior index partition).
             */

            itr = mdi.rangeIterator(//
                    fromKey,//
                    toKey, //
                    0, // capacity
                    flags | IRangeQuery.REVERSE,
                    null // filter
                    );

        } else {
            
            /*
             * Forward locator scan.
             * 
             * Note: The scan on the metadata index needs to start at the index
             * partition in which the fromKey would be located. Therefore, when
             * the fromKey is specified, we replace it with the leftSeparator of
             * the index partition which would contain that fromKey.
             */

            final byte[] _fromKey = fromKey == null //
                ? null //
                : mdi.find(fromKey).getLeftSeparatorKey()//
                ;

            itr = mdi.rangeIterator(//
                    _fromKey,//
                    toKey, //
                    0, // capacity
                    flags,//
                    null // filter
                    );

        }

        return new Striterator(itr).addFilter(new Resolver(){

            private static final long serialVersionUID = 7874887729130530971L;

            @Override
            protected Object resolve(Object obj) {
             
                final ITuple tuple = (ITuple) obj;
                
                return tuple.getObject();
                
            }
        });
        
    }
        
    /**
     * Return true.
     */
    final public boolean isScaleOut() {
        
        return true;
        
    }

    private final IndexCache indexCache;
    private final MetadataIndexCache metadataIndexCache;
    
    protected IndexCache getIndexCache() {
        
        return indexCache;
        
    }
    
    /**
     * Return the cache for {@link IMetadataIndex} objects.
     */
    protected MetadataIndexCache getMetadataIndexCache() {
        
        return metadataIndexCache;
        
    }

    /**
     * Await the availability of an {@link IMetadataService} and the specified
     * minimum #of {@link IDataService}s.
     * 
     * @param minDataServices
     *            The minimum #of data services.
     * @param timeout
     *            The timeout (ms).
     * 
     * @return An array #of the {@link UUID}s of the {@link IDataService}s
     *         that have been discovered by this client. Note that at
     *         least minDataServices elements will be present in this
     *         array but that ALL discovered data services MAY be reported.
     * 
     * @throws IllegalArgumentException
     *             if minDataServices is non-positive.
     * @throws IllegalArgumentException
     *             if timeout is non-positive.
     * @throws IllegalStateException
     *             if the client is not connected to the federation.
     * @throws InterruptedException
     *             if this thread is interrupted while awaiting the availability
     *             of the {@link MetadataService} or the specified #of
     *             {@link DataService}s.
     * @throws TimeoutException
     *             If a timeout occurs.
     * 
     * @todo We should await critical services during connect() {MDS, TS, LS}.
     *       The LBS is not critical, but we should either have it on hand to
     *       notice our service join or we should notice its JOIN and then
     *       notice it ourselves. That would leave this method with the
     *       responsibility for awaiting the join of at least N data services
     *       (and perhaps verifying that the other services are still joined).
     * 
     * FIXME This should be rewritten in the JiniFederation subclass to use the
     * ServiceDiscoveryListener interface implemented by that class.
     */
    public UUID[] awaitServices(final int minDataServices, final long timeout)
            throws InterruptedException, TimeoutException {

        assertOpen();

        if (minDataServices <= 0)
            throw new IllegalArgumentException();

        if (timeout <= 0)
            throw new IllegalArgumentException();
        
        final long begin = System.currentTimeMillis();

        // sleep interval if not ready (ms).
        final long interval = Math.min(100, timeout / 10);

        int ntries = 0;
        
        // updated each time through the loop.
        IMetadataService metadataService = null;
        
        // updated each time through the loop.
        UUID[] dataServiceUUIDs = null;
        
        while (true) {
            
            // set on entry each time through the loop.
            metadataService = getMetadataService();

            // set on entry each time through the loop.
            dataServiceUUIDs = getDataServiceUUIDs(0/* all */);

            if ((System.currentTimeMillis() - begin) >= timeout
                    || (metadataService != null && dataServiceUUIDs.length >= minDataServices)) {

                /*
                 * Either a timeout or we have the MDS and enough DS.
                 * 
                 * Either way, we are done so break out of the loop.
                 */
                
                break;
                
            }
            
            ntries++;

            if (log.isInfoEnabled())
                log.info("Waiting : ntries=" + ntries + ", metadataService="
                        + (metadataService == null ? "not " : "")
                        + " found; #dataServices=" + dataServiceUUIDs.length
                        + " out of " + minDataServices + " required : "
                        + Arrays.toString(dataServiceUUIDs));

            // @todo the way this is written can sleep longer than the remaining time 
            Thread.sleep(interval);

            continue;

        }

        if (log.isInfoEnabled())
            log.info("MDS=" + (metadataService != null) + ", #dataServices="
                    + dataServiceUUIDs.length);

        if (metadataService != null
                && dataServiceUUIDs.length >= minDataServices) {

            // success.
            return dataServiceUUIDs;

        }

        throw new TimeoutException("elapsed="
                + (System.currentTimeMillis() - begin) + "ms: metadataService="
                + (metadataService != null) + ", dataServices="
                + dataServiceUUIDs.length+" but require "+minDataServices);
        
    }

	/**
	 * Force overflow of each data service in the scale-out federation (only
	 * scale-out federations support overflow processing). This method is
	 * synchronous. It will not return until all {@link DataService}s have
	 * initiated and completed overflow processing. Any unused resources (as
	 * determined by the {@link StoreManager}) will have been purged.
	 * 
	 * This is a relatively fast operation when
	 * compactingMerge := false. By specifying both
	 * compactingMerge := false and
	 * truncateJournal := false you can cause the data services to
	 * close out their current journals against further writes. While this is
	 * not a global synchronous operation, it can provide a basis to obtain a
	 * "near synchronous" snapshot from the federation consisting of all writes
	 * up to the point where overflow was triggered on each data service.
	 * 
	 * @param compactingMerge
	 *            When true, each shard on each
	 *            {@link IDataService} will undergo a compacting merge.
	 *            Synchronous parallel compacting merge of all shards is an
	 *            expensive operation. This parameter shoudl normally be
	 *            false unless you are requesting a compacting
	 *            merge for specific purposes, such as benchmarking when all
	 *            data is known to exist in one {@link IndexSegment} per shard.
	 * @param truncateJournal
	 *            When true, the live journal will be truncated to
	 *            its minimum extent (all writes will be preserved but there
	 *            will be no free space left in the journal). This may be used
	 *            to force the {@link DataService} to its minimum possible
	 *            footprint.
	 * 
	 * @todo when overflow processing is enabled for the {@link MetadataService}
	 *       we will have to modify this to also trigger overflow for those
	 *       services.
	 */
    public void forceOverflow(final boolean compactingMerge, final boolean truncateJournal) {
        
        // find UUID for each data service.
        final UUID[] dataServiceUUIDs = getDataServiceUUIDs(0/* maxCount */);

        final int ndataServices = dataServiceUUIDs.length;

        log.warn("Forcing overflow: #dataServices=" + ndataServices + ", now=" + new Date());

        final List> tasks = new ArrayList>(ndataServices);

        for (UUID serviceUUID : dataServiceUUIDs) {

            tasks.add(new ForceOverflowTask(getDataService(serviceUUID),
                    compactingMerge, truncateJournal));

        }

        if(truncateJournal) {

            /*
             * @todo The metadata service does not yet support overflow (it does
             * not support partitioned metadata indices) so it only has a live
             * journal. Therefore all we can do is truncate the live journal for
             * the metadata service.
             */

            tasks.add(new PurgeResourcesTask(getMetadataService(),
                    truncateJournal));
            
        }
        
        final List> futures;
        try {

            futures = getExecutorService().invokeAll(tasks);
            
        } catch (InterruptedException ex) {
            
            throw new RuntimeException(ex);
            
        }

        int nok = 0;

        for (Future f : futures) {

            try {
                f.get();
                nok++;
            } catch (InterruptedException ex) {
                log.warn(ex.getLocalizedMessage());
                continue;
            } catch (ExecutionException ex) {
                log.error(ex.getLocalizedMessage(), ex);
            }

        }

        log.warn("Did overflow: #ok=" + nok + ", #dataServices="
                + ndataServices + ", now=" + new Date());

        if (nok != tasks.size()) {

            throw new RuntimeException(
                    "Errors during overflow processing: #ok=" + nok
                            + ", #tasks=" + tasks.size());

        }

    }

    /**
     * Task directs a {@link DataService} to purge any unused resources and to
     * optionally truncate the extent of the live journal.
     * 
     * @author Bryan Thompson
     * @version $Id$
     */
    public static class PurgeResourcesTask implements Callable {

        protected static final Logger log = Logger
                .getLogger(PurgeResourcesTask.class);

        private final IDataService dataService;

        private final boolean truncateJournal;

        public PurgeResourcesTask(final IDataService dataService,
                final boolean truncateJournal) {

            if (dataService == null)
                throw new IllegalArgumentException();

            this.dataService = dataService;

            this.truncateJournal = truncateJournal;

        }

        public Void call() throws Exception {

            if (log.isInfoEnabled())
                log.info("dataService: " + dataService.getServiceName());

            if (!dataService.purgeOldResources(5000/* ms */, truncateJournal)) {

                log
                        .warn("Could not pause write service - resources will not be purged.");

            }

            return null;

        }

    }

    /**
     * Task forces immediate overflow of the specified data service, returning
     * once both synchronous AND asynchronous overflow are complete.
     * 
     * @author Bryan Thompson
     * @version $Id$
     */
    public static class ForceOverflowTask implements Callable {

        protected static final Logger log = Logger
                .getLogger(ForceOverflowTask.class);

        private final IDataService dataService;
        private final boolean compactingMerge;
        private final boolean truncateJournal;
        
		public ForceOverflowTask(final IDataService dataService,
				final boolean compactingMerge, final boolean truncateJournal) {

            if (dataService == null)
                throw new IllegalArgumentException();

            this.dataService = dataService;
            
            this.compactingMerge = compactingMerge;
            
            this.truncateJournal = truncateJournal;

        }
        
        public Void call() throws Exception {
            
            if(log.isInfoEnabled())
                log.info("dataService: " + dataService.getServiceName());

            // returns once synchronous overflow is complete.
			dataService.forceOverflow(true/* immediate */, compactingMerge);

            if (log.isInfoEnabled())
                log.info("Synchronous overflow is done: "
                        + dataService.getServiceName());

            // wait until overflow processing is done.
            while (dataService.isOverflowActive()) {

                Thread.sleep(100/* ms */);

            }

            if (log.isInfoEnabled())
                log.info("Asynchronous overflow is done: "
                        + dataService.getServiceName());
            
            /*
             * Note: Old resources are automatically released as the last step
             * of asynchronous overflow processing. Therefore all we are really
             * doing here is issuing a request to truncate the journal. However,
             * we use the same method to accomplish both ends.
             */
            if (truncateJournal) {

                if (!dataService
                        .purgeOldResources(5000/* ms */, true/* truncateJournal */)) {

                    log.warn("Could not pause write service - resources will not be purged.");

                }

            }
            
            return null;
            
        }
        
    }
    
}