com.bigdata.mdi.MetadataIndex Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package com.bigdata.mdi;

import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.UUID;


import com.bigdata.btree.BTree;
import com.bigdata.btree.Checkpoint;
import com.bigdata.btree.DefaultTupleSerializer;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.view.FusedView;
import com.bigdata.io.LongPacker;
import com.bigdata.journal.ICommitter;
import com.bigdata.journal.IResourceManager;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.service.MetadataService;

/**
 * A metadata index for the partitions of a distributed index. There is one
 * metadata index for each distributed index. The keys of the metadata index are
 * the first key that would be directed into the corresponding index segment,
 * e.g., a separator key (this is just the standard btree semantics).
 * The values are serialized {@link PartitionLocator} objects.
 * 
 * Note: At this time the recommended scale-out approach for the metadata index
 * is to place the metadata indices on a {@link MetadataService} (the same
 * {@link MetadataService} may be used for an arbitrary #of scale-out indices)
 * and to replicate the state for the {@link MetadataService} onto
 * failover {@link MetadataService}s. Since the {@link MetadataIndex} may grow
 * without bound, you simply need to have enough disk on hand for it (the size
 * requirements are quite modest). Further, the {@link MetadataService} MUST NOT
 * be used to hold the data for the scale-out indices themselves since the
 * {@link MetadataIndex} can not undergo {@link IResourceManager#overflow()}.
 * 

 * One advantage of this approach is that the {@link MetadataIndex} is
 * guaranteed to hold all historical states of the partition definitions for
 * each index - effectively it is an immortal store for the partition metadata.
 * On the other hand it is not possible to compact the metadata index without
 * taking the database offline.
 * 
 * @author Bryan Thompson
 * @version $Id$
 * 
 * @todo The {@link MetadataIndex} does NOT support either overflow (it may NOT
 *       be a {@link FusedView}) NOR key-range splits. There are several issues
 *       involved:
 *       

 *       (a) How to track the next partition identifier to be assigned to an
 *       index partition for the managed index. Currently this value is written
 *       in the {@link MetadataIndexCheckpoint} record and is propagated to the
 *       new backing store on overflow. However, if the metadata index is split
 *       into partitions then additional care MUST be taken to use only the
 *       value of that field on the 'meta-meta' index.
 *       

 *       (b) how to locate the partitions of the metadata index itself.
 * 
 * @todo one way to locate the metadata-index partitions is to hash partition
 *       the metadata index and range queries can be flooded to all partitions.
 *       the #of metadata service nodes can be changed by a suitable broadcast
 *       event in which clients have to change to the new hash basis. this
 *       feature can be generalized to provide hash partitioned indices as well
 *       as key-range partitioned indices.
 * 
 * @todo A metadata index can be recovered by a distributed process running over
 *       the data services. Each data service reports all index partitions. The
 *       reports are collected and the index is rebuilt from the reports. Much
 *       like a map/reduce job.
 */
public class MetadataIndex extends BTree implements IMetadataIndex {

    /**
     * Used to implement find(byte[] key).
     */
    private transient final MetadataIndexView view;
    
    public MetadataIndexMetadata getIndexMetadata() {
        
        return (MetadataIndexMetadata) super.getIndexMetadata();
        
    }
    
    public IndexMetadata getScaleOutIndexMetadata() {
        
        return getIndexMetadata().getManagedIndexMetadata();
        
    }
    
    /**
     * Returns the value to be assigned to the next partition created on this
     * {@link MetadataIndex} and then increments the counter. The counter will
     * be made restart-safe iff the index is dirty, the index is registered as
     * an {@link ICommitter}, and the store on which the index is stored is
     * committed.
     * 
     * Note: The metadata index uses a 32-bit partition identifier rather than
     * the {@link #getCounter()}. The reason is that the {@link Counter} uses
     * the partition identifier in the high word and a partition local counter
     * in the low word. Therefore we have to centralize the assignment of the
     * partition identifier, even when the metadata index is itself split into
     * partitions. Requests for partition identifiers need to be directed to the
     * root partition (L0) for the {@link MetadataIndex}.
     */
    public int incrementAndGetNextPartitionId() {
        
        final int tmp = nextPartitionId++;

        /*
         * Notify listener that the index is dirty.
         */
        
        fireDirtyEvent();
        
        return tmp;
        
    }

    private int nextPartitionId;
    
    /**
     * Create a new {@link MetadataIndex}.
     * 
     * @param store
     *            The backing store.
     * @param indexUUID
     *            The unique identifier for the metadata index.
     * @param managedIndexMetadata
     *            The metadata template for the managed scale-out index.
     */
    public static MetadataIndex create(IRawStore store, UUID indexUUID,
            IndexMetadata managedIndexMetadata) {

        final MetadataIndexMetadata metadata = new MetadataIndexMetadata(
                managedIndexMetadata.getName(), indexUUID, managedIndexMetadata);

        /*
         * @todo the metadata index should use probably delete markers so that
         * we can do compacting merges on it, but that is really moot until we
         * support overflow of the metadataService and partitioning of the
         * metadata index.
         */
        
        metadata.setDeleteMarkers(true);

        /*
         * Override the implementation class.
         */
        
        metadata.setBTreeClassName(MetadataIndex.class.getName());

        /*
         * Override the checkpoint record implementation class.
         */
        metadata.setCheckpointClassName(MetadataIndexCheckpoint.class.getName());

        /*
         * Tuple serializer.
         */
        metadata.setTupleSerializer(PartitionLocatorTupleSerializer.newInstance());
        
        return (MetadataIndex) BTree.create(store, metadata);
        
    }

    /**
     * Required ctor.
     * 
     * @param store
     * @param checkpoint
     * @param metadata
     */
    public MetadataIndex(IRawStore store, Checkpoint checkpoint, IndexMetadata metadata, boolean readOnly) {
        
        super(store, checkpoint, metadata, readOnly);

        /*
         * copy the initial value from the checkpoint record.
         */

        nextPartitionId = ((MetadataIndexCheckpoint)checkpoint).getNextPartitionId();
        
        view = new MetadataIndexView(this);
        
    }
    
    /**
     * Extended to require a checkpoint if {@link #incrementAndGetNextPartitionId()} has been
     * invoked.
     */
    public boolean needsCheckpoint() {

        if(nextPartitionId != ((MetadataIndexCheckpoint)getCheckpoint()).getNextPartitionId()) {
            
            return true;
            
        }
        
        return super.needsCheckpoint();
        
    }
    
    /**
     * Extends the {@link Checkpoint} record to store the next partition
     * identifier to be assigned by the metadata index.
     * 
     * @author Bryan Thompson
     * @version $Id$
     */
    public static class MetadataIndexCheckpoint extends Checkpoint {

        /**
         * 
         */
        private static final long serialVersionUID = 6482587101150014793L;

        private int nextPartitionId;

        /**
         * The immutable value of the nextPartitionId counter
         * stored in the metadata record.
         */
        public int getNextPartitionId() {

            return nextPartitionId;

        }
        
        /**
         * De-serialization constructor.
         */
        public MetadataIndexCheckpoint() {
            
        }
        
        /**
         * @param btree
         */
        public MetadataIndexCheckpoint(BTree btree) {
            
            super(btree);
            
            nextPartitionId = ((MetadataIndex)btree).nextPartitionId;
            
        }

        /**
         * Create the initial checkpoint record for the initial metadata index.
         * 
         * @param metadata
         */
        public MetadataIndexCheckpoint(IndexMetadata metadata) {

            super(metadata);

            // The first partitionId is zero(0).
            nextPartitionId = 0;
            
        }
        
        /**
         * Create the initial checkpoint record when the metadata index
         * overflows onto a new backing store.
         * 
         * @param metadata
         */
        public MetadataIndexCheckpoint(IndexMetadata metadata, Checkpoint oldCheckpoint) {

            super(metadata, oldCheckpoint);

            // propagate the value of this field onto the new backing store.
            nextPartitionId = ((MetadataIndexCheckpoint)oldCheckpoint).nextPartitionId;
            
        }
        
        final transient private static int VERSION0 = 0x0;
        
        @Override
        public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {

            super.readExternal(in);
            
            final int version = (int) LongPacker.unpackLong(in);

            if (version != 0)
                throw new IOException("Unknown version: " + version);

            nextPartitionId = in.readInt();
            
        }

        @Override
        public void writeExternal(ObjectOutput out) throws IOException {

            super.writeExternal(out);

            LongPacker.packLong(out, VERSION0);

            out.writeInt(nextPartitionId);
            
        }

    }

    /**
     * Extends the {@link IndexMetadata} record to hold the metadata template
     * for the managed scale-out index.
     * 
     * @author Bryan Thompson
     * @version $Id$
     */
    public static class MetadataIndexMetadata extends IndexMetadata implements Externalizable {

        private static final long serialVersionUID = -7309267778881420043L;
        
        private IndexMetadata scaleOutIndexMetadata;
        
        /**
         * The managed index metadata
         */
        public final IndexMetadata getManagedIndexMetadata() {
            
            return scaleOutIndexMetadata;
            
        }
        
        /**
         * De-serialization constructor.
         */
        public MetadataIndexMetadata() {
            
        }

        /**
         * First time constructor.
         * 
         * @param name
         *            The name of the managed index. The name of the metadata
         *            index is given by
         *            {@link MetadataService#getMetadataIndexName(String)}
         * @param indexUUID
         *            The UUID of the metadata index.
         * @param managedIndexMetadata
         *            The metadata template for the managed index.
         */
        public MetadataIndexMetadata(String managedIndexName, UUID indexUUID, IndexMetadata managedIndexMetadata) {

            super(MetadataService.getMetadataIndexName(managedIndexName), indexUUID);
            
            if(managedIndexMetadata == null) {
                
                throw new IllegalArgumentException();
                
            }
            
            this.scaleOutIndexMetadata = managedIndexMetadata;
            
        }

        private static final transient int VERSION0 = 0x0;

        public void readExternal(ObjectInput in) throws IOException,
                ClassNotFoundException {

            super.readExternal(in);

            final int version = (int) LongPacker.unpackLong(in);

            if (version != VERSION0) {

                throw new IOException("Unknown version: version=" + version);

            }

            scaleOutIndexMetadata = (IndexMetadata) in.readObject();

        }

        public void writeExternal(ObjectOutput out) throws IOException {

            super.writeExternal(out);

            LongPacker.packLong(out, VERSION0);

            out.writeObject(scaleOutIndexMetadata);

        }

    }

    public PartitionLocator get(final byte[] key) {
        
//        return (PartitionLocator)SerializerUtil.deserialize(lookup(key));
        
        // automatic de-serialization using the ITupleSerializer.
        return (PartitionLocator)super.lookup((Object)key);
        
    }

    public PartitionLocator find(final byte[] key) {

        return view.find(key);
        
    }

    /**
     * Used to (de-)serialize {@link PartitionLocator}s in the
     * {@link MetadataIndex}.
     * 
     * @author Bryan Thompson
     * @version $Id$
     */
    public static class PartitionLocatorTupleSerializer extends
            DefaultTupleSerializer {

        /**
         * 
         */
        private static final long serialVersionUID = -4178430896409893596L;

        public static PartitionLocatorTupleSerializer newInstance() {
         
            return new PartitionLocatorTupleSerializer(getDefaultKeyBuilderFactory());
            
        }
        
        /**
         * De-serialization ctor only.
         */
        public PartitionLocatorTupleSerializer() {
            
        }
        
        /**
         * @param keyBuilderFactory
         */
        public PartitionLocatorTupleSerializer(
                IKeyBuilderFactory keyBuilderFactory) {

            super(keyBuilderFactory);
            
        }

        /**
         * The initial version (no additional persistent state).
         */
        private final static transient byte VERSION0 = 0;

        /**
         * The current version.
         */
        private final static transient byte VERSION = VERSION0;

        public void readExternal(final ObjectInput in) throws IOException,
                ClassNotFoundException {

            super.readExternal(in);
            
            final byte version = in.readByte();
            
            switch (version) {
            case VERSION0:
                break;
            default:
                throw new UnsupportedOperationException("Unknown version: "
                        + version);
            }

        }

        public void writeExternal(final ObjectOutput out) throws IOException {

            super.writeExternal(out);
            
            out.writeByte(VERSION);
            
        }

    } // PartitionLocatorTupleSerializer

    /**
     * Passes the notice along to the {@link #view}. It caches de-serialized
     * locators and needs to drop them from its cache if they become stale.
     */
    public void staleLocator(PartitionLocator locator) {
        
        view.staleLocator(locator);
        
    }

}