com.bigdata.mdi.MetadataIndex Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.bigdata.mdi;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.UUID;
import com.bigdata.btree.BTree;
import com.bigdata.btree.Checkpoint;
import com.bigdata.btree.DefaultTupleSerializer;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.view.FusedView;
import com.bigdata.io.LongPacker;
import com.bigdata.journal.ICommitter;
import com.bigdata.journal.IResourceManager;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.service.MetadataService;
/**
* A metadata index for the partitions of a distributed index. There is one
* metadata index for each distributed index. The keys of the metadata index are
* the first key that would be directed into the corresponding index segment,
* e.g., a separator key (this is just the standard btree semantics).
* The values are serialized {@link PartitionLocator} objects.
*
* Note: At this time the recommended scale-out approach for the metadata index
* is to place the metadata indices on a {@link MetadataService} (the same
* {@link MetadataService} may be used for an arbitrary #of scale-out indices)
* and to replicate the state for the {@link MetadataService} onto
* failover {@link MetadataService}s. Since the {@link MetadataIndex} may grow
* without bound, you simply need to have enough disk on hand for it (the size
* requirements are quite modest). Further, the {@link MetadataService} MUST NOT
* be used to hold the data for the scale-out indices themselves since the
* {@link MetadataIndex} can not undergo {@link IResourceManager#overflow()}.
*
* One advantage of this approach is that the {@link MetadataIndex} is
* guaranteed to hold all historical states of the partition definitions for
* each index - effectively it is an immortal store for the partition metadata.
* On the other hand it is not possible to compact the metadata index without
* taking the database offline.
*
* @author Bryan Thompson
* @version $Id$
*
* @todo The {@link MetadataIndex} does NOT support either overflow (it may NOT
* be a {@link FusedView}) NOR key-range splits. There are several issues
* involved:
*
* (a) How to track the next partition identifier to be assigned to an
* index partition for the managed index. Currently this value is written
* in the {@link MetadataIndexCheckpoint} record and is propagated to the
* new backing store on overflow. However, if the metadata index is split
* into partitions then additional care MUST be taken to use only the
* value of that field on the 'meta-meta' index.
*
* (b) how to locate the partitions of the metadata index itself.
*
* @todo one way to locate the metadata-index partitions is to hash partition
* the metadata index and range queries can be flooded to all partitions.
* the #of metadata service nodes can be changed by a suitable broadcast
* event in which clients have to change to the new hash basis. this
* feature can be generalized to provide hash partitioned indices as well
* as key-range partitioned indices.
*
* @todo A metadata index can be recovered by a distributed process running over
* the data services. Each data service reports all index partitions. The
* reports are collected and the index is rebuilt from the reports. Much
* like a map/reduce job.
*/
public class MetadataIndex extends BTree implements IMetadataIndex {
/**
* Used to implement find(byte[] key).
*/
private transient final MetadataIndexView view;
public MetadataIndexMetadata getIndexMetadata() {
return (MetadataIndexMetadata) super.getIndexMetadata();
}
public IndexMetadata getScaleOutIndexMetadata() {
return getIndexMetadata().getManagedIndexMetadata();
}
/**
* Returns the value to be assigned to the next partition created on this
* {@link MetadataIndex} and then increments the counter. The counter will
* be made restart-safe iff the index is dirty, the index is registered as
* an {@link ICommitter}, and the store on which the index is stored is
* committed.
*
* Note: The metadata index uses a 32-bit partition identifier rather than
* the {@link #getCounter()}. The reason is that the {@link Counter} uses
* the partition identifier in the high word and a partition local counter
* in the low word. Therefore we have to centralize the assignment of the
* partition identifier, even when the metadata index is itself split into
* partitions. Requests for partition identifiers need to be directed to the
* root partition (L0) for the {@link MetadataIndex}.
*/
public int incrementAndGetNextPartitionId() {
final int tmp = nextPartitionId++;
/*
* Notify listener that the index is dirty.
*/
fireDirtyEvent();
return tmp;
}
private int nextPartitionId;
/**
* Create a new {@link MetadataIndex}.
*
* @param store
* The backing store.
* @param indexUUID
* The unique identifier for the metadata index.
* @param managedIndexMetadata
* The metadata template for the managed scale-out index.
*/
public static MetadataIndex create(IRawStore store, UUID indexUUID,
IndexMetadata managedIndexMetadata) {
final MetadataIndexMetadata metadata = new MetadataIndexMetadata(
managedIndexMetadata.getName(), indexUUID, managedIndexMetadata);
/*
* @todo the metadata index should use probably delete markers so that
* we can do compacting merges on it, but that is really moot until we
* support overflow of the metadataService and partitioning of the
* metadata index.
*/
metadata.setDeleteMarkers(true);
/*
* Override the implementation class.
*/
metadata.setBTreeClassName(MetadataIndex.class.getName());
/*
* Override the checkpoint record implementation class.
*/
metadata.setCheckpointClassName(MetadataIndexCheckpoint.class.getName());
/*
* Tuple serializer.
*/
metadata.setTupleSerializer(PartitionLocatorTupleSerializer.newInstance());
return (MetadataIndex) BTree.create(store, metadata);
}
/**
* Required ctor.
*
* @param store
* @param checkpoint
* @param metadata
*/
public MetadataIndex(IRawStore store, Checkpoint checkpoint, IndexMetadata metadata, boolean readOnly) {
super(store, checkpoint, metadata, readOnly);
/*
* copy the initial value from the checkpoint record.
*/
nextPartitionId = ((MetadataIndexCheckpoint)checkpoint).getNextPartitionId();
view = new MetadataIndexView(this);
}
/**
* Extended to require a checkpoint if {@link #incrementAndGetNextPartitionId()} has been
* invoked.
*/
public boolean needsCheckpoint() {
if(nextPartitionId != ((MetadataIndexCheckpoint)getCheckpoint()).getNextPartitionId()) {
return true;
}
return super.needsCheckpoint();
}
/**
* Extends the {@link Checkpoint} record to store the next partition
* identifier to be assigned by the metadata index.
*
* @author Bryan Thompson
* @version $Id$
*/
public static class MetadataIndexCheckpoint extends Checkpoint {
/**
*
*/
private static final long serialVersionUID = 6482587101150014793L;
private int nextPartitionId;
/**
* The immutable value of the nextPartitionId
counter
* stored in the metadata record.
*/
public int getNextPartitionId() {
return nextPartitionId;
}
/**
* De-serialization constructor.
*/
public MetadataIndexCheckpoint() {
}
/**
* @param btree
*/
public MetadataIndexCheckpoint(BTree btree) {
super(btree);
nextPartitionId = ((MetadataIndex)btree).nextPartitionId;
}
/**
* Create the initial checkpoint record for the initial metadata index.
*
* @param metadata
*/
public MetadataIndexCheckpoint(IndexMetadata metadata) {
super(metadata);
// The first partitionId is zero(0).
nextPartitionId = 0;
}
/**
* Create the initial checkpoint record when the metadata index
* overflows onto a new backing store.
*
* @param metadata
*/
public MetadataIndexCheckpoint(IndexMetadata metadata, Checkpoint oldCheckpoint) {
super(metadata, oldCheckpoint);
// propagate the value of this field onto the new backing store.
nextPartitionId = ((MetadataIndexCheckpoint)oldCheckpoint).nextPartitionId;
}
final transient private static int VERSION0 = 0x0;
@Override
public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
super.readExternal(in);
final int version = (int) LongPacker.unpackLong(in);
if (version != 0)
throw new IOException("Unknown version: " + version);
nextPartitionId = in.readInt();
}
@Override
public void writeExternal(ObjectOutput out) throws IOException {
super.writeExternal(out);
LongPacker.packLong(out, VERSION0);
out.writeInt(nextPartitionId);
}
}
/**
* Extends the {@link IndexMetadata} record to hold the metadata template
* for the managed scale-out index.
*
* @author Bryan Thompson
* @version $Id$
*/
public static class MetadataIndexMetadata extends IndexMetadata implements Externalizable {
private static final long serialVersionUID = -7309267778881420043L;
private IndexMetadata scaleOutIndexMetadata;
/**
* The managed index metadata
*/
public final IndexMetadata getManagedIndexMetadata() {
return scaleOutIndexMetadata;
}
/**
* De-serialization constructor.
*/
public MetadataIndexMetadata() {
}
/**
* First time constructor.
*
* @param name
* The name of the managed index. The name of the metadata
* index is given by
* {@link MetadataService#getMetadataIndexName(String)}
* @param indexUUID
* The UUID of the metadata index.
* @param managedIndexMetadata
* The metadata template for the managed index.
*/
public MetadataIndexMetadata(String managedIndexName, UUID indexUUID, IndexMetadata managedIndexMetadata) {
super(MetadataService.getMetadataIndexName(managedIndexName), indexUUID);
if(managedIndexMetadata == null) {
throw new IllegalArgumentException();
}
this.scaleOutIndexMetadata = managedIndexMetadata;
}
private static final transient int VERSION0 = 0x0;
public void readExternal(ObjectInput in) throws IOException,
ClassNotFoundException {
super.readExternal(in);
final int version = (int) LongPacker.unpackLong(in);
if (version != VERSION0) {
throw new IOException("Unknown version: version=" + version);
}
scaleOutIndexMetadata = (IndexMetadata) in.readObject();
}
public void writeExternal(ObjectOutput out) throws IOException {
super.writeExternal(out);
LongPacker.packLong(out, VERSION0);
out.writeObject(scaleOutIndexMetadata);
}
}
public PartitionLocator get(final byte[] key) {
// return (PartitionLocator)SerializerUtil.deserialize(lookup(key));
// automatic de-serialization using the ITupleSerializer.
return (PartitionLocator)super.lookup((Object)key);
}
public PartitionLocator find(final byte[] key) {
return view.find(key);
}
/**
* Used to (de-)serialize {@link PartitionLocator}s in the
* {@link MetadataIndex}.
*
* @author Bryan Thompson
* @version $Id$
*/
public static class PartitionLocatorTupleSerializer extends
DefaultTupleSerializer {
/**
*
*/
private static final long serialVersionUID = -4178430896409893596L;
public static PartitionLocatorTupleSerializer newInstance() {
return new PartitionLocatorTupleSerializer(getDefaultKeyBuilderFactory());
}
/**
* De-serialization ctor only.
*/
public PartitionLocatorTupleSerializer() {
}
/**
* @param keyBuilderFactory
*/
public PartitionLocatorTupleSerializer(
IKeyBuilderFactory keyBuilderFactory) {
super(keyBuilderFactory);
}
/**
* The initial version (no additional persistent state).
*/
private final static transient byte VERSION0 = 0;
/**
* The current version.
*/
private final static transient byte VERSION = VERSION0;
public void readExternal(final ObjectInput in) throws IOException,
ClassNotFoundException {
super.readExternal(in);
final byte version = in.readByte();
switch (version) {
case VERSION0:
break;
default:
throw new UnsupportedOperationException("Unknown version: "
+ version);
}
}
public void writeExternal(final ObjectOutput out) throws IOException {
super.writeExternal(out);
out.writeByte(VERSION);
}
} // PartitionLocatorTupleSerializer
/**
* Passes the notice along to the {@link #view}. It caches de-serialized
* locators and needs to drop them from its cache if they become stale.
*/
public void staleLocator(PartitionLocator locator) {
view.staleLocator(locator);
}
}