com.bigdata.mdi.LocalPartitionMetadata Maven / Gradle / Ivy
/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.bigdata.mdi;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.Arrays;
import java.util.UUID;
import com.bigdata.btree.BTree;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.IndexSegment;
import com.bigdata.btree.IndexSegmentStore;
import com.bigdata.io.LongPacker;
import com.bigdata.io.ShortPacker;
import com.bigdata.journal.Journal;
import com.bigdata.service.DataService;
import com.bigdata.util.BytesUtil;
/**
* An immutable object providing metadata about a local index partition,
* including the partition identifier, the left and right separator keys
* defining the half-open key range of the index partition, and optionally
* defining the {@link IResourceMetadata}[] required to materialize a view of
* that index partition.
*
* @author Bryan Thompson
* @version $Id$
*/
public class LocalPartitionMetadata implements IPartitionMetadata,
Externalizable {
/**
*
*/
private static final long serialVersionUID = -1511361004851335936L;
/**
* The maximum length of the history string (4kb).
*
* Note: The history is written each time the {@link IndexMetadata} is
* written and is read each time it is read so this can be the main driver
* of the size of the {@link IndexMetadata} record.
*
* @deprecated
*/
protected final static transient int MAX_HISTORY_LENGTH = 0;//4 * Bytes.kilobyte32;
/**
* The unique partition identifier.
*/
private int partitionId;
/**
*
* @see #getSourcePartitionId()
*
* @deprecated MoveTask manages without this field (it was required by the
* previous MOVE implementation).
*/
private int sourcePartitionId;
/**
*
*/
private byte[] leftSeparatorKey;
private byte[] rightSeparatorKey;
/**
* Description of the resources required to materialize a view of the index
* partition (optional - not stored when the partition metadata is stored on
* an {@link IndexSegmentStore}).
*
* The entries in the array reflect the creation time of the resources. The
* earliest resource is listed first. The most recently created resource is
* listed last.
*
* When present, the #of sources in the index partition view includes: the
* mutable {@link BTree}, any {@link BTree}s on historical journal(s)
* still incorporated into the view, and any {@link IndexSegment}s
* incorporated into the view.
*/
private IResourceMetadata[] resources;
/**
* The reason why an index partition was created together with some metadata
* about when it was created.
*/
private IndexPartitionCause cause;
// /**
// * A history of operations giving rise to the current partition metadata.
// * E.g., register(timestamp), copyOnOverflow(timestamp), split(timestamp),
// * join(partitionId,partitionId,timestamp), etc. This is truncated when
// * serialized to keep it from growing without bound.
// *
// * @deprecated See {@link #getHistory()}
// */
// private String history;
//
// /**
// * If the history string exceeds {@link #MAX_HISTORY_LENGTH} characters then
// * truncates it to the last {@link #MAX_HISTORY_LENGTH}-3 characters,
// * prepends "...", and returns the result. Otherwise returns the entire
// * history string.
// *
// * @deprecated See {@link #history}
// */
// protected String getTruncatedHistory() {
//
// if (MAX_HISTORY_LENGTH == 0)
// return "";
//
// String history = this.history;
//
// if(history.length() > MAX_HISTORY_LENGTH) {
//
// /*
// * Truncate the history.
// */
//
// final int len = history.length();
//
// final int fromIndex = len - (MAX_HISTORY_LENGTH - 3);
//
// assert fromIndex > 0 : "len=" + len + ", fromIndex=" + fromIndex
// + ", maxHistoryLength=" + MAX_HISTORY_LENGTH;
//
// history = "..." + history.substring(fromIndex, len);
//
// }
//
// return history;
//
// }
/**
* De-serialization constructor.
*/
public LocalPartitionMetadata() {
}
/**
*
* @param partitionId
* The unique partition identifier assigned by the
* {@link MetadataIndex}.
* @param sourcePartitionId
* -1
unless this index partition is the target
* for a move, in which case this is the partition identifier of
* the source index partition.
* @param leftSeparatorKey
* The first key that can enter this index partition. The left
* separator key for the first index partition is always
* new byte[]{}
. The left separator key MAY NOT
* be null
.
* @param rightSeparatorKey
* The first key that is excluded from this index partition or
* null
iff there is no upper bound.
* @param resources
* A description of each {@link Journal} or {@link IndexSegment}
* resource(s) required to compose a view of the index partition
* (optional).
*
* The entries in the array reflect the creation time of the
* resources. The earliest resource is listed first. The most
* recently created resource is listed last.
*
* Note: This is required if the {@link LocalPartitionMetadata}
* record will be saved on the {@link IndexMetadata} of a
* {@link BTree}. It is NOT recommended when it will be saved on
* the {@link IndexMetadata} of an {@link IndexSegment}. When
* the {@link IndexMetadata} is sent to a remote
* {@link DataService} this field MUST be null
and
* the remote {@link DataService} will fill it in on arrival.
* @param cause
* The underlying cause for the creation of the index partition.
*/
// * @param history
// * A human interpretable history of the index partition. The
// * history is a series of whitespace delimited records each of
// * more or less the form foo(x,y,z)
. The history
// * gets truncated when the {@link LocalPartitionMetadata} is
// * serialized in order to prevent it from growing without bound.
public LocalPartitionMetadata(//
final int partitionId,//
final int sourcePartitionId,//
final byte[] leftSeparatorKey,//
final byte[] rightSeparatorKey,//
final IResourceMetadata[] resources,//
final IndexPartitionCause cause
// final String history
) {
/*
* Set fields first so that toString() can be used in thrown exceptions.
*/
this.partitionId = partitionId;
this.sourcePartitionId = sourcePartitionId;
this.leftSeparatorKey = leftSeparatorKey;
this.rightSeparatorKey = rightSeparatorKey;
this.resources = resources;
this.cause = cause;
// this.history = history;
/*
* Test arguments.
*/
if (leftSeparatorKey == null)
throw new IllegalArgumentException("leftSeparatorKey");
// Note: rightSeparatorKey MAY be null.
if (rightSeparatorKey != null) {
final int cmp = BytesUtil.compareBytes(leftSeparatorKey,
rightSeparatorKey);
if (cmp >= 0) {
throw new IllegalArgumentException("Separator keys are "
+ (cmp == 0 ? "equal" : "out of order") + " : " + this);
}
}
if (resources != null) {
if (resources.length == 0) {
throw new IllegalArgumentException("Empty resources array.");
}
for (IResourceMetadata t : resources) {
if (t == null)
throw new IllegalArgumentException(
"null value in resources[]");
}
/*
* This is the "live" journal.
*
* Note: The "live" journal is still available for writes. Index
* segments created off of this journal will therefore have a
* createTime that is greater than the firstCommitTime of this
* journal while being LTE to the lastCommitTime of the journal and
* strictly LT the commitTime of any other resource for this index
* partition.
*/
if (!resources[0].isJournal()) {
throw new RuntimeException(
"Expecting a journal as the first resource: " + this);
}
/*
* Scan from 1 to n-1 - these are historical resources
* (non-writable). resources[0] is always the live journal. The
* other resources may be either historical journals or index
* segments.
*
* The order of the array is the order of the view. Reads on a
* FusedView will process the resources in the order in which they
* are specified by this array.
*
* The live journal gets listed first since it can continue to
* receive writes and therefore logically comes before any other
* resource in the ordering since any writes on the live index on
* the journal will be more recent than the data on the index
* segment.
*
* Normally, each successive entry in the resources[] will have an
* earlier createTime (smaller number) than the one that follows it.
* However, there is one exception. The createTime of the live
* journal MAY be less than the createTime of index segments created
* from that journal - this will be true if those indices are
* created from a historical view found on that journal and put into
* play while the journal is still the live journal. To work around
* this we start at the 2nd entry in the array.
*/
/*
* Note: The practice of sending and index segment generated on one
* data service to another data service introduces another way in
* which the resource timestamp order can be broken. During the next
* synchronous overflow event you can see things like this:
*
* resourceMetadata=[
* JournalMetadata{filename=journal28417.jnl,uuid=add43d12-29b5-44e5-b26a-ae1b0694f67d,createTime=1236974533730},
* JournalMetadata{filename=journal28409.jnl,uuid=b954caf8-431b-42ae-9453-4c009398bec2,createTime=1236974293720},
* SegmentMetadata{filename=U8000_spo_OSP_part00050_28412.seg,uuid=cd954860-76fa-41ff-b788-e73a21b2c306,createTime=1236974525108},
* SegmentMetadata{filename=U8000_spo_OSP_part00050_28411.seg,uuid=35840589-6fb5-4691-b271-cf660186cd4b,createTime=1236974523976} ]
*
* This is in fact well-formed. However, because the index segments
* were generated on a different host, the create times get out of
* wack. For that reason, I have disabled checking here.
*/
final boolean checkCreateTimes = false;
if (checkCreateTimes && resources.length > 2) {
long lastTimestamp = resources[1/*2ndEntry*/].getCreateTime();
for (int i = 2/* 3rd entry */; i < resources.length; i++) {
// createTime of the resource.
final long thisTimestamp = resources[i].getCreateTime();
if (lastTimestamp <= thisTimestamp) {
throw new RuntimeException(
"Resources out of timestamp order @ index="
+ i + " : "+ this);
}
lastTimestamp = resources[i].getCreateTime();
}
}
}
}
final public int getPartitionId() {
return partitionId;
}
/**
* -1
unless this index partition is the target for a move,
* in which case this is the partition identifier of the source index
* partition and the move operation has not been completed. This property is
* used to prevent the target data service from de-defining the index
* partition using a split, join or move operation while the MOVE operation
* is proceeding. The property is cleared to -1
(which is an
* invalid index partition identifier) once the move has been completed
* successfully.
*
* @deprecated MoveTask manages without this field (it was required by the
* previous MOVE implementation).
*/
final public int getSourcePartitionId() {
return sourcePartitionId;
}
final public byte[] getLeftSeparatorKey() {
return leftSeparatorKey;
}
final public byte[] getRightSeparatorKey() {
return rightSeparatorKey;
}
/**
* Description of the resources required to materialize a view of the index
* partition (optional, but required for a {@link BTree}).
*
* The entries in the array reflect the creation time of the resources. The
* earliest resource is listed first. The most recently created resource is
* listed last. The order of the resources corresponds to the order in which
* a fused view of the index partition will be read. Reads begin with the
* most "recent" data for the index partition and stop as soon as there is a
* "hit" on one of the resources (including a hit on a deleted index entry).
*
* When present, the #of sources in the index partition view includes: the
* mutable {@link BTree}, any {@link BTree}s on historical journal(s)
* still incorporated into the view, and any {@link IndexSegment}s
* incorporated into the view.
*
* Note: the {@link IResourceMetadata}[] is only available when the
* {@link LocalPartitionMetadata} is attached to the {@link IndexMetadata}
* of a {@link BTree} and is NOT defined when the
* {@link LocalPartitionMetadata} is attached to an {@link IndexSegment}.
* The reason is that the index partition view is always described by the
* {@link BTree} and that view evolves as journals overflow. On the other
* hand, {@link IndexSegment}s are used as resources in index partition
* views but exist in a one to many relationship to those views.
*/
final public IResourceMetadata[] getResources() {
return resources;
}
/**
* The reason why an index partition was created together with some metadata
* about when it was created.
*/
final public IndexPartitionCause getIndexPartitionCause() {
return cause;
}
// /**
// * A history of the changes to the index partition.
// *
// * @deprecated I've essentially disabled the history (it is always empty
// * when it is persisted). I found it nearly impossible to read.
// * There are much saner ways to track what is going on in the
// * federation. An analysis of the {@link Event} log is much more
// * useful. If nothing else, you could examine the index
// * partition in the metadata index by scanning the commit points
// * and reading its state in each commit and reporting all state
// * changes.
// */
// final public String getHistory() {
//
// return history;
//
// }
final public int hashCode() {
// per the interface contract.
return partitionId;
}
// Note: used by assertEquals in the test cases.
public boolean equals(final Object o) {
if (this == o)
return true;
final LocalPartitionMetadata o2 = (LocalPartitionMetadata) o;
if (partitionId != o2.partitionId)
return false;
if (!BytesUtil.bytesEqual(leftSeparatorKey, o2.leftSeparatorKey)) {
return false;
}
if (rightSeparatorKey == null) {
if (o2.rightSeparatorKey != null)
return false;
} else {
if (!BytesUtil.bytesEqual(rightSeparatorKey, o2.rightSeparatorKey)) {
return false;
}
}
if (resources.length != o2.resources.length)
return false;
for (int i = 0; i < resources.length; i++) {
if (!resources[i].equals(o2.resources[i]))
return false;
}
return true;
}
public String toString() {
return
"{ partitionId="+partitionId+
(sourcePartitionId!=-1?", sourcePartitionId="+sourcePartitionId:"")+
", leftSeparator="+BytesUtil.toString(leftSeparatorKey)+
", rightSeparator="+BytesUtil.toString(rightSeparatorKey)+
", resourceMetadata="+Arrays.toString(resources)+
", cause="+cause+
// ", history="+history+
"}"
;
}
/*
* Externalizable
*/
private static final transient short VERSION0 = 0x0;
/**
* This version adds support for {@link IResourceMetadata#getCommitTime()},
* but that field is only serialized for a journal.
*/
private static final transient short VERSION1 = 0x1;
/**
* This version serializes the {@link #partitionId} as 32-bits clean and
* gets rid of the history
field.
*/
private static final transient short VERSION2 = 0x2;
/**
* The current version.
*/
private static final transient short VERSION = VERSION2;
public void readExternal(final ObjectInput in) throws IOException,
ClassNotFoundException {
final short version = ShortPacker.unpackShort(in);
switch (version) {
case VERSION0:
case VERSION1:
case VERSION2:
break;
default:
throw new IOException("Unknown version: " + version);
}
if (version < VERSION2) {
partitionId = (int) LongPacker.unpackLong(in);
} else {
partitionId = in.readInt();
}
sourcePartitionId = in.readInt(); // MAY be -1.
final int nresources = ShortPacker.unpackShort(in);
final int leftLen = (int) LongPacker.unpackLong(in);
final int rightLen = (int) LongPacker.unpackLong(in);
leftSeparatorKey = new byte[leftLen];
in.readFully(leftSeparatorKey);
if (rightLen != 0) {
rightSeparatorKey = new byte[rightLen];
in.readFully(rightSeparatorKey);
} else {
rightSeparatorKey = null;
}
cause = (IndexPartitionCause)in.readObject();
if (version < VERSION2) {
/* history = */in.readUTF();
}
resources = nresources>0 ? new IResourceMetadata[nresources] : null;
for (int j = 0; j < nresources; j++) {
final boolean isIndexSegment = in.readBoolean();
final String filename = in.readUTF();
// long nbytes = LongPacker.unpackLong(in);
final UUID uuid = new UUID(in.readLong()/*MSB*/,in.readLong()/*LSB*/);
final long createTime = in.readLong();
long commitTime = 0L;
if (version >= VERSION1 && !isIndexSegment) {
commitTime = in.readLong();
}
resources[j] = (isIndexSegment //
? new SegmentMetadata(filename, /*nbytes,*/ uuid, createTime) //
: new JournalMetadata(filename, /*nbytes,*/ uuid, createTime, commitTime) //
);
}
}
public void writeExternal(final ObjectOutput out) throws IOException {
ShortPacker.packShort(out, VERSION);
if (VERSION < VERSION2) {
LongPacker.packLong(out, partitionId);
} else {
out.writeInt(partitionId);
}
out.writeInt(sourcePartitionId); // MAY be -1.
final int nresources = (resources == null ? 0 : resources.length);
assert nresources < Short.MAX_VALUE;
ShortPacker.packShort(out, (short) nresources);
LongPacker.packLong(out, leftSeparatorKey.length);
LongPacker.packLong(out, rightSeparatorKey == null ? 0
: rightSeparatorKey.length);
out.write(leftSeparatorKey);
if (rightSeparatorKey != null) {
out.write(rightSeparatorKey);
}
out.writeObject(cause);
if (VERSION < VERSION2) {
out.writeUTF("");// getTruncatedHistory()
}
/*
* Note: we serialize using the IResourceMetadata interface so that we
* can handle different subclasses and then special case the
* deserialization based on the boolean flag. This is significantly more
* compact than using an Externalizable for each ResourceMetadata object
* since we do not have to write the class names for those objects.
*/
for (int j = 0; j < nresources; j++) {
final IResourceMetadata rmd = resources[j];
final boolean isSegment = rmd.isIndexSegment();
out.writeBoolean(isSegment);
out.writeUTF(rmd.getFile());
// LongPacker.packLong(out,rmd.size());
final UUID resourceUUID = rmd.getUUID();
out.writeLong(resourceUUID.getMostSignificantBits());
out.writeLong(resourceUUID.getLeastSignificantBits());
out.writeLong(rmd.getCreateTime());
if (!isSegment) {
out.writeLong(rmd.getCommitTime());
}
}
}
}