com.bigdata.sparse.KeyDecoder Maven / Gradle / Ivy
/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.bigdata.sparse;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.util.BytesUtil;
/**
* A utility class that decodes a key in a {@link SparseRowStore} into the
* {@link KeyType} for the primary key, the column name, and the timestamp. Note
* that the exact schema name itself is not recoverable since it is encoded
* using a non-reversible algorithm (it is a sort key generated by a Unicode
* collator). Likewise, the primary key can be decoded for primitive data types,
* but while we can identify the bytes corresponding to the primary key for a
* Unicode {@link KeyType} we can not decode them (it is also a sort key
* generated by a Unicode collator). The column name is NOT stored with Unicode
* compression so that we can decode it without loss (it is encoded into bytes
* using UTF-8 and those bytes are written directly into the key). This means
* that column names are NOT ordered according to the Unicode collator. In
* practice this is not a problem since we never assume order for that part of
* the key. The {@link SparseRowStore} only relies on {columnName,timestamp}
* defining the semantics of distinct keys for a given {schema,primaryKey}
* prefix.
*
* The encoded schema name is followed by the {@link KeyType#getByteCode()} and
* then by a nul
byte. By searching for the nul
byte
* we can identify the end of the encoded schema name and also the data type of
* the primary key. Most kinds of primary keys have a fixed length encoding,
* e.g., {@link Long}, {@link Double}, etc.
*
* Unicode primary keys have a variable length encoding which makes life more
* complex. For Unicode primary keys, we break with the collation order and use
* the UTF8 encoding of the key. This means that the primary key can be decoded
* and preserves hierarchical namespace clustering within the row store but does
* not impose a total sort order per Unicode sort key semantics. The only
* reasonable approach is to append a byte sequence to the key that never occurs
* within the generated Unicode sort keys. Again, we use a nul
byte
* to mark the end of the Unicode primary key since it is not emitted by most
* Unicode collation implementations as it would cause grief for C-language
* strings. (However, see SparseRowStore.Options#PRIMARY_KEY_UNICODE_CLEAN} for
* information on backward compatibility.)
*
* @see Schema#fromKey(IKeyBuilder, Object)
* @see KeyType#getKeyType(byte)
* @see AtomicRowWriteRead
* @see AtomicRowRead
*
* @author Bryan Thompson
* @version $Id$
*
* @todo The key is now 100% decodable. The package should be updated to take
* advantage of that.
*/
public class KeyDecoder {
/**
* The key that was specified to the ctor.
*/
private final byte[] key;
/**
* The #of bytes in the encoded schema name (does not include either the
* byte encoding the {@link KeyType} of the primary key or the
* nul
byte that terminates the schema component in the key).
*/
private final int schemaBytesLength;
/**
* Offset of the byte that encoded the {@link KeyType} for the primary key.
* This is basically part of the schema component of the key by it is
* distinct from the bytes returned by {@link Schema#getSchemaBytes()}.
*/
private final int primaryKeyTypeOffset;
/**
* The offset of the first byte in the encoded primary key.
*/
private final int primaryKeyOffset;
/**
* The #of bytes in the encoded primary key (does not include the
* nul
byte that terminates variable length primary keys).
*/
private final int primaryKeyLength;
/**
* The decoded value of the primary key -or- null
if it is
* not possible to decode the {@link KeyType}.
*/
private final Object primaryKey;
/**
* The offset of the first byte in the encoded column name.
*/
private final int columnNameOffset;
/**
* The #of bytes in the encoded column name (does not include the
* nul
byte that terminates the column name).
*/
private final int columnNameLength;
/**
* The offset of the first byte in the timestamp (it is always 8 bytes
* long).
*/
private final int timestampOffset;
/**
* The decoded {@link KeyType} for the primary key.
*/
private final KeyType primaryKeyType;
/**
* The decoded column name.
*/
private final String col;
/**
* The decoded timestamp on the column value.
*/
public final long timestamp;
/**
* The bytes from the key that represent the encoded name of the
* {@link Schema}.
*/
public byte[] getSchemaBytes() {
final byte[] a = new byte[schemaBytesLength];
System.arraycopy(key, 0, a, 0, schemaBytesLength);
return a;
}
/**
* Return the schema name.
*
* @throws UnsupportedOperationException
* unless {@link SparseRowStore#schemaNameUnicodeClean} is
* true
.
*/
public String getSchemaName() {
if(!SparseRowStore.schemaNameUnicodeClean)
throw new UnsupportedOperationException();
return new String(getSchemaBytes());
}
/**
* The decoded {@link KeyType} for the primary key.
*/
public final KeyType getPrimaryKeyType() {
return primaryKeyType;
}
/**
* The decoded primary key.
*
* @throws UnsupportedOperationException
* if the primary key can not be decoded.
*/
public Object getPrimaryKey() {
if(primaryKey == null) {
throw new UnsupportedOperationException("Can not decode: keyType="
+ primaryKeyType);
}
return primaryKey;
}
/**
* The decoded column name.
*/
public final String getColumnName() {
return col;
}
/**
* The decoded timestamp on the column value. The semantics of the
* timestamp depend entirely on the application. When the application
* provides timestamps, they are application defined long integers. When
* the application requests auto-timestamps, they are generated by the
* data service.
*/
public long getTimestamp() {
return timestamp;
}
public KeyDecoder(final byte[] key) {
if (key == null) {
throw new IllegalArgumentException();
}
this.key = key;
/*
* Find the end of the encoded schema name. This also gives us the type
* of the primary key and the offset of the primary key.
*
* Note: the KeyType byte occurs after the schema name bytes and before
* the [nul].
*/
int primaryKeyOffset = 0;
{
boolean found = false;
int schemaBytesLength = 0;
for (int i = 0; i < key.length; i++) {
if (key[i] == (byte) 0) {
schemaBytesLength = i - 1;
primaryKeyOffset = i + 1;
found = true;
break;
}
}
if (!found) {
throw new RuntimeException(
"Could not locate the end of the encoded schema name: key="
+ BytesUtil.toString(key));
}
this.schemaBytesLength = schemaBytesLength;
this.primaryKeyTypeOffset = schemaBytesLength;
/*
* Note: ArrayIndexOutOfBounds with index==-1 is an indication that
* the schema name or a Unicode primary key contained embedded nul
* bytes. This should no longer be possible when using the unicode
* clean options on the SparseRowStore which encoded those data as
* UTF8 rather than as Unicode sort keys. Historically, these were
* encoded as Unicode sort keys. However, the JDK CollatorEnum
* option does not support compressed Unicode sort keys and embeds
* nul bytes in its generated sort keys. We rely on nul bytes as
* boundary markers when decoding the row store keys. The presence
* of those nul byte within the scheme and and/or the a Unicode
* primary key was causing the ArrayIndexOutOfBoundsException here.
*/
this.primaryKeyType = KeyType.getKeyType(KeyBuilder
.decodeByte(key[primaryKeyTypeOffset]));
}
/*
* Find the end of the primary key. For some key types the primary key
* has a fixed length and we just skip that many bytes. For Unicode keys
* we scan to the next [nul] byte.
*/
{
if (primaryKeyType.isFixedLength()) {
primaryKeyLength = primaryKeyType.getEncodedLength();
this.primaryKeyOffset = primaryKeyOffset;
columnNameOffset = primaryKeyOffset + primaryKeyLength;
} else {
/*
* Scan for the next [nul] byte (ASCII).
*/
boolean found = false;
int primaryKeyLength = 0;
for (int i = primaryKeyOffset; i < key.length; i++) {
if (key[i] == (byte) 0) {
primaryKeyLength = i - primaryKeyOffset;
found = true;
break;
}
}
if (!found) {
throw new RuntimeException(
"Could not locate the end of the encoded schema name: keyType="
+ primaryKeyType + ", key="
+ BytesUtil.toString(key));
}
this.primaryKeyLength = primaryKeyLength;
this.primaryKeyOffset = primaryKeyOffset;
// Note: also skips the [nul] byte terminating the primary
// key.
this.columnNameOffset = primaryKeyOffset + primaryKeyLength + 1;
}
switch (primaryKeyType) {
case Integer:
primaryKey = KeyBuilder.decodeInt(key, primaryKeyOffset);
break;
case Long:
primaryKey = KeyBuilder.decodeLong(key, primaryKeyOffset);
break;
case Double:
primaryKey = KeyBuilder.decodeDouble(key, primaryKeyOffset);
break;
case Float:
primaryKey = KeyBuilder.decodeFloat(key, primaryKeyOffset);
break;
case Unicode:
if (SparseRowStore.primaryKeyUnicodeClean) {
final byte[] bytes = new byte[primaryKeyLength];
System.arraycopy(key, primaryKeyOffset, bytes, 0, primaryKeyLength);
try {
primaryKey = new String(bytes, SparseRowStore.UTF8);
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException(
"Could not decode the primary key"
+ ": primaryKeyOffset="
+ primaryKeyOffset
+ ", primaryKeyLength="
+ primaryKeyLength + ", key="
+ BytesUtil.toString(key));
}
} else {
/*
* Note: Decode is not possible for this case.
*/
primaryKey = null;
}
break;
case ASCII:
primaryKey = KeyBuilder.decodeASCII(key, primaryKeyOffset,
primaryKeyLength);
break;
case Date:
primaryKey = new Date(KeyBuilder.decodeLong(key,
primaryKeyOffset));
break;
default:
throw new AssertionError("Unknown keyType=" + primaryKeyType);
}
}
/*
* Decode the column name. All bytes until the next [nul] are the column
* name.
*
* Note: The column name is NOT compressed using Unicode compression so
* that we can decode it without loss.
*/
{
boolean found = false;
int columnNameLength = 0;
int timestampOffset = 0;
for (int i = columnNameOffset; i < key.length; i++) {
if (key[i] == (byte) 0) {
columnNameLength = i - columnNameOffset;
timestampOffset = i + 1;
found = true;
break;
}
}
if (!found) {
/*
* Could not unpack the column name from the key!
*/
throw new RuntimeException(
"Could not locate the end of the column name: keyType="
+ primaryKeyType + ", columnNameOffset="
+ columnNameOffset + ", key="
+ BytesUtil.toString(key));
}
this.columnNameLength = columnNameLength;
this.timestampOffset = timestampOffset;
final byte[] bytes = new byte[columnNameLength];
System.arraycopy(key, columnNameOffset, bytes, 0, columnNameLength);
try {
col = new String(bytes, SparseRowStore.UTF8);
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException(
"Could not decode the column name: keyType="
+ primaryKeyType + ", columnNameOffset="
+ columnNameOffset + ", columnNameLength="
+ columnNameLength + ", key="
+ BytesUtil.toString(key));
}
}
/*
* Decode the timestamp.
*/
timestamp = KeyBuilder.decodeLong(key, timestampOffset);
}
/**
* Returns the head of the key corresponding to the encoded schema name, the
* primary key's {@link KeyType}, and the primary key (including any
* terminating nul
byte).
*
* @return
*/
public byte[] getPrefix() {
final int n = columnNameOffset;
final byte[] b = new byte[n];
System.arraycopy(key, 0, b, 0, n);
return b;
}
/**
* Returns the length of the prefix corresponding to the encoded schema
* name, the primary key's {@link KeyType}, and the primary key (including
* any terminating nul
byte).
*
* @return
*/
public int getPrefixLength() {
return columnNameOffset;
}
/**
* Shows some of the data that is extracted.
*/
public String toString() {
return "KeyDecoder{"
+ (SparseRowStore.schemaNameUnicodeClean ? "schema="
+ getSchemaName() + "," : "")//
+ "primaryKeyType="+ primaryKeyType//
+ (SparseRowStore.primaryKeyUnicodeClean ? ",primaryKey="
+ getPrimaryKey() : "")//
+ ",col=" + col //
+ ",timestamp=" + timestamp //
+ ",key=" + BytesUtil.toString(key) //
+ "}";
}
}