com.bigdata.rdf.model.BigdataValueSerializer Maven / Gradle / Ivy
/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Aug 12, 2008
*/
package com.bigdata.rdf.model;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.OutputStream;
import org.openrdf.model.BNode;
import org.openrdf.model.Literal;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.ValueFactory;
import com.bigdata.io.ByteArrayBuffer;
import com.bigdata.io.DataInputBuffer;
import com.bigdata.io.DataOutputBuffer;
import com.bigdata.io.ShortPacker;
import com.bigdata.io.compression.NoCompressor;
import com.bigdata.io.compression.UnicodeHelper;
import com.bigdata.rdf.lexicon.ITermIndexCodes;
/**
* Helper class provides efficient stand-off serialization of RDF {@link Value}
* objects.
*
* @author Bryan Thompson
* @version $Id$
*/
public class BigdataValueSerializer {
/*
* Serialization.
*/
/**
* Version zero(0) of the serialization format. This version could not write
* out very large Unicode strings (64k character limit).
*/
// * Note: This inefficiency has been fixed.
// * Also, there were inefficiencies in {@link DataOutputBuffer} when writing
// * UTF8 which caused a performance hit. See
// * {@link DataOutputBuffer#writeUTF(String)}.
private static final short VERSION0 = 0x0;
/**
* Version ONE(1) of the serialization format. This version supports very
* large Unicode strings using the {@link UnicodeHelper} class.
*/
private static final short VERSION1 = 0x1;
/**
* The current serialization version.
*
* Note: Changing back to {@link #VERSION0}. It looks like it was
* significantly more efficient (though it might be possible to optimize the
* code paths for {@link #VERSION1}).
*
* Note: {@link #VERSION0} can not be used with the BLOBS index since it can
* not handle very large {@link Value}s. When changing this to
* {@link #VERSION0}, that change needs to be exclusive of the version used
* for the BLOBS index (ie, it is good for ID2TERM and related indices such
* as the ivCache, but not for the BLOBS index and related indices such as
* the blobsCache).
*
* @see
* Load, closure and query performance in 1.1.x versus 1.0.x
*/
// private final static short currentVersion = VERSION0;
private static final short getVersion(final Value val) {
if (BigdataValueSerializer.getStringLength(val) < Short.MAX_VALUE) {
/*
* This version is faster, but can only be used with UTF data LT 64k
* in length.
*/
return VERSION0;
} else {
/*
* This version can be used with very large UTF strings (BLOBS).
*/
return VERSION1;
}
}
/**
* Error message indicates that the version code in the serialized
* record did not correspond to a known serialization version for an RDF
* value.
*/
protected static final String ERR_VERSION = "Bad version";
/**
* Error message indicates that the term code in the serialized record
* did not correspond to a known term code for an RDF value of the
* appropriate type (e.g., a URI code where an Literal code was
* expected). The codes are defined by {@link ITermIndexCodes}.
*/
protected static final String ERR_CODE = "Bad term code";
/**
* Factory used to de-serialize {@link Value}s.
*/
private final ValueFactory valueFactory;
/**
* Used to compress Unicode strings.
*/
private final UnicodeHelper uc;
/**
* Create an instance that will materialize objects using the caller's
* factory.
*
* @param valueFactory
* The value factory.
*/
public BigdataValueSerializer(final ValueFactory valueFactory) {
if (valueFactory == null)
throw new IllegalArgumentException();
this.valueFactory = valueFactory;
// this.uc = new UnicodeHelper(new BOCU1Compressor());
this.uc = new UnicodeHelper(new NoCompressor());
// this.uc = new UnicodeHelper(new SCSUCompressor());
}
// public byte getTermCode(final InputStream is) throws IOException {
// final short version = ShortPacker.unpackShort(is);
// switch(version){
// case VERSION0:
// case VERSION1:
// final int b = is.read();
// if(b == -1)
// throw new EOFException();
// return (byte)(0xff & b);
// default:
// throw new AssertionError();
// }
// }
/**
* Return the term code as defined by {@link ITermIndexCodes} for this type
* of term. This is used to places URIs, different types of literals, and
* bnodes into disjoint parts of the key space for sort orders.
*
* @see ITermIndexCodes
*/
private byte getTermCode(final Value val) {
if (val == null)
throw new IllegalArgumentException();
if (val instanceof URI) {
return ITermIndexCodes.TERM_CODE_URI;
} else if (val instanceof Literal) {
final Literal lit = (Literal) val;
if (lit.getLanguage() != null)
return ITermIndexCodes.TERM_CODE_LCL;
if (lit.getDatatype() != null)
return ITermIndexCodes.TERM_CODE_DTL;
return ITermIndexCodes.TERM_CODE_LIT;
} else if (val instanceof BNode) {
return ITermIndexCodes.TERM_CODE_BND;
} else {
throw new IllegalArgumentException("class="+val.getClass().getName());
}
}
/**
* Routine for efficient serialization of an RDF {@link Value}.
*
* @return The byte[] containing the serialized data record.
*
* @throws RuntimeException
* if there is a IO problem
*
* @see {@link #deserialize(byte[])}
*/
public byte[] serialize(final V val) {
return serialize(val, new DataOutputBuffer(128), null/* lazilyAllocated */);
}
/**
* Variant which permits reuse of the same buffer. This has the advantage
* that the buffer is reused on each invocation and swiftly grows to its
* maximum extent.
*
* @param val
* The value.
* @param out
* The buffer - the caller is responsible for resetting the
* buffer before each invocation.
* @param tmp
* A buffer used to compress the component Unicode strings. This
* will be reset as necessary by this method. It will be lazily
* allocated if null
.
*
* @return The byte[] containing the serialized data record. This array is
* newly allocated so that a series of invocations of this method
* return distinct byte[]s.
*/
public byte[] serialize(final V val, final DataOutputBuffer out, final
ByteArrayBuffer tmp) {
serialize2(val, out, tmp);
return out.toByteArray();
}
/**
* Variant which permits reuse of the same buffer and avoids copying the
* data once it has been formated onto the caller's {@link DataOutputBuffer}
* (core impl).
*
* @param val
* The value.
* @param out
* The buffer - the caller is responsible for resetting the
* buffer before each invocation.
* @param tmp
* A buffer used to compress the component Unicode strings. This
* will be reset as necessary by this method. It will be lazily
* allocated if null
.
*/
public void serialize2(final V val, final DataOutputBuffer out,
ByteArrayBuffer tmp) {
try {
final short version = getVersion(val);
ShortPacker.packShort(out, version);
switch (version) {
case VERSION0:
serializeVersion0(val, version, out);
break;
case VERSION1: {
if(tmp == null) {
/*
* Allocate lazily on the code path where it is necessary.
*/
tmp = new ByteArrayBuffer(128);
}
serializeVersion1(val, version, out, tmp);
break;
}
default:
throw new UnsupportedOperationException(ERR_VERSION);
}
// return out.toByteArray();
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
/**
* Routine for efficient de-serialization of an RDF {@link Value}.
*
* Note: This automatically uses the {@link BigdataValueFactoryImpl} to create
* the {@link BigdataValue}s from the de-serialized state so the factory
* reference is always set on the returned {@link BigdataValueImpl}.
*
* @param b
* The byte[] containing the serialized data record.
*
* @return The {@link BigdataValue}.
*
* @throws RuntimeException
* if there is an IO problem.
*
* @see {@link #serialize()}
*/
public V deserialize(final byte[] b) {
return deserialize(new DataInputBuffer(b), new StringBuilder(b.length));
}
/**
* Routine for efficient de-serialization of a {@link BigdataValue}.
*
* Note: This automatically uses the {@link BigdataValueFactoryImpl} to
* create the {@link BigdataValue}s from the de-serialized state so the
* factory reference is always set on the returned {@link BigdataValueImpl}.
*
* @param b
* An input stream from which the serialized data may be read.
* @param tmp
* A buffer used to decode the component Unicode strings. The
* length of the buffer will be reset as necessary by this
* method.
*
* @return The {@link BigdataValue}.
*
* @throws RuntimeException
* if there is an IO problem.
*
* @see {@link #serialize()}
*/
public V deserialize(final DataInputBuffer in, final StringBuilder tmp) {
try {
final short version = ShortPacker.unpackShort((DataInput)in);//in.unpackShort();
switch (version) {
case VERSION0:
return deserializeVersion0(version, in);
case VERSION1:
return deserializeVersion1(version, in, tmp);
default:
throw new UnsupportedOperationException(ERR_VERSION + " : "
+ version);
}
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
/**
* Implements the serialization of a Literal, URI, or BNode.
*
* @param val
* The {@link Value}.
* @param version
* The serialization version number (which has already been
* written on out by the caller).
* @param out
* The data are written here.
*
* @throws IOException
*/
private void serializeVersion0(final V val, final short version,
final DataOutput out) throws IOException {
final byte termCode = getTermCode(val);
/*
* Note: VERSION0 writes the termCode immediately after the packed
* version identifier. Other versions MAY do something else.
*
* Note: This method requires the DataOutput interface for the
* writeUTF() method. If we can get those _exact_ semantics elsewhere
* (for backward compatibility they have to be exact) then we could
* relax the API and pass in an OutputStream.
*/
out.writeByte(termCode);
switch(termCode) {
case ITermIndexCodes.TERM_CODE_BND: {
out.writeUTF(((BNode) val).getID());
break;
}
case ITermIndexCodes.TERM_CODE_URI: {
// Serialize as UTF.
out.writeUTF(((URI)val).stringValue());
break;
}
case ITermIndexCodes.TERM_CODE_LIT:
out.writeUTF(((Literal)val).getLabel());
break;
case ITermIndexCodes.TERM_CODE_LCL:
/*
* Note: This field is ASCII [A-Za-z0-9] and "-". However, this
* method writes using UTF-8 so it will generate one byte per
* character and it is probably more work to write the data
* directly as ASCII bytes.
*/
out.writeUTF(((Literal)val).getLanguage());
out.writeUTF(((Literal)val).getLabel());
break;
case ITermIndexCodes.TERM_CODE_DTL:
out.writeUTF(((Literal)val).getDatatype().stringValue());
out.writeUTF(((Literal)val).getLabel());
break;
default:
throw new IOException(ERR_CODE + " : " + termCode);
}
}
/**
* Implements the de-serialization of a Literal, URI, or BNode.
*
* Note: This automatically uses the {@link BigdataValueFactoryImpl} to create
* the {@link BigdataValue}s from the de-serialized state so the factory
* reference is always set on the returned {@link BigdataValueImpl}.
*
* @param version
* The serialization version number (which has already been read
* by the caller).
* @param in
* The data are read from here.
*
* @throws IOException
*/
@SuppressWarnings("unchecked")
private V deserializeVersion0(final short version, final DataInput in)
throws IOException {
/*
* Note: The term code immediately follows the packed version
* code for VERSION0 - this is not necessarily true for other
* serialization versions.
*/
final byte termCode = in.readByte();
switch(termCode) {
case ITermIndexCodes.TERM_CODE_BND: {
return (V) valueFactory.createBNode(in.readUTF());
}
case ITermIndexCodes.TERM_CODE_URI: {
return (V) valueFactory.createURI(in.readUTF());
}
case ITermIndexCodes.TERM_CODE_LIT: {
final String label = in.readUTF();
return (V)valueFactory.createLiteral(label);
}
case ITermIndexCodes.TERM_CODE_LCL: {
final String language = in.readUTF();
final String label = in.readUTF();
return (V)valueFactory.createLiteral(label, language);
}
case ITermIndexCodes.TERM_CODE_DTL: {
final String datatype = in.readUTF();
final String label = in.readUTF();
return (V) valueFactory.createLiteral(label, valueFactory
.createURI(datatype));
}
default:
throw new IOException(ERR_CODE + " : " + termCode);
}
}
/**
* Implements the serialization of a Literal, URI, or BNode.
*
* @param val
* The {@link Value}.
* @param version
* The serialization version number (which has already been
* written on out by the caller).
* @param out
* The data are written here.
* @param tmp
* A buffer used to compress the component Unicode strings.
*
* @throws IOException
*/
private void serializeVersion1(final V val, final short version,
final OutputStream out, final ByteArrayBuffer tmp)
throws IOException {
final byte termCode = getTermCode(val);
/*
* Note: VERSION1 writes the termCode immediately after the packed
* version identifier. Other versions MAY do something else.
*/
out.write/*Byte*/(termCode);
switch(termCode) {
case ITermIndexCodes.TERM_CODE_BND:
uc.encode(((BNode) val).getID(), out, tmp);
break;
case ITermIndexCodes.TERM_CODE_URI:
uc.encode(((URI)val).stringValue(), out, tmp);
break;
case ITermIndexCodes.TERM_CODE_LIT:
uc.encode(((Literal)val).getLabel(), out, tmp);
break;
case ITermIndexCodes.TERM_CODE_LCL:
/*
* Note: This field is ASCII [A-Za-z0-9] and "-". However, this
* method writes using UTF-8 so it will generate one byte per
* character and it is probably more work to write the data
* directly as ASCII bytes.
*/
uc.encode(((Literal) val).getLanguage(), out, tmp);
uc.encode(((Literal) val).getLabel(), out, tmp);
break;
case ITermIndexCodes.TERM_CODE_DTL:
uc.encode(((Literal) val).getDatatype().stringValue(), out, tmp);
uc.encode(((Literal) val).getLabel(), out, tmp);
break;
default:
throw new IOException(ERR_CODE + " : " + termCode);
}
}
/**
* Implements the de-serialization of a Literal, URI, or BNode.
*
* Note: This automatically uses the {@link BigdataValueFactoryImpl} to
* create the {@link BigdataValue}s from the de-serialized state so the
* factory reference is always set on the returned {@link BigdataValueImpl}.
*
* @param version
* The serialization version number (which has already been read
* by the caller).
* @param in
* The data are read from here.
* @param tmp
* Buffer used to extract bytes to be decompressed.
* @param sb
* Buffer used to decompress bytes.
*
* @throws IOException
*/
@SuppressWarnings("unchecked")
private V deserializeVersion1(final short version,
final DataInputBuffer in, final StringBuilder tmp)
throws IOException {
/*
* Note: The term code immediately follows the packed version code for
* VERSION0 - this is not necessarily true for other serialization
* versions.
*/
final byte termCode = in.readByte();
switch (termCode) {
case ITermIndexCodes.TERM_CODE_BND:
return (V) valueFactory.createBNode(uc.decode1(in, tmp));
case ITermIndexCodes.TERM_CODE_URI:
return (V) valueFactory.createURI(uc.decode1(in, tmp));
case ITermIndexCodes.TERM_CODE_LIT:
return (V) valueFactory.createLiteral(uc.decode1(in, tmp));
case ITermIndexCodes.TERM_CODE_LCL: {
final String language = uc.decode1(in, tmp);
final String label = uc.decode1(in, tmp);
return (V) valueFactory.createLiteral(label, language);
}
case ITermIndexCodes.TERM_CODE_DTL: {
final String datatype = uc.decode1(in, tmp);
final String label = uc.decode1(in, tmp);
return (V) valueFactory.createLiteral(label, valueFactory
.createURI(datatype));
}
default:
throw new IOException(ERR_CODE + " : " + termCode);
}
}
/**
* Return the total #of characters in the RDF {@link Value}.
*
* @param v
* The {@link Value}.
*
* @return The character length of the data in the RDF {@link Value}.
*/
static public long getStringLength(final Value v) {
if (v == null)
throw new IllegalArgumentException();
if (v instanceof URI) {
return ((URI) v).stringValue().length();
} else if (v instanceof Literal) {
final Literal value = (Literal) v;
final String label = value.getLabel();
final int datatypeLength = value.getDatatype() == null ? 0 : value
.getDatatype().stringValue().length();
final int languageLength = value.getLanguage() == null ? 0 : value
.getLanguage().length();
final long totalLength = label.length() + datatypeLength
+ languageLength;
return totalLength;
} else if (v instanceof BNode) {
return ((BNode) v).getID().length();
} else {
throw new UnsupportedOperationException();
}
}
}