
com.bigdata.rdf.internal.encoder.IVSolutionSetEncoder Maven / Gradle / Ivy
Show all versions of bigdata-core Show documentation
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Feb 15, 2012
*/
package com.bigdata.rdf.internal.encoder;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Logger;
import org.openrdf.model.Value;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstant;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.bindingSet.ListBindingSet;
import com.bigdata.btree.keys.ASCIIKeyBuilderFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.htree.HTree;
import com.bigdata.io.ByteArrayBuffer;
import com.bigdata.io.DataOutputBuffer;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.IVCache;
import com.bigdata.rdf.internal.IVUtility;
import com.bigdata.rdf.internal.impl.TermId;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.model.BigdataValueFactory;
import com.bigdata.rdf.model.BigdataValueFactoryImpl;
import com.bigdata.rdf.model.BigdataValueSerializer;
import com.bigdata.util.BytesUtil;
/**
* This class provides fast, efficient serialization for solution sets. Each
* solution must be an {@link IBindingSet}s whose bound values are {@link IV}s
* and their cached {@link BigdataValue}s. The {@link IV}s and the cached
* {@link BigdataValue}s are efficiently and compactly represented in format
* suitable for chunked messages or streaming. Decode is a fast online process.
* Both encode and decode require the maintenance of a map from the {@link IV}
* having cached {@link BigdataValue}s to those cached values.
*
* Record Format
*
* The format is as follows:
*
* * nbound
* nvars
* ncached
* (namespace)
* var[0]...var[nvars-1]
* bitmap-for-bound-variables
* bitmap-for-IV-with-cached-Values
* IV[0] ... IV[nbound-1]
* Value[0] ... Value[ncached-1]
*
*
* where nbound
is the #of bindings in the binding set. When zero,
* the rest of the record is omitted.
*
* where nvars
is the #of new variables in this binding set. The
* "schema" used to encode the bindings is based on the ordered set of variables
* for which bindings are observed. The encoder writes this information out
* incrementally. The decoder builds up this information as it decodes
* solutions.
*
* where ncached
is the #of bindings in the binding set for which
* there is a cached {@link BigdataValue} which has not already been written
* into a previous record. Even if the {@link IV} has a cached
* {@link BigdataValue}, if the {@link IV} has been previously written into a
* record then the {@link IV} is NOT record in this record with a cached Value.
* Further, if the {@link IV} appears more than once in a given record, the
* cached value is only marked in the bitmap for the first such occurrence and
* the cached value is only written into the record once.
*
* where namespace
is the namespace of the lexicon relation. This
* is written out for the first solution having an {@link IVCache} association.
* It is assumed that all {@link Value}s are {@link BigdataValue} for the same
* lexicon relation. If no solutions have an {@link IVCache} association, then
* the namespace will never be written into the encoded output.
*
* where var
is the name of a variable for which a binding was
* first observed for the current solution. The names of the variables are
* written in the order in which they are first observed. This forms the
* implicit "schema" required to decode the {@link IV}[].
*
* where bitmap-for-bound-variables
is zero or more bytes providing
* a bit map indicating those variables which are bound in this solution out of
* the total set of variables which have been observed in the solutions
* presented to this encode.
*
* where bitmap-for-IVs-with-cached-Values
is zero or more bytes
* providing a bit map indicating which IVs are associated with cached values
* written into the record. Whether or not an IV has a cached value must be
* decided by the caller after processing the record and consulting an
* (IV,Value) cache which they maintain over the set of records processed to
* date. Cached values are written out (and the bit set) only the first time a
* given IV with a cached Value is observed.
*
* where IV[n]
is an {@link IV} as encoded by {@link IVUtility}.
*
* where {@link BigdataValue} is an RDF Value serialized using the
* {@link BigdataValueSerializer} for the namespace of the lexicon.
*
*
Decode
*
* The namespace of the lexicon is required in to obtain the
* {@link BigdataValueFactory} and {@link BigdataValueSerializer} used to decode
* and materialize the cached {@link BigdataValue}s. This information can be
* sent before the records if it is not known to the caller.
*
* The decoder materializes the cached values into a map (either a HashMap or
* HTree, as appropriate for the data scale) as the records are processed. Only
* one solution needs to be decoded at a time, but the decoder must maintain the
* (IV,Value) cache across all decoded records. There is no need to indicate the
* #of records, but IChunkMessage#getSolutionCount() in fact reports exactly
* that information.
*
* Each solution can be turned into an {@link IBindingSet} at the time that it
* is decoded. If we use a standard {@link ListBindingSet}, then we need to
* resolve each {@link IV} against the {@link IV} cache, setting its RDF Value
* as a side effect before returning the IBindingSet to the caller. If we do a
* custom {@link IBindingSet} implementation, then the cached
* {@link BigdataValue} could be lazily materialized by hooking
* {@link IVCache#getValue()}. Either way, the life cycle of the materialized
* objects will be very short unless they are propagated into new solutions.
* Short life cycle objects entail very little heap burden.
*
* @author Bryan Thompson
* @version $Id: IVSolutionSetEncoder.java 6032 2012-02-16 12:48:04Z thompsonbry
* $
*
* @see Optimize
* serialization for query messages on cluster
*
* TODO There chould be a completely different encoding when only a single
* variable is bound (column projection) especially if there are likely to
* be duplicate IVs. However, we still have to pass through the cached
* Value associations, which this does pretty efficiently.
*/
public class IVSolutionSetEncoder implements IBindingSetEncoder {
private static final Logger log = Logger
.getLogger(IVSolutionSetEncoder.class);
/**
* The schema provides the order in which the {@link IV}[] for solutions
* stored in the hash index are encoded in the {@link HTree}. {@link IV} s
* which are not bound are modeled by a {@link TermId#NullIV}.
*
* Note: In order to be able to encode/decode the schema based on the lazy
* identification of the variables which appear in solutions the
* {@link HTree} must store variable length {@link IV}[]s since new
* variables may be discovered at any point.
*/
private final LinkedHashSet> schema;
/**
* Used to store the {@link IVCache} associations. This allows us to elide
* {@link BigdataValue}s which have already been written by this encoder
* instance.
*/
private final Map, BigdataValue> cache;
/**
* Used to encode the {@link IV}s.
*/
private final IKeyBuilder keyBuilder;
/**
* Used to format the encoded records (reset each time).
*/
private final DataOutputBuffer out;
/**
* Temporary buffer used by {@link BigdataValueSerializer}.
*/
private final ByteArrayBuffer tmp;
/**
* The initial version.
*/
static final int VERSION0 = 0x0;
/**
* The version used by this encoder.
*/
private final int version = VERSION0;
/**
* The #of solutions encoded to date.
*/
private int nsolutions = 0;
/*
* Set when the first solution having a bound value is processed.
*/
/**
* The namespace of the lexicon relation. This is discovered from the first
* {@link IVCache} association and written out into the stream at that
* point. If there are no {@link IVCache} associations then it is never set.
*/
private String namespace;
/**
* Used to de-serialize the {@link BigdataValue}s for {@link IVCache}
* associations.
*/
private BigdataValueSerializer valueSer;
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append(super.toString());
sb.append("{namespace=" + namespace);
sb.append(",schema=" + schema); // Not thread-safe.
sb.append(",cacheSize=" + cache.size());// Not thread-safe
sb.append(",nsolutions=" + nsolutions);
sb.append("}");
return sb.toString();
}
public IVSolutionSetEncoder() {
// The ordered set of variables for which bindings have been observed.
this.schema = new LinkedHashSet>();
// The IV -> BigdataValue cache
this.cache = new HashMap, BigdataValue>();
// Used to encode the IVs.
this.keyBuilder = new ASCIIKeyBuilderFactory(128).getKeyBuilder();
// Used to format the encoded records (reset each time).
this.out = new DataOutputBuffer();
// Temp buffer used by the BigdataValueSerializer
this.tmp = new ByteArrayBuffer();
}
/**
* Encode the solution on the stream.
*
* @param out
* The stream.
* @param bset
* The solution.
*/
public void encodeSolution(final DataOutputBuffer out,
final IBindingSet bset) {
out.append(encodeSolution(bset));
}
@Override
public byte[] encodeSolution(final IBindingSet bset) {
return encodeSolution(bset, true/* updateCache */);
}
/**
* {@inheritDoc}
*
* TODO We typically use a {@link ListBindingSet}. If the
* {@link IBindingSet} is large enough, then it would be more efficient to
* create an {@link IVariable} to {@link IV} map within this method since we
* have to lookup bindings by variables more than once.
*/
@Override
public byte[] encodeSolution(final IBindingSet bset,
final boolean updateCacheIsIgnored) {
if (bset == null)
throw new IllegalArgumentException();
final boolean trace = log.isTraceEnabled();
// Reset internal buffers.
keyBuilder.reset();
out.reset();
if (0 == nsolutions++) {
// Write out the version number before the first solution.
out.packLong(version);
}
/*
* Before we can encode the binding set, we need to update the schema
* such that it captures any variables used in the binding set (plus any
* variables which have been observed in previous binding sets). We also
* note which variables are present for the first time in this solution.
* We will need to write those onto the wire.
*/
final List> newVars = new LinkedList>();
{
/*
* Note: Changed to use bset.iterator() based on a hot spot in the
* profiler.
*/
@SuppressWarnings("rawtypes")
// final Iterator vitr = bset.vars();
// while (vitr.hasNext()) {
// final IVariable v = vitr.next();
// if (schema.add(v)) {
// newVars.add(v);
// }
// }
final Iterator> itr = bset
.iterator();
while (itr.hasNext()) {
final IVariable v = itr.next().getKey();
if (schema.add(v)) {
newVars.add(v);
}
}
}
/*
* Encode the binding set. Bindings will appear in the same order that
* they were added to the schema.
*/
// Ordered set of new IV -> Value associations. List entry is [null] if
// no association for IV at that ordinal position in the solution.
final List values = new LinkedList();
// #of bindings with non-null IVs.
int numBindings = 0;
// #of new IVCache associations.
int newCached = 0;
boolean discoveredNamespace = false;
{
// For each variable in the [schema] order.
final Iterator> vitr = schema.iterator();
while (vitr.hasNext()) {
final IVariable v = vitr.next();
// Lookup binding for that variable in the solution.
@SuppressWarnings("unchecked")
final IConstant> c = bset.get(v);
if (c == null)
continue;
// Variable is bound in this solution.
final IV iv = c.get();
// Encode binding into buffer.
IVUtility.encode(keyBuilder, iv);
if (iv.hasValue() && (iv.isNullIV() || !cache.containsKey(iv))) {
// New IV => Value association (all NullIVs are "new").
final BigdataValue value = iv.getValue();
if (namespace == null) {
// Note: The namespace is discovered here!!!
namespace = value.getValueFactory().getNamespace();
valueSer = BigdataValueFactoryImpl.getInstance(
namespace).getValueSerializer();
discoveredNamespace = true;
}
if (!iv.isNullIV()) {
/*
* We can not lookup Null IVs in the cache on the
* decoder side since ties are broken by comparing the
* IVCache association, which is what we are trying to
* resolve. Therefore we always inline the IVCache
* assocation if TermId.isNull() is true.
*/
cache.put(iv, value);
}
values.add(value);
newCached++;
} else {
values.add(null/* no-value */);
}
numBindings++;
}
}
// * nbound
// * nvars
// * ncached
// * var[0]...var[nvars-1]
// * bitmap-for-bound-variables
// * bitmap-for-IV-with-cached-Values
// * IV[0] ... IV[nbound-1]
// * Value[0] ... Value[ncached-1]
out.packLong(numBindings);
if (numBindings == 0) {
// Return formatted record.
return out.toByteArray();
}
out.packLong(newVars.size());
out.packLong(newCached);
if (discoveredNamespace) {
out.writeUTF2(namespace);
}
if (trace) {
log.trace("schemaSize=" + schema.size() + ", cacheSize="
+ cache.size() + ", namespace=" + namespace);
log.trace("newVars=" + newVars.size() + ", numBindings="
+ numBindings + ", newCached=" + newCached);
}
// write newly declared variable names.
for (IVariable var : newVars) {
out.writeUTF2(var.getName());
}
/*
* Write out a bit map for the variables which are bound in this
* solution.
*
* Note: This is more compact than using TermId.NULL to indicate an
* unbound variable. TermId.NULL is 9 bytes. 9 bytes is enough for a bit
* map for 9*8=72 variables, so this is a win if there is even one
* unbound variable.
*/
if (numBindings > 0) {
// #of bytes required for the bit flags (one per declared var to date)
final int nbytes = BytesUtil.bitFlagByteLength(schema.size());
// current buffer position as bit index.
int bitIndex = out.pos() << 3;
if (trace)
log.trace("varbitmap: beginBitOffset=" + bitIndex + ", nbytes="
+ nbytes);
// pre-extend the buffer, zeroing the bitmap.
out.ensureFree(nbytes);
for (int i = 0; i < nbytes; i++) {
out.append((byte) 0);
}
for (IVariable var : schema) {
if (bset.isBound(var)) {
BytesUtil.setBit(out.array(), bitIndex, true);
}
bitIndex++;
}
}
/*
* Write out bit map for IVs for which we have newly observed a cached
* Value.
*/
if (newCached > 0) {
// #of bytes required for the bit flags.
final int nbytes = BytesUtil.bitFlagByteLength(numBindings);
// current buffer position as bit index.
int bitIndex = out.pos() << 3;
// pre-extend the buffer, zeroing the bitmap.
out.ensureFree(nbytes);
for (int i = 0; i < nbytes; i++) {
out.append((byte) 0);
}
if (trace)
log.trace("cachebitmap: beginBitOffset=" + bitIndex
+ ", nbytes=" + nbytes);
for (BigdataValue value : values) {
if (value != null) {
BytesUtil.setBit(out.array(), bitIndex, true);
}
bitIndex++;
}
}
// write IV[].
if (trace)
log.trace("IV[]: off=" + out.pos() + ", numBindings=" + numBindings
+ ", byteLength=" + keyBuilder.len());
out.append(keyBuilder.array(), 0/* off */, keyBuilder.len());
/*
* Write Value[]. Each Value is written directly into [out].
*/
if (newCached > 0) {
if(trace)
log.trace("cache[]: off=" + out.pos() + ", newCached="
+ newCached);
for (BigdataValue value : values) {
if (value != null) {
valueSer.serialize2(value, out, tmp);
}
}
}
if(trace)
log.trace("done: off=" + out.pos());
// Return formatted record.
return out.toByteArray();
}
@Override
public void release() {
cache.clear();
schema.clear();
out.clear();
tmp.clear();
nsolutions = 0;
}
@Override
public void flush() {
// NOP
}
/**
* {@inheritDoc}
*
* Always returns true
.
*/
@Override
public boolean isValueCache() {
return true;
}
}