com.bigdata.bop.fed.FederationChunkHandler Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Oct 22, 2010
*/
package com.bigdata.bop.fed;
import java.nio.ByteBuffer;
import java.rmi.RemoteException;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import org.apache.log4j.Logger;
import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpEvaluationContext;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IPredicate;
import com.bigdata.bop.IShardwisePipelineOp;
import com.bigdata.bop.engine.IChunkHandler;
import com.bigdata.bop.engine.IChunkMessage;
import com.bigdata.bop.engine.IQueryPeer;
import com.bigdata.bop.engine.IRunningQuery;
import com.bigdata.bop.engine.LocalChunkMessage;
import com.bigdata.bop.engine.StandaloneChunkHandler;
import com.bigdata.bop.fed.shards.MapBindingSetsOverShardsBuffer;
import com.bigdata.io.DirectBufferPool;
import com.bigdata.io.DirectBufferPoolAllocator.IAllocationContext;
import com.bigdata.mdi.PartitionLocator;
import com.bigdata.relation.accesspath.BlockingBuffer;
import com.bigdata.relation.accesspath.IAsynchronousIterator;
import com.bigdata.relation.accesspath.IBlockingBuffer;
import com.bigdata.relation.accesspath.IBuffer;
import com.bigdata.relation.rule.eval.pipeline.DistributedJoinTask;
/**
* The base class is extended to organize the output from one operator so in
* order to make it available to another operator running on a different node.
* There are several cases which have to be handled and which are identified by
* the {@link BOp#getEvaluationContext()}. In addition, we need to handle low
* latency and high data volume queries somewhat differently. Except for
* {@link BOpEvaluationContext#ANY}, all of these cases wind up writing the
* intermediate results onto a direct {@link ByteBuffer} and notifying the
* receiving service that there are intermediate results which it can pull when
* it is ready to process them. This pattern allows the receiver to impose flow
* control on the producer.
*
* @author Bryan Thompson
* @version $Id: FederationChunkHandler.java 6038 2012-02-17 17:43:26Z
* thompsonbry $
* @param
* The generic type of the objects in the relation.
*
* @see Vector
* query engine messages per node
*/
public class FederationChunkHandler extends StandaloneChunkHandler {
private final static Logger log = Logger
.getLogger(FederationChunkHandler.class);
/**
* FIXME Debug the NIO chunk message materialization logic (it is currently
* disabled by the setting of the nioThreshold parameter to the
* constructor).
*
* @see
* ResourceService should use NIO for file and buffer transfers
*
* @see Support
* NIO solution set interchange on the cluster
*/
@SuppressWarnings("rawtypes")
public static final IChunkHandler INSTANCE = new FederationChunkHandler(
Integer.MAX_VALUE/* nioThreshold */, false/*usePOJO*/);
/**
* Instance used by some test suites to avoid a dependency on the RDF data
* model. All messages will use {@link LocalChunkMessage} which uses POJO
* serialization.
*/
@SuppressWarnings("rawtypes")
public static final IChunkHandler TEST_INSTANCE = new FederationChunkHandler(
Integer.MAX_VALUE/* nioThreshold */, true/*usePOJO*/);
/**
* The threshold above which the intermediate solutions are shipped using
* NIO rather than RMI.
*
* @see ThickChunkMessage
* @see NIOChunkMessage
*
* @see
* Invalid byte: 3 in ResourceService (StatusEnum) on cluster
*/
private final int nioThreshold;
/**
* When true
, the {@link LocalChunkMessage} will be used for
* all messages. This allows the test cases to avoid RDF specific logic
* in the {@link IChunkMessage} serialization.
*/
private final boolean usePOJO;
/**
*
* @param nioThreshold
* The threshold above which the intermediate solutions are
* shipped using NIO rather than RMI. This is ignored if
* usePOJO:=true
.
* @param usePOJO
* When true
, the {@link LocalChunkMessage} will be
* used for all messages. This allows the test cases to avoid RDF
* specific logic in the {@link IChunkMessage} serialization.
*/
public FederationChunkHandler(final int nioThreshold, final boolean usePOJO) {
this.nioThreshold = nioThreshold;
this.usePOJO = usePOJO;
}
/**
* {@inheritDoc}
*
* @todo Figure out how (or if) we will combine binding set streams emerging
* from concurrent tasks executing on a given node destined for the
* same shard/node. (There is code in the {@link DistributedJoinTask}
* which does this for the same shard, but it does it on the receiver
* side.) Pay attention to the #of threads running in the join, the
* potential concurrency of threads targeting the same (bopId,shardId)
* and how to best combine their data together.
*/
@Override
public int handleChunk(final IRunningQuery query, final int bopId,
final int sinkId, final IBindingSet[] chunk) {
if (query == null)
throw new IllegalArgumentException();
if (chunk == null)
throw new IllegalArgumentException();
if (chunk.length == 0)
return 0;
final FederatedRunningQuery q = (FederatedRunningQuery) query;
final BOp targetOp = q.getBOpIndex().get(sinkId);
if (targetOp == null)
throw new IllegalStateException("Not found: " + sinkId);
if(log.isTraceEnabled())
log.trace("queryId=" + query.getQueryId() + ", sourceBopId="+bopId+", sink=" + sinkId);
switch (targetOp.getEvaluationContext()) {
case ANY: {
/*
* This operator may be evaluated anywhere.
*/
return super.handleChunk(query, bopId, sinkId, chunk);
}
case HASHED: {
/*
* @todo The sink must use annotations to describe the nodes over
* which the binding sets will be mapped and the hash function to be
* applied. Look up those annotations and apply them to distribute
* the binding sets across the nodes.
*/
throw new UnsupportedOperationException();
}
case SHARDED: {
/*
* The sink must read or write on a shard so we map the binding sets
* across the access path for the sink.
*
* @todo Set the capacity of the the "map" buffer to the size of the
* data contained in the sink (in fact, we should just process the
* sink data in place using an expanded IChunkAccessor interface).
*
* @todo high volume operators will need different capacity
* parameters.
*
* FIXME the chunkSize will limit us to RMI w/ the payload inline
* when it is the same as the threshold for NIO chuck transfers.
* This needs to be adaptive and responsive to the actual data scale
* of the operator's outputs. [Actually, we wind up re-combining the
* chunks into a single chunk per target shard below.]
*/
@SuppressWarnings("unchecked")
final IPredicate pred = ((IShardwisePipelineOp) targetOp).getPredicate();
final long timestamp = pred.getTimestamp();
final int capacity = 1000;// @todo
// FIXME Capacity is unbounded to prevent deadlock. See the node below.
final int chunkOfChunksCapacity = Integer.MAX_VALUE;
final int chunkSize = 100;// @todo modest chunks.
final MapBindingSetsOverShardsBuffer mapper = new MapBindingSetsOverShardsBuffer(
q.getFederation(), pred, timestamp, capacity) {
@Override
protected IBuffer newBuffer(final PartitionLocator locator) {
return new BlockingBuffer(
chunkOfChunksCapacity,//
chunkSize,//
BlockingBuffer.DEFAULT_CONSUMER_CHUNK_TIMEOUT,//
BlockingBuffer.DEFAULT_CONSUMER_CHUNK_TIMEOUT_UNIT//
);
}
};
/*
* Map the binding sets over shards.
*/
{
// final IAsynchronousIterator itr = sink
// .iterator();
// try {
// while (itr.hasNext()) {
// final IBindingSet[] chunk = itr.next();
// for (IBindingSet bset : chunk) {
// mapper.add(bset);
// }
// }
// } finally {
// itr.close();
// mapper.flush();
// }
for (IBindingSet bset : chunk) {
mapper.add(bset);
}
mapper.flush();
}
/*
* The allocation context.
*
* @todo use (queryId, serviceId, sinkId) when the target bop is
* high volume operator (this requires annotation by the query
* planner of the operator tree).
*/
final IAllocationContext allocationContext = q
.getAllocationContext(new QueryContext(q.getQueryId()));
/*
* Generate the output chunks and notify the receivers.
*
* FIXME If the output buffer has a bounded capacity then this can
* deadlock when the buffer fills up because we are not draining the
* buffer until the chunk has been fully mapped. This stage should
* probably be integrated with the stage which maps the binding sets
* over the shards (immediately above) to minimize copying or
* visiting in the data. This could be done by hooking the method
* which outputs a chunk to instead directly send the IChunkMessage.
* We could also simplify the API from IBlockingBuffer to something
* much thinner, such as add(IBindingSet[] chunk).
*
* TODO We should report the time spent mapping chunks out to the
* QueryLog. That could be done through an extension of BOpStats.
*/
int messageSendCount = 0;
for (Map.Entry> e : mapper
.getSinks().entrySet()) {
final PartitionLocator locator = e.getKey();
/*
* Note: newBuffer() above creates an BlockingBuffer so this
* cast is safe.
*/
final IBlockingBuffer shardSink = (IBlockingBuffer) e
.getValue();
// close buffer now that nothing is left to map onto it.
shardSink.close();
// drain buffer to a single chunk.
final IBindingSet[] a;
{
int n = 0;
final List lst = new LinkedList();
final IAsynchronousIterator itr = shardSink.iterator();
try {
while (itr.hasNext()) {
final IBindingSet[] t = itr.next();
lst.add(t);
n += t.length;
}
} finally {
itr.close();
}
a = new IBindingSet[n];
int i = 0;
for(IBindingSet[] t : lst) {
System
.arraycopy(t/* src */, 0/* srcPos */,
a/* dest */, i/* destPos */, t.length/* length */);
i += t.length;
}
}
if (a.length > 0) {
/*
* Send message.
*
* Note: This avoids sending empty chunks.
*
* @see https://sourceforge.net/apps/trac/bigdata/ticket/492
* (Empty chunk in ThickChunkMessage (cluster))
*/
// send message.
sendChunkMessage(q, locator.getDataServiceUUID(), sinkId,
locator.getPartitionId(), allocationContext, a);
// #of messages sent.
messageSendCount++;
}
}
return messageSendCount;
}
case CONTROLLER: {
/*
* Format the binding sets onto a ByteBuffer and publish that
* ByteBuffer as a manager resource for the query and notify the
* query controller that data is available for it.
*/
final IAllocationContext allocationContext = q.getAllocationContext(new QueryContext(
q.getQueryId()));
sendChunkMessage(q, q.queryControllerUUID, sinkId,
-1/* partitionId */, allocationContext, chunk);
return 1;
}
default:
throw new AssertionError(targetOp.getEvaluationContext());
}
}
/**
* Create and send an {@link IChunkMessage} from some intermediate results.
* Various optimizations are employed depending on the amount of data to be
* moved and whether or not the target is this service.
*
* @param serviceUUID
* The {@link UUID} of the {@link IQueryPeer} who is the
* recipient.
* @param sinkId
* The identifier of the target {@link BOp}.
* @param allocationContext
* The allocation context within which the {@link ByteBuffer}s
* will be managed for this {@link NIOChunkMessage}.
* @param source
* The binding sets to be formatted onto a buffer.
*
* @return The {@link NIOChunkMessage}.
*
* @todo This is basically a factory for creating {@link IChunkMessage}s.
* That factory pattern in combined with the logic to send the message
* so we can do within JVM handoffs. We could break these things apart
* using {@link IChunkMessage#isMaterialized()} to detect inline
* cases. That would let us send out the messages in parallel, which
* could help to cut latency when an operator has a large fan out (in
* scale-out when mapping over shards or nodes).
*
* @todo Release the allocations associated with each output chunk once it
* is received by the remote service.
*
* When the query terminates all output chunks targeting any node
* EXCEPT the query controller should be immediately dropped.
*
* If there is an error during query evaluation, then the output
* chunks for the query controller should be immediately dropped.
*
* If the iterator draining the results on the query controller is
* closed, then the output chunks for the query controller should be
* immediately dropped.
*
* @todo There are a few things for which the resource must be made
* available to more than one operator evaluation phase. The best
* examples are temporary graphs for parallel closure and large
* collections of graphIds for SPARQL "NAMED FROM DATA SET"
* extensions.
*
* @todo Rethink the multiplicity relationship between chunks output from an
* operator, chunks output from mapping the operator over shards or
* nodes, RMI messages concerning buffers available for the sink
* operator on the various nodes, and the #of allocations per RMI
* message on both the sender and the receiver.
*
* I am pretty sure that none of these are strongly coupled, e.g.,
* they are not 1:1. Some stages can combine chunks. Multiple
* allocations could be required on either the sender or the receiver
* purely due to where the slices fall on the backing direct
* {@link ByteBuffer}s in the {@link DirectBufferPool} and the sender
* and receiver do not need to use the same allocation context or have
* the same projection of slices onto the backing buffers.
*
* The one thing which is critical is that the query controller is
* properly informed of the #of chunks made available to an operator
* and consumed by that operator, that those reports must be in the
* same units, and that the reports must be delivered back to the
* query controller in a manner which does not transiently violate the
* termination conditions of the query.
*/
protected void sendChunkMessage(
final FederatedRunningQuery q,
final UUID serviceUUID,
final int sinkId,
final int partitionId,
final IAllocationContext allocationContext,
final IBindingSet[] source) {
if (serviceUUID == null)
throw new IllegalArgumentException();
if (allocationContext == null)
throw new IllegalArgumentException();
if (source == null)
throw new IllegalArgumentException();
// if (source.isEmpty())
// throw new RuntimeException();
// The peer to whom we send the message.
final IQueryPeer peerProxy = q.getQueryPeer(serviceUUID);
if (peerProxy == null)
throw new RuntimeException("Not found: serviceId=" + serviceUUID);
// true iff the target is this service (no proxy, no RMI).
final boolean thisService = peerProxy == q.getQueryEngine();
if(thisService) {
/*
* Leave the chunk as Java objects and drop it directly onto the
* query engine.
*/
final IChunkMessage msg = new LocalChunkMessage(
q.getQueryController(), //
q.getQueryId(), //
sinkId, //
partitionId, //
source);
if (log.isDebugEnabled())
log.debug("Sending local message: " + msg);
/*
* The message is fully materialized and will not cross a machine
* boundary. Drop it directly onto the work queue on this
* QueryEngine rather than handing it off to the QueryEngine. This
* is more efficient and also prevents counting messages which
* target the same query controller as inter-controller messages.
*/
// drop the message onto the IRunningQuery
q.acceptChunk(msg);
// drop the message onto the QueryEngine.
// q.getQueryEngine().bufferReady(msg);
return;
}
/*
* We will be notifying another service (RMI) that a chunk is available.
*
* Note: Depending on how much data it involved, we may move it with the
* RMI message or out of band using NIO. This decision effects how we
* serialize the chunk.
*/
final IChunkMessage msg;
if (usePOJO) {
msg = new LocalChunkMessage(q.getQueryController(), q.getQueryId(),
sinkId, partitionId, source);
} else {
if (source.length <= nioThreshold) {
msg = new ThickChunkMessage(
q.getQueryController(), q.getQueryId(), sinkId,
partitionId, source);
} else {
/*
* Marshall the data onto direct ByteBuffer(s) and send a thin
* message by RMI. The receiver will retrieve the data using NIO
* against the ResourceService.
*/
msg = new NIOChunkMessage(q.getQueryController(),
q.getQueryId(), sinkId, partitionId, allocationContext,
source, q.getQueryEngine().getResourceService()
.getAddr());
}
}
if (log.isDebugEnabled())
log.debug("Sending remote message: " + msg);
// Update counters since message will cross machine boundary.
final FederatedQueryEngineCounters c = q.getQueryEngine()
.getQueryEngineCounters();
c.chunksOut.increment();
c.solutionsOut.add(source.length);
try {
peerProxy.bufferReady(msg);
} catch (RemoteException e) {
throw new RuntimeException(e);
}
}
}