com.bigdata.bop.join.PipelinedHashIndexAndSolutionSetJoinOp Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Oct 20, 2015
*/
package com.bigdata.bop.join;
import java.util.Map;
import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpContext;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstraint;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.NV;
import com.bigdata.bop.PipelineOp;
import com.bigdata.bop.PipelineOp.Annotations;
import com.bigdata.bop.controller.INamedSolutionSetRef;
import com.bigdata.bop.controller.SubqueryAnnotations;
import com.bigdata.rdf.sparql.ast.QueryHints;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpUtility;
import com.bigdata.relation.accesspath.IBlockingBuffer;
import com.bigdata.relation.accesspath.UnsyncLocalOutputBuffer;
import cutthecrap.utils.striterators.ICloseableIterator;
/**
* Operator for pipelined hash index construction and subsequent join. Note that
* this operator needs not to be combined with a solution set hash join, but
* instead gets the subquery/subgroup passed as a parameter and thus can be
* considered as an "all-in-one" build-hash-index-and-join operation.
*
* The operator is designed for single-threaded use (as it is the case for the
* non-pipelined hash index & join operators). It's processing scheme is
* illustrated by an example in the following. Assume we have a query such as
*
*
SELECT * WHERE {
?s ?o1
OPTIONAL {
?s ?o2 .
?s ?o3 .
}
} LIMIT 10
to be evaluated over the data set
.
.
.
.
.
.
*
* , where the OPTIONAL is considered as a complex group that is translated
* using a hash join pattern. The approach taken by this operator is that the
* OPTIONAL pattern is considered as subquery that is passed in to this operator
* via Annotations.SUBQUERY. The operator logically proceeds as follows:
*
* 1. Incoming are the bindings from outside, i.e. in our example the bindings
* for triple pattern "?s ?o1". Given the input data at hand,
* this means we have the following binding set
*
* {
* { ?s -> , ?o1 -> },
* { ?s -> , ?o1 -> }
* { ?s -> , ?o1 -> }
* }
*
* coming in. For the sake of this example, assume the solutions are dropping
* in one after the after (chunk size=1) s.t. we have three iterations,
* but note that the implementation implements a generalized, vectored
* approach processing the input chunk by chunk..
*
* 2. The join variable set is { ?s }. We compute the distinct projection over
* ?s on the incoming bindings:
*
* - In the 1st iteration, the distinct projection is { ?s -> },
* - In the 2nd iteration, the distinct projection is { ?s -> },
* - In the 3rd iteration, the distinct projection is { ?s -> } again.
*
* 3. For each of these distinct projections, we decide whether the subquery
* has been evaluated with the distinct projection as input before.
* If the subquery has already been evaluated, we take the fast path and
* proceed with step 5. If it has not yet been evaluated before, proceed
* to step 4 (namely its evaluation and buffering of the solution on the
* hash index).
*
* For our example this means: in the first iteration we proceed to step
* (4) with { ?s -> } as input; in the second iteration, we
* proceed to step (4) with { ?s -> } as input; in the third
* iteration, we can directly proceed to step (5): the distinct projection
* { ?s -> } has been encountered in iteration 1 already.
*
* 4. Evaluate the subquery for the incoming binding. The result is stored in
* a hash index (this hash index is used, in future, to decide whether
* the result has been already computed before for the distinct projection
* in step 3; note that, in addition to the hash index, we also record those
* distinct projections for which the subquery has been evaluated without
* producing a result, to avoid unnecessary re-computation).
*
* In the first iteration, we compute the subquery result
* { ?s -> , ?o2 -> , ?o3 -> } for
* the input binding { ?s -> }; in the second iteration, we
* compute { ?s -> } for the input binding { ?s -> }
* (i.e., OPTIONAL subquery that does not match, leaving the input
* unmodified); there is no third iteration step.
*
* 5. Join the original bindings (from which the distinct projection was
* obtained) against the hash index and output the results. The operator
* supports all kinds of joins (Normal, Exists, NotExists, etc.).
*
* We thus obtain the following matches with the hash index:
* - Iteration 1: { ?s -> , ?o1 -> } JOIN
* { ?s -> , ?o2 -> , ?o3 -> }
* - Iteration 2: { ?s -> , ?o1 -> } JOIN { ?s -> }
* - Iteration 3: { ?s -> , ?o1 -> } JOIN
* { ?s -> , ?o2 -> , ?o3 -> }
*
* This gives us the expected final result:
*
* {
* { ?s -> , ?o1 -> , ?o2 -> , ?o3 -> },
* { ?s -> , ?o1 -> },
* { ?s -> , ?o1 -> , ?o2 -> , ?o3 -> }
* }
*
* Note that this strategy is pipelined in the sense that all results from the
* left are emitted as soon as the subquery result for its distinct projection
* result has been calculated.
*
* # Further notes:
* 1.) Vectored processing: the implementation uses a vectored processing
* approach: instead of processing the mappings one by one, we collect
* unseen distinct projections and their associated incoming mappings in two
* buffers, namely the distinctProjectionBuffer and the incomingBindingssBuffer.
*
* Buffer size is controlled via annotations:
* incomingBindingssBufferThreshold and the Annotations.DISTINCT_PROJECTION_BUFFER_THRESHOLD.
* If, after processing a chunk, one of these thresholds is exceeded for the
* respective buffer (or, alternatively, if we're in the lastPass), the
* mappings in the distinctProjectionBuffer are provided as input to
* the subquery "in batch" (cf. step 4.)) and subsequently the bindings from
* the incomingBindingssBuffer are joined (and released). At that point, the
* buffers become empty again.
*
* Effectively, the thresholds allow to reserve some internal buffer space. For
* now, we go with default values, but we may want to expose these buffers via
* query hints or the like at some point.
*
* 2.) Alternative source: the strategy sketched above illustrates how the op
* works for subqueries. An alternative way of using the operator is by
* setting Annotations.BINDING_SETS_SOURCE *instead of* the subquery. This
* is used when computing a hash join with a VALUES clause; in that case, the
* binding sets provided by the VALUES clause are considered as "inner query";
* as a notable difference, this set of values is static and does not need to
* be re-evaluated each time, it is submitted once to the hash index in the
* beginning and joined with every incoming binding.
*
* 3.) There exist two "implementations" of the operator, namely the
* {@link JVMPipelinedHashJoinUtility} and the {@link HTreePipelinedHashJoinUtility}.
*
* # Other remarks:
* There are some more technicalities like support for ASK_VAR (which is used
* by the FILTER (NOT) EXISTS translation scheme, which work in principle in
* the same way as they do for the standard hash join.
*
* Usage: the pipelined hash join operator is preferably used for queries
* containing LIMIT but *no* ORDER BY. It can also be globally enabled by
* system parameter {@link QueryHints#PIPELINED_HASH_JOIN} and via query
* hints. See {@link AST2BOpUtility#usePipelinedHashJoin} for the method
* implementing its selection strategy.
*
* @see JVMPipelinedHashJoinUtility for implementation
*
* @author Michael Schmidt
*/
public class PipelinedHashIndexAndSolutionSetJoinOp extends HashIndexOp {
private static final long serialVersionUID = 3473675701742394157L;
public interface Annotations extends HashIndexOp.Annotations, SubqueryAnnotations {
/**
* The variables that is projected into the inner subgroup. Typically,
* this is identical to the join variables. There are, however,
* exceptions where we need to project in a superset. For instance, for
* the query
*
* select * where {
* ?a :knows ?b .
* OPTIONAL {
* ?b :knows ?c .
* ?c :knows ?d .
* filter(?a != :paul) # Note: filter applies to *outer* variable
* }
* }
*
* we have joinVars={?b} and projectInVars={?a, ?b}, because variables
* from outside are visible in the inner filter.
*/
String PROJECT_IN_VARS =
PipelinedHashIndexAndSolutionSetJoinOp.class.getName() + ".projectInVars";
/**
* The threshold defining when to release the distinctProjectionBuffer.
* Note that releasing this buffer means releasing the
* distinctProjectionBuffer at the same time.
*/
String DISTINCT_PROJECTION_BUFFER_THRESHOLD =
PipelinedHashIndexAndSolutionSetJoinOp.class.getName()
+ ".distinctProjectionBufferThreshold";
// set default to have a default's chunk size default
int DEFAULT_DISTINCT_PROJECTION_BUFFER_THRESHOLD = 50;
/**
* The threshold defining when to release the incomingBindingsBuffer.
* Note that releasing this buffer means releasing the
* distinctProjectionBuffer at the same time.
*/
String INCOMING_BINDINGS_BUFFER_THRESHOLD =
PipelinedHashIndexAndSolutionSetJoinOp.class.getName()
+ ".incomingBindingsBuffer";
// having buffered 1000 incoming bindings, we release both buffers;
// this might happen if we observe 1000 incoming mappings with less
// than DISTINCT_PROJECTION_BUFFER_THRESHOLD distinct projections
int DEFAULT_INCOMING_BINDINGS_BUFFER_THRESHOLD = 1000;
}
/**
* Deep copy constructor.
*/
public PipelinedHashIndexAndSolutionSetJoinOp(final PipelinedHashIndexAndSolutionSetJoinOp op) {
super(op);
// max parallel must be one
if (getMaxParallel() != 1)
throw new IllegalArgumentException(Annotations.MAX_PARALLEL + "=" + getMaxParallel());
}
/**
* Shallow copy constructor.
*
* @param args
* @param annotations
*/
public PipelinedHashIndexAndSolutionSetJoinOp(final BOp[] args, final Map annotations) {
super(args, annotations);
// max parallel must be one
if (getMaxParallel() != 1)
throw new IllegalArgumentException(Annotations.MAX_PARALLEL + "=" + getMaxParallel());
}
public PipelinedHashIndexAndSolutionSetJoinOp(final BOp[] args, final NV... annotations) {
this(args, NV.asMap(annotations));
}
@Override
protected ChunkTaskBase createChunkTask(final BOpContext context) {
/**
* The operator offers two ways to generate the hash index of the input
* stream, either via subquery or via binding set that is passed in.
* Exactly one of both *must* be provided.
*/
final PipelineOp subquery =
(PipelineOp)getProperty(Annotations.SUBQUERY);
final IBindingSet[] bsFromBindingsSetSource =
(IBindingSet[]) getProperty(Annotations.BINDING_SETS_SOURCE);
if (subquery==null && bsFromBindingsSetSource==null) {
throw new IllegalArgumentException(
"Neither subquery nor binding set source provided.");
} else if (subquery!=null && bsFromBindingsSetSource!=null) {
throw new IllegalArgumentException(
"Both subquery and binding set source provided.");
}
final IVariable> askVar =
(IVariable>) getProperty(HashJoinAnnotations.ASK_VAR);
final IVariable>[] projectInVars =
(IVariable>[]) getProperty(Annotations.PROJECT_IN_VARS);
final int distinctProjectionBufferThreshold =
getProperty(
Annotations.DISTINCT_PROJECTION_BUFFER_THRESHOLD,
Annotations.DEFAULT_DISTINCT_PROJECTION_BUFFER_THRESHOLD);
final int incomingBindingsBufferThreshold =
getProperty(
Annotations.INCOMING_BINDINGS_BUFFER_THRESHOLD,
Annotations.DEFAULT_INCOMING_BINDINGS_BUFFER_THRESHOLD);
return new ChunkTask(this, context, subquery,
bsFromBindingsSetSource, projectInVars, askVar,
distinctProjectionBufferThreshold, incomingBindingsBufferThreshold);
}
/**
* A chunk task. See outer class for explanation of parameters.
*/
private static class ChunkTask extends com.bigdata.bop.join.HashIndexOp.ChunkTask {
final PipelineOp subquery;
final IBindingSet[] bsFromBindingsSetSource;
final IConstraint[] joinConstraints;
final IVariable> askVar;
final IVariable>[] projectInVars;
final int distinctProjectionBufferThreshold;
final int incomingBindingsBufferThreshold;
public ChunkTask(final PipelinedHashIndexAndSolutionSetJoinOp op,
final BOpContext context,
final PipelineOp subquery,
final IBindingSet[] bsFromBindingsSetSource,
final IVariable>[] projectInVars,
final IVariable> askVar,
final int distinctProjectionBufferThreshold,
final int incomingBindingsBufferThreshold) {
super(op, context);
joinConstraints = BOpUtility.concat(
(IConstraint[]) op.getProperty(Annotations.CONSTRAINTS),
state.getConstraints());
// exactly one of the two will be non-null
this.subquery = subquery;
this.bsFromBindingsSetSource = bsFromBindingsSetSource;
this.projectInVars = projectInVars;
this.askVar = askVar;
this.distinctProjectionBufferThreshold = distinctProjectionBufferThreshold;
this.incomingBindingsBufferThreshold = incomingBindingsBufferThreshold;
}
/**
* Evaluate.
*/
@Override
public Void call() throws Exception {
try {
if (sourceIsPipeline) {
// Buffer all source solutions.
acceptAndOutputSolutions();
if (context.isLastInvocation()) {
// Done. Release the allocation context.
state.release();
}
} else {
if(first) {
// Accept ALL solutions.
acceptAndOutputSolutions();
// Done. Release the allocation context.
state.release();
}
// Copy all solutions from the pipeline to the sink.
BOpUtility.copy(context.getSource(), context.getSink(),
null/* sink2 */, null/* mergeSolution */,
null/* selectVars */, null/* constraints */, stats);
// Flush solutions to the sink.
context.getSink().flush();
}
// Done.
return null;
} finally {
context.getSource().close();
context.getSink().close();
}
}
/**
* Output the buffered solutions.
*/
private void acceptAndOutputSolutions() {
// default sink
final IBlockingBuffer sink = context.getSink();
final UnsyncLocalOutputBuffer unsyncBuffer =
new UnsyncLocalOutputBuffer(
op.getChunkCapacity(), sink);
final ICloseableIterator src;
if (sourceIsPipeline) {
src = context.getSource();
} else if (op.getProperty(Annotations.NAMED_SET_SOURCE_REF) != null) {
/*
* Metadata to identify the optional *source* solution set. When
* null
, the hash index is built from the solutions flowing
* through the pipeline. When non-null
, the hash index is
* built from the solutions in the identifier solution set.
*/
final INamedSolutionSetRef namedSetSourceRef = (INamedSolutionSetRef) op
.getRequiredProperty(Annotations.NAMED_SET_SOURCE_REF);
src = context.getAlternateSource(namedSetSourceRef);
} else if (bsFromBindingsSetSource != null) {
/**
* We handle the BINDINGS_SETS_SOURCE case as follows: the
* binding sets on the source are treated as input. Given that
* in this case no inner query is set, we consider the
* BINDINGS_SETS_SOURCE as the result of the query instead.
* It is extracted here and passed in as a parameter.
*/
src = context.getSource();
} else {
throw new UnsupportedOperationException(
"Source was not specified");
}
((PipelinedHashJoinUtility)state).acceptAndOutputSolutions(
unsyncBuffer, src, stats, joinConstraints, subquery,
bsFromBindingsSetSource, projectInVars, askVar,
context.isLastInvocation(), distinctProjectionBufferThreshold,
incomingBindingsBufferThreshold, context);
unsyncBuffer.flush();
sink.flush();
}
} // ControllerTask
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy