com.bigdata.bop.join.IHashJoinUtility Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General License for more details.
You should have received a copy of the GNU General License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Nov 8, 2011
*/
package com.bigdata.bop.join;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstraint;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.engine.BOpStats;
import com.bigdata.htree.HTree;
import com.bigdata.relation.accesspath.IBuffer;
import com.ibm.icu.util.BytesTrie.Iterator;
import cutthecrap.utils.striterators.ICloseableIterator;
/**
* Interface for hash index build and hash join operations.
*
* Use cases
*
* For a JOIN, there are two core steps, plus one additional step if the join is
* optional. The hash join logically has a Left Hand Side (LHS) and a
* Right Hand Side (RHS). The RHS is used to build up a hash index which is then
* probed for each LHS solution. The LHS is generally an access path scan, which
* is done once. A hash join therefore provides an alternative to a nested index
* join in which we visit the access path once, probing the hash index for
* solutions which join.
*
* - Accept solutions
* - This step builds the hash index, also known as the RHS (Right Hand Side).
*
* - hash join
* - The hash join considers each left solution in turn and outputs solutions
* which join. If optionals are required, this step also builds an hash index
* (the joinSet) over the right solutions which did join.
* - Output optionals
* - The RHS hash index is scanned and the joinSet is probed to
* identify right solutions which did not join with any left solution. Those
* solutions are output as "optionals".
*
*
* This class also supports DISTINCT SOLUTIONS filters. For this use case, the
* caller uses {@link #filterSolutions(ICloseableIterator, BOpStats, IBuffer)}
* method.
*
* @author Bryan Thompson
* @version $Id$
*/
public interface IHashJoinUtility {
/**
* Return the type safe enumeration indicating what kind of operation is to
* be performed.
*/
JoinTypeEnum getJoinType();
/**
* The variable bound based on whether or not a solution survives an
* "EXISTS" graph pattern (optional).
*
* @see HashJoinAnnotations#ASK_VAR
*/
IVariable> getAskVar();
/**
* The join variables.
*
* @see HashJoinAnnotations#JOIN_VARS
*/
IVariable>[] getJoinVars();
/**
* The variables to be retained (optional, all variables are retained if
* not specified).
*
* @see JoinAnnotations#SELECT
*/
IVariable>[] getSelectVars();
/**
* Returns true if the projection outputs the distinct join vars (in
* that case, the variables delivered by {{@link #getSelectVars()} will
* be ignored, might even be uninitialized). See
* {@link HashJoinAnnotations#OUTPUT_DISTINCT_JVs}.
*/
public boolean isOutputDistinctJoinVars();
/**
* The join constraints (optional).
*
* @see JoinAnnotations#CONSTRAINTS
*/
IConstraint[] getConstraints();
/**
* Return true
iff there are no solutions in the hash index.
*/
boolean isEmpty();
/**
* Return the #of solutions in the hash index.
*/
long getRightSolutionCount();
/**
* Discard the hash index.
*/
void release();
/**
* Buffer solutions on a hash index.
*
* When optional:=true
, solutions which do not have a binding
* for one or more of the join variables will be inserted into the hash
* index anyway using hashCode:=1
. This allows the solutions to
* be discovered when we scan the hash index and the set of solutions which
* did join to identify the optional solutions.
*
* @param itr
* The source from which the solutions will be drained.
* @param stats
* The statistics to be updated as the solutions are buffered on
* the hash index.
*
* @return The #of solutions that were buffered.
*/
long acceptSolutions(ICloseableIterator itr,
BOpStats stats);
/**
* Filter solutions, writing only the DISTINCT solutions onto the sink.
*
* @param itr
* The source solutions.
* @param stats
* The stats to be updated.
* @param sink
* The sink.
*
* @return The #of source solutions which pass the filter.
*/
long filterSolutions(ICloseableIterator itr,
BOpStats stats, IBuffer sink);
/**
* Do a hash join between a stream of source solutions (left) and a hash
* index (right). For each left solution, the hash index (right) is probed
* for possible matches (solutions whose as-bound values for the join
* variables produce the same hash code). Possible matches are tested for
* consistency and the constraints (if any) are applied. Solutions which
* join are written on the caller's buffer.
*
* Note: Some {@link JoinTypeEnum}s have side-effects on the join state. For
* this joins, once method has been invoked for the final time, you must
* then invoke either {@link #outputOptionals(IBuffer)} (Optional or
* NotExists) or {@link #outputJoinSet(IBuffer)} (Exists).
*
* @param leftItr
* A stream of chunks of solutions to be joined against the hash
* index (left).
* @param stats
* The statistics to be updated as solutions are drained from the
* leftItr (optional). When left
is the
* pipeline, {@link BOpStats#chunksIn} and
* {@link BOpStats#unitsIn} should be updated by passing in the
* {@link BOpStats} object. When left
is a hash
* index (i.e., for a hash join against an access path), you
* should pass null
since the chunksIn and unitsIn
* are updated as the {@link HashIndexOp} builds the hash index
* rather than when it executes the join against the access
* path).
* @param outputBuffer
* Where to write the solutions which join.
*/
void hashJoin(//
ICloseableIterator leftItr,//
BOpStats stats,//
IBuffer outputBuffer//
);
/**
* Variant hash join method allows the caller to impose different
* constraints or additional constraints. This is used to impose join
* constraints when a solution set is joined back into a query based on the
* join filters in the join group in which the solution set is included.
*
* Note: Some {@link JoinTypeEnum}s have side-effects on the join state. For
* this joins, once method has been invoked for the final time, you must
* then invoke either {@link #outputOptionals(IBuffer)} (Optional or
* NotExists) or {@link #outputJoinSet(IBuffer)} (Exists).
*
* @param leftItr
* A stream of chunks of solutions to be joined against the hash
* index (left).
* @param stats
* The statistics to be updated as solutions are drained from the
* leftItr.
* @param outputBuffer
* Where to write the solutions which join.
* @param constraints
* Constraints attached to this join (optional). Any constraints
* specified here are combined with those specified in the
* constructor.
*/
void hashJoin2(//
ICloseableIterator leftItr,//
BOpStats stats,//
IBuffer outputBuffer,//
IConstraint[] constraints//
);
/**
* Perform an N-way merge join. For an OPTIONAL join, this instance
* is understood to be the index having the "required" solutions.
*
* The merge join takes a set of solution sets in the some order and having
* the same join variables. It examines the next solution in order for each
* solution set and compares them. For each solution set which reported a
* solution having the same join variables as that earliest solution, it
* outputs the cross product and advances the iterator on that solution set.
*
* The iterators draining the source solution sets need to be synchronized
* such that we consider only solutions having the same hash code in each
* cycle of the MERGE JOIN. The synchronization step is different depending
* on whether or not the MERGE JOIN is OPTIONAL.
*
* If the MERGE JOIN is REQUIRED, then we want to synchronize the source
* solution iterators on the next lowest key (aka hash code) which they all
* have in common.
*
* If the MERGE JOIN is OPTIONAL, then we want to synchronize the source
* solution iterators on the next lowest key (aka hash code) which appears
* for any source iterator. Solutions will not be drawn from iterators not
* having that key in that pass.
*
* Note that each hash code may be an alias for solutions having different
* values for their join variables. Such solutions will not join. However,
* only solutions having the same values for the hash code can join. Thus,
* by proceeding with synchronized iterators and operating only on solutions
* having the same hash code in each round, we will consider all solutions
* which COULD join with one another in each round.
*
* Note: If the solutions are not in a stable and mutually consistent order
* by hash code in the hash indices then the solutions in each hash index
* MUST be SORTED before proceeding. (The {@link HTree} maintains solutions
* in such an order but the JVM collections do not.)
*
* @param others
* The other solution sets to be joined. All instances must be of
* the same concrete type as this.
* @param outputBuffer
* Where to write the solutions.
* @param constraints
* The join constraints.
* @param optional
* true
iff the join is optional.
*/
void mergeJoin(//
IHashJoinUtility[] others,//
IBuffer outputBuffer,//
IConstraint[] constraints,//
boolean optional//
);
/**
* Checkpoint the generated hash index such that it becomes safe for
* concurrent readers.
*/
void saveSolutionSet();
/**
* Identify and output the optional solutions. This is used with OPTIONAL
* and NOT EXISTS.
*
* Optionals are identified using a joinSet containing each right
* solution which joined with at least one left solution. The total set of
* right solutions is then scanned once. For each right solution, we probe
* the joinSet. If the right solution did not join, then it is output
* now as an optional join.
*
* @param outputBuffer
* Where to write the optional solutions.
*/
void outputOptionals(IBuffer outputBuffer);
/**
* Output the solutions buffered in the hash index. This is used when an
* operator is building a hash index for use by a downstream operator.
*
* @param out
* Where to write the solutions.
*/
void outputSolutions(IBuffer out);
/**
* Return an {@link Iterator} that visits all solutions in the index (index
* scan). The visited solutions MAY contain variables that would not be
* projected out of the hash join.
*
* Note: This is very nearly the same as {@link #outputSolutions(IBuffer)}
* except that the latter only outputs the projected variables and it writes
* onto an {@link IBuffer} rather than returning an
* {@link ICloseableIterator}.
*
* @return The {@link Iterator}.
*/
ICloseableIterator indexScan();
/**
* Output the solutions which joined. This is used with EXISTS.
*
* @param out
* Where to write the solutions.
*/
void outputJoinSet(IBuffer out);
}