com.bigdata.bop.join.IHashJoinUtility Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General License for more details.

You should have received a copy of the GNU General License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
/*
 * Created on Nov 8, 2011
 */

package com.bigdata.bop.join;

import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstraint;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.engine.BOpStats;
import com.bigdata.htree.HTree;
import com.bigdata.relation.accesspath.IBuffer;
import com.ibm.icu.util.BytesTrie.Iterator;

import cutthecrap.utils.striterators.ICloseableIterator;

/**
 * Interface for hash index build and hash join operations.
 * 
 * Use cases
 * 
 * For a JOIN, there are two core steps, plus one additional step if the join is
 * optional. The hash join logically has a Left Hand Side (LHS) and a
 * Right Hand Side (RHS). The RHS is used to build up a hash index which is then
 * probed for each LHS solution. The LHS is generally an access path scan, which
 * is done once. A hash join therefore provides an alternative to a nested index
 * join in which we visit the access path once, probing the hash index for
 * solutions which join.
 * 
 * Accept solutions
 * This step builds the hash index, also known as the RHS (Right Hand Side).
 * 
 * hash join
 * The hash join considers each left solution in turn and outputs solutions
 * which join. If optionals are required, this step also builds an hash index
 * (the joinSet) over the right solutions which did join.
 * Output optionals
 * The RHS hash index is scanned and the joinSet is probed to
 * identify right solutions which did not join with any left solution. Those
 * solutions are output as "optionals".
 * 
 * 
 * This class also supports DISTINCT SOLUTIONS filters. For this use case, the
 * caller uses {@link #filterSolutions(ICloseableIterator, BOpStats, IBuffer)}
 * method.
 * 
 * @author Bryan Thompson
 * @version $Id$
 */
public interface IHashJoinUtility {
    
    /**
     * Return the type safe enumeration indicating what kind of operation is to
     * be performed.
     */
    JoinTypeEnum getJoinType();
    
    /**
     * The variable bound based on whether or not a solution survives an
     * "EXISTS" graph pattern (optional).
     * 
     * @see HashJoinAnnotations#ASK_VAR
     */
    IVariable getAskVar();
    
    /**
     * The join variables.
     * 
     * @see HashJoinAnnotations#JOIN_VARS
     */
    IVariable[] getJoinVars();

    /**
     * The variables to be retained (optional, all variables are retained if
     * not specified).
     * 
     * @see JoinAnnotations#SELECT
     */
    IVariable[] getSelectVars();

    /**
     * Returns true if the projection outputs the distinct join vars (in
     * that case, the variables delivered by {{@link #getSelectVars()} will
     * be ignored, might even be uninitialized). See
     * {@link HashJoinAnnotations#OUTPUT_DISTINCT_JVs}.
     */
    public boolean isOutputDistinctJoinVars();
    
    /**
     * The join constraints (optional).
     * 
     * @see JoinAnnotations#CONSTRAINTS
     */
    IConstraint[] getConstraints();
    
    /**
     * Return true iff there are no solutions in the hash index.
     */
    boolean isEmpty();

    /**
     * Return the #of solutions in the hash index.
     */
    long getRightSolutionCount();

    /**
     * Discard the hash index.
     */
    void release();

    /**
     * Buffer solutions on a hash index.
     * 

     * When optional:=true, solutions which do not have a binding
     * for one or more of the join variables will be inserted into the hash
     * index anyway using hashCode:=1. This allows the solutions to
     * be discovered when we scan the hash index and the set of solutions which
     * did join to identify the optional solutions.
     * 
     * @param itr
     *            The source from which the solutions will be drained.
     * @param stats
     *            The statistics to be updated as the solutions are buffered on
     *            the hash index.
     * 
     * @return The #of solutions that were buffered.
     */
    long acceptSolutions(ICloseableIterator itr,
            BOpStats stats);

    /**
     * Filter solutions, writing only the DISTINCT solutions onto the sink.
     * 
     * @param itr
     *            The source solutions.
     * @param stats
     *            The stats to be updated.
     * @param sink
     *            The sink.
     *            
     * @return The #of source solutions which pass the filter.
     */
    long filterSolutions(ICloseableIterator itr,
            BOpStats stats, IBuffer sink);

    /**
     * Do a hash join between a stream of source solutions (left) and a hash
     * index (right). For each left solution, the hash index (right) is probed
     * for possible matches (solutions whose as-bound values for the join
     * variables produce the same hash code). Possible matches are tested for
     * consistency and the constraints (if any) are applied. Solutions which
     * join are written on the caller's buffer.
     * 

     * Note: Some {@link JoinTypeEnum}s have side-effects on the join state. For
     * this joins, once method has been invoked for the final time, you must
     * then invoke either {@link #outputOptionals(IBuffer)} (Optional or
     * NotExists) or {@link #outputJoinSet(IBuffer)} (Exists).
     * 
     * @param leftItr
     *            A stream of chunks of solutions to be joined against the hash
     *            index (left).
     * @param stats
     *            The statistics to be updated as solutions are drained from the
     *            leftItr (optional). When left is the
     *            pipeline, {@link BOpStats#chunksIn} and
     *            {@link BOpStats#unitsIn} should be updated by passing in the
     *            {@link BOpStats} object. When left is a hash
     *            index (i.e., for a hash join against an access path), you
     *            should pass null since the chunksIn and unitsIn
     *            are updated as the {@link HashIndexOp} builds the hash index
     *            rather than when it executes the join against the access
     *            path).
     * @param outputBuffer
     *            Where to write the solutions which join.
     */
    void hashJoin(//
            ICloseableIterator leftItr,//
            BOpStats stats,//
            IBuffer outputBuffer//
    );

    /**
     * Variant hash join method allows the caller to impose different
     * constraints or additional constraints. This is used to impose join
     * constraints when a solution set is joined back into a query based on the
     * join filters in the join group in which the solution set is included.
     * 

     * Note: Some {@link JoinTypeEnum}s have side-effects on the join state. For
     * this joins, once method has been invoked for the final time, you must
     * then invoke either {@link #outputOptionals(IBuffer)} (Optional or
     * NotExists) or {@link #outputJoinSet(IBuffer)} (Exists).
     * 
     * @param leftItr
     *            A stream of chunks of solutions to be joined against the hash
     *            index (left).
     * @param stats
     *            The statistics to be updated as solutions are drained from the
     *            leftItr.
     * @param outputBuffer
     *            Where to write the solutions which join.
     * @param constraints
     *            Constraints attached to this join (optional). Any constraints
     *            specified here are combined with those specified in the
     *            constructor.
     */
    void hashJoin2(//
            ICloseableIterator leftItr,//
            BOpStats stats,//
            IBuffer outputBuffer,//
            IConstraint[] constraints//
    );

    /**
     * Perform an N-way merge join. For an OPTIONAL join, this instance
     * is understood to be the index having the "required" solutions.
     * 

     * The merge join takes a set of solution sets in the some order and having
     * the same join variables. It examines the next solution in order for each
     * solution set and compares them. For each solution set which reported a
     * solution having the same join variables as that earliest solution, it
     * outputs the cross product and advances the iterator on that solution set.
     * 

     * The iterators draining the source solution sets need to be synchronized
     * such that we consider only solutions having the same hash code in each
     * cycle of the MERGE JOIN. The synchronization step is different depending
     * on whether or not the MERGE JOIN is OPTIONAL.
     * 

     * If the MERGE JOIN is REQUIRED, then we want to synchronize the source
     * solution iterators on the next lowest key (aka hash code) which they all
     * have in common.
     * 

     * If the MERGE JOIN is OPTIONAL, then we want to synchronize the source
     * solution iterators on the next lowest key (aka hash code) which appears
     * for any source iterator. Solutions will not be drawn from iterators not
     * having that key in that pass.
     * 

     * Note that each hash code may be an alias for solutions having different
     * values for their join variables. Such solutions will not join. However,
     * only solutions having the same values for the hash code can join. Thus,
     * by proceeding with synchronized iterators and operating only on solutions
     * having the same hash code in each round, we will consider all solutions
     * which COULD join with one another in each round.
     * 

     * Note: If the solutions are not in a stable and mutually consistent order
     * by hash code in the hash indices then the solutions in each hash index
     * MUST be SORTED before proceeding. (The {@link HTree} maintains solutions
     * in such an order but the JVM collections do not.)
     * 
     * @param others
     *            The other solution sets to be joined. All instances must be of
     *            the same concrete type as this.
     * @param outputBuffer
     *            Where to write the solutions.
     * @param constraints
     *            The join constraints.
     * @param optional
     *            true iff the join is optional.
     */
    void mergeJoin(//
            IHashJoinUtility[] others,//
            IBuffer outputBuffer,//
            IConstraint[] constraints,//
            boolean optional//
    );
    
    /**
     * Checkpoint the generated hash index such that it becomes safe for
     * concurrent readers.
     */
    void saveSolutionSet();

    /**
     * Identify and output the optional solutions. This is used with OPTIONAL
     * and NOT EXISTS.
     * 

     * Optionals are identified using a joinSet containing each right
     * solution which joined with at least one left solution. The total set of
     * right solutions is then scanned once. For each right solution, we probe
     * the joinSet. If the right solution did not join, then it is output
     * now as an optional join.
     * 
     * @param outputBuffer
     *            Where to write the optional solutions.
     */
    void outputOptionals(IBuffer outputBuffer);

    /**
     * Output the solutions buffered in the hash index. This is used when an
     * operator is building a hash index for use by a downstream operator.
     * 
     * @param out
     *            Where to write the solutions.
     */
    void outputSolutions(IBuffer out);

    /**
     * Return an {@link Iterator} that visits all solutions in the index (index
     * scan). The visited solutions MAY contain variables that would not be
     * projected out of the hash join.
     * 
     * Note: This is very nearly the same as {@link #outputSolutions(IBuffer)}
     * except that the latter only outputs the projected variables and it writes
     * onto an {@link IBuffer} rather than returning an
     * {@link ICloseableIterator}.
     * 
     * @return The {@link Iterator}.
     */
    ICloseableIterator indexScan();
    
    /**
     * Output the solutions which joined. This is used with EXISTS.
     * 
     * @param out
     *            Where to write the solutions.
     */
    void outputJoinSet(IBuffer out);
    
}