All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bigdata.bop.join.JVMHashJoinUtility Maven / Gradle / Ivy

/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]
     
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Oct 17, 2011
 */

package com.bigdata.bop.join;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.log4j.Logger;

import com.bigdata.bop.BOpContext;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.Constant;
import com.bigdata.bop.HTreeAnnotations;
import com.bigdata.bop.HashMapAnnotations;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstraint;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.PipelineOp;
import com.bigdata.bop.controller.INamedSolutionSetRef;
import com.bigdata.bop.engine.BOpStats;
import com.bigdata.bop.join.JVMHashIndex.Bucket;
import com.bigdata.bop.join.JVMHashIndex.Key;
import com.bigdata.bop.join.JVMHashIndex.SolutionHit;
import com.bigdata.counters.CAT;
import com.bigdata.htree.HTree;
import com.bigdata.rdf.internal.impl.literal.XSDBooleanIV;
import com.bigdata.relation.accesspath.BufferClosedException;
import com.bigdata.relation.accesspath.IBuffer;
import com.bigdata.util.InnerCause;

import cutthecrap.utils.striterators.Expander;
import cutthecrap.utils.striterators.ICloseableIterator;
import cutthecrap.utils.striterators.IStriterator;
import cutthecrap.utils.striterators.Resolver;
import cutthecrap.utils.striterators.Striterator;
import cutthecrap.utils.striterators.Visitor;

/**
 * Utility class supporting hash join against a Java hash collection.
 * 
 * @author Bryan Thompson
 * @version $Id$
 */
public class JVMHashJoinUtility implements IHashJoinUtility {

    private static final Logger log = Logger.getLogger(JVMHashJoinUtility.class);

    /**
     * Singleton {@link IHashJoinUtilityFactory} that can be used to create a 
     * new {@link JVMHashJoinUtility}.
     */
    static public final IHashJoinUtilityFactory factory =
            new IHashJoinUtilityFactory() {

        private static final long serialVersionUID = 1L;

        public IHashJoinUtility create(//
                final BOpContext context,//
                final INamedSolutionSetRef namedSetRef,//
                final PipelineOp op,//
                final JoinTypeEnum joinType//
                ) {

            return new JVMHashJoinUtility(op, joinType);

        }
    };

    /**
     * true until the state is discarded by {@link #release()}.
     */
    protected final AtomicBoolean open = new AtomicBoolean(true);


   /**
     * The type of join to be performed.
     */
    protected final JoinTypeEnum joinType;
    
//    /**
//     * true iff the join is OPTIONAL.
//     */
//    private final boolean optional;
//    
//    /**
//     * true iff this is a DISTINCT filter.
//     */
//    private final boolean filter;
//    
//    /**
//     * true iff a solution having an unbound {@link #joinVars}
//     * should be dropped and false if it should be indexed anyway.
//     */
//    private final boolean dropSolutionsHavingUnboundJoinVars;

    /**
     * @see HashJoinAnnotations#ASK_VAR
     */
    protected final IVariable askVar;
    
    /**
     * The join variables.
     */
    protected final IVariable[] joinVars;

    /**
     * The variables to be retained (aka projected out) (optional, all variables
     * are retained if not specified).
     */
    protected final IVariable[] selectVars;

    /**
     * True if the hash join utility class is to output the distinct join
     * variables.
     */
    protected boolean outputDistinctJVs = false;
    
    /**
     * The join constraints (optional).
     */
    protected final IConstraint[] constraints;

    /**
     * The hash index.
     * 

* Note: There is no separate "joinSet". Instead, the {@link SolutionHit} * class provides a join hit counter. */ protected final AtomicReference rightSolutionsRef = new AtomicReference(); /** * The #of solutions accepted into the hash index. */ protected final CAT rightSolutionCount = new CAT(); /** * The maximum #of (left,right) solution joins that will be considered * before failing the join. This is used IFF there are no join variables. */ private final long noJoinVarsLimit = HashJoinAnnotations.DEFAULT_NO_JOIN_VARS_LIMIT; /** * The #of left solutions considered for a join. */ protected final CAT nleftConsidered = new CAT(); /** * The #of right solutions considered for a join. */ protected final CAT nrightConsidered = new CAT(); /** * The #of solution pairs considered for a join. */ protected final CAT nJoinsConsidered = new CAT(); /** * Human readable representation of the {@link IHashJoinUtility} metadata * (but not the solutions themselves). */ @Override public String toString() { final StringBuilder sb = new StringBuilder(); sb.append(getClass().getSimpleName()); sb.append("{open=" + open); sb.append(",joinType="+joinType); // sb.append(",optional=" + optional); // sb.append(",filter=" + filter); if (askVar != null) sb.append(",askVar=" + askVar); sb.append(",joinVars=" + Arrays.toString(joinVars)); sb.append(",outputDistinctJVs=" + outputDistinctJVs); if (selectVars != null) sb.append(",selectVars=" + Arrays.toString(selectVars)); if (constraints != null) sb.append(",constraints=" + Arrays.toString(constraints)); sb.append(",size=" + getRightSolutionCount()); sb.append(",considered(left=" + nleftConsidered + ",right=" + nrightConsidered + ",joins=" + nJoinsConsidered + ")"); sb.append("}"); return sb.toString(); } /** * * @param op * The operator whose annotation will inform construction the * hash index. The {@link HTreeAnnotations} may be specified for * this operator and will control the initialization of the * various {@link HTree} instances. * @param joinType * The type of join to be performed. * * @see JVMHashJoinAnnotations */ public JVMHashJoinUtility(final PipelineOp op, final JoinTypeEnum joinType) { if (op == null) throw new IllegalArgumentException(); if(joinType == null) throw new IllegalArgumentException(); this.joinType = joinType; /* * Note: This flag needs to be [true] if we allow solutions to be stored * in the hash index that have unbound variables for the "joinVars". We * do this for OPTIONAL because all solutions must be indexed for an * OPTIONAL join since solutions that do not join will be reported. * * A DISTINCT FILTER does this as well. This is because "joinVars" is * really the list of projected variables for a SELECT DISTINCT for a * DISTINCT FILTER. * * By this reasoning, we should also do this for MINUS/NOT EXISTS. That * is, for everything except a NORMAL and EXISTS joins. * * This suggests a historical bug (fixed below) in MINUS / NOT EXISTS * handling. * * TODO Write a unit test for that bug involving a source solution with * an unbound join variable and see whether the solution is dropped or * reported (I believe that it should be reported). */ // final boolean optional = joinType == JoinTypeEnum.Optional; final boolean filter = joinType == JoinTypeEnum.Filter; final boolean indexSolutionsHavingUnboundJoinVars; switch(joinType) { case Normal: case Exists: indexSolutionsHavingUnboundJoinVars = false; break; case Optional: // OPTIONAL join. case NotExists: // NOT EXISTS and MINUS case Filter: // SELECT DISTINCT indexSolutionsHavingUnboundJoinVars = true; break; default: throw new UnsupportedOperationException(); } // Optional variable used for (NOT) EXISTS. this.askVar = (IVariable) op .getProperty(HashJoinAnnotations.ASK_VAR); // The join variables (required). this.joinVars = (IVariable[]) op .getRequiredProperty(HashJoinAnnotations.JOIN_VARS); /* * The projected OUT variables (optional and equal to the join variables * iff this is a DISTINCT filter). */ this.selectVars = filter ? joinVars : (IVariable[]) op .getProperty(JoinAnnotations.SELECT); this.outputDistinctJVs = op.getProperty( HashIndexOp.Annotations.OUTPUT_DISTINCT_JVs, false); // The join constraints (optional). this.constraints = (IConstraint[]) op .getProperty(JoinAnnotations.CONSTRAINTS); // // Iff the join has OPTIONAL semantics. // this.optional = optional; // // // Iff this is a DISTINCT filter. // this.filter = filter; /* * TODO Parameter for the map implementation class. * * - HashMap is going to be faster for insert and search. * * - LinkedHashMap will be faster for the iterator. * * - ConcurrentHashMap will be faster if (a) the JVMHashIndex is safe * for concurrent inserts; and (b) the JVMHashIndexOp is safe for * concurrent execution. [These conditions are not currently true.] * * Some join patterns do not require us to use an iterator at all, in * which case HashMap is the clear winner. (For example, a non-optional * hash join against an access path never uses the iterator over the * hash index.) */ /* * The ordered variables for the keys in the hash index. * * For SELECT DISTINCT, use the [select] variables. * * Otherwise use the [joinVars]. */ final IVariable[] keyVars = filter ? (IVariable[]) op .getProperty(JoinAnnotations.SELECT) : joinVars; rightSolutionsRef.set(// new JVMHashIndex(// keyVars,// indexSolutionsHavingUnboundJoinVars,// new LinkedHashMap(op.getProperty( HashMapAnnotations.INITIAL_CAPACITY, HashMapAnnotations.DEFAULT_INITIAL_CAPACITY),// op.getProperty(HashMapAnnotations.LOAD_FACTOR, HashMapAnnotations.DEFAULT_LOAD_FACTOR)// )// )); } @Override public JoinTypeEnum getJoinType() { return joinType; } @Override public IVariable getAskVar() { return askVar; } @Override public IVariable[] getJoinVars() { return joinVars; } @Override public IVariable[] getSelectVars() { return selectVars; } @Override public boolean isOutputDistinctJoinVars() { return outputDistinctJVs; } @Override public IConstraint[] getConstraints() { return constraints; } @Override public boolean isEmpty() { return getRightSolutionCount() == 0; } protected long getNoJoinVarsLimit() { return noJoinVarsLimit; } protected JVMHashIndex getRightSolutions() { return rightSolutionsRef.get(); } @Override public long getRightSolutionCount() { /* * Note: This needs to be explicitly tracked and reported. Since each * entry in the rightSolutions map is a collision bucket, we can not * just report the size of the hash index. Instead we have to track and * report the #of solutions entered into the hash index in * acceptSolutions() and filterSolutions(). */ return rightSolutionCount.get(); } @Override public void release() { if (open.compareAndSet(true/* expect */, false/* update */)) { // Already closed. return; } rightSolutionsRef.set(null); } @Override public long acceptSolutions(final ICloseableIterator itr, final BOpStats stats) { if (!open.get()) throw new IllegalStateException(); try { final JVMHashIndex index = getRightSolutions(); final IBindingSet[] all = BOpUtility.toArray(itr, stats); if (log.isDebugEnabled()) log.debug("Materialized: " + all.length + " source solutions."); long naccepted = 0; for (IBindingSet bset : all) { if (index.add(bset) == null) { continue; } naccepted++; } if (log.isDebugEnabled()) log.debug("There are " + index.bucketCount() + " hash buckets, joinVars=" + Arrays.toString(joinVars)); rightSolutionCount.add(naccepted); return naccepted; } catch (Throwable t) { throw launderThrowable(t); } } /* * Note: This implementation is not used. The JVMDistinctFilter is based on * a ConcurrenthashMap and provides better throughput. For this reason, the * JVMDistinctFilter is is used by the JVMDistinctBindingSetsOp. */ @Override public long filterSolutions(final ICloseableIterator itr, final BOpStats stats, final IBuffer sink) { try { final JVMHashIndex index = getRightSolutions(); final IBindingSet[] all = BOpUtility.toArray(itr, stats); if (log.isDebugEnabled()) log.debug("Materialized: " + all.length + " source solutions."); for (IBindingSet bset : all) { /* * Note: For a DISTINCT SOLUTIONS filter, we only consider the * variables that are being projected. Further, all variables * are used when computing the hash code. Therefore "joinVars" * == "selectedVars" for a DISTINCT SOLUTIONS filter. */ bset = bset.copy(joinVars); // only consider the selected variables. /* * Note: Solutions are NOT dropped if a variable is not bound in * a given solution. The variable is simply not used when * computing the hash code. Specifying optional:=true here * causes makeKey() to have this behavior. */ if (index.addDistinct(bset)) { // Write on the output sink. sink.add(bset); } } if (log.isDebugEnabled()) log.debug("There are " + index.bucketCount() + " hash buckets, joinVars=" + Arrays.toString(joinVars)); final long naccepted = all.length; rightSolutionCount.add(naccepted); return naccepted; } catch (Throwable t) { throw launderThrowable(t); } } @Override public void hashJoin(// final ICloseableIterator leftItr,// final BOpStats stats,// final IBuffer outputBuffer// ) { hashJoin2(leftItr, stats, outputBuffer, constraints); } /** * {@inheritDoc} *

* For each source solution materialized, the hash table is probed using the * as-bound join variables for that source solution. A join hit counter is * carried for each solution in the hash index and is used to support * OPTIONAL joins. */ @Override public void hashJoin2(// final ICloseableIterator leftItr,// final BOpStats stats, final IBuffer outputBuffer,// final IConstraint[] constraints// ) { if (!open.get()) throw new IllegalStateException(); final JVMHashIndex rightSolutions = getRightSolutions(); if (log.isInfoEnabled()) { log.info("rightSolutions: #buckets=" + rightSolutions.bucketCount() + ",#solutions=" + getRightSolutionCount()); } // true iff there are no join variables. final boolean noJoinVars = joinVars.length == 0; try { while (leftItr.hasNext()) { // Next chunk of solutions from left. final IBindingSet[] leftChunk = leftItr.next(); if (stats != null) { stats.chunksIn.increment(); stats.unitsIn.add(leftChunk.length); } for (IBindingSet left : leftChunk) { nleftConsidered.increment(); if (log.isDebugEnabled()) log.debug("Considering " + left); final Bucket bucket = rightSolutions.getBucket(left); if (bucket == null) continue; final Iterator ritr = bucket.iterator(); while (ritr.hasNext()) { final SolutionHit right = ritr.next(); nrightConsidered.increment(); if (log.isDebugEnabled()) log.debug("Join with " + right); nJoinsConsidered.increment(); if (noJoinVars && nJoinsConsidered.get() == noJoinVarsLimit) { if (nleftConsidered.get() > 1 && nrightConsidered.get() > 1) { throw new UnconstrainedJoinException(); } } // See if the solutions join. final IBindingSet outSolution = BOpContext.bind(// right.solution,// left,// constraints,// selectVars// ); switch (joinType) { case Normal: { if (outSolution != null) { // Output the solution. outputSolution(outputBuffer, outSolution); } break; } case Optional: { if (outSolution != null) { // Output the solution. outputSolution(outputBuffer, outSolution); // Increment counter so we know not to output // the rightSolution as an optional solution. right.nhits.increment(); } break; } case Exists: { /* * The right solution is output iff there is at * least one left solution which joins with that * right solution. Each right solution is output at * most one time. */ if (outSolution != null) { // if (right.nhits.get() == 0L) { // // Output the solution. // outputSolution(outputBuffer, right.solution); // } // Increment counter so we know this solution joins. right.nhits.increment(); } break; } case NotExists: { /* * The right solution is output iff there does not * exist any left solution which joins with that * right solution. This basically an optional join * where the solutions which join are not output. */ if (outSolution != null) { // Increment counter so we know not to output // the rightSolution as an optional solution. right.nhits.increment(); } break; } default: throw new AssertionError(); } } // while(ritr.hasNext()) } // for(left : leftChunk) } // while(leftItr.hasNext()) } catch(Throwable t) { throw launderThrowable(t); } finally { leftItr.close(); } } /** * {@inheritDoc} *

* This implementation is a NOP since the underlying Java collection class * is thread-safe for concurrent readers. */ @Override public void saveSolutionSet() { // NOP } /** * Output a solution. * * @param outputBuffer * Where to write the solution. * @param outSolution * The solution. */ protected void outputSolution(final IBuffer outputBuffer, final IBindingSet outSolution) { if (log.isDebugEnabled()) log.debug("Output solution: " + outSolution); // Accept this binding set. outputBuffer.add(outSolution); } @Override public void outputOptionals(final IBuffer outputBuffer) { if (!open.get()) throw new IllegalStateException(); try { @SuppressWarnings({ "rawtypes", "unchecked" }) final Constant f = askVar == null ? null : new Constant( XSDBooleanIV.FALSE); final JVMHashIndex rightSolutions = getRightSolutions(); final IVariable[] selected = getSelectVars(); if (log.isInfoEnabled()) log.info("rightSolutions: #buckets=" + rightSolutions.bucketCount()); /* * Note: when NO solutions joined for a given source binding set AND * the join is OPTIONAL then we output the _original_ binding set to * the sink join task(s) and DO NOT apply the CONSTRAINT(s). */ final Iterator bitr = rightSolutions.buckets(); while (bitr.hasNext()) { final Bucket b = bitr.next(); for (SolutionHit hit : b) { if (hit.nhits.get() > 0) continue; IBindingSet bs = hit.solution; if (selected != null) { // Drop variables which are not projected. bs = bs.copy(selected); } if (f != null) { if (bs == hit.solution) bs = bs.clone(); bs.set(askVar, f); } outputBuffer.add(bs); if (log.isDebugEnabled()) log.debug("Optional solution: " + bs); } } } catch (Throwable t) { throw launderThrowable(t); } } @SuppressWarnings("unchecked") @Override public ICloseableIterator indexScan() { try { // /* // * The selected variables -or- null if all variables // * should be projected. // */ // final IVariable[] selected = getSelectVars(); final JVMHashIndex rightSolutions = getRightSolutions(); if (log.isInfoEnabled()) log.info("rightSolutions: #buckets=" + rightSolutions.bucketCount()); // Visit the buckets. IStriterator itr = new Striterator(rightSolutions.buckets()); itr = itr.addFilter(new Expander() { private static final long serialVersionUID = 1L; /** * Expand the bucket into the solutions in the bucket. */ @SuppressWarnings("rawtypes") @Override protected Iterator expand(final Object obj) { final Bucket b = (Bucket) obj; return b.iterator(); } }); /** * Copy only the variables that are projected. */ itr = itr.addFilter(new Resolver() { private static final long serialVersionUID = 1L; @Override protected Object resolve(final Object obj) { final IBindingSet bs = ((SolutionHit) obj).solution; // if (selected != null) { // // // Drop variables which are not projected. // bs = bs.copy(selected); // // } return bs; } }); return (ICloseableIterator) itr; } catch (Throwable t) { throw launderThrowable(t); } } @Override public void outputSolutions(final IBuffer out) { if (!open.get()) throw new IllegalStateException(); try { final JVMHashIndex rightSolutions = getRightSolutions(); final IVariable[] selected = getSelectVars(); if (log.isInfoEnabled()) log.info("rightSolutions: #buckets=" + rightSolutions.bucketCount()); // source. final Iterator bucketIterator = rightSolutions.buckets(); while (bucketIterator.hasNext()) { final Bucket bucket = bucketIterator.next(); // New hash bucket so new DISTINCT set. final HashSet distinctSet = outputDistinctJVs ? new HashSet()// TODO Size estimate? : null; for (SolutionHit solutionHit : bucket) { IBindingSet bs = solutionHit.solution; if( outputDistinctJVs) { /* * Output those solutions that are distinct on the join * variables. We do this by laying a DISTINCT filter * over the solutions drawn from each bucket that we * visit. The DISTINCT filter does not need to consider * solutions that fall into other buckets, just the * current bucket. */ // drop anything not in the join variables. bs = bs.copy(joinVars); if (!distinctSet.add(bs)) { // Duplicate solution on JVs in this bucket. continue; } // if (distinctFilter != null) { // // if ((bs = distinctFilter.accept(bs)) == null) { // // // Drop duplicate solutions. // continue; // // } } else if (selected != null) { /* * FIXME We should be using projectedInVars here since * outputSolutions() is used to stream solutions into * the child join group (at least for some kinds of * joins, but there might be exceptions for joining with * a named solution set). */ // Drop variables which are not projected. bs = bs.copy(selected); } out.add(bs); if (log.isDebugEnabled()) log.debug("Output solution: " + bs); } } } catch (Throwable t) { throw launderThrowable(t); } } @Override public void outputJoinSet(final IBuffer outputBuffer) { try { @SuppressWarnings({ "rawtypes", "unchecked" }) final Constant t = askVar == null ? null : new Constant( XSDBooleanIV.TRUE); final JVMHashIndex rightSolutions = getRightSolutions(); final IVariable[] selected = getSelectVars(); if (log.isInfoEnabled()) log.info("rightSolutions: #buckets=" + rightSolutions.bucketCount()); final Iterator bitr = rightSolutions.buckets(); while(bitr.hasNext()) { final Bucket b = bitr.next(); for (SolutionHit hit : b) { if (hit.nhits.get() == 0) continue; IBindingSet bs = hit.solution; if (selected != null) { // Drop variables which are not projected. bs = bs.copy(selected); } if (t != null) { if (bs == hit.solution) bs = bs.clone(); bs.set(askVar, t); } outputBuffer.add(bs); if (log.isDebugEnabled()) log.debug("Output solution: " + bs); } } } catch (Throwable t) { throw launderThrowable(t); } } /** * Combine constraints for each source with the given constraints. * * @param constraints * Explicitly given constraints for this join. * @param all * The sources for the join. * * @return The combined constraints and null iff there are no * constraints. */ static IConstraint[] combineConstraints(final IConstraint[] constraints, final IHashJoinUtility[] all) { final List list = new LinkedList(); // For each source. for (int i = 0; i < all.length; i++) { final IHashJoinUtility tmp = all[i]; if (tmp.getConstraints() != null) { list.addAll(Arrays.asList(tmp.getConstraints())); } } // The join constraints specified by the caller. if (constraints != null) { list.addAll(Arrays.asList(constraints)); } return list.isEmpty() ? null : list .toArray(new IConstraint[list.size()]); } /** * Advance each other source to the first hash code GTE the hashCode for the * first source. *

* If the source does not have a bucket for the hash code in the first * bucket then either (a) if this is a required join, this method will * return false and the caller must advance to the next bucket * in the first source; or (b) if this is an optional join, there will be a * null in the currentBucket[] for that source. * * @param sortedSourceBuckets * An array of {@link Bucket}[]s for each source. The vector of * {@link Bucket}s for each source has been sorted. This means * that we can scan down those vectors and observe {@link Bucket} * s having strictly increasing hash codes for each source. * @param sourceIndex * The next index into each source. * @param currentBucket * The current bucket for each source. * @param optional * true iff this is an optional join. * * @return true if we are on a bucket which might join. if this * method returns false, then the caller should * immediately advance to the next bucket from the first source * without attempting a join. */ static private boolean advanceOtherSources(// final Bucket[][] sortedSourceBuckets,// final int[] sourceIndex,// final Bucket[] currentBucket,// final boolean optional// ) { // The next collision bucket in hash code order from the 1st source. final Bucket firstBucket = sortedSourceBuckets[0][sourceIndex[0]]; final int hashCode = firstBucket.hashCode(); currentBucket[0] = firstBucket; for (int i = 1; i < sourceIndex.length; i++) { // Advance source to first bucket GTE hashCode. while (true) { // Next bucket index for the other source final int j = sourceIndex[i]; final Bucket otherBucket; if (j >= sortedSourceBuckets[i].length) { // This source is exhausted. if (!optional) { // Nothing is left which can join. return false; } otherBucket = null; } else { otherBucket = sortedSourceBuckets[i][j]; } if (otherBucket == null) { assert optional; currentBucket[i] = null; break; } if (otherBucket.hashCode() < hashCode) { sourceIndex[i]++; continue; } if (otherBucket.hashCode() > hashCode) { if (!optional) { // The bucket on the first source can not join. return false; } else { // The bucket will be ignored. currentBucket[i] = null; // Exit the inner loop. break; } } currentBucket[i] = otherBucket; break; } } return true; } @Override public void mergeJoin(// final IHashJoinUtility[] others,// final IBuffer outputBuffer, // final IConstraint[] constraints,// final boolean optional// ) { /* * Validate arguments. */ if (others == null) throw new IllegalArgumentException(); if (others.length == 0) throw new IllegalArgumentException(); if (outputBuffer == null) throw new IllegalArgumentException(); final JVMHashJoinUtility[] all = new JVMHashJoinUtility[others.length + 1]; { all[0] = this; for (int i = 0; i < others.length; i++) { final JVMHashJoinUtility o = (JVMHashJoinUtility) others[i]; if (o == null) throw new IllegalArgumentException(); if (!Arrays.equals(this.joinVars, o.joinVars)) { // Must have the same join variables. throw new IllegalArgumentException(); } all[i + 1] = o; } } if(isEmpty()) { return; } /* * Combine constraints for each source with the given constraints. */ final IConstraint[] c = combineConstraints(constraints, all); /* * The JVM hash collections do not maintain the data in hash code order. * Therefore, we materialize and sort the collision buckets for each * hash index. */ final Bucket[][] sortedSourceBuckets = new Bucket[all.length][]; { for (int i = 0; i < all.length; i++) { // Fully materialize the solution set as a Bucket[]. final Bucket[] t = all[i].getRightSolutions().toArray(); /* * Sort the array. It's natural sort order is by the hash code * of the join variables. */ Arrays.sort(t); sortedSourceBuckets[i] = t; } } /* * Synchronize each source. */ // The next index into each source (not used for the 1st source). final int[] sourceIndex = new int[all.length]; // The current bucket for each source. final Bucket[] currentBucket = new Bucket[all.length]; while (sourceIndex[0] < sortedSourceBuckets[0].length) { if (!optional) { /* * If the join is not optional, then we are done as soon as any * source is exhausted. */ for (int i = 1; i < sourceIndex.length; i++) { if (sourceIndex[i] >= sortedSourceBuckets[i].length) { // All done. return; } } } // Synchronous the other sources. if (advanceOtherSources(sortedSourceBuckets, sourceIndex, currentBucket, optional)) { // log.error("sourceIndex[]=" + Arrays.toString(sourceIndex)); // Join those buckets, outputting solutions which join. mergeJoin(currentBucket, c, optional, outputBuffer); } // Advance the first source to the next bucket. sourceIndex[0]++; } } /** * MERGE JOIN *

* Join the solution sets from each source. This will consider the full * cross product of the solutions in each source bucket. All buckets will * have the same hash code. If this is an optional join, then some entries * in buckets[] MAY be null. However, the first entry is never * null since that is the primary source for the join. * * @param currentBucket * The current {@link Bucket} from each source. The first entry * in this array is the source from which optional solutions will * be reported if the join is optional. * @param constraints * @param optional * true iff the join is optional. * @param outputBuffer */ static private void mergeJoin(// final Bucket[] currentBucket,// final IConstraint[] constraints,// final boolean optional,// final IBuffer outputBuffer) { final int nsources = currentBucket.length; // The bucket for the first source. final Bucket firstBucket = currentBucket[0]; assert firstBucket != null; // never allowed for the 1st source. for (int i = 1; i < nsources; i++) { // A bucket having the same hash code for another source. final Bucket otherBucket = currentBucket[i]; if (otherBucket == null) { assert optional; // only allowed if the join is optional. continue; } // Must be the same hash code. assert firstBucket.hashCode() == otherBucket.hashCode(); } final SolutionHit[] set = new SolutionHit[nsources]; final Striterator sols1 = new Striterator(firstBucket.iterator()); sols1.addFilter(new Visitor() { private static final long serialVersionUID = 1L; @Override protected void visit(Object obj) { set[0] = (SolutionHit) obj; } }); // now add in Expanders and Visitors for each Bucket for (int i = 1; i < nsources; i++) { // A bucket having the same hash code for another source. final int slot = i; final Bucket otherBucket = currentBucket[i]; // if optional then if there are no solutions don't try and // expand further if (!(optional && (otherBucket == null || otherBucket.isEmpty()))) { sols1.addFilter(new Expander() { private static final long serialVersionUID = 1L; @Override protected Iterator expand(final Object obj) { return otherBucket.iterator(); } }); sols1.addFilter(new Visitor() { private static final long serialVersionUID = 1L; @Override protected void visit(final Object obj) { set[slot] = (SolutionHit) obj; } }); } } while (sols1.hasNext()) { sols1.next(); IBindingSet in = set[0].solution; for (int i = 1; i < set.length; i++) { // See if the solutions join. if (set[i] != null) { in = BOpContext.bind(// in,// set[i].solution,// constraints,// TODO constraint[][] null// ); } if (in == null) { // Join failed. break; } if (log.isDebugEnabled()) log.debug("Output solution: " + in); } // Accept this binding set. if (in != null) { outputBuffer.add(in); } // // now clear set! // for (int i = 1; i < set.length; i++) { // set[i] = null; // } } } /** * Adds metadata about the {@link IHashJoinUtility} state to the stack * trace. * * @param t * The thrown error. * * @return The laundered exception. * * @throws Exception * * @see http://sourceforge.net/apps/trac/bigdata/ticket/508 (LIMIT causes * hash join utility to log errors) */ private RuntimeException launderThrowable(final Throwable t) { final String msg = "cause=" + t + ", state=" + toString(); if (!InnerCause.isInnerCause(t, InterruptedException.class) && !InnerCause.isInnerCause(t, BufferClosedException.class)) { /* * Some sort of unexpected exception. */ log.error(msg, t); } return new RuntimeException(msg, t); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy