com.bigdata.bop.join.JVMHashJoinUtility Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Oct 17, 2011
*/
package com.bigdata.bop.join;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.log4j.Logger;
import com.bigdata.bop.BOpContext;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.Constant;
import com.bigdata.bop.HTreeAnnotations;
import com.bigdata.bop.HashMapAnnotations;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstraint;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.PipelineOp;
import com.bigdata.bop.controller.INamedSolutionSetRef;
import com.bigdata.bop.engine.BOpStats;
import com.bigdata.bop.join.JVMHashIndex.Bucket;
import com.bigdata.bop.join.JVMHashIndex.Key;
import com.bigdata.bop.join.JVMHashIndex.SolutionHit;
import com.bigdata.counters.CAT;
import com.bigdata.htree.HTree;
import com.bigdata.rdf.internal.impl.literal.XSDBooleanIV;
import com.bigdata.relation.accesspath.BufferClosedException;
import com.bigdata.relation.accesspath.IBuffer;
import com.bigdata.util.InnerCause;
import cutthecrap.utils.striterators.Expander;
import cutthecrap.utils.striterators.ICloseableIterator;
import cutthecrap.utils.striterators.IStriterator;
import cutthecrap.utils.striterators.Resolver;
import cutthecrap.utils.striterators.Striterator;
import cutthecrap.utils.striterators.Visitor;
/**
* Utility class supporting hash join against a Java hash collection.
*
* @author Bryan Thompson
* @version $Id$
*/
public class JVMHashJoinUtility implements IHashJoinUtility {
private static final Logger log = Logger.getLogger(JVMHashJoinUtility.class);
/**
* Singleton {@link IHashJoinUtilityFactory} that can be used to create a
* new {@link JVMHashJoinUtility}.
*/
static public final IHashJoinUtilityFactory factory =
new IHashJoinUtilityFactory() {
private static final long serialVersionUID = 1L;
public IHashJoinUtility create(//
final BOpContext context,//
final INamedSolutionSetRef namedSetRef,//
final PipelineOp op,//
final JoinTypeEnum joinType//
) {
return new JVMHashJoinUtility(op, joinType);
}
};
/**
* true
until the state is discarded by {@link #release()}.
*/
protected final AtomicBoolean open = new AtomicBoolean(true);
/**
* The type of join to be performed.
*/
protected final JoinTypeEnum joinType;
// /**
// * true
iff the join is OPTIONAL.
// */
// private final boolean optional;
//
// /**
// * true
iff this is a DISTINCT filter.
// */
// private final boolean filter;
//
// /**
// * true
iff a solution having an unbound {@link #joinVars}
// * should be dropped and false
if it should be indexed anyway.
// */
// private final boolean dropSolutionsHavingUnboundJoinVars;
/**
* @see HashJoinAnnotations#ASK_VAR
*/
protected final IVariable> askVar;
/**
* The join variables.
*/
protected final IVariable>[] joinVars;
/**
* The variables to be retained (aka projected out) (optional, all variables
* are retained if not specified).
*/
protected final IVariable>[] selectVars;
/**
* True if the hash join utility class is to output the distinct join
* variables.
*/
protected boolean outputDistinctJVs = false;
/**
* The join constraints (optional).
*/
protected final IConstraint[] constraints;
/**
* The hash index.
*
* Note: There is no separate "joinSet". Instead, the {@link SolutionHit}
* class provides a join hit counter.
*/
protected final AtomicReference rightSolutionsRef = new AtomicReference();
/**
* The #of solutions accepted into the hash index.
*/
protected final CAT rightSolutionCount = new CAT();
/**
* The maximum #of (left,right) solution joins that will be considered
* before failing the join. This is used IFF there are no join variables.
*/
private final long noJoinVarsLimit = HashJoinAnnotations.DEFAULT_NO_JOIN_VARS_LIMIT;
/**
* The #of left solutions considered for a join.
*/
protected final CAT nleftConsidered = new CAT();
/**
* The #of right solutions considered for a join.
*/
protected final CAT nrightConsidered = new CAT();
/**
* The #of solution pairs considered for a join.
*/
protected final CAT nJoinsConsidered = new CAT();
/**
* Human readable representation of the {@link IHashJoinUtility} metadata
* (but not the solutions themselves).
*/
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append(getClass().getSimpleName());
sb.append("{open=" + open);
sb.append(",joinType="+joinType);
// sb.append(",optional=" + optional);
// sb.append(",filter=" + filter);
if (askVar != null)
sb.append(",askVar=" + askVar);
sb.append(",joinVars=" + Arrays.toString(joinVars));
sb.append(",outputDistinctJVs=" + outputDistinctJVs);
if (selectVars != null)
sb.append(",selectVars=" + Arrays.toString(selectVars));
if (constraints != null)
sb.append(",constraints=" + Arrays.toString(constraints));
sb.append(",size=" + getRightSolutionCount());
sb.append(",considered(left=" + nleftConsidered + ",right="
+ nrightConsidered + ",joins=" + nJoinsConsidered + ")");
sb.append("}");
return sb.toString();
}
/**
*
* @param op
* The operator whose annotation will inform construction the
* hash index. The {@link HTreeAnnotations} may be specified for
* this operator and will control the initialization of the
* various {@link HTree} instances.
* @param joinType
* The type of join to be performed.
*
* @see JVMHashJoinAnnotations
*/
public JVMHashJoinUtility(final PipelineOp op, final JoinTypeEnum joinType) {
if (op == null)
throw new IllegalArgumentException();
if(joinType == null)
throw new IllegalArgumentException();
this.joinType = joinType;
/*
* Note: This flag needs to be [true] if we allow solutions to be stored
* in the hash index that have unbound variables for the "joinVars". We
* do this for OPTIONAL because all solutions must be indexed for an
* OPTIONAL join since solutions that do not join will be reported.
*
* A DISTINCT FILTER does this as well. This is because "joinVars" is
* really the list of projected variables for a SELECT DISTINCT for a
* DISTINCT FILTER.
*
* By this reasoning, we should also do this for MINUS/NOT EXISTS. That
* is, for everything except a NORMAL and EXISTS joins.
*
* This suggests a historical bug (fixed below) in MINUS / NOT EXISTS
* handling.
*
* TODO Write a unit test for that bug involving a source solution with
* an unbound join variable and see whether the solution is dropped or
* reported (I believe that it should be reported).
*/
// final boolean optional = joinType == JoinTypeEnum.Optional;
final boolean filter = joinType == JoinTypeEnum.Filter;
final boolean indexSolutionsHavingUnboundJoinVars;
switch(joinType) {
case Normal:
case Exists:
indexSolutionsHavingUnboundJoinVars = false;
break;
case Optional: // OPTIONAL join.
case NotExists: // NOT EXISTS and MINUS
case Filter: // SELECT DISTINCT
indexSolutionsHavingUnboundJoinVars = true;
break;
default:
throw new UnsupportedOperationException();
}
// Optional variable used for (NOT) EXISTS.
this.askVar = (IVariable>) op
.getProperty(HashJoinAnnotations.ASK_VAR);
// The join variables (required).
this.joinVars = (IVariable>[]) op
.getRequiredProperty(HashJoinAnnotations.JOIN_VARS);
/*
* The projected OUT variables (optional and equal to the join variables
* iff this is a DISTINCT filter).
*/
this.selectVars = filter ? joinVars : (IVariable>[]) op
.getProperty(JoinAnnotations.SELECT);
this.outputDistinctJVs =
op.getProperty(
HashIndexOp.Annotations.OUTPUT_DISTINCT_JVs, false);
// The join constraints (optional).
this.constraints = (IConstraint[]) op
.getProperty(JoinAnnotations.CONSTRAINTS);
// // Iff the join has OPTIONAL semantics.
// this.optional = optional;
//
// // Iff this is a DISTINCT filter.
// this.filter = filter;
/*
* TODO Parameter for the map implementation class.
*
* - HashMap is going to be faster for insert and search.
*
* - LinkedHashMap will be faster for the iterator.
*
* - ConcurrentHashMap will be faster if (a) the JVMHashIndex is safe
* for concurrent inserts; and (b) the JVMHashIndexOp is safe for
* concurrent execution. [These conditions are not currently true.]
*
* Some join patterns do not require us to use an iterator at all, in
* which case HashMap is the clear winner. (For example, a non-optional
* hash join against an access path never uses the iterator over the
* hash index.)
*/
/*
* The ordered variables for the keys in the hash index.
*
* For SELECT DISTINCT, use the [select] variables.
*
* Otherwise use the [joinVars].
*/
final IVariable>[] keyVars = filter ? (IVariable>[]) op
.getProperty(JoinAnnotations.SELECT) : joinVars;
rightSolutionsRef.set(//
new JVMHashIndex(//
keyVars,//
indexSolutionsHavingUnboundJoinVars,//
new LinkedHashMap(op.getProperty(
HashMapAnnotations.INITIAL_CAPACITY,
HashMapAnnotations.DEFAULT_INITIAL_CAPACITY),//
op.getProperty(HashMapAnnotations.LOAD_FACTOR,
HashMapAnnotations.DEFAULT_LOAD_FACTOR)//
)//
));
}
@Override
public JoinTypeEnum getJoinType() {
return joinType;
}
@Override
public IVariable> getAskVar() {
return askVar;
}
@Override
public IVariable>[] getJoinVars() {
return joinVars;
}
@Override
public IVariable>[] getSelectVars() {
return selectVars;
}
@Override
public boolean isOutputDistinctJoinVars() {
return outputDistinctJVs;
}
@Override
public IConstraint[] getConstraints() {
return constraints;
}
@Override
public boolean isEmpty() {
return getRightSolutionCount() == 0;
}
protected long getNoJoinVarsLimit() {
return noJoinVarsLimit;
}
protected JVMHashIndex getRightSolutions() {
return rightSolutionsRef.get();
}
@Override
public long getRightSolutionCount() {
/*
* Note: This needs to be explicitly tracked and reported. Since each
* entry in the rightSolutions map is a collision bucket, we can not
* just report the size of the hash index. Instead we have to track and
* report the #of solutions entered into the hash index in
* acceptSolutions() and filterSolutions().
*/
return rightSolutionCount.get();
}
@Override
public void release() {
if (open.compareAndSet(true/* expect */, false/* update */)) {
// Already closed.
return;
}
rightSolutionsRef.set(null);
}
@Override
public long acceptSolutions(final ICloseableIterator itr,
final BOpStats stats) {
if (!open.get())
throw new IllegalStateException();
try {
final JVMHashIndex index = getRightSolutions();
final IBindingSet[] all = BOpUtility.toArray(itr, stats);
if (log.isDebugEnabled())
log.debug("Materialized: " + all.length + " source solutions.");
long naccepted = 0;
for (IBindingSet bset : all) {
if (index.add(bset) == null) {
continue;
}
naccepted++;
}
if (log.isDebugEnabled())
log.debug("There are " + index.bucketCount()
+ " hash buckets, joinVars="
+ Arrays.toString(joinVars));
rightSolutionCount.add(naccepted);
return naccepted;
} catch (Throwable t) {
throw launderThrowable(t);
}
}
/*
* Note: This implementation is not used. The JVMDistinctFilter is based on
* a ConcurrenthashMap and provides better throughput. For this reason, the
* JVMDistinctFilter is is used by the JVMDistinctBindingSetsOp.
*/
@Override
public long filterSolutions(final ICloseableIterator itr,
final BOpStats stats, final IBuffer sink) {
try {
final JVMHashIndex index = getRightSolutions();
final IBindingSet[] all = BOpUtility.toArray(itr, stats);
if (log.isDebugEnabled())
log.debug("Materialized: " + all.length + " source solutions.");
for (IBindingSet bset : all) {
/*
* Note: For a DISTINCT SOLUTIONS filter, we only consider the
* variables that are being projected. Further, all variables
* are used when computing the hash code. Therefore "joinVars"
* == "selectedVars" for a DISTINCT SOLUTIONS filter.
*/
bset = bset.copy(joinVars); // only consider the selected variables.
/*
* Note: Solutions are NOT dropped if a variable is not bound in
* a given solution. The variable is simply not used when
* computing the hash code. Specifying optional:=true here
* causes makeKey() to have this behavior.
*/
if (index.addDistinct(bset)) {
// Write on the output sink.
sink.add(bset);
}
}
if (log.isDebugEnabled())
log.debug("There are " + index.bucketCount()
+ " hash buckets, joinVars="
+ Arrays.toString(joinVars));
final long naccepted = all.length;
rightSolutionCount.add(naccepted);
return naccepted;
} catch (Throwable t) {
throw launderThrowable(t);
}
}
@Override
public void hashJoin(//
final ICloseableIterator leftItr,//
final BOpStats stats,//
final IBuffer outputBuffer//
) {
hashJoin2(leftItr, stats, outputBuffer, constraints);
}
/**
* {@inheritDoc}
*
* For each source solution materialized, the hash table is probed using the
* as-bound join variables for that source solution. A join hit counter is
* carried for each solution in the hash index and is used to support
* OPTIONAL joins.
*/
@Override
public void hashJoin2(//
final ICloseableIterator leftItr,//
final BOpStats stats,
final IBuffer outputBuffer,//
final IConstraint[] constraints//
) {
if (!open.get())
throw new IllegalStateException();
final JVMHashIndex rightSolutions = getRightSolutions();
if (log.isInfoEnabled()) {
log.info("rightSolutions: #buckets=" + rightSolutions.bucketCount()
+ ",#solutions=" + getRightSolutionCount());
}
// true iff there are no join variables.
final boolean noJoinVars = joinVars.length == 0;
try {
while (leftItr.hasNext()) {
// Next chunk of solutions from left.
final IBindingSet[] leftChunk = leftItr.next();
if (stats != null) {
stats.chunksIn.increment();
stats.unitsIn.add(leftChunk.length);
}
for (IBindingSet left : leftChunk) {
nleftConsidered.increment();
if (log.isDebugEnabled())
log.debug("Considering " + left);
final Bucket bucket = rightSolutions.getBucket(left);
if (bucket == null)
continue;
final Iterator ritr = bucket.iterator();
while (ritr.hasNext()) {
final SolutionHit right = ritr.next();
nrightConsidered.increment();
if (log.isDebugEnabled())
log.debug("Join with " + right);
nJoinsConsidered.increment();
if (noJoinVars
&& nJoinsConsidered.get() == noJoinVarsLimit) {
if (nleftConsidered.get() > 1
&& nrightConsidered.get() > 1) {
throw new UnconstrainedJoinException();
}
}
// See if the solutions join.
final IBindingSet outSolution = BOpContext.bind(//
right.solution,//
left,//
constraints,//
selectVars//
);
switch (joinType) {
case Normal: {
if (outSolution != null) {
// Output the solution.
outputSolution(outputBuffer, outSolution);
}
break;
}
case Optional: {
if (outSolution != null) {
// Output the solution.
outputSolution(outputBuffer, outSolution);
// Increment counter so we know not to output
// the rightSolution as an optional solution.
right.nhits.increment();
}
break;
}
case Exists: {
/*
* The right solution is output iff there is at
* least one left solution which joins with that
* right solution. Each right solution is output at
* most one time.
*/
if (outSolution != null) {
// if (right.nhits.get() == 0L) {
// // Output the solution.
// outputSolution(outputBuffer, right.solution);
// }
// Increment counter so we know this solution joins.
right.nhits.increment();
}
break;
}
case NotExists: {
/*
* The right solution is output iff there does not
* exist any left solution which joins with that
* right solution. This basically an optional join
* where the solutions which join are not output.
*/
if (outSolution != null) {
// Increment counter so we know not to output
// the rightSolution as an optional solution.
right.nhits.increment();
}
break;
}
default:
throw new AssertionError();
}
} // while(ritr.hasNext())
} // for(left : leftChunk)
} // while(leftItr.hasNext())
} catch(Throwable t) {
throw launderThrowable(t);
} finally {
leftItr.close();
}
}
/**
* {@inheritDoc}
*
* This implementation is a NOP since the underlying Java collection class
* is thread-safe for concurrent readers.
*/
@Override
public void saveSolutionSet() {
// NOP
}
/**
* Output a solution.
*
* @param outputBuffer
* Where to write the solution.
* @param outSolution
* The solution.
*/
protected void outputSolution(final IBuffer outputBuffer,
final IBindingSet outSolution) {
if (log.isDebugEnabled())
log.debug("Output solution: " + outSolution);
// Accept this binding set.
outputBuffer.add(outSolution);
}
@Override
public void outputOptionals(final IBuffer outputBuffer) {
if (!open.get())
throw new IllegalStateException();
try {
@SuppressWarnings({ "rawtypes", "unchecked" })
final Constant f = askVar == null ? null : new Constant(
XSDBooleanIV.FALSE);
final JVMHashIndex rightSolutions = getRightSolutions();
final IVariable>[] selected = getSelectVars();
if (log.isInfoEnabled())
log.info("rightSolutions: #buckets="
+ rightSolutions.bucketCount());
/*
* Note: when NO solutions joined for a given source binding set AND
* the join is OPTIONAL then we output the _original_ binding set to
* the sink join task(s) and DO NOT apply the CONSTRAINT(s).
*/
final Iterator bitr = rightSolutions.buckets();
while (bitr.hasNext()) {
final Bucket b = bitr.next();
for (SolutionHit hit : b) {
if (hit.nhits.get() > 0)
continue;
IBindingSet bs = hit.solution;
if (selected != null) {
// Drop variables which are not projected.
bs = bs.copy(selected);
}
if (f != null) {
if (bs == hit.solution)
bs = bs.clone();
bs.set(askVar, f);
}
outputBuffer.add(bs);
if (log.isDebugEnabled())
log.debug("Optional solution: " + bs);
}
}
} catch (Throwable t) {
throw launderThrowable(t);
}
}
@SuppressWarnings("unchecked")
@Override
public ICloseableIterator indexScan() {
try {
// /*
// * The selected variables -or- null
if all variables
// * should be projected.
// */
// final IVariable>[] selected = getSelectVars();
final JVMHashIndex rightSolutions = getRightSolutions();
if (log.isInfoEnabled())
log.info("rightSolutions: #buckets=" + rightSolutions.bucketCount());
// Visit the buckets.
IStriterator itr = new Striterator(rightSolutions.buckets());
itr = itr.addFilter(new Expander() {
private static final long serialVersionUID = 1L;
/**
* Expand the bucket into the solutions in the bucket.
*/
@SuppressWarnings("rawtypes")
@Override
protected Iterator expand(final Object obj) {
final Bucket b = (Bucket) obj;
return b.iterator();
}
});
/**
* Copy only the variables that are projected.
*/
itr = itr.addFilter(new Resolver() {
private static final long serialVersionUID = 1L;
@Override
protected Object resolve(final Object obj) {
final IBindingSet bs = ((SolutionHit) obj).solution;
// if (selected != null) {
//
// // Drop variables which are not projected.
// bs = bs.copy(selected);
//
// }
return bs;
}
});
return (ICloseableIterator) itr;
} catch (Throwable t) {
throw launderThrowable(t);
}
}
@Override
public void outputSolutions(final IBuffer out) {
if (!open.get())
throw new IllegalStateException();
try {
final JVMHashIndex rightSolutions = getRightSolutions();
final IVariable>[] selected = getSelectVars();
if (log.isInfoEnabled())
log.info("rightSolutions: #buckets="
+ rightSolutions.bucketCount());
// source.
final Iterator bucketIterator = rightSolutions.buckets();
while (bucketIterator.hasNext()) {
final Bucket bucket = bucketIterator.next();
// New hash bucket so new DISTINCT set.
final HashSet distinctSet =
outputDistinctJVs ? new HashSet()// TODO Size estimate?
: null;
for (SolutionHit solutionHit : bucket) {
IBindingSet bs = solutionHit.solution;
if( outputDistinctJVs) {
/*
* Output those solutions that are distinct on the join
* variables. We do this by laying a DISTINCT filter
* over the solutions drawn from each bucket that we
* visit. The DISTINCT filter does not need to consider
* solutions that fall into other buckets, just the
* current bucket.
*/
// drop anything not in the join variables.
bs = bs.copy(joinVars);
if (!distinctSet.add(bs)) {
// Duplicate solution on JVs in this bucket.
continue;
}
// if (distinctFilter != null) {
//
// if ((bs = distinctFilter.accept(bs)) == null) {
//
// // Drop duplicate solutions.
// continue;
//
// }
} else if (selected != null) {
/*
* FIXME We should be using projectedInVars here since
* outputSolutions() is used to stream solutions into
* the child join group (at least for some kinds of
* joins, but there might be exceptions for joining with
* a named solution set).
*/
// Drop variables which are not projected.
bs = bs.copy(selected);
}
out.add(bs);
if (log.isDebugEnabled())
log.debug("Output solution: " + bs);
}
}
} catch (Throwable t) {
throw launderThrowable(t);
}
}
@Override
public void outputJoinSet(final IBuffer outputBuffer) {
try {
@SuppressWarnings({ "rawtypes", "unchecked" })
final Constant t = askVar == null ? null : new Constant(
XSDBooleanIV.TRUE);
final JVMHashIndex rightSolutions = getRightSolutions();
final IVariable>[] selected = getSelectVars();
if (log.isInfoEnabled())
log.info("rightSolutions: #buckets=" + rightSolutions.bucketCount());
final Iterator bitr = rightSolutions.buckets();
while(bitr.hasNext()) {
final Bucket b = bitr.next();
for (SolutionHit hit : b) {
if (hit.nhits.get() == 0)
continue;
IBindingSet bs = hit.solution;
if (selected != null) {
// Drop variables which are not projected.
bs = bs.copy(selected);
}
if (t != null) {
if (bs == hit.solution)
bs = bs.clone();
bs.set(askVar, t);
}
outputBuffer.add(bs);
if (log.isDebugEnabled())
log.debug("Output solution: " + bs);
}
}
} catch (Throwable t) {
throw launderThrowable(t);
}
}
/**
* Combine constraints for each source with the given constraints.
*
* @param constraints
* Explicitly given constraints for this join.
* @param all
* The sources for the join.
*
* @return The combined constraints and null
iff there are no
* constraints.
*/
static IConstraint[] combineConstraints(final IConstraint[] constraints,
final IHashJoinUtility[] all) {
final List list = new LinkedList();
// For each source.
for (int i = 0; i < all.length; i++) {
final IHashJoinUtility tmp = all[i];
if (tmp.getConstraints() != null) {
list.addAll(Arrays.asList(tmp.getConstraints()));
}
}
// The join constraints specified by the caller.
if (constraints != null) {
list.addAll(Arrays.asList(constraints));
}
return list.isEmpty() ? null : list
.toArray(new IConstraint[list.size()]);
}
/**
* Advance each other source to the first hash code GTE the hashCode for the
* first source.
*
* If the source does not have a bucket for the hash code in the first
* bucket then either (a) if this is a required join, this method will
* return false
and the caller must advance to the next bucket
* in the first source; or (b) if this is an optional join, there will be a
* null
in the currentBucket[] for that source.
*
* @param sortedSourceBuckets
* An array of {@link Bucket}[]s for each source. The vector of
* {@link Bucket}s for each source has been sorted. This means
* that we can scan down those vectors and observe {@link Bucket}
* s having strictly increasing hash codes for each source.
* @param sourceIndex
* The next index into each source.
* @param currentBucket
* The current bucket for each source.
* @param optional
* true
iff this is an optional join.
*
* @return true
if we are on a bucket which might join. if this
* method returns false
, then the caller should
* immediately advance to the next bucket from the first source
* without attempting a join.
*/
static private boolean advanceOtherSources(//
final Bucket[][] sortedSourceBuckets,//
final int[] sourceIndex,//
final Bucket[] currentBucket,//
final boolean optional//
) {
// The next collision bucket in hash code order from the 1st source.
final Bucket firstBucket = sortedSourceBuckets[0][sourceIndex[0]];
final int hashCode = firstBucket.hashCode();
currentBucket[0] = firstBucket;
for (int i = 1; i < sourceIndex.length; i++) {
// Advance source to first bucket GTE hashCode.
while (true) {
// Next bucket index for the other source
final int j = sourceIndex[i];
final Bucket otherBucket;
if (j >= sortedSourceBuckets[i].length) {
// This source is exhausted.
if (!optional) {
// Nothing is left which can join.
return false;
}
otherBucket = null;
} else {
otherBucket = sortedSourceBuckets[i][j];
}
if (otherBucket == null) {
assert optional;
currentBucket[i] = null;
break;
}
if (otherBucket.hashCode() < hashCode) {
sourceIndex[i]++;
continue;
}
if (otherBucket.hashCode() > hashCode) {
if (!optional) {
// The bucket on the first source can not join.
return false;
} else {
// The bucket will be ignored.
currentBucket[i] = null;
// Exit the inner loop.
break;
}
}
currentBucket[i] = otherBucket;
break;
}
}
return true;
}
@Override
public void mergeJoin(//
final IHashJoinUtility[] others,//
final IBuffer outputBuffer, //
final IConstraint[] constraints,//
final boolean optional//
) {
/*
* Validate arguments.
*/
if (others == null)
throw new IllegalArgumentException();
if (others.length == 0)
throw new IllegalArgumentException();
if (outputBuffer == null)
throw new IllegalArgumentException();
final JVMHashJoinUtility[] all = new JVMHashJoinUtility[others.length + 1];
{
all[0] = this;
for (int i = 0; i < others.length; i++) {
final JVMHashJoinUtility o = (JVMHashJoinUtility) others[i];
if (o == null)
throw new IllegalArgumentException();
if (!Arrays.equals(this.joinVars, o.joinVars)) {
// Must have the same join variables.
throw new IllegalArgumentException();
}
all[i + 1] = o;
}
}
if(isEmpty()) {
return;
}
/*
* Combine constraints for each source with the given constraints.
*/
final IConstraint[] c = combineConstraints(constraints, all);
/*
* The JVM hash collections do not maintain the data in hash code order.
* Therefore, we materialize and sort the collision buckets for each
* hash index.
*/
final Bucket[][] sortedSourceBuckets = new Bucket[all.length][];
{
for (int i = 0; i < all.length; i++) {
// Fully materialize the solution set as a Bucket[].
final Bucket[] t = all[i].getRightSolutions().toArray();
/*
* Sort the array. It's natural sort order is by the hash code
* of the join variables.
*/
Arrays.sort(t);
sortedSourceBuckets[i] = t;
}
}
/*
* Synchronize each source.
*/
// The next index into each source (not used for the 1st source).
final int[] sourceIndex = new int[all.length];
// The current bucket for each source.
final Bucket[] currentBucket = new Bucket[all.length];
while (sourceIndex[0] < sortedSourceBuckets[0].length) {
if (!optional) {
/*
* If the join is not optional, then we are done as soon as any
* source is exhausted.
*/
for (int i = 1; i < sourceIndex.length; i++) {
if (sourceIndex[i] >= sortedSourceBuckets[i].length) {
// All done.
return;
}
}
}
// Synchronous the other sources.
if (advanceOtherSources(sortedSourceBuckets, sourceIndex,
currentBucket, optional)) {
// log.error("sourceIndex[]=" + Arrays.toString(sourceIndex));
// Join those buckets, outputting solutions which join.
mergeJoin(currentBucket, c, optional, outputBuffer);
}
// Advance the first source to the next bucket.
sourceIndex[0]++;
}
}
/**
* MERGE JOIN
*
* Join the solution sets from each source. This will consider the full
* cross product of the solutions in each source bucket. All buckets will
* have the same hash code. If this is an optional join, then some entries
* in buckets[] MAY be null
. However, the first entry is never
* null
since that is the primary source for the join.
*
* @param currentBucket
* The current {@link Bucket} from each source. The first entry
* in this array is the source from which optional solutions will
* be reported if the join is optional.
* @param constraints
* @param optional
* true
iff the join is optional.
* @param outputBuffer
*/
static private void mergeJoin(//
final Bucket[] currentBucket,//
final IConstraint[] constraints,//
final boolean optional,//
final IBuffer outputBuffer) {
final int nsources = currentBucket.length;
// The bucket for the first source.
final Bucket firstBucket = currentBucket[0];
assert firstBucket != null; // never allowed for the 1st source.
for (int i = 1; i < nsources; i++) {
// A bucket having the same hash code for another source.
final Bucket otherBucket = currentBucket[i];
if (otherBucket == null) {
assert optional; // only allowed if the join is optional.
continue;
}
// Must be the same hash code.
assert firstBucket.hashCode() == otherBucket.hashCode();
}
final SolutionHit[] set = new SolutionHit[nsources];
final Striterator sols1 = new Striterator(firstBucket.iterator());
sols1.addFilter(new Visitor() {
private static final long serialVersionUID = 1L;
@Override
protected void visit(Object obj) {
set[0] = (SolutionHit) obj;
}
});
// now add in Expanders and Visitors for each Bucket
for (int i = 1; i < nsources; i++) {
// A bucket having the same hash code for another source.
final int slot = i;
final Bucket otherBucket = currentBucket[i];
// if optional then if there are no solutions don't try and
// expand further
if (!(optional && (otherBucket == null || otherBucket.isEmpty()))) {
sols1.addFilter(new Expander() {
private static final long serialVersionUID = 1L;
@Override
protected Iterator> expand(final Object obj) {
return otherBucket.iterator();
}
});
sols1.addFilter(new Visitor() {
private static final long serialVersionUID = 1L;
@Override
protected void visit(final Object obj) {
set[slot] = (SolutionHit) obj;
}
});
}
}
while (sols1.hasNext()) {
sols1.next();
IBindingSet in = set[0].solution;
for (int i = 1; i < set.length; i++) {
// See if the solutions join.
if (set[i] != null) {
in =
BOpContext.bind(//
in,//
set[i].solution,//
constraints,// TODO constraint[][]
null//
);
}
if (in == null) {
// Join failed.
break;
}
if (log.isDebugEnabled())
log.debug("Output solution: " + in);
}
// Accept this binding set.
if (in != null) {
outputBuffer.add(in);
}
// // now clear set!
// for (int i = 1; i < set.length; i++) {
// set[i] = null;
// }
}
}
/**
* Adds metadata about the {@link IHashJoinUtility} state to the stack
* trace.
*
* @param t
* The thrown error.
*
* @return The laundered exception.
*
* @throws Exception
*
* @see http://sourceforge.net/apps/trac/bigdata/ticket/508 (LIMIT causes
* hash join utility to log errors)
*/
private RuntimeException launderThrowable(final Throwable t) {
final String msg = "cause=" + t + ", state=" + toString();
if (!InnerCause.isInnerCause(t, InterruptedException.class)
&& !InnerCause.isInnerCause(t, BufferClosedException.class)) {
/*
* Some sort of unexpected exception.
*/
log.error(msg, t);
}
return new RuntimeException(msg, t);
}
}