com.bigdata.bop.join.JVMHashIndex Maven / Gradle / Ivy

Go to download
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.
Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Apr 29, 2013
 */
package com.bigdata.bop.join;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.log4j.Logger;

import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstant;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.solutions.JVMDistinctBindingSetsOp;
import com.bigdata.counters.CAT;

/**
 * A hash index for {@link IBindingSet}s that supports duplicate solutions and
 * hit counts. The hit counts are used to detect {@link IBindingSet}s that do
 * not join for OPTIONAL, MINUS, and related kinds of "negation" joins.
 * 
 * Note: The {@link JVMDistinctBindingSetsOp} does not use this class right now
 * because it enjoys better concurrency than the {@link JVMHashIndex}. Also see
 * {@link JVMDistinctFilter}, which is the backing implementation for the
 * {@link JVMDistinctBindingSetsOp}.
 * 
 * @see JVMDistinctFilter
 * 
 * @author Bryan Thompson
 */
public class JVMHashIndex {

    private static final Logger log = Logger.getLogger(JVMHashIndex.class);

    /**
     * Note: If joinVars is an empty array, then the solutions will all hash to
     * ONE (1).
     */
    private static final int ONE = 1;

//    /**
//     * Return the hash code which will be used as the key given the ordered
//     * as-bound values for the join variables.
//     * 
//     * @param joinVars
//     *            The join variables.
//     * @param bset
//     *            The bindings whose as-bound hash code for the join variables
//     *            will be computed.
//     * @param ignoreUnboundVariables
//     *            If a variable without a binding should be silently ignored.
//     * 
//     * @return The hash code.
//     * 
//     * @throws JoinVariableNotBoundException
//     *             if there is no binding for a join variable.
//     */
//    private static int hashCode(final IVariable[] joinVars,
//            final IBindingSet bset, final boolean ignoreUnboundVariables)
//            throws JoinVariableNotBoundException {
//
//        int h = ONE;
//
//        for (IVariable v : joinVars) {
//
//            final IConstant c = bset.get(v);
//
//            if (c == null) {
//
//                if (ignoreUnboundVariables)
//                    continue;
//
//                // Reject any solution which does not have a binding for a join
//                // variable.
//
//                throw new JoinVariableNotBoundException(v.getName());
//
//            }
//
//            h = 31 * h + c.hashCode();
//
//        }
//
//        if (log.isTraceEnabled())
//            log.trace("hashCode=" + h + ", joinVars="
//                    + Arrays.toString(joinVars) + " : " + bset);
//
//        return h;
//
//    }

    /**
     * Return an array of constants corresponding to the as-bound values of the
     * join variables for the given solution.
     * 
     * @param bset
     *            The solution.
     * 
     * @return The as-bound values for the {@link #keyVars} for that solution
     *         -or- null if one or more join variables is not bound
     *         by the solution and {@link #indexSolutionsHavingUnboundJoinVars}
     *         is false.
     * 
     * @see #keyVars
     * @see #indexSolutionsHavingUnboundJoinVars
     */
    private Key makeKey(//final IVariable[] keyVars,
            final IBindingSet bset
//            final boolean indexSolutionsHavingUnboundJoinVars
            ) {

        final IConstant[] vals = new IConstant[keyVars.length];

        for (int i = 0; i < keyVars.length; i++) {

            final IVariable v = keyVars[i];

            vals[i] = bset.get(v);

        }

        int h = ONE;

//        for (IVariable v : keyVars) {
//
//            final IConstant c = bset.get(v);

        for (int i = 0; i < keyVars.length; i++) {

            final IVariable v = keyVars[i];
            
            final IConstant c = vals[i];
            
            if (c == null) {

                if (!indexSolutionsHavingUnboundJoinVars) {

                    /*
                     * Drop solution having an unbound join variable.
                     */

                    if (log.isDebugEnabled())
                        log.debug("Join variable is not bound: var=" + v
                                + ", solution=" + bset);

                    return null;

                }

                continue;
                
            }

            h = 31 * h + c.hashCode();

        }

        if (log.isTraceEnabled())
            log.trace("hashCode=" + h + ", joinVars="
                    + Arrays.toString(keyVars) + " : " + bset);

        return new Key(h, vals);

    }

    /**
     * Wrapper for the keys in the hash table. This is necessary for the hash
     * table to compare the keys as equal and also provides efficiencies in the
     * hash code and equals() methods.
     */
    public static class Key {

        private final int hash;

        private final IConstant[] vals;

        private Key(final int hashCode, final IConstant[] vals) {
            this.vals = vals;
            this.hash = hashCode;
        }
        
        @Override
        public int hashCode() {
            return hash;
        }

        @Override
        public boolean equals(final Object o) {
            if (this == o)
                return true;
            if (!(o instanceof Key)) {
                return false;
            }
            final Key t = (Key) o;
            if (vals.length != t.vals.length)
                return false;
            for (int i = 0; i < vals.length; i++) {
                if (vals[i] == t.vals[i])
                    continue;
                if (vals[i] == null)
                    return false;
                if (!vals[i].equals(t.vals[i]))
                    return false;
            }
            return true;
        }
    }

    /**
     * An solution and a hit counter as stored in the {@link JVMHashIndex}.
     */
    public static class SolutionHit {

        /**
         * The input solution.
         */
        final public IBindingSet solution;

        /**
         * The #of hits on that solution. This may be used to detect solutions
         * that did not join. E.g., by scanning and reporting out all solutions
         * where {@link #nhits} is ZERO (0L).
         */
        public final CAT nhits = new CAT();

        private SolutionHit(final IBindingSet solution) {

            if (solution == null)
                throw new IllegalArgumentException();

            this.solution = solution;

        }

        @Override
        public String toString() {

            return getClass().getName() + "{nhits=" + nhits + ",solution="
                    + solution + "}";

        }

    } // class SolutionHit

    /**
     * A group of solutions having the same as-bound values for the join vars.
     * Each solution is paired with a hit counter so we can support OPTIONAL
     * semantics for the join.
     */
    public static class Bucket implements Iterable,
            Comparable {

        /** The hash code for this collision bucket. */
        private final int hashCode;

        /**
         * A set of solutions (and their hit counters) which have the same
         * as-bound values for the join variables.
         */
        private final List solutions = new LinkedList();

        @Override
        public String toString() {
            return super.toString()
                    + //
                    "{hashCode=" + hashCode + ",#solutions=" + solutions.size()
                    + "}";
        }

        public Bucket(final int hashCode, final IBindingSet solution) {

            this.hashCode = hashCode;

            add(solution);

        }

        public void add(final IBindingSet solution) {

            if (solution == null)
                throw new IllegalArgumentException();

            solutions.add(new SolutionHit(solution));

        }

        /**
         * Add the solution to the bucket iff the solutions is not already
         * present in the bucket.
         * 

         * Note: There is already a hash index in place on the join variables
         * when we are doing a DISTINCT filter. Further, only the "join"
         * variables are "selected" and participate in a DISTINCT filter.
         * Therefore, if we have a hash collision such that two solutions would
         * be directed into the same {@link Bucket} then we can not improve
         * matters but must simply scan the solutions in the bucket to decide
         * whether the new solution duplicates a solution which is already
         * present.
         * 
         * @param solution
         *            The solution.
         * 
         * @return true iff the bucket was modified by this
         *         operation.
         */
        public boolean addDistinct(final IBindingSet solution) {

            if (solutions.isEmpty()) {

                // First solution.
                solutions.add(new SolutionHit(solution));

                return true;

            }

            final Iterator itr = solutions.iterator();

            while (itr.hasNext()) {

                final SolutionHit aSolution = itr.next();

                if (aSolution.solution.equals(solution)) {

                    // Solution already in this bucket.
                    return false;

                }

            }

            // This is a distinct solution.
            solutions.add(new SolutionHit(solution));

            return true;

        }

        @Override
        final public Iterator iterator() {

            // return Collections.unmodifiableList(solutions).iterator();
            return solutions.iterator();

        }

        // @SuppressWarnings("unchecked")
        // public Iterator bindingSetIterator() {
        //
        // return new Striterator(solutions.iterator()).addFilter(new Resolver()
        // {
        //
        // @Override
        // protected Object resolve(Object obj) {
        // return ((SolutionHit)obj).solution;
        // }
        // });
        //
        // }

        /**
         * Orders the buckets based on their hash codes.
         */
        @Override
        final public int compareTo(final Bucket o) {
            if (hashCode > o.hashCode)
                return 1;
            if (hashCode < o.hashCode)
                return -1;
            return 0;
        }
        
        @Override
        final public int hashCode() {

            return hashCode;
            
        }

        /**
         * Return true iff this {@link Bucket} is empty (if there
         * are no solutions in the bucket).
         */
        final public boolean isEmpty() {

            return solutions.isEmpty();

        }

    } // Bucket

    /**
     * The join variables (required, but may be empty). The order of the entries
     * is used when forming the as-bound keys for the hash table. Duplicate
     * elements and null elements are not permitted. If no join variables are
     * specified, then the join will consider the N x M cross product, filtering
     * for solutions which join. This is very expensive when compared to a hash
     * join. Whenever possible you should identify one or more variables which
     * must be bound for the join and specify those as the join variables.
     */
    private final IVariable[] keyVars;

    /**
     * When true, we allow solutions to be stored in the hash index
     * that have unbound variables for the {@link #keyVars}. When
     * false, such solutions are dropped.
     * 

     * Note: This must be true for DISTINCT, OPTIONAL, and NOT
     * EXISTS / MINUS since in each case we do not want to drop solutions
     * lacking a binding for some {@link #keyVars}. For DISTINCT, this is
     * because we want to project all solutions, regardless of unbound
     * variables. For OPTIONAL and NOT EXISTS / MINUS, this is because we must
     * index all solutions since we will report only those solutions that do not
     * join. Once all solutions that do join have been identified, the solutions
     * that do not join are identified by a scan of the hash index looking for
     * {@link SolutionHit#nhits} equals ZERO (0L).
     */
    private final boolean indexSolutionsHavingUnboundJoinVars;

    /**
     * The backing map - this is NOT thread safe.
     */
    private final Map map;

    /**
     * @param keyVars
     *            The variables that are used to form the keys in the hash index
     *            (required, but may be empty). The order of the entries is used
     *            when forming the as-bound keys for the hash table. Duplicate
     *            elements and null elements are not permitted. If no join
     *            variables are specified, then the join will consider the N x M
     *            cross product, filtering for solutions which join. This is
     *            very expensive when compared to a hash join. Whenever possible
     *            you should identify one or more variables which must be bound
     *            for the join and specify those as the join variables.
     * @param indexSolutionsHavingUnboundJoinVars
     *            When true, we allow solutions to be stored in the
     *            hash index that have unbound variables for the
     *            {@link #keyVars}. When false, such solutions are
     *            dropped (they are not added to the index).
     * @param map
     *            The backing map. A {@link HashMap} should be faster for insert
     *            and search. A {@link LinkedHashMap} should be faster for
     *            scans. Some join patterns do not require us to use scans, in
     *            which case {@link HashMap} is the clear winner. (For example,
     *            a non-optional hash join against an access path never uses the
     *            iterator over the hash index.)
     */
    public JVMHashIndex(final IVariable[] keyVars,
            final boolean indexSolutionsHavingUnboundJoinVars,
            final Map map) {

        if (keyVars == null) {
       
            /*
             * A ZERO LENGTH joinVars[] means that all solutions will be in the
             * same hash bucket. This can arise due to poor assignment of join
             * variables or simply because there are no available join variables
             * (full cross product join). Such joins are very expensive.
             */
            
            throw new IllegalArgumentException();

        }
        
        if (map == null) {
        
            throw new IllegalArgumentException();
            
        }

        this.map = map;

        this.indexSolutionsHavingUnboundJoinVars = indexSolutionsHavingUnboundJoinVars;

        this.keyVars = keyVars;

    }

    /**
     * Add the solution to the index.
     * 
     * @param bset
     *            The {@link IBindingSet}.
     * 
     * @return The {@link Key} iff the solution was added to the index and
     *         null iff the solution was not added (because a
     *         {@link Key} could not be formed for the solution given the
     *         specified {@link #keyVars}).
     */
    public Key add(final IBindingSet bset) {

        final Key key = makeKey(bset);

        if (key == null) {

            // Drop solution.
            return null;

        }

        /*
         * TODO There is an opportunity for CONCURRENT hash map for at least the
         * DISTINCT SOLUTIONS filter and perhaps for others as well. However, to
         * do this with the DISTINCT SOLUTIONS filter we would have to make the
         * mutation operations on a Bucket atomic. E.g., using the synchronized
         * keyword. This would give us what amounts to per-hash code striped
         * locks.
         * 
         * Note: This pattern could be made thread safe. If the get() fails, use
         * a putIfAbsent() in a data race to create and insert the new bucket.
         * If the thread looses the data race, then it must use the other
         * thread's bucket and add its solution to that bucket.
         * 
         * The Bucket.addDistinct() could also be made thread safe by using the
         * monitor for the Bucket (or its Solutions List). This is necessary for
         * correctness, but note that we do not use addDistinct() and instead
         * rely on the more efficient JVMDistinctFilter. The JVMDistinctFilter
         * it is more efficient because it based on a ConcurrentHashMap does not
         * require any explicit synchronization.
         * 
         * TODO This change would allow us to execute the JVMHashIndexOp
         * concurrently which could provide a substantial throughput gain.
         * However, we still are faced with the requirement to decide atomically
         * when the HashIndexOp is done (the isLastPass() test). It is possible
         * to decide when no more solutions will be available. If the thread
         * that executes the last pass awaits a latch to count down to ONE, then
         * it will known that it is (a) the last invocation, and (b) that all
         * other invocations are complete. This pattern would have to be
         * supported in the QueryEngine and PipelineOp since the latch would
         * have to be incremented by the QueryEngine in a critical section when
         * the new ChunkTask is created and then decremented in a critical
         * section when the ChunkTask ends. If the latch is then exposed to the
         * BOpContext, the operator can decide that it is the last invocation
         * and that no other task is running (or will run) for that operator and
         * then execute the post-processing step (flooding the solutions in the
         * hash index to the downstream operator in the query plan). [Actually,
         * we might not have to do that for the JVMHashIndexOp since we do not
         * have to checkpoint the JVMHashIndex and could incrementally pass
         * along the indexed solutions to the downstream operator, but this
         * would also mean that outputSolutions() would need to use sharedState
         * for its DISTINCT FILTER on the solutions flowing into the sub-group.
         * All of this could be done, but it might require us to specialize the
         * JVMHashIndexOp. We would also have to update AST2BOpUtility to
         * generate the appropriate annotations.]
         */
        Bucket b = map.get(key);

        if (b == null) {

            map.put(key, b = new Bucket(key.hash, bset));

        } else {

            b.add(bset);

        }

        return key;

    }

    /**
     * Add the solution to the index iff the solution is not already present in
     * the index.
     * 
     * @param bset
     *            The solution.
     * 
     * @return true iff the index was modified by this operation.
     */
    public boolean addDistinct(final IBindingSet bset) {

        final Key key = makeKey(bset);

        assert key != null;

        Bucket b = map.get(key);

        if (b == null) {

            // New bucket holding just this solution.
            map.put(key, b = new Bucket(key.hash, bset));

            return true;

        } else {

            if (b.addDistinct(bset)) {

                // Existing bucket not having this solution.
                return true;

            }

            // Existing bucket with duplicate solution.
            return false;

        }

    }

    /**
     * Return the hash {@link Bucket} into which the given solution is mapped.
     * 
     * Note: The caller must apply an appropriate join constraint in order to
     * correctly reject solutions that (a) violate the join contract; and (b)
     * that are present in the hash bucket due to a hash collection rather than
     * because they have the same bindings for the join variables.
     * 
     * @param left
     *            The probe.
     *            
     * @return The hash {@link Bucket} into which the given solution is mapped
     *         -or- null if there is no such hash bucket.
     */
    public Bucket getBucket(final IBindingSet left) {

        final Key key = makeKey(left);

        if (key == null) {

            return null;

        }

        // Probe the hash map : May return [null]!
        return map.get(key);

    }

    /**
     * Visit all buckets in the hash index.
     */
    public Iterator buckets() {

        return map.values().iterator();

    }

    /**
     * The #of buckets in the hash index. Each bucket has a distinct hash code.
     * Hash collisions can cause solutions that are distinct in their
     * {@link #keyVars} to nevertheless be mapped into the same hash bucket.
     * 
     * @return The #of buckets in the hash index.
     */
    public int bucketCount() {

        return map.size();

    }

    /**
     * Export the {@link Bucket}s as an array.
     */
    public Bucket[] toArray() {

        // source.
        final Iterator bucketIterator = map.values()
                .iterator();

        final Bucket[] a = new Bucket[map.size()];

        int i = 0;

        while (bucketIterator.hasNext()) {

            a[i++] = bucketIterator.next();

        }

        return a;

    }

}