com.bigdata.bop.solutions.MemoryGroupByOp Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package com.bigdata.bop.solutions;

import java.io.Serializable;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.FutureTask;

import org.apache.log4j.Logger;

import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpContext;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.Constant;
import com.bigdata.bop.ContextBindingSet;
import com.bigdata.bop.HashMapAnnotations;
import com.bigdata.bop.IBind;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstant;
import com.bigdata.bop.IConstraint;
import com.bigdata.bop.IValueExpression;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.aggregate.IAggregate;
import com.bigdata.bop.bindingSet.ListBindingSet;
import com.bigdata.bop.engine.BOpStats;
import com.bigdata.htree.HTree;
import com.bigdata.rdf.error.SparqlTypeErrorException;
import com.bigdata.rdf.sparql.ast.FilterNode;
import com.bigdata.relation.accesspath.IBlockingBuffer;
import com.bigdata.util.InnerCause;

import cutthecrap.utils.striterators.ICloseableIterator;

/**
 * An in-memory at-once generalized aggregation operator.
 * 
 * @author Bryan Thompson
 * @version $Id: DistinctElementFilter.java 3466 2010-08-27 14:28:04Z
 *          thompsonbry $
 */
public class MemoryGroupByOp extends GroupByOp {

    /**
     * 
     */
    private static final long serialVersionUID = 1L;

	private static final transient Logger log = Logger
			.getLogger(MemoryGroupByOp.class);
    
    public interface Annotations extends GroupByOp.Annotations,
            HashMapAnnotations {

	}

    /**
     * {@inheritDoc}
     * 
     * Returns false. This is a generalized aggregation operator
     * and may be used to evaluate any aggregation request.
     */
    @Override
    final public boolean isPipelinedAggregationOp() {

        return false;
        
    }
    
    /**
     * Constructor required for {@link com.bigdata.bop.BOpUtility#deepCopy(FilterNode)}.
     */
    public MemoryGroupByOp(final MemoryGroupByOp op) {
        super(op);
    }

    /**
     * Required shallow copy constructor.
     */
    public MemoryGroupByOp(final BOp[] args,
            final Map annotations) {

        super(args, annotations);

        switch (getEvaluationContext()) {
		case CONTROLLER:
			break;
		default:
			throw new UnsupportedOperationException(
					Annotations.EVALUATION_CONTEXT + "="
							+ getEvaluationContext());
		}

        assertAtOnceJavaHeapOp();

        getRequiredProperty(Annotations.GROUP_BY_STATE);
        
        getRequiredProperty(Annotations.GROUP_BY_REWRITE);
        
	}
    
    /**
     * @see Annotations#INITIAL_CAPACITY
     */
    public int getInitialCapacity() {

        return getProperty(Annotations.INITIAL_CAPACITY,
                Annotations.DEFAULT_INITIAL_CAPACITY);

    }

    /**
     * @see Annotations#LOAD_FACTOR
     */
    public float getLoadFactor() {

        return getProperty(Annotations.LOAD_FACTOR,
                Annotations.DEFAULT_LOAD_FACTOR);

    }

    @Override
    public FutureTask eval(final BOpContext context) {

        return new FutureTask(new GroupByTask(this, context));
        
    }

    /**
     * Wrapper used for the solution groups in the {@link ConcurrentHashMap}.
     */
    private static class SolutionGroup {

		/** The hash code for {@link #vals}. */
		private final int hash;

        /**
         * The computed values for the groupBy value expressions in the order in
         * which they were declared.
         */
        private final IConstant[] vals;

        @Override
		public String toString() {
			return super.toString() + //
					"{group=" + Arrays.toString(vals) + //
					"}";
		}

        /**
         * Return a new {@link SolutionGroup} given the value expressions and
         * the binding set.
         * 
         * @param groupBy
         *            The value expressions to be computed.
         * @param bset
         *            The binding set.
         * 
         * @return The new {@link SolutionGroup} -or- null if any
         *         of the value expressions evaluates or a null
         *         -OR- throws a {@link SparqlTypeErrorException}.
         */
        static SolutionGroup newInstance(final IValueExpression[] groupBy,
                final IBindingSet bset, final BOpStats stats) {

            final IConstant[] r = new IConstant[groupBy.length];

            for (int i = 0; i < groupBy.length; i++) {

                final IValueExpression expr = groupBy[i];
                final Object asBound;
                try {
                    /*
                     * Note: This has a side-effect on the solution, which means
                     * that it needs to be mutable and we have to store the
                     * modified solution. However, it might be nicer to NOT have
                     * a side effect on the incoming solution. That means that
                     * it can continue to be buffered in a read-only encoding on
                     * the native heap and all we need to do here is associate
                     * it with the appropriate group. We can easily re-compute
                     * the GROUP_BY value expressions when we actually evaluate
                     * the aggregates over the solutions in a solution group. At
                     * that point, the solution is once again materialized in
                     * memory on the JVM. [These concerns are only relevant when
                     * developing a generalized aggregation operator backed by
                     * the HTree.]
                     */
                    asBound = expr.get(bset);
                } catch (SparqlTypeErrorException ex) {
                    TypeErrorLog.handleTypeError(ex, expr, stats);
                    // Drop solution.
                    return null;
                }
                if (asBound == null) {
                    // Drop solution.
                    return null;
                }
                @SuppressWarnings({ "rawtypes", "unchecked" })
                final IConstant x = new Constant(asBound);
                r[i] = x;

            }

            return new SolutionGroup(r);
            
        }

        private SolutionGroup(final IConstant[] vals) {
            this.vals = vals;
            this.hash = java.util.Arrays.hashCode(vals);
        }

        @Override
        public int hashCode() {
            return hash;
        }

        @Override
        public boolean equals(final Object o) {
            if (this == o)
                return true;
            if (!(o instanceof SolutionGroup)) {
                return false;
            }
            final SolutionGroup t = (SolutionGroup) o;
            if (vals.length != t.vals.length)
                return false;
            for (int i = 0; i < vals.length; i++) {
                if (vals[i] == t.vals[i])
                    continue;
                if (vals[i] == null)
                    return false;
                if (!vals[i].equals(t.vals[i]))
                    return false;
            }
            return true;
        }

    } // SolutionGroup

    /**
     * A multiset of solutions associated with a {@link SolutionGroup}.
     */
    private static class SolutionMultiSet {

        private List solutions = new LinkedList();

        public void add(final IBindingSet bset) {

            if(bset == null)
                throw new IllegalArgumentException();
            
            solutions.add(bset);
            
        }

    }
    
    /**
     * Task executing on the node.
     */
    static private class GroupByTask implements Callable {

        private final BOpContext context;

        /**
         * A map whose keys are the computed bindings on the GROUP_BY
         * expressions and whose values are the solution multisets which fall
         * into a given group.
         */
        private final LinkedHashMap map;

        private final IGroupByState groupByState;

        private final IGroupByRewriteState rewrite;
        
        private final IValueExpression[] groupBy;

        private final BOpStats stats;

        GroupByTask(final MemoryGroupByOp op,
                final BOpContext context) {
        	
            this.context = context;

            this.stats = context.getStats();
            
            this.groupByState = (IGroupByState) op
                    .getRequiredProperty(Annotations.GROUP_BY_STATE);

            this.rewrite = (IGroupByRewriteState) op
                    .getRequiredProperty(Annotations.GROUP_BY_REWRITE);
            
            this.groupBy = groupByState.getGroupByClause();

            // The map is only defined if a GROUP_BY clause was used.
            this.map = groupBy == null ? null
                    : new LinkedHashMap(
                            op.getInitialCapacity(), op.getLoadFactor());

        }

        /**
         * Add the solution to the multiset for the appropriate group. If we can
         * not compute the GROUP_BY value expressions for a solution, then the
         * solution is dropped.
         * 
         * @param bset
         *            The solution.
         */
        private void accept(final IBindingSet bset) {

            if (groupBy == null || groupBy.length == 0)
                throw new IllegalArgumentException();

            if (bset == null)
                throw new IllegalArgumentException();

            final SolutionGroup s = SolutionGroup.newInstance(groupBy, bset,
                    stats);

            if (s == null) {

                // Drop the solution.

                if (log.isDebugEnabled())
                    log.debug("Dropping solution: " + bset);

                return;

            }

            SolutionMultiSet m = map.get(s);

            if (m == null) {

                map.put(s, m = new SolutionMultiSet());

            }

            // Accept the solution.
            if (log.isTraceEnabled())
                log.trace("Accepting solution: " + bset);

            m.add(bset);

        }

        @Override
        public Void call() throws Exception {

			final ICloseableIterator itr = context
					.getSource();

			final IBlockingBuffer sink = context.getSink();

			try {

                final List accepted = new LinkedList();

                int naccepted = 0;

                if (groupBy == null) {

                    /*
                     * Combine all solutions into a single multiset.
                     */
                    final SolutionMultiSet m = new SolutionMultiSet();
                    
                    while (itr.hasNext()) {

                        final IBindingSet[] a = itr.next();

                        stats.chunksIn.increment();
                        stats.unitsIn.add(a.length);

                        for (IBindingSet bset : a) {

                            m.add(bset);

                        }                    
                        
                    }
                    
                    // Compute the aggregate for that group.
                    final IBindingSet bset = aggregate(m.solutions);

                    if (bset != null) {

                        if (log.isDebugEnabled())
                            log.debug("output: solution=" + bset);

                        accepted.add(bset);

                        naccepted++;

                    } else {

                        if (log.isDebugEnabled())
                            log.debug("output : no solution.");
                        
                    }

                } else {

                    /*
                     * Group the solutions.
                     */
                    
                    while (itr.hasNext()) {

                        final IBindingSet[] a = itr.next();

                        stats.chunksIn.increment();
                        stats.unitsIn.add(a.length);

                        for (IBindingSet bset : a) {

                            accept(bset);

                        }

                    }

                    for (Map.Entry e : map
                            .entrySet()) {

                        final SolutionMultiSet m = e.getValue();

                        // Compute the aggregate for that group.
                        final IBindingSet bset = aggregate(m.solutions);

                        if (bset != null) {
                            
                            if (log.isDebugEnabled())
                                log.debug("output: groupBy=" + e.getKey()
                                        + ", solution=" + bset);

                            accepted.add(bset);

                            naccepted++;
                            
                        } else {
                            
                            if (log.isDebugEnabled())
                                log.debug("output: groupBy=" + e.getKey()
                                        + " : dropped.");
                            
                        }

                    }

                    // discard the map.
                    map.clear();

                }

                /*
                 * Output the aggregated bindings for the accepted solutions.
                 */
                if (naccepted > 0) {

                    final IBindingSet[] b = accepted
                            .toArray(new IBindingSet[naccepted]);

                    sink.add(b);

                    // flush the output.
                    sink.flush();

                }

                // done.
                return null;

            } finally {

                sink.close();

            }

        } // call()

        /**
         * Compute the aggregate solution for a solution multiset (aka a group).
         * 
         * @return The aggregate solution -or- null if the solution
         *         for the group was dropped (type error or violated HAVING
         *         constraint).
         */
        private IBindingSet aggregate(final Iterable solutions) {
            /**
             * The intermediate solution with all bindings produced when
             * evaluating this solution group. Evaluation begins by binding any
             * bare variables or BINDs in the GROUP_BY clause, followed by
             * evaluating all aggregates, and then finally evaluating the
             * (rewritten) SELECT expressions. The rewritten HAVING clause (if
             * any) may then be then be trivially evaluated. If the solution is
             * not dropped, then only the SELECTed variables are projected out.
             */
            final IBindingSet aggregates = new ContextBindingSet(context,new ListBindingSet());

            /**
             * Propagate GROUP_BY expression onto [aggregates]. 
             */
            if (groupBy != null) {

                // The first solution in the group (must exist since the
                // group was observed).
                final IBindingSet aSolution = solutions.iterator().next();

                for (IValueExpression expr : groupBy) {

                    if (expr instanceof IVariable) {

                        /**
                         * Propagate bare variable used in GROUP_BY clause to
                         * [aggregates].
                         * 
                         * 
                         * GROUP BY ?x
                         * 
                         */

                        final IVariable var = (IVariable) expr;

                        // Note: MUST be a binding for each groupBy var.
                        @SuppressWarnings({ "rawtypes", "unchecked" })
                        final Constant val = new Constant(var.get(aSolution));

                        // Bind on [aggregates].
                        aggregates.set(var, val);

                    } else if (expr instanceof IBind) {

                        /**
                         * Propagate BIND declared by GROUP_BY clause to
                         * [aggregates].
                         * 
                         *                          * GROUP BY (2*?y as ?x)
                         * 
                         */

                        final IBind bindExpr = (IBind) expr;

                        // Compute value expression.
                        // Note: MUST be valid since group exists.
                        @SuppressWarnings({ "rawtypes", "unchecked" })
                        final Constant val = new Constant(
                                bindExpr.get(aSolution));

                        // Variable to be projected out by SELECT.
                        final IVariable ovar = ((IBind) expr).getVar();

                        // Bind on [aggregates].
                        aggregates.set(ovar, val);

                    }

                } // next GROUP_BY value expression

            } // if(groupBy != null)

            /**
             * Compute the aggregates.
             * 
             * TODO This can be further optimized by computing the column
             * projections of the different value expressions exactly once and
             * then applying the aggregation functions to those column
             * projections. As long as we adhere to the dependency ordering
             * among those aggregates, we can compute them all in a single pass
             * over the column projections.
             * 
             * TODO DISTINCT projections of columns projections can be modeled
             * in a bunch of different ways, but if we need the original column
             * projection as well as the DISTINCT of that column projection then
             * it makes sense to either form the DISTINCT projection while
             * building the column projection or as an after action.
             */
            {

                final boolean nestedAggregates = groupByState.isNestedAggregates();
                
                final Iterator, IVariable>> itr = rewrite
                        .getAggExpr().entrySet().iterator();

                while (itr.hasNext()) {
                
                    final Map.Entry, IVariable> e = itr.next();
                    
                    // Aggregate.
                    doAggregate(e.getKey(), e.getValue(), nestedAggregates,
                            aggregates, solutions, stats);
                    
                }
                
                if (log.isTraceEnabled())
                    log.trace("aggregates: " + aggregates);
                
            }

            // Evaluate SELECT expressions.
            for (IValueExpression expr : rewrite.getSelect2()) {

                try {
                    expr.get(aggregates);
                } catch (SparqlTypeErrorException ex) {
                    TypeErrorLog.handleTypeError(ex, expr, stats);
                    continue;
                } catch (IllegalArgumentException ex) {
                    /*
                     * Note: This is a hack turning an IllegalArgumentException
                     * which we presume is coming out of new Constant(null) into
                     * an (implicit) SPARQL type error so we can drop the
                     * binding for this SELECT expression. (Note that we are not
                     * trying to drop the entire group!)
                     */
                    TypeErrorLog.handleTypeError(ex, expr, stats);
                    continue;
                }

            }

            /*
             * Verify optional constraint(s).
             * 
             * TODO This could be done before fully computing the aggregates as
             * we only need to have on hand those computed aggregates on which
             * the HAVING clause depends.
             */
            {
                final boolean drop;
                final IConstraint[] having2 = rewrite.getHaving2();
                if (having2 != null
                        && !BOpUtility.isConsistent(having2, aggregates)) {
                    // drop this solution.
                    drop = true;
                } else {
                    drop = false;
                }

                if (log.isInfoEnabled())
                    log.info((drop ? "drop" : "keep") + " : " + aggregates);

                if (drop) {

                    // Drop this solution.
                    return null;

                }
            }

            // project out only selected variables.
            final IBindingSet out = aggregates.copy(groupByState
                    .getSelectVars().toArray(new IVariable[0]));

            return out;

        }

    } // GroupByTask

    /**
     * Apply the value expression to each solution in the group.
     * 
     * @param expr
     *            The {@link IAggregate} to be evaluated.
     * @param var
     *            The variable on which computed value of the {@link IAggregate}
     *            will be bound.
     * @param selectDependency
     *            When true, some aggregates bind variables which
     *            are relied on by other aggregates. In this case, this method
     *            must ensure that those bindings become visible.
     * @param aggregates
     *            The binding set on which the results are being bound (by the
     *            caller).
     * @param solutions
     *            The input solutions for a solution group across which we will
     *            compute the aggregate.
     * @param stats
     *            Used to report type errors.
     */
    @SuppressWarnings({ "unchecked", "rawtypes" })
    private static void doAggregate(//
            final IAggregate expr,//
            final IVariable var,//
            final boolean selectDependency,//
            final IBindingSet aggregates,//
            final Iterable solutions,//
            final BOpStats stats//
            ) {
        
        try {

            IConstant c=null;

            if (expr.isWildcard() && expr.isDistinct()) {
                /**
                 * For a wildcard we basically need to operate on solution
                 * multisets. For example, COUNT(*) is the size of the solution
                 * multiset (aka group).
                 * 
                 * Note: It is possible to optimize COUNT(*) and COUNT(DISTINCT
                 * *) as the cardinality of the solution multiset / solution set
                 * respectively. However, we can not undertake this optimization
                 * when COUNT() is parameterized by an {@link IValueExpression},
                 * even a simple {@link IVariable}, since then we need to count
                 * the solutions where the value expression is non-
                 * null and NOT bind the result of the COUNT() for
                 * the group if the evaluation of the value expression results
                 * in an error for any solution in that group.
                 */
                
                // Set used to impose DISTINCT on the solution multiset.
                final LinkedHashSet set = new LinkedHashSet();

                expr.reset();
                for (IBindingSet bset : solutions) {

                    if (set.add(bset)) {
                    
                        if (selectDependency)
                            propagateAggregateBindings(aggregates, bset);
                        
                        // aggregate iff this is a new result.
                        expr.get(bset);
                        
                    }

                }

                final Object result = expr.done();
                if (result != null) {
                    c = new Constant(result);
                }
                
            } else if (expr.isDistinct()) {
                
                /*
                 * Apply aggregate function only to the distinct values which
                 * it's inner value expression takes on.
                 */
                
                // Set used to impose "DISTINCT" on value expression results.
                final Set set = new LinkedHashSet();
                
                expr.reset();
                
                for (IBindingSet bset : solutions) {
                    
                    final Object constants[] = new Object[expr.arity()];

                    for (int i=0;i> itr = aggregates
                .iterator();

        while (itr.hasNext()) {

            @SuppressWarnings("rawtypes")
            final Map.Entry e = itr.next();

            bset.set(e.getKey(), e.getValue());
            
        }

    }
    /**
     * Wrapper used for as bound solutions in the {@link HTree}.
     * 
     * Note: A similar class appears in different operators which use the
     * {@link HTree}. However, these classes differ in what bindings are
     * conceptually part of the key in the {@link HTree}, in how they compute
     * the hash code under which the solution will be indexed, and in how they
     * compare the solutions for equality.
     * 
     * This implementation relies on an ordered {@link IVariable}[] which
     * defines the bindings that are part of the key and permits a simpler
     * {@link #equals(Object)}s method that would be used if an entire
     * {@link IBindingSet} was being tested for equality (binding sets do not
     * consider order of the bindings when testing for equality).
     */
    private static class Solution implements Serializable {

        private static final long serialVersionUID = 1L;

        private final int hash;

        private final Object[] vals;

        /**
         * Solution whose hash code is the hash code of the {@link IConstant}[].
         *
         * @param vals
         *            The values.
         */
        public Solution(final Object[] vals) {

            this.vals = vals;

            this.hash = java.util.Arrays.hashCode(vals);

        }

        @Override
        public int hashCode() {

            return hash;

        }

        @Override
        public boolean equals(final Object o) {
            if (this == o)
                return true;
            if (!(o instanceof Solution)) {
                return false;
            }
            final Solution t = (Solution) o;
            if (vals.length != t.vals.length)
                return false;
            for (int i = 0; i < vals.length; i++) {
                if (vals[i] == t.vals[i])
                    continue;
                if (vals[i] == null)
                    return false;
                if (!vals[i].equals(t.vals[i]))
                    return false;
            }
            return true;
        }

    } // class Solution
}