All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bigdata.bop.solutions.MemoryGroupByOp Maven / Gradle / Ivy

/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package com.bigdata.bop.solutions;

import java.io.Serializable;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.FutureTask;

import org.apache.log4j.Logger;

import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpContext;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.Constant;
import com.bigdata.bop.ContextBindingSet;
import com.bigdata.bop.HashMapAnnotations;
import com.bigdata.bop.IBind;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstant;
import com.bigdata.bop.IConstraint;
import com.bigdata.bop.IValueExpression;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.aggregate.IAggregate;
import com.bigdata.bop.bindingSet.ListBindingSet;
import com.bigdata.bop.engine.BOpStats;
import com.bigdata.htree.HTree;
import com.bigdata.rdf.error.SparqlTypeErrorException;
import com.bigdata.rdf.sparql.ast.FilterNode;
import com.bigdata.relation.accesspath.IBlockingBuffer;
import com.bigdata.util.InnerCause;

import cutthecrap.utils.striterators.ICloseableIterator;

/**
 * An in-memory at-once generalized aggregation operator.
 * 
 * @author Bryan Thompson
 * @version $Id: DistinctElementFilter.java 3466 2010-08-27 14:28:04Z
 *          thompsonbry $
 */
public class MemoryGroupByOp extends GroupByOp {

    /**
     * 
     */
    private static final long serialVersionUID = 1L;

	private static final transient Logger log = Logger
			.getLogger(MemoryGroupByOp.class);
    
    public interface Annotations extends GroupByOp.Annotations,
            HashMapAnnotations {

	}

    /**
     * {@inheritDoc}
     * 

* Returns false. This is a generalized aggregation operator * and may be used to evaluate any aggregation request. */ @Override final public boolean isPipelinedAggregationOp() { return false; } /** * Constructor required for {@link com.bigdata.bop.BOpUtility#deepCopy(FilterNode)}. */ public MemoryGroupByOp(final MemoryGroupByOp op) { super(op); } /** * Required shallow copy constructor. */ public MemoryGroupByOp(final BOp[] args, final Map annotations) { super(args, annotations); switch (getEvaluationContext()) { case CONTROLLER: break; default: throw new UnsupportedOperationException( Annotations.EVALUATION_CONTEXT + "=" + getEvaluationContext()); } assertAtOnceJavaHeapOp(); getRequiredProperty(Annotations.GROUP_BY_STATE); getRequiredProperty(Annotations.GROUP_BY_REWRITE); } /** * @see Annotations#INITIAL_CAPACITY */ public int getInitialCapacity() { return getProperty(Annotations.INITIAL_CAPACITY, Annotations.DEFAULT_INITIAL_CAPACITY); } /** * @see Annotations#LOAD_FACTOR */ public float getLoadFactor() { return getProperty(Annotations.LOAD_FACTOR, Annotations.DEFAULT_LOAD_FACTOR); } @Override public FutureTask eval(final BOpContext context) { return new FutureTask(new GroupByTask(this, context)); } /** * Wrapper used for the solution groups in the {@link ConcurrentHashMap}. */ private static class SolutionGroup { /** The hash code for {@link #vals}. */ private final int hash; /** * The computed values for the groupBy value expressions in the order in * which they were declared. */ private final IConstant[] vals; @Override public String toString() { return super.toString() + // "{group=" + Arrays.toString(vals) + // "}"; } /** * Return a new {@link SolutionGroup} given the value expressions and * the binding set. * * @param groupBy * The value expressions to be computed. * @param bset * The binding set. * * @return The new {@link SolutionGroup} -or- null if any * of the value expressions evaluates or a null * -OR- throws a {@link SparqlTypeErrorException}. */ static SolutionGroup newInstance(final IValueExpression[] groupBy, final IBindingSet bset, final BOpStats stats) { final IConstant[] r = new IConstant[groupBy.length]; for (int i = 0; i < groupBy.length; i++) { final IValueExpression expr = groupBy[i]; final Object asBound; try { /* * Note: This has a side-effect on the solution, which means * that it needs to be mutable and we have to store the * modified solution. However, it might be nicer to NOT have * a side effect on the incoming solution. That means that * it can continue to be buffered in a read-only encoding on * the native heap and all we need to do here is associate * it with the appropriate group. We can easily re-compute * the GROUP_BY value expressions when we actually evaluate * the aggregates over the solutions in a solution group. At * that point, the solution is once again materialized in * memory on the JVM. [These concerns are only relevant when * developing a generalized aggregation operator backed by * the HTree.] */ asBound = expr.get(bset); } catch (SparqlTypeErrorException ex) { TypeErrorLog.handleTypeError(ex, expr, stats); // Drop solution. return null; } if (asBound == null) { // Drop solution. return null; } @SuppressWarnings({ "rawtypes", "unchecked" }) final IConstant x = new Constant(asBound); r[i] = x; } return new SolutionGroup(r); } private SolutionGroup(final IConstant[] vals) { this.vals = vals; this.hash = java.util.Arrays.hashCode(vals); } @Override public int hashCode() { return hash; } @Override public boolean equals(final Object o) { if (this == o) return true; if (!(o instanceof SolutionGroup)) { return false; } final SolutionGroup t = (SolutionGroup) o; if (vals.length != t.vals.length) return false; for (int i = 0; i < vals.length; i++) { if (vals[i] == t.vals[i]) continue; if (vals[i] == null) return false; if (!vals[i].equals(t.vals[i])) return false; } return true; } } // SolutionGroup /** * A multiset of solutions associated with a {@link SolutionGroup}. */ private static class SolutionMultiSet { private List solutions = new LinkedList(); public void add(final IBindingSet bset) { if(bset == null) throw new IllegalArgumentException(); solutions.add(bset); } } /** * Task executing on the node. */ static private class GroupByTask implements Callable { private final BOpContext context; /** * A map whose keys are the computed bindings on the GROUP_BY * expressions and whose values are the solution multisets which fall * into a given group. */ private final LinkedHashMap map; private final IGroupByState groupByState; private final IGroupByRewriteState rewrite; private final IValueExpression[] groupBy; private final BOpStats stats; GroupByTask(final MemoryGroupByOp op, final BOpContext context) { this.context = context; this.stats = context.getStats(); this.groupByState = (IGroupByState) op .getRequiredProperty(Annotations.GROUP_BY_STATE); this.rewrite = (IGroupByRewriteState) op .getRequiredProperty(Annotations.GROUP_BY_REWRITE); this.groupBy = groupByState.getGroupByClause(); // The map is only defined if a GROUP_BY clause was used. this.map = groupBy == null ? null : new LinkedHashMap( op.getInitialCapacity(), op.getLoadFactor()); } /** * Add the solution to the multiset for the appropriate group. If we can * not compute the GROUP_BY value expressions for a solution, then the * solution is dropped. * * @param bset * The solution. */ private void accept(final IBindingSet bset) { if (groupBy == null || groupBy.length == 0) throw new IllegalArgumentException(); if (bset == null) throw new IllegalArgumentException(); final SolutionGroup s = SolutionGroup.newInstance(groupBy, bset, stats); if (s == null) { // Drop the solution. if (log.isDebugEnabled()) log.debug("Dropping solution: " + bset); return; } SolutionMultiSet m = map.get(s); if (m == null) { map.put(s, m = new SolutionMultiSet()); } // Accept the solution. if (log.isTraceEnabled()) log.trace("Accepting solution: " + bset); m.add(bset); } @Override public Void call() throws Exception { final ICloseableIterator itr = context .getSource(); final IBlockingBuffer sink = context.getSink(); try { final List accepted = new LinkedList(); int naccepted = 0; if (groupBy == null) { /* * Combine all solutions into a single multiset. */ final SolutionMultiSet m = new SolutionMultiSet(); while (itr.hasNext()) { final IBindingSet[] a = itr.next(); stats.chunksIn.increment(); stats.unitsIn.add(a.length); for (IBindingSet bset : a) { m.add(bset); } } // Compute the aggregate for that group. final IBindingSet bset = aggregate(m.solutions); if (bset != null) { if (log.isDebugEnabled()) log.debug("output: solution=" + bset); accepted.add(bset); naccepted++; } else { if (log.isDebugEnabled()) log.debug("output : no solution."); } } else { /* * Group the solutions. */ while (itr.hasNext()) { final IBindingSet[] a = itr.next(); stats.chunksIn.increment(); stats.unitsIn.add(a.length); for (IBindingSet bset : a) { accept(bset); } } for (Map.Entry e : map .entrySet()) { final SolutionMultiSet m = e.getValue(); // Compute the aggregate for that group. final IBindingSet bset = aggregate(m.solutions); if (bset != null) { if (log.isDebugEnabled()) log.debug("output: groupBy=" + e.getKey() + ", solution=" + bset); accepted.add(bset); naccepted++; } else { if (log.isDebugEnabled()) log.debug("output: groupBy=" + e.getKey() + " : dropped."); } } // discard the map. map.clear(); } /* * Output the aggregated bindings for the accepted solutions. */ if (naccepted > 0) { final IBindingSet[] b = accepted .toArray(new IBindingSet[naccepted]); sink.add(b); // flush the output. sink.flush(); } // done. return null; } finally { sink.close(); } } // call() /** * Compute the aggregate solution for a solution multiset (aka a group). * * @return The aggregate solution -or- null if the solution * for the group was dropped (type error or violated HAVING * constraint). */ private IBindingSet aggregate(final Iterable solutions) { /** * The intermediate solution with all bindings produced when * evaluating this solution group. Evaluation begins by binding any * bare variables or BINDs in the GROUP_BY clause, followed by * evaluating all aggregates, and then finally evaluating the * (rewritten) SELECT expressions. The rewritten HAVING clause (if * any) may then be then be trivially evaluated. If the solution is * not dropped, then only the SELECTed variables are projected out. */ final IBindingSet aggregates = new ContextBindingSet(context,new ListBindingSet()); /** * Propagate GROUP_BY expression onto [aggregates]. */ if (groupBy != null) { // The first solution in the group (must exist since the // group was observed). final IBindingSet aSolution = solutions.iterator().next(); for (IValueExpression expr : groupBy) { if (expr instanceof IVariable) { /** * Propagate bare variable used in GROUP_BY clause to * [aggregates]. * *

                         * GROUP BY ?x
                         * 
*/ final IVariable var = (IVariable) expr; // Note: MUST be a binding for each groupBy var. @SuppressWarnings({ "rawtypes", "unchecked" }) final Constant val = new Constant(var.get(aSolution)); // Bind on [aggregates]. aggregates.set(var, val); } else if (expr instanceof IBind) { /** * Propagate BIND declared by GROUP_BY clause to * [aggregates]. * *
                         * GROUP BY (2*?y as ?x)
                         * 
*/ final IBind bindExpr = (IBind) expr; // Compute value expression. // Note: MUST be valid since group exists. @SuppressWarnings({ "rawtypes", "unchecked" }) final Constant val = new Constant( bindExpr.get(aSolution)); // Variable to be projected out by SELECT. final IVariable ovar = ((IBind) expr).getVar(); // Bind on [aggregates]. aggregates.set(ovar, val); } } // next GROUP_BY value expression } // if(groupBy != null) /** * Compute the aggregates. * * TODO This can be further optimized by computing the column * projections of the different value expressions exactly once and * then applying the aggregation functions to those column * projections. As long as we adhere to the dependency ordering * among those aggregates, we can compute them all in a single pass * over the column projections. * * TODO DISTINCT projections of columns projections can be modeled * in a bunch of different ways, but if we need the original column * projection as well as the DISTINCT of that column projection then * it makes sense to either form the DISTINCT projection while * building the column projection or as an after action. */ { final boolean nestedAggregates = groupByState.isNestedAggregates(); final Iterator, IVariable>> itr = rewrite .getAggExpr().entrySet().iterator(); while (itr.hasNext()) { final Map.Entry, IVariable> e = itr.next(); // Aggregate. doAggregate(e.getKey(), e.getValue(), nestedAggregates, aggregates, solutions, stats); } if (log.isTraceEnabled()) log.trace("aggregates: " + aggregates); } // Evaluate SELECT expressions. for (IValueExpression expr : rewrite.getSelect2()) { try { expr.get(aggregates); } catch (SparqlTypeErrorException ex) { TypeErrorLog.handleTypeError(ex, expr, stats); continue; } catch (IllegalArgumentException ex) { /* * Note: This is a hack turning an IllegalArgumentException * which we presume is coming out of new Constant(null) into * an (implicit) SPARQL type error so we can drop the * binding for this SELECT expression. (Note that we are not * trying to drop the entire group!) */ TypeErrorLog.handleTypeError(ex, expr, stats); continue; } } /* * Verify optional constraint(s). * * TODO This could be done before fully computing the aggregates as * we only need to have on hand those computed aggregates on which * the HAVING clause depends. */ { final boolean drop; final IConstraint[] having2 = rewrite.getHaving2(); if (having2 != null && !BOpUtility.isConsistent(having2, aggregates)) { // drop this solution. drop = true; } else { drop = false; } if (log.isInfoEnabled()) log.info((drop ? "drop" : "keep") + " : " + aggregates); if (drop) { // Drop this solution. return null; } } // project out only selected variables. final IBindingSet out = aggregates.copy(groupByState .getSelectVars().toArray(new IVariable[0])); return out; } } // GroupByTask /** * Apply the value expression to each solution in the group. * * @param expr * The {@link IAggregate} to be evaluated. * @param var * The variable on which computed value of the {@link IAggregate} * will be bound. * @param selectDependency * When true, some aggregates bind variables which * are relied on by other aggregates. In this case, this method * must ensure that those bindings become visible. * @param aggregates * The binding set on which the results are being bound (by the * caller). * @param solutions * The input solutions for a solution group across which we will * compute the aggregate. * @param stats * Used to report type errors. */ @SuppressWarnings({ "unchecked", "rawtypes" }) private static void doAggregate(// final IAggregate expr,// final IVariable var,// final boolean selectDependency,// final IBindingSet aggregates,// final Iterable solutions,// final BOpStats stats// ) { try { IConstant c=null; if (expr.isWildcard() && expr.isDistinct()) { /** * For a wildcard we basically need to operate on solution * multisets. For example, COUNT(*) is the size of the solution * multiset (aka group). * * Note: It is possible to optimize COUNT(*) and COUNT(DISTINCT * *) as the cardinality of the solution multiset / solution set * respectively. However, we can not undertake this optimization * when COUNT() is parameterized by an {@link IValueExpression}, * even a simple {@link IVariable}, since then we need to count * the solutions where the value expression is non- * null and NOT bind the result of the COUNT() for * the group if the evaluation of the value expression results * in an error for any solution in that group. */ // Set used to impose DISTINCT on the solution multiset. final LinkedHashSet set = new LinkedHashSet(); expr.reset(); for (IBindingSet bset : solutions) { if (set.add(bset)) { if (selectDependency) propagateAggregateBindings(aggregates, bset); // aggregate iff this is a new result. expr.get(bset); } } final Object result = expr.done(); if (result != null) { c = new Constant(result); } } else if (expr.isDistinct()) { /* * Apply aggregate function only to the distinct values which * it's inner value expression takes on. */ // Set used to impose "DISTINCT" on value expression results. final Set set = new LinkedHashSet(); expr.reset(); for (IBindingSet bset : solutions) { final Object constants[] = new Object[expr.arity()]; for (int i=0;i> itr = aggregates .iterator(); while (itr.hasNext()) { @SuppressWarnings("rawtypes") final Map.Entry e = itr.next(); bset.set(e.getKey(), e.getValue()); } } /** * Wrapper used for as bound solutions in the {@link HTree}. *

* Note: A similar class appears in different operators which use the * {@link HTree}. However, these classes differ in what bindings are * conceptually part of the key in the {@link HTree}, in how they compute * the hash code under which the solution will be indexed, and in how they * compare the solutions for equality. *

* This implementation relies on an ordered {@link IVariable}[] which * defines the bindings that are part of the key and permits a simpler * {@link #equals(Object)}s method that would be used if an entire * {@link IBindingSet} was being tested for equality (binding sets do not * consider order of the bindings when testing for equality). */ private static class Solution implements Serializable { private static final long serialVersionUID = 1L; private final int hash; private final Object[] vals; /** * Solution whose hash code is the hash code of the {@link IConstant}[]. * * @param vals * The values. */ public Solution(final Object[] vals) { this.vals = vals; this.hash = java.util.Arrays.hashCode(vals); } @Override public int hashCode() { return hash; } @Override public boolean equals(final Object o) { if (this == o) return true; if (!(o instanceof Solution)) { return false; } final Solution t = (Solution) o; if (vals.length != t.vals.length) return false; for (int i = 0; i < vals.length; i++) { if (vals[i] == t.vals[i]) continue; if (vals[i] == null) return false; if (!vals[i].equals(t.vals[i])) return false; } return true; } } // class Solution }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy