com.bigdata.bop.solutions.MemoryGroupByOp Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.bigdata.bop.solutions;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.FutureTask;
import org.apache.log4j.Logger;
import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpContext;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.Constant;
import com.bigdata.bop.ContextBindingSet;
import com.bigdata.bop.HashMapAnnotations;
import com.bigdata.bop.IBind;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstant;
import com.bigdata.bop.IConstraint;
import com.bigdata.bop.IValueExpression;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.aggregate.IAggregate;
import com.bigdata.bop.bindingSet.ListBindingSet;
import com.bigdata.bop.engine.BOpStats;
import com.bigdata.htree.HTree;
import com.bigdata.rdf.error.SparqlTypeErrorException;
import com.bigdata.rdf.sparql.ast.FilterNode;
import com.bigdata.relation.accesspath.IBlockingBuffer;
import com.bigdata.util.InnerCause;
import cutthecrap.utils.striterators.ICloseableIterator;
/**
* An in-memory at-once generalized aggregation operator.
*
* @author Bryan Thompson
* @version $Id: DistinctElementFilter.java 3466 2010-08-27 14:28:04Z
* thompsonbry $
*/
public class MemoryGroupByOp extends GroupByOp {
/**
*
*/
private static final long serialVersionUID = 1L;
private static final transient Logger log = Logger
.getLogger(MemoryGroupByOp.class);
public interface Annotations extends GroupByOp.Annotations,
HashMapAnnotations {
}
/**
* {@inheritDoc}
*
* Returns false
. This is a generalized aggregation operator
* and may be used to evaluate any aggregation request.
*/
@Override
final public boolean isPipelinedAggregationOp() {
return false;
}
/**
* Constructor required for {@link com.bigdata.bop.BOpUtility#deepCopy(FilterNode)}.
*/
public MemoryGroupByOp(final MemoryGroupByOp op) {
super(op);
}
/**
* Required shallow copy constructor.
*/
public MemoryGroupByOp(final BOp[] args,
final Map annotations) {
super(args, annotations);
switch (getEvaluationContext()) {
case CONTROLLER:
break;
default:
throw new UnsupportedOperationException(
Annotations.EVALUATION_CONTEXT + "="
+ getEvaluationContext());
}
assertAtOnceJavaHeapOp();
getRequiredProperty(Annotations.GROUP_BY_STATE);
getRequiredProperty(Annotations.GROUP_BY_REWRITE);
}
/**
* @see Annotations#INITIAL_CAPACITY
*/
public int getInitialCapacity() {
return getProperty(Annotations.INITIAL_CAPACITY,
Annotations.DEFAULT_INITIAL_CAPACITY);
}
/**
* @see Annotations#LOAD_FACTOR
*/
public float getLoadFactor() {
return getProperty(Annotations.LOAD_FACTOR,
Annotations.DEFAULT_LOAD_FACTOR);
}
@Override
public FutureTask eval(final BOpContext context) {
return new FutureTask(new GroupByTask(this, context));
}
/**
* Wrapper used for the solution groups in the {@link ConcurrentHashMap}.
*/
private static class SolutionGroup {
/** The hash code for {@link #vals}. */
private final int hash;
/**
* The computed values for the groupBy value expressions in the order in
* which they were declared.
*/
private final IConstant>[] vals;
@Override
public String toString() {
return super.toString() + //
"{group=" + Arrays.toString(vals) + //
"}";
}
/**
* Return a new {@link SolutionGroup} given the value expressions and
* the binding set.
*
* @param groupBy
* The value expressions to be computed.
* @param bset
* The binding set.
*
* @return The new {@link SolutionGroup} -or- null
if any
* of the value expressions evaluates or a null
* -OR- throws a {@link SparqlTypeErrorException}.
*/
static SolutionGroup newInstance(final IValueExpression>[] groupBy,
final IBindingSet bset, final BOpStats stats) {
final IConstant>[] r = new IConstant>[groupBy.length];
for (int i = 0; i < groupBy.length; i++) {
final IValueExpression> expr = groupBy[i];
final Object asBound;
try {
/*
* Note: This has a side-effect on the solution, which means
* that it needs to be mutable and we have to store the
* modified solution. However, it might be nicer to NOT have
* a side effect on the incoming solution. That means that
* it can continue to be buffered in a read-only encoding on
* the native heap and all we need to do here is associate
* it with the appropriate group. We can easily re-compute
* the GROUP_BY value expressions when we actually evaluate
* the aggregates over the solutions in a solution group. At
* that point, the solution is once again materialized in
* memory on the JVM. [These concerns are only relevant when
* developing a generalized aggregation operator backed by
* the HTree.]
*/
asBound = expr.get(bset);
} catch (SparqlTypeErrorException ex) {
TypeErrorLog.handleTypeError(ex, expr, stats);
// Drop solution.
return null;
}
if (asBound == null) {
// Drop solution.
return null;
}
@SuppressWarnings({ "rawtypes", "unchecked" })
final IConstant> x = new Constant(asBound);
r[i] = x;
}
return new SolutionGroup(r);
}
private SolutionGroup(final IConstant>[] vals) {
this.vals = vals;
this.hash = java.util.Arrays.hashCode(vals);
}
@Override
public int hashCode() {
return hash;
}
@Override
public boolean equals(final Object o) {
if (this == o)
return true;
if (!(o instanceof SolutionGroup)) {
return false;
}
final SolutionGroup t = (SolutionGroup) o;
if (vals.length != t.vals.length)
return false;
for (int i = 0; i < vals.length; i++) {
if (vals[i] == t.vals[i])
continue;
if (vals[i] == null)
return false;
if (!vals[i].equals(t.vals[i]))
return false;
}
return true;
}
} // SolutionGroup
/**
* A multiset of solutions associated with a {@link SolutionGroup}.
*/
private static class SolutionMultiSet {
private List solutions = new LinkedList();
public void add(final IBindingSet bset) {
if(bset == null)
throw new IllegalArgumentException();
solutions.add(bset);
}
}
/**
* Task executing on the node.
*/
static private class GroupByTask implements Callable {
private final BOpContext context;
/**
* A map whose keys are the computed bindings on the GROUP_BY
* expressions and whose values are the solution multisets which fall
* into a given group.
*/
private final LinkedHashMap map;
private final IGroupByState groupByState;
private final IGroupByRewriteState rewrite;
private final IValueExpression>[] groupBy;
private final BOpStats stats;
GroupByTask(final MemoryGroupByOp op,
final BOpContext context) {
this.context = context;
this.stats = context.getStats();
this.groupByState = (IGroupByState) op
.getRequiredProperty(Annotations.GROUP_BY_STATE);
this.rewrite = (IGroupByRewriteState) op
.getRequiredProperty(Annotations.GROUP_BY_REWRITE);
this.groupBy = groupByState.getGroupByClause();
// The map is only defined if a GROUP_BY clause was used.
this.map = groupBy == null ? null
: new LinkedHashMap(
op.getInitialCapacity(), op.getLoadFactor());
}
/**
* Add the solution to the multiset for the appropriate group. If we can
* not compute the GROUP_BY value expressions for a solution, then the
* solution is dropped.
*
* @param bset
* The solution.
*/
private void accept(final IBindingSet bset) {
if (groupBy == null || groupBy.length == 0)
throw new IllegalArgumentException();
if (bset == null)
throw new IllegalArgumentException();
final SolutionGroup s = SolutionGroup.newInstance(groupBy, bset,
stats);
if (s == null) {
// Drop the solution.
if (log.isDebugEnabled())
log.debug("Dropping solution: " + bset);
return;
}
SolutionMultiSet m = map.get(s);
if (m == null) {
map.put(s, m = new SolutionMultiSet());
}
// Accept the solution.
if (log.isTraceEnabled())
log.trace("Accepting solution: " + bset);
m.add(bset);
}
@Override
public Void call() throws Exception {
final ICloseableIterator itr = context
.getSource();
final IBlockingBuffer sink = context.getSink();
try {
final List accepted = new LinkedList();
int naccepted = 0;
if (groupBy == null) {
/*
* Combine all solutions into a single multiset.
*/
final SolutionMultiSet m = new SolutionMultiSet();
while (itr.hasNext()) {
final IBindingSet[] a = itr.next();
stats.chunksIn.increment();
stats.unitsIn.add(a.length);
for (IBindingSet bset : a) {
m.add(bset);
}
}
// Compute the aggregate for that group.
final IBindingSet bset = aggregate(m.solutions);
if (bset != null) {
if (log.isDebugEnabled())
log.debug("output: solution=" + bset);
accepted.add(bset);
naccepted++;
} else {
if (log.isDebugEnabled())
log.debug("output : no solution.");
}
} else {
/*
* Group the solutions.
*/
while (itr.hasNext()) {
final IBindingSet[] a = itr.next();
stats.chunksIn.increment();
stats.unitsIn.add(a.length);
for (IBindingSet bset : a) {
accept(bset);
}
}
for (Map.Entry e : map
.entrySet()) {
final SolutionMultiSet m = e.getValue();
// Compute the aggregate for that group.
final IBindingSet bset = aggregate(m.solutions);
if (bset != null) {
if (log.isDebugEnabled())
log.debug("output: groupBy=" + e.getKey()
+ ", solution=" + bset);
accepted.add(bset);
naccepted++;
} else {
if (log.isDebugEnabled())
log.debug("output: groupBy=" + e.getKey()
+ " : dropped.");
}
}
// discard the map.
map.clear();
}
/*
* Output the aggregated bindings for the accepted solutions.
*/
if (naccepted > 0) {
final IBindingSet[] b = accepted
.toArray(new IBindingSet[naccepted]);
sink.add(b);
// flush the output.
sink.flush();
}
// done.
return null;
} finally {
sink.close();
}
} // call()
/**
* Compute the aggregate solution for a solution multiset (aka a group).
*
* @return The aggregate solution -or- null
if the solution
* for the group was dropped (type error or violated HAVING
* constraint).
*/
private IBindingSet aggregate(final Iterable solutions) {
/**
* The intermediate solution with all bindings produced when
* evaluating this solution group. Evaluation begins by binding any
* bare variables or BINDs in the GROUP_BY clause, followed by
* evaluating all aggregates, and then finally evaluating the
* (rewritten) SELECT expressions. The rewritten HAVING clause (if
* any) may then be then be trivially evaluated. If the solution is
* not dropped, then only the SELECTed variables are projected out.
*/
final IBindingSet aggregates = new ContextBindingSet(context,new ListBindingSet());
/**
* Propagate GROUP_BY expression onto [aggregates].
*/
if (groupBy != null) {
// The first solution in the group (must exist since the
// group was observed).
final IBindingSet aSolution = solutions.iterator().next();
for (IValueExpression> expr : groupBy) {
if (expr instanceof IVariable>) {
/**
* Propagate bare variable used in GROUP_BY clause to
* [aggregates].
*
*
* GROUP BY ?x
*
*/
final IVariable> var = (IVariable>) expr;
// Note: MUST be a binding for each groupBy var.
@SuppressWarnings({ "rawtypes", "unchecked" })
final Constant> val = new Constant(var.get(aSolution));
// Bind on [aggregates].
aggregates.set(var, val);
} else if (expr instanceof IBind>) {
/**
* Propagate BIND declared by GROUP_BY clause to
* [aggregates].
*
*
* GROUP BY (2*?y as ?x)
*
*/
final IBind> bindExpr = (IBind>) expr;
// Compute value expression.
// Note: MUST be valid since group exists.
@SuppressWarnings({ "rawtypes", "unchecked" })
final Constant> val = new Constant(
bindExpr.get(aSolution));
// Variable to be projected out by SELECT.
final IVariable> ovar = ((IBind>) expr).getVar();
// Bind on [aggregates].
aggregates.set(ovar, val);
}
} // next GROUP_BY value expression
} // if(groupBy != null)
/**
* Compute the aggregates.
*
* TODO This can be further optimized by computing the column
* projections of the different value expressions exactly once and
* then applying the aggregation functions to those column
* projections. As long as we adhere to the dependency ordering
* among those aggregates, we can compute them all in a single pass
* over the column projections.
*
* TODO DISTINCT projections of columns projections can be modeled
* in a bunch of different ways, but if we need the original column
* projection as well as the DISTINCT of that column projection then
* it makes sense to either form the DISTINCT projection while
* building the column projection or as an after action.
*/
{
final boolean nestedAggregates = groupByState.isNestedAggregates();
final Iterator, IVariable>>> itr = rewrite
.getAggExpr().entrySet().iterator();
while (itr.hasNext()) {
final Map.Entry, IVariable>> e = itr.next();
// Aggregate.
doAggregate(e.getKey(), e.getValue(), nestedAggregates,
aggregates, solutions, stats);
}
if (log.isTraceEnabled())
log.trace("aggregates: " + aggregates);
}
// Evaluate SELECT expressions.
for (IValueExpression> expr : rewrite.getSelect2()) {
try {
expr.get(aggregates);
} catch (SparqlTypeErrorException ex) {
TypeErrorLog.handleTypeError(ex, expr, stats);
continue;
} catch (IllegalArgumentException ex) {
/*
* Note: This is a hack turning an IllegalArgumentException
* which we presume is coming out of new Constant(null) into
* an (implicit) SPARQL type error so we can drop the
* binding for this SELECT expression. (Note that we are not
* trying to drop the entire group!)
*/
TypeErrorLog.handleTypeError(ex, expr, stats);
continue;
}
}
/*
* Verify optional constraint(s).
*
* TODO This could be done before fully computing the aggregates as
* we only need to have on hand those computed aggregates on which
* the HAVING clause depends.
*/
{
final boolean drop;
final IConstraint[] having2 = rewrite.getHaving2();
if (having2 != null
&& !BOpUtility.isConsistent(having2, aggregates)) {
// drop this solution.
drop = true;
} else {
drop = false;
}
if (log.isInfoEnabled())
log.info((drop ? "drop" : "keep") + " : " + aggregates);
if (drop) {
// Drop this solution.
return null;
}
}
// project out only selected variables.
final IBindingSet out = aggregates.copy(groupByState
.getSelectVars().toArray(new IVariable[0]));
return out;
}
} // GroupByTask
/**
* Apply the value expression to each solution in the group.
*
* @param expr
* The {@link IAggregate} to be evaluated.
* @param var
* The variable on which computed value of the {@link IAggregate}
* will be bound.
* @param selectDependency
* When true
, some aggregates bind variables which
* are relied on by other aggregates. In this case, this method
* must ensure that those bindings become visible.
* @param aggregates
* The binding set on which the results are being bound (by the
* caller).
* @param solutions
* The input solutions for a solution group across which we will
* compute the aggregate.
* @param stats
* Used to report type errors.
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
private static void doAggregate(//
final IAggregate> expr,//
final IVariable> var,//
final boolean selectDependency,//
final IBindingSet aggregates,//
final Iterable solutions,//
final BOpStats stats//
) {
try {
IConstant> c=null;
if (expr.isWildcard() && expr.isDistinct()) {
/**
* For a wildcard we basically need to operate on solution
* multisets. For example, COUNT(*) is the size of the solution
* multiset (aka group).
*
* Note: It is possible to optimize COUNT(*) and COUNT(DISTINCT
* *) as the cardinality of the solution multiset / solution set
* respectively. However, we can not undertake this optimization
* when COUNT() is parameterized by an {@link IValueExpression},
* even a simple {@link IVariable}, since then we need to count
* the solutions where the value expression is non-
* null
and NOT bind the result of the COUNT() for
* the group if the evaluation of the value expression results
* in an error for any solution in that group.
*/
// Set used to impose DISTINCT on the solution multiset.
final LinkedHashSet set = new LinkedHashSet();
expr.reset();
for (IBindingSet bset : solutions) {
if (set.add(bset)) {
if (selectDependency)
propagateAggregateBindings(aggregates, bset);
// aggregate iff this is a new result.
expr.get(bset);
}
}
final Object result = expr.done();
if (result != null) {
c = new Constant(result);
}
} else if (expr.isDistinct()) {
/*
* Apply aggregate function only to the distinct values which
* it's inner value expression takes on.
*/
// Set used to impose "DISTINCT" on value expression results.
final Set set = new LinkedHashSet();
expr.reset();
for (IBindingSet bset : solutions) {
final Object constants[] = new Object[expr.arity()];
for (int i=0;i> itr = aggregates
.iterator();
while (itr.hasNext()) {
@SuppressWarnings("rawtypes")
final Map.Entry e = itr.next();
bset.set(e.getKey(), e.getValue());
}
}
/**
* Wrapper used for as bound solutions in the {@link HTree}.
*
* Note: A similar class appears in different operators which use the
* {@link HTree}. However, these classes differ in what bindings are
* conceptually part of the key in the {@link HTree}, in how they compute
* the hash code under which the solution will be indexed, and in how they
* compare the solutions for equality.
*
* This implementation relies on an ordered {@link IVariable}[] which
* defines the bindings that are part of the key and permits a simpler
* {@link #equals(Object)}s method that would be used if an entire
* {@link IBindingSet} was being tested for equality (binding sets do not
* consider order of the bindings when testing for equality).
*/
private static class Solution implements Serializable {
private static final long serialVersionUID = 1L;
private final int hash;
private final Object[] vals;
/**
* Solution whose hash code is the hash code of the {@link IConstant}[].
*
* @param vals
* The values.
*/
public Solution(final Object[] vals) {
this.vals = vals;
this.hash = java.util.Arrays.hashCode(vals);
}
@Override
public int hashCode() {
return hash;
}
@Override
public boolean equals(final Object o) {
if (this == o)
return true;
if (!(o instanceof Solution)) {
return false;
}
final Solution t = (Solution) o;
if (vals.length != t.vals.length)
return false;
for (int i = 0; i < vals.length; i++) {
if (vals[i] == t.vals[i])
continue;
if (vals[i] == null)
return false;
if (!vals[i].equals(t.vals[i]))
return false;
}
return true;
}
} // class Solution
}