com.bigdata.rdf.sparql.ast.JoinSetUtil Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Oct 21, 2011
*/
package com.bigdata.rdf.sparql.ast;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.log4j.Logger;
import com.bigdata.bop.BOp;
import com.bigdata.bop.IVariable;
import com.bigdata.rdf.sparql.ast.optimizers.ASTHashJoinOptimizer;
/**
* Utility class for join analysis.
*
* TODO Surely we can do some bit math which would be slimmer and faster than
* managing the IVariable sets?
*
* @author Bryan Thompson
* @version $Id$
*/
public class JoinSetUtil {
private static final Logger log = Logger
.getLogger(ASTHashJoinOptimizer.class);
/**
* The group.
*/
public final GraphPatternGroup group;
/**
* The variables known to be bound on entry to the group.
*/
public final Set> knownBound;
/**
* The set of variables which are bound by the time the last required join
* is done. This includes the {@link #knownBound} variables plus any
* variables for any required join.
*/
public final Set> eventuallyBoundVars;
/**
* An array of the vertices for the required joins in the group. The indices
* into the array are the order in which the required joins were encountered
* in the group.
*/
public final IJoinNode[] requiredJoins;
/** The #of required joins in the group (the {@link #requiredJoins} length). */
public final int requiredJoinCount;
/**
* The FILTERS that can be run on entry to the group (and which should be
* lifted into the parent).
*/
public final Set preFilters;
/**
* The FILTERS that will be attached to the required joins for this group.
*/
public final Set joinFilters;
/**
* The FILTERS that can not be run until the end of the join group.
*/
public final Set postFilters;
/**
* The set of variables which are bound be each of the vertices for the
* required joins. The indices into the array are the order in which the
* requried joins were encountered in the group.
*/
public final Set>[] vars;
// final Set>[] varsWithFilters;
/**
* Diagonal matrix for the required joins. A cells having a positive value
* gives the #of directly shared variables. This will be ZERO (0) if there
* is no direct join. Only the upper diagonal of the matrix is populated.
*/
public final int[][] canJoin;
/**
* The #of direct joins found for each vertex (summed across
* {@link #canJoin} matrix.
*/
public final int[] directJoinCount;
/**
* The distinct sets of vertices which are composed solely of (the
* transitive closure of) joins on directly shared variables. For example
*
*
* p(x,y) X p(y,z) x p(z,t)
*
*
* Would form a direct join set each join will share a variable directly.
* Even though there are some vertices which do not share variables
* directly, they can be joined directly once other joins have been
* processed. For example p(z,t)
can be joined once we have
* p(y,z)
.
*
* The number of such direct join sets varies with the query. If all
* vertices fit into one direct join set, then the query can be pipelined
* efficiently. If there is more than one such set then we need to look
* further to see if we can identify joins on indirectly shared variables
* (shared through a FILTER) which can be used to piece these join sets
* together. If not, there is an unconstrained cross product in the query.
*/
public final Set directJoinSets;
/**
* A collection of vertices and the join variables they bind.
*/
public static class VertexJoinSet {
/**
* The set of verticies in this join set. The indices are the order in
* which the vertices were encountered in the group.
*/
final public Set vertices;
/**
* The set of variables bound by at least one join in the join set.
*/
final public Set> joinvars;
public VertexJoinSet() {
vertices = new LinkedHashSet();
joinvars = new LinkedHashSet>();
}
public VertexJoinSet(final Set vertices,
final Set> joinvars) {
this.vertices = vertices;
this.joinvars = joinvars;
}
public String toString() {
return getClass().getSimpleName() + "{vertices=" + vertices
+ ",joinvars=" + joinvars + "}";
}
public int hashCode() {
return vertices.hashCode();
}
public boolean equals(final Object o) {
if (this == o)
return true;
if (!(o instanceof VertexJoinSet))
return false;
final VertexJoinSet t = (VertexJoinSet) o;
if (!vertices.equals(t.vertices))
return false;
if (!joinvars.equals(t.joinvars))
return false;
return true;
}
}
/**
*
* @param sa
* @param knownBound Any variables known to be bound on entry to the group.
* @param group The group.
*/
public JoinSetUtil(final StaticAnalysis sa,
Set> knownBound,
final GraphPatternGroup group) {
this.group = group;
if (knownBound == null) {
knownBound = new LinkedHashSet>();
}
this.knownBound = knownBound;
/*
* Create an array of the vertices for the required joins.
*/
// Locate all the required joins.
{
final List list = new LinkedList();
for (IGroupMemberNode m : group) {
if (m instanceof IJoinNode) {
final IJoinNode j = (IJoinNode) m;
if (!j.isOptional())
list.add(j);
}
}
requiredJoinCount = list.size();
requiredJoins = list.toArray(new IJoinNode[requiredJoinCount]);
}
/*
* Create an array of the variables for each of the required join
* vertices.
*/
eventuallyBoundVars = new LinkedHashSet>();
{
vars = new Set[requiredJoinCount];
for (int i = 0; i < requiredJoinCount; i++) {
final IJoinNode j = requiredJoins[i];
// anything bound by this join.
final Set> tmp = sa.getSpannedVariables((BOp) j,
new LinkedHashSet>());
tmp.addAll(knownBound); // plus anything bound on entry to the group.
eventuallyBoundVars.addAll(vars[i] = tmp);
}
}
/*
* Identify the FILTERs which can run against the required joins. This
* will include both pre-filters (which really should be lifted out) and
* join-filters (which are satisified by the time we have run all the
* required joins).
*/
{
preFilters = new LinkedHashSet();
joinFilters = new LinkedHashSet();
postFilters = new LinkedHashSet();
for (IGroupMemberNode m : group) {
if (m instanceof FilterNode) {
final FilterNode f = (FilterNode) m;
if (sa.isFullyBound(f, knownBound)) {
/*
* The variables for this filter are already bound on
* entry.
*/
preFilters.add(f);
continue;
}
if (sa.isFullyBound(f, eventuallyBoundVars)) {
/*
* The variables for this filter will be fully bound by
* the time we are done with the required joins.
*/
joinFilters.add(f);
continue;
}
/*
* The variables for this filter will not be bound by the
* end of the required joins. If there are optional joins,
* then the filter might be able to succeed if it is run
* after the last optional join which could bind a variable
* which it needs and which is not bound by a required join
* in this group.
*/
postFilters.add(f);
}
}
}
/*
* Diagonal matrix for the required joins
*
* In the first pass, we set a cell to the #of directly shared join
* variables between each pair of vertices. This will be ZERO (0) if
* there is no direct join. Only the upper diagonal of the matrix is
* populated.
*/
canJoin = new int[requiredJoinCount][requiredJoinCount];
// The #of direct joins found for each vertex (summed across canJoin
// matrix).
directJoinCount = new int[requiredJoinCount];
{
for (int i = 0; i < requiredJoinCount; i++) {
for (int j = i + 1; j < requiredJoinCount; j++) {
final Set> sharedVars = new HashSet>();
sharedVars.addAll(vars[i]);
sharedVars.retainAll(vars[j]);
if ((canJoin[i][j] = sharedVars.size()) > 0) {
// #of times there is a direct join for this vertex.
directJoinCount[i]++;
directJoinCount[j]++;
}
}
}
}
if (log.isDebugEnabled()) {
// TODO Could be moved into a toString() for this class.
log.debug("\ncanJoin:\n" + toString(canJoin));
log.debug("\ndirectJoinCount:\n" + Arrays.toString(directJoinCount));
}
/*
* The distinct sets of vertices which composed solely of (the
* transitive closure of) direct joins.
*/
directJoinSets = calcDirectJoinSets();
}
/**
* Identify the subsets of the vertices which can join directly.
*
* Start with each row and create a set containing the variables for that
* vertex and a set of the vertices with which it has joined (initially this
* contains just that row). For each column that it joins with, add in all
* variables for that column and add the column to the set of joined
* vertices. Once all columns have been processed for the initial row, we
* have the first join set of vertices.
*
* Repeat for the first row not found in that join set until all joins have
* been incorporated into a join set. This gives us the #of join sets which
* do not overlap based on directly shared variables. It would be ideal to
* have one such join set. Where there is more than one there is clearly a
* question concerning which join set we should handle first.
*/
private Set calcDirectJoinSets() {
final Set joinSets = new LinkedHashSet();
// The set of vertices which have been consumed.
final Set used = new LinkedHashSet();
for (int i = 0; i < requiredJoinCount; i++) {
if (used.contains(i))
continue;
// This join set.
final VertexJoinSet joinSet = new VertexJoinSet();
joinSet.vertices.add(i);
used.add(i);
// The initial vertex for this join set.
joinSet.joinvars.addAll(vars[i]);
expandJoinSet(i, joinSet.vertices, used, joinSet.joinvars);
joinSets.add(joinSet);
if (log.isInfoEnabled())
log.info("joinSet: " + joinSet.vertices + " on "
+ joinSet.joinvars);
}
// All vertices must have been used.
assert used.size() == requiredJoinCount : "used=" + used
+ ", but requiredJoinCount=" + requiredJoinCount;
return joinSets;
}
private void expandJoinSet(final int i, final Set joinSet,
final Set used, final Set> joinvars) {
// Build up the join set
for (int j = 0; j < requiredJoinCount; j++) {
if (i == j)
continue;
if (used.contains(j))
continue;
if ((i < j && canJoin[i][j] > 0) || (i > j && canJoin[j][i] > 0)) {
joinSet.add(j);
used.add(j);
joinvars.addAll(vars[j]);
// And visit anything which canJoin() with j (recursion).
expandJoinSet(j, joinSet, used, joinvars);
}
}
}
static private String toString(final int[][] data) {
final StringBuilder sb = new StringBuilder();
final int n = data.length;
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
if (j <= i)
sb.append("-");
else
sb.append(data[i][j]);
sb.append(" ");
}
sb.append("\n");
}
return sb.toString();
}
}