All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bigdata.rdf.sparql.ast.JoinSetUtil Maven / Gradle / Ivy

/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Oct 21, 2011
 */

package com.bigdata.rdf.sparql.ast;

import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import org.apache.log4j.Logger;

import com.bigdata.bop.BOp;
import com.bigdata.bop.IVariable;
import com.bigdata.rdf.sparql.ast.optimizers.ASTHashJoinOptimizer;

/**
 * Utility class for join analysis.
 * 
 * TODO Surely we can do some bit math which would be slimmer and faster than
 * managing the IVariable sets?
 * 
 * @author Bryan Thompson
 * @version $Id$
 */
public class JoinSetUtil {

    private static final Logger log = Logger
            .getLogger(ASTHashJoinOptimizer.class);

    /**
     * The group.
     */
    public final GraphPatternGroup group;

    /**
     * The variables known to be bound on entry to the group.
     */
    public final Set> knownBound;

    /**
     * The set of variables which are bound by the time the last required join
     * is done. This includes the {@link #knownBound} variables plus any
     * variables for any required join.
     */
    public final Set> eventuallyBoundVars;
    
    /**
     * An array of the vertices for the required joins in the group. The indices
     * into the array are the order in which the required joins were encountered
     * in the group.
     */
    public final IJoinNode[] requiredJoins;

    /** The #of required joins in the group (the {@link #requiredJoins} length). */
    public final int requiredJoinCount;

    /**
     * The FILTERS that can be run on entry to the group (and which should be
     * lifted into the parent).
     */
    public final Set preFilters;

    /**
     * The FILTERS that will be attached to the required joins for this group.
     */
    public final Set joinFilters;

    /**
     * The FILTERS that can not be run until the end of the join group.
     */
    public final Set postFilters;

    /**
     * The set of variables which are bound be each of the vertices for the
     * required joins. The indices into the array are the order in which the
     * requried joins were encountered in the group.
     */
    public final Set>[] vars;

    // final Set>[] varsWithFilters;

    /**
     * Diagonal matrix for the required joins. A cells having a positive value
     * gives the #of directly shared variables. This will be ZERO (0) if there
     * is no direct join. Only the upper diagonal of the matrix is populated.
     */
    public final int[][] canJoin;

    /**
     * The #of direct joins found for each vertex (summed across
     * {@link #canJoin} matrix.
     */
    public final int[] directJoinCount;

    /**
     * The distinct sets of vertices which are composed solely of (the
     * transitive closure of) joins on directly shared variables. For example
     * 
     * 
     * p(x,y) X p(y,z) x p(z,t)
     * 
* * Would form a direct join set each join will share a variable directly. * Even though there are some vertices which do not share variables * directly, they can be joined directly once other joins have been * processed. For example p(z,t) can be joined once we have * p(y,z). *

* The number of such direct join sets varies with the query. If all * vertices fit into one direct join set, then the query can be pipelined * efficiently. If there is more than one such set then we need to look * further to see if we can identify joins on indirectly shared variables * (shared through a FILTER) which can be used to piece these join sets * together. If not, there is an unconstrained cross product in the query. */ public final Set directJoinSets; /** * A collection of vertices and the join variables they bind. */ public static class VertexJoinSet { /** * The set of verticies in this join set. The indices are the order in * which the vertices were encountered in the group. */ final public Set vertices; /** * The set of variables bound by at least one join in the join set. */ final public Set> joinvars; public VertexJoinSet() { vertices = new LinkedHashSet(); joinvars = new LinkedHashSet>(); } public VertexJoinSet(final Set vertices, final Set> joinvars) { this.vertices = vertices; this.joinvars = joinvars; } public String toString() { return getClass().getSimpleName() + "{vertices=" + vertices + ",joinvars=" + joinvars + "}"; } public int hashCode() { return vertices.hashCode(); } public boolean equals(final Object o) { if (this == o) return true; if (!(o instanceof VertexJoinSet)) return false; final VertexJoinSet t = (VertexJoinSet) o; if (!vertices.equals(t.vertices)) return false; if (!joinvars.equals(t.joinvars)) return false; return true; } } /** * * @param sa * @param knownBound Any variables known to be bound on entry to the group. * @param group The group. */ public JoinSetUtil(final StaticAnalysis sa, Set> knownBound, final GraphPatternGroup group) { this.group = group; if (knownBound == null) { knownBound = new LinkedHashSet>(); } this.knownBound = knownBound; /* * Create an array of the vertices for the required joins. */ // Locate all the required joins. { final List list = new LinkedList(); for (IGroupMemberNode m : group) { if (m instanceof IJoinNode) { final IJoinNode j = (IJoinNode) m; if (!j.isOptional()) list.add(j); } } requiredJoinCount = list.size(); requiredJoins = list.toArray(new IJoinNode[requiredJoinCount]); } /* * Create an array of the variables for each of the required join * vertices. */ eventuallyBoundVars = new LinkedHashSet>(); { vars = new Set[requiredJoinCount]; for (int i = 0; i < requiredJoinCount; i++) { final IJoinNode j = requiredJoins[i]; // anything bound by this join. final Set> tmp = sa.getSpannedVariables((BOp) j, new LinkedHashSet>()); tmp.addAll(knownBound); // plus anything bound on entry to the group. eventuallyBoundVars.addAll(vars[i] = tmp); } } /* * Identify the FILTERs which can run against the required joins. This * will include both pre-filters (which really should be lifted out) and * join-filters (which are satisified by the time we have run all the * required joins). */ { preFilters = new LinkedHashSet(); joinFilters = new LinkedHashSet(); postFilters = new LinkedHashSet(); for (IGroupMemberNode m : group) { if (m instanceof FilterNode) { final FilterNode f = (FilterNode) m; if (sa.isFullyBound(f, knownBound)) { /* * The variables for this filter are already bound on * entry. */ preFilters.add(f); continue; } if (sa.isFullyBound(f, eventuallyBoundVars)) { /* * The variables for this filter will be fully bound by * the time we are done with the required joins. */ joinFilters.add(f); continue; } /* * The variables for this filter will not be bound by the * end of the required joins. If there are optional joins, * then the filter might be able to succeed if it is run * after the last optional join which could bind a variable * which it needs and which is not bound by a required join * in this group. */ postFilters.add(f); } } } /* * Diagonal matrix for the required joins * * In the first pass, we set a cell to the #of directly shared join * variables between each pair of vertices. This will be ZERO (0) if * there is no direct join. Only the upper diagonal of the matrix is * populated. */ canJoin = new int[requiredJoinCount][requiredJoinCount]; // The #of direct joins found for each vertex (summed across canJoin // matrix). directJoinCount = new int[requiredJoinCount]; { for (int i = 0; i < requiredJoinCount; i++) { for (int j = i + 1; j < requiredJoinCount; j++) { final Set> sharedVars = new HashSet>(); sharedVars.addAll(vars[i]); sharedVars.retainAll(vars[j]); if ((canJoin[i][j] = sharedVars.size()) > 0) { // #of times there is a direct join for this vertex. directJoinCount[i]++; directJoinCount[j]++; } } } } if (log.isDebugEnabled()) { // TODO Could be moved into a toString() for this class. log.debug("\ncanJoin:\n" + toString(canJoin)); log.debug("\ndirectJoinCount:\n" + Arrays.toString(directJoinCount)); } /* * The distinct sets of vertices which composed solely of (the * transitive closure of) direct joins. */ directJoinSets = calcDirectJoinSets(); } /** * Identify the subsets of the vertices which can join directly. *

* Start with each row and create a set containing the variables for that * vertex and a set of the vertices with which it has joined (initially this * contains just that row). For each column that it joins with, add in all * variables for that column and add the column to the set of joined * vertices. Once all columns have been processed for the initial row, we * have the first join set of vertices. *

* Repeat for the first row not found in that join set until all joins have * been incorporated into a join set. This gives us the #of join sets which * do not overlap based on directly shared variables. It would be ideal to * have one such join set. Where there is more than one there is clearly a * question concerning which join set we should handle first. */ private Set calcDirectJoinSets() { final Set joinSets = new LinkedHashSet(); // The set of vertices which have been consumed. final Set used = new LinkedHashSet(); for (int i = 0; i < requiredJoinCount; i++) { if (used.contains(i)) continue; // This join set. final VertexJoinSet joinSet = new VertexJoinSet(); joinSet.vertices.add(i); used.add(i); // The initial vertex for this join set. joinSet.joinvars.addAll(vars[i]); expandJoinSet(i, joinSet.vertices, used, joinSet.joinvars); joinSets.add(joinSet); if (log.isInfoEnabled()) log.info("joinSet: " + joinSet.vertices + " on " + joinSet.joinvars); } // All vertices must have been used. assert used.size() == requiredJoinCount : "used=" + used + ", but requiredJoinCount=" + requiredJoinCount; return joinSets; } private void expandJoinSet(final int i, final Set joinSet, final Set used, final Set> joinvars) { // Build up the join set for (int j = 0; j < requiredJoinCount; j++) { if (i == j) continue; if (used.contains(j)) continue; if ((i < j && canJoin[i][j] > 0) || (i > j && canJoin[j][i] > 0)) { joinSet.add(j); used.add(j); joinvars.addAll(vars[j]); // And visit anything which canJoin() with j (recursion). expandJoinSet(j, joinSet, used, joinvars); } } } static private String toString(final int[][] data) { final StringBuilder sb = new StringBuilder(); final int n = data.length; for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { if (j <= i) sb.append("-"); else sb.append(data[i][j]); sb.append(" "); } sb.append("\n"); } return sb.toString(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy