org.rdfhdt.hdtjena.solver.ReorderTransformationHDT Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hdt-jena Show documentation
Show all versions of hdt-jena Show documentation
Integration of HDT with Apache Jena
The newest version!
/*
* File: $HeadURL: https://hdt-java.googlecode.com/svn/trunk/hdt-jena/src/org/rdfhdt/hdtjena/solver/ReorderTransformationHDT.java $
* Revision: $Rev: 190 $
* Last modified: $Date: 2013-03-03 11:30:03 +0000 (dom, 03 mar 2013) $
* Last modified by: $Author: mario.arias $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
* Contacting the authors:
* Mario Arias: [email protected]
* Javier D. Fernandez: [email protected]
* Miguel A. Martinez-Prieto: [email protected]
*/
package org.rdfhdt.hdtjena.solver;
import static org.apache.jena.sparql.engine.optimizer.reorder.PatternElements.TERM;
import static org.apache.jena.sparql.engine.optimizer.reorder.PatternElements.VAR;
import org.apache.jena.graph.GraphStatisticsHandler;
import org.apache.jena.graph.Node;
import org.apache.jena.sparql.engine.optimizer.Pattern;
import org.apache.jena.sparql.engine.optimizer.StatsMatcher;
import org.apache.jena.sparql.engine.optimizer.reorder.PatternTriple;
import org.apache.jena.sparql.engine.optimizer.reorder.ReorderTransformationSubstitution;
import org.apache.jena.sparql.graph.NodeConst;
import org.apache.jena.sparql.sse.Item;
import org.rdfhdt.hdt.dictionary.Dictionary;
import org.rdfhdt.hdtjena.HDTGraph;
/**
* Reorders the Triple Patterns of a BGP by using statistics directly fetched from
* the dataset. At query optimization phase, when planning index-joins, some variables
* are known to be bound at some stage, but the actual values are unknown.
* In this case it uses predefined typical behaviour for RDF, using independent
* histograms for S/P/O, inspired by Jena's FixedReorder.
*
* @author mario.arias
*
*/
public class ReorderTransformationHDT extends ReorderTransformationSubstitution {
/** Maximum value for a match involving two terms. */
public static final int MultiTermMax = 100;
/** The number of triples used for the base scale */
public static final int MultiTermSampleSize = 10000 ;
final long TERM_S ; // Used for S ? ? if no stats
final long TERM_P ; // Used for ? P ? if no stats
final long TERM_O ; // Used for ? ? O if no stats
final long numTriples ; // Actual number of triples of the dataset.
private final GraphStatisticsHandler stats;
public final StatsMatcher matcher = new StatsMatcher() ;
public ReorderTransformationHDT(HDTGraph graph)
{
this.stats = graph.getStatisticsHandler();
numTriples = graph.size();
initializeMatcher();
// FIXME: Compute exactly for using the HDT
Dictionary dict = graph.getHDT().getDictionary();
TERM_S = dict.getNsubjects()/Math.max(numTriples, 1);
TERM_P = dict.getNpredicates()/Math.max(numTriples, 1);
TERM_O = dict.getNobjects()/Math.max(numTriples, 1);
}
private void initializeMatcher () {
Item type = Item.createNode(NodeConst.nodeRDFType);
//matcher.addPattern(new Pattern(1, TERM, TERM, TERM)) ; // SPO - built-in - not needed as a rule
// Numbers chosen as an approximation for a graph of 10K triples
matcher.addPattern(new Pattern(5, TERM, TERM, VAR)) ; // SP?
matcher.addPattern(new Pattern(1000,VAR, type, TERM)) ; // ? type O -- worse than ?PO
matcher.addPattern(new Pattern(90, VAR, TERM, TERM)) ; // ?PO
matcher.addPattern(new Pattern(5, TERM, VAR, TERM)) ; // S?O
matcher.addPattern(new Pattern(40, TERM, VAR, VAR)) ; // S??
matcher.addPattern(new Pattern(200, VAR, VAR, TERM)) ; // ??O
matcher.addPattern(new Pattern(2000,VAR, TERM, VAR)) ; // ?P?
matcher.addPattern(new Pattern(MultiTermSampleSize, VAR, VAR, VAR)) ; // ???
}
@Override
protected double weight(PatternTriple pt)
{
// If all are nodes, there are no substitutions. We can get the exact number.
if(pt.subject.isNode() && pt.predicate.isNode() && pt.object.isNode()) {
return stats.getStatistic(pt.subject.getNode(), pt.predicate.getNode(), pt.object.getNode());
}
// Try on fixed
double x = matcher.match(pt);
// If there are two fixed terms, use the fixed weighting, all of which are quite small.
// This chooses a less optimal triple but the worse choice is still a very selective choice.
// One case is IFPs: the multi term choice for PO is not 1.
if ( x < MultiTermMax )
{
return x;
}
// One or zero fixed terms.
// Otherwise, assuming S / P / O independent, do an estimation.
long S = -1 ;
long P = -1 ;
long O = -1 ;
// Include guesses for SP, OP, typeClass
if ( pt.subject.isNode() && !pt.subject.isVar()) {
S = stats.getStatistic(pt.subject.getNode(), Node.ANY, Node.ANY) ;
} else if ( TERM.equals(pt.subject) ) {
S = TERM_S ;
}
// rdf:type.
if ( pt.predicate.isNode() && !pt.predicate.isVar())
P = stats.getStatistic(Node.ANY, pt.predicate.getNode(), Node.ANY) ;
else if ( TERM.equals(pt.predicate) ) {
P = TERM_P ;
}
if ( pt.object.isNode() && !pt.object.isVar())
O = stats.getStatistic(Node.ANY, Node.ANY, pt.object.getNode()) ;
else if ( TERM.equals(pt.object) ) {
O = TERM_O ;
}
if ( S == 0 || P == 0 || O == 0 ) {
// Can't match.
return 0 ;
}
// Find min positive
x = -1 ;
if ( S > 0 ) x = S ;
if ( P > 0 && P < x ) x = P ;
if ( O > 0 && O < x ) x = O ;
//System.out.printf("** [%d, %d, %d]\n", S, P ,O) ;
return x;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy