![JAR search and dependency download from the Maven repository](/logo.png)
com.bigdata.rdf.sail.webapp.VoID Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Jul 24, 2012
*/
package com.bigdata.rdf.sail.webapp;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.openrdf.model.BNode;
import org.openrdf.model.Graph;
import org.openrdf.model.Resource;
import org.openrdf.model.URI;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.vocabulary.RDF;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.NotMaterializedException;
import com.bigdata.rdf.model.BigdataResource;
import com.bigdata.rdf.model.BigdataURI;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.sail.webapp.client.ConnectOptions;
import com.bigdata.rdf.spo.SPOKeyOrder;
import com.bigdata.rdf.spo.SPORelation;
import com.bigdata.rdf.store.AbstractTripleStore;
import com.bigdata.rdf.store.BigdataValueIterator;
import com.bigdata.rdf.store.BigdataValueIteratorImpl;
import com.bigdata.rdf.vocab.decls.DCTermsVocabularyDecl;
import com.bigdata.rdf.vocab.decls.VoidVocabularyDecl;
import com.bigdata.striterator.IChunkedIterator;
/**
* Helper class for VoID descriptions.
*
* @see Describing Linked Datasets with
* the VoID Vocabulary
*
* @author Bryan Thompson
*/
public class VoID {
/**
* The graph in which the service description is accumulated (from the
* constructor).
*/
private final Graph g;
/**
* The KB instance that is being described (from the constructor).
*/
private final AbstractTripleStore tripleStore;
/**
* The service end point(s) (from the constructor).
*/
private final String[] serviceURI;
/**
* The value factory used to create values for the service description graph
* {@link #g}.
*/
private final ValueFactory f;
// /**
// * The resource which models the service.
// */
// protected final BNode aService;
/**
* The resource which models the data set.
*/
private final Resource aDataset;
/**
* The resource which models the default graph in the data set.
*/
private final BNode aDefaultGraph;
/**
*
* @param g
* Where to assemble the description.
* @param tripleStore
* The KB instance to be described.
* @param serviceURI
* The SPARQL service end point.
* @param aDefaultDataset
* The data set identifier that will be used on the description
* (the bigdata namespace of the dataset is obtained from the
* tripleStore).
*/
public VoID(final Graph g, final AbstractTripleStore tripleStore,
final String[] serviceURI, final Resource aDataset) {
if (g == null)
throw new IllegalArgumentException();
if (tripleStore == null)
throw new IllegalArgumentException();
if (serviceURI == null)
throw new IllegalArgumentException();
if (serviceURI.length == 0)
throw new IllegalArgumentException();
for (String s : serviceURI)
if (s == null)
throw new IllegalArgumentException();
if (aDataset == null)
throw new IllegalArgumentException();
this.g = g;
this.tripleStore = tripleStore;
this.serviceURI = serviceURI;
this.f = g.getValueFactory();
this.aDataset = aDataset;
this.aDefaultGraph = f.createBNode("defaultGraph");
}
/**
* Describe the default data set (the one identified by the namespace
* associated with the {@link AbstractTripleStore}.
*
* @param describeStatistics
* When true
, the VoID description will include the
* {@link VoidVocabularyDecl#vocabulary} declarations, the
* property partition statistics, and the class partition
* statistics.
* @param describeNamedGraphs
* When true
, each named graph will also be
* described in in the same level of detail as the default graph.
* Otherwise only the default graph will be described.
*/
public void describeDataSet(final boolean describeStatistics,
final boolean describeNamedGraphs) {
final String namespace = tripleStore.getNamespace();
// This is a VoID data set.
g.add(aDataset, RDF.TYPE, VoidVocabularyDecl.Dataset);
// The namespace is used as a title for the data set.
g.add(aDataset, DCTermsVocabularyDecl.title,
f.createLiteral(namespace));
// Also present the namespace in an unambiguous manner.
g.add(aDataset, SD.KB_NAMESPACE, f.createLiteral(namespace));
/**
* Service end point for this namespace.
*
* @see
* Missing URL encoding in RemoteRepositoryManager
*/
for (String uri : serviceURI) {
g.add(aDataset,
VoidVocabularyDecl.sparqlEndpoint,
f.createURI(uri + "/" + ConnectOptions.urlEncode(namespace)
+ "/sparql"));
}
// any URI is considered to be an entity.
g.add(aDataset, VoidVocabularyDecl.uriRegexPattern,
f.createLiteral("^.*"));
if(!describeStatistics) {
// No statistics.
return;
}
// Frequency count of the predicates in the default graph.
final IVCount[] predicatePartitionCounts = predicateUsage(
tripleStore);
// Frequency count of the classes in the default graph.
final IVCount[] classPartitionCounts = classUsage(tripleStore);
// Describe vocabularies based on the predicate partitions.
describeVocabularies(predicatePartitionCounts);
// defaultGraph description.
{
// Default graph in the default data set.
g.add(aDataset, SD.defaultGraph, aDefaultGraph);
// Describe the default graph using statistics.
describeGraph(aDefaultGraph, predicatePartitionCounts,
classPartitionCounts);
} // end defaultGraph
// sb.append("termCount\t = " + tripleStore.getTermCount() + "\n");
//
// sb.append("uriCount\t = " + tripleStore.getURICount() + "\n");
//
// sb.append("literalCount\t = " + tripleStore.getLiteralCount() +
// "\n");
//
// /*
// * Note: The blank node count is only available when using the told
// * bnodes mode.
// */
// sb
// .append("bnodeCount\t = "
// + (tripleStore.getLexiconRelation()
// .isStoreBlankNodes() ? ""
// + tripleStore.getBNodeCount() : "N/A")
// + "\n");
/*
* Report for each named graph.
*/
if (describeNamedGraphs && tripleStore.isQuads()) {
final SPORelation r = tripleStore.getSPORelation();
// the index to use for distinct term scan.
final SPOKeyOrder keyOrder = SPOKeyOrder.CSPO;
// visit distinct IVs for context position on that index.
@SuppressWarnings("rawtypes")
final IChunkedIterator itr = r.distinctTermScan(keyOrder);
// resolve IVs to terms efficiently during iteration.
final BigdataValueIterator itr2 = new BigdataValueIteratorImpl(
tripleStore/* resolveTerms */, itr);
try {
while (itr2.hasNext()) {
/*
* Describe this named graph.
*
* Note: This is using the predicate and class partition
* statistics from the default graph (RDF merge) to identify
* the set of all possible predicates and classes within
* each named graph. It then tests each predicate and class
* partition against the named graph and ignores those which
* are not present in a given named graph. This is being
* done because we do not have a CPxx index.
*/
final BigdataResource graph = (BigdataResource) itr2.next();
final IVCount[] predicatePartitionCounts2 = predicateUsage(
tripleStore, graph.getIV(),
predicatePartitionCounts);
final IVCount[] classPartitionCounts2 = classUsage(
tripleStore, graph.getIV(), classPartitionCounts);
final BNode aNamedGraph = f.createBNode();
// Named graph in the default data set.
g.add(aDataset, SD.namedGraph, aNamedGraph);
// The name of that named graph.
g.add(aNamedGraph, SD.name, graph);
// Describe the named graph.
describeGraph(aNamedGraph, predicatePartitionCounts2,
classPartitionCounts2);
}
} finally {
itr2.close();
}
}
}
/**
* Describe the vocabularies which are in use in the KB based on the
* predicate partition statistics.
*
* @param predicateParitionCounts
* The predicate partition statistics.
*/
protected void describeVocabularies(final IVCount[] predicatePartitionCounts) {
// Find the distinct vocabularies in use.
final Set namespaces = new LinkedHashSet();
{
// property partitions.
for (IVCount tmp : predicatePartitionCounts) {
final URI p = (URI) tmp.getValue();
String namespace = p.getNamespace();
if (namespace.endsWith("#")) {
// Strip trailing '#' per VoID specification.
namespace = namespace.substring(0,
namespace.length() - 1);
}
namespaces.add(namespace);
}
}
// Sort into dictionary order.
final String[] a = namespaces.toArray(new String[namespaces.size()]);
Arrays.sort(a);
for (String namespace : a) {
g.add(aDataset, VoidVocabularyDecl.vocabulary,
f.createURI(namespace));
}
}
/**
* Describe a named or default graph.
*
* @param graph
* The named graph.
* @param predicatePartitionCounts
* The predicate partition statistics for that graph.
* @param classPartitionCounts
* The class partition statistics for that graph.
*/
protected void describeGraph(final Resource graph,
final IVCount[] predicatePartitionCounts,
final IVCount[] classPartitionCounts) {
// The graph is a Graph.
g.add(graph, RDF.TYPE, SD.Graph);
// #of triples in the default graph
g.add(graph, VoidVocabularyDecl.triples,
f.createLiteral(tripleStore.getStatementCount()));
// #of entities in the default graph.
g.add(graph, VoidVocabularyDecl.entities,
f.createLiteral(tripleStore.getURICount()));
// #of distinct predicates in the default graph.
g.add(graph, VoidVocabularyDecl.properties,
f.createLiteral(predicatePartitionCounts.length));
// #of distinct classes in the default graph.
g.add(graph, VoidVocabularyDecl.classes,
f.createLiteral(classPartitionCounts.length));
// property partition statistics.
for (IVCount tmp : predicatePartitionCounts) {
final BNode propertyPartition = f.createBNode();
final URI p = (URI) tmp.getValue();
g.add(graph, VoidVocabularyDecl.propertyPartition,
propertyPartition);
g.add(propertyPartition, VoidVocabularyDecl.property, p);
g.add(propertyPartition, VoidVocabularyDecl.triples,
f.createLiteral(tmp.count));
}
// class partition statistics.
{
// per class partition statistics.
for (IVCount tmp : classPartitionCounts) {
final BNode classPartition = f.createBNode();
final BigdataValue cls = tmp.getValue();
g.add(graph, VoidVocabularyDecl.classPartition,
classPartition);
g.add(classPartition, VoidVocabularyDecl.class_, cls);
g.add(classPartition, VoidVocabularyDecl.triples,
f.createLiteral(tmp.count));
}
} // end class partition statistics.
}
/**
* An {@link IV} and a counter for that {@link IV}.
*/
protected static class IVCount implements Comparable {
public final IV, ?> iv;
public final long count;
private BigdataValue val;
/**
* Return the associated {@link BigdataValue}.
*
* Note: A resolution set is necessary if you want to attach the
* {@link BigdataValue} to the {@link IV}.
*
* @throws NotMaterializedException
*/
public BigdataValue getValue() {
if(val == null)
throw new NotMaterializedException(iv.toString());
return val;
}
public void setValue(final BigdataValue val) {
if (val == null)
throw new IllegalArgumentException();
if (this.val != null && !this.val.equals(val))
throw new IllegalArgumentException();
this.val = val;
}
public IVCount(final IV,?> iv, final long count) {
if (iv == null)
throw new IllegalArgumentException();
this.iv = iv;
this.count = count;
}
/**
* Place into order by descending count.
*/
@Override
public int compareTo(IVCount arg0) {
if (count < arg0.count)
return 1;
if (count > arg0.count)
return -1;
return 0;
}
}
/**
* Return an array of the distinct predicates in the KB ordered by their
* descending frequency of use. The {@link IV}s in the returned array will
* have been resolved to the corresponding {@link BigdataURI}s which can be
* accessed using {@link IV#getValue()}.
*
* @param kb
* The KB instance.
*/
protected static IVCount[] predicateUsage(final AbstractTripleStore kb) {
final SPORelation r = kb.getSPORelation();
if (r.oneAccessPath) {
// The necessary index (POS or POCS) does not exist.
throw new UnsupportedOperationException();
}
final boolean quads = kb.isQuads();
// the index to use for distinct predicate scan.
final SPOKeyOrder keyOrder = quads ? SPOKeyOrder.POCS : SPOKeyOrder.POS;
// visit distinct term identifiers for predicate position on that index.
@SuppressWarnings("rawtypes")
final IChunkedIterator itr = r.distinctTermScan(keyOrder);
// resolve term identifiers to terms efficiently during iteration.
final BigdataValueIterator itr2 = new BigdataValueIteratorImpl(
kb/* resolveTerms */, itr);
try {
final Set> ivs = new LinkedHashSet>();
final Map, IVCount> counts = new LinkedHashMap, IVCount>();
while (itr2.hasNext()) {
final BigdataValue term = itr2.next();
final IV,?> iv = term.getIV();
final long n = r.getAccessPath(null, iv, null, null)
.rangeCount(false/* exact */);
ivs.add(iv);
counts.put(iv, new IVCount(iv, n));
}
// Batch resolve IVs to Values
final Map, BigdataValue> x = kb.getLexiconRelation()
.getTerms(ivs);
for (Map.Entry, BigdataValue> e : x.entrySet()) {
final IVCount count = counts.get(e.getKey());
count.setValue(e.getValue());
}
final IVCount[] a = counts.values().toArray(
new IVCount[counts.size()]);
// Order by descending count.
Arrays.sort(a);
return a;
} finally {
itr2.close();
}
}
/**
* Return the predicate partition statistics for the named graph.
*
* @param kb
* The KB instance.
* @param civ
* The {@link IV} of a named graph (required).
*
* @return The predicate partition statistics for that named graph. Only
* predicate partitions which are non-empty are returned.
*/
protected static IVCount[] predicateUsage(final AbstractTripleStore kb,
final IV, ?> civ, final IVCount[] predicatePartitionCounts) {
final SPORelation r = kb.getSPORelation();
final boolean quads = kb.isQuads();
if (!quads) {
// Named graph only valid in quads mode.
throw new IllegalArgumentException();
}
// The non-zero counts.
final List counts = new LinkedList();
// Check the known non-empty predicate partitions.
for(IVCount in : predicatePartitionCounts){
final long n = r.getAccessPath(null, in.iv, null, civ).rangeCount(
false/* exact */);
if (n == 0)
continue;
final IVCount out = new IVCount(in.iv, n);
out.setValue(in.getValue());
counts.add(out);
}
final IVCount[] a = counts.toArray(new IVCount[counts.size()]);
// Order by descending count.
Arrays.sort(a);
return a;
}
/**
* Return an efficient statistical summary for the class partitions. The
* SPARQL query for this is
*
*
* SELECT ?class (COUNT(?s) AS ?count ) { ?s a ?class } GROUP BY ?class ORDER BY ?count
*
*
* However, it is much efficient to scan POS for
*
*
* rdf:type ?o ?s
*
*
* and report the range count of
*
*
* rdf:type ?o ?s
*
*
* for each distinct value of ?o
.
*
* @param kb
* The KB instance.
*
* @return The class usage statistics.
*/
protected static IVCount[] classUsage(final AbstractTripleStore kb) {
final SPORelation r = kb.getSPORelation();
if (r.oneAccessPath) {
// The necessary index (POS or POCS) does not exist.
throw new UnsupportedOperationException();
}
final boolean quads = kb.isQuads();
final SPOKeyOrder keyOrder = quads ? SPOKeyOrder.POCS : SPOKeyOrder.POS;
// Resolve IV for rdf:type
final BigdataURI rdfType = kb.getValueFactory().asValue(RDF.TYPE);
kb.getLexiconRelation().addTerms(new BigdataValue[] { rdfType },
1/* numTerms */, true/* readOnly */);
if (rdfType.getIV() == null) {
// No rdf:type assertions since rdf:type is unknown term.
return new IVCount[0];
}
// visit distinct term identifiers for the rdf:type predicate.
@SuppressWarnings("rawtypes")
final IChunkedIterator itr = r.distinctMultiTermScan(keyOrder,
new IV[] { rdfType.getIV() }/* knownTerms */);
// resolve term identifiers to terms efficiently during iteration.
final BigdataValueIterator itr2 = new BigdataValueIteratorImpl(
kb/* resolveTerms */, itr);
try {
final Set> ivs = new LinkedHashSet>();
final Map, IVCount> counts = new LinkedHashMap, IVCount>();
while (itr2.hasNext()) {
final BigdataValue term = itr2.next();
final IV,?> iv = term.getIV();
final long n = r.getAccessPath(null, rdfType.getIV()/* p */,
iv/* o */, null).rangeCount(false/* exact */);
ivs.add(iv);
counts.put(iv, new IVCount(iv, n));
}
// Batch resolve IVs to Values
final Map, BigdataValue> x = kb.getLexiconRelation()
.getTerms(ivs);
for (Map.Entry, BigdataValue> e : x.entrySet()) {
final IVCount count = counts.get(e.getKey());
count.setValue(e.getValue());
}
final IVCount[] a = counts.values().toArray(
new IVCount[counts.size()]);
// Order by descending count.
Arrays.sort(a);
return a;
} finally {
itr2.close();
}
}
/**
* Return the class partition statistics for the named graph.
*
* @param kb
* The KB instance.
* @param civ
* The {@link IV} of a named graph (required).
*
* @return The class partition statistics for that named graph. Only class
* partitions which are non-empty are returned.
*/
protected static IVCount[] classUsage(final AbstractTripleStore kb,
final IV, ?> civ, final IVCount[] classPartitionCounts) {
final SPORelation r = kb.getSPORelation();
final boolean quads = kb.isQuads();
if (!quads) {
// Named graph only valid in quads mode.
throw new IllegalArgumentException();
}
// Resolve IV for rdf:type
final BigdataURI rdfType = kb.getValueFactory().asValue(RDF.TYPE);
kb.getLexiconRelation().addTerms(new BigdataValue[] { rdfType },
1/* numTerms */, true/* readOnly */);
if (rdfType.getIV() == null) {
// No rdf:type assertions since rdf:type is unknown term.
return new IVCount[0];
}
// The non-zero counts.
final List counts = new LinkedList();
// Check the known non-empty predicate partitions.
for (IVCount in : classPartitionCounts) {
final long n = r.getAccessPath(null, rdfType.getIV()/* p */,
in.iv/* o */, civ).rangeCount(false/* exact */);
if (n == 0)
continue;
final IVCount out = new IVCount(in.iv, n);
out.setValue(in.getValue());
counts.add(out);
}
final IVCount[] a = counts.toArray(new IVCount[counts.size()]);
// Order by descending count.
Arrays.sort(a);
return a;
}
}