All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bigdata.rdf.sail.webapp.VoID Maven / Gradle / Ivy

/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Jul 24, 2012
 */
package com.bigdata.rdf.sail.webapp;

import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.openrdf.model.BNode;
import org.openrdf.model.Graph;
import org.openrdf.model.Resource;
import org.openrdf.model.URI;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.vocabulary.RDF;

import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.NotMaterializedException;
import com.bigdata.rdf.model.BigdataResource;
import com.bigdata.rdf.model.BigdataURI;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.sail.webapp.client.ConnectOptions;
import com.bigdata.rdf.spo.SPOKeyOrder;
import com.bigdata.rdf.spo.SPORelation;
import com.bigdata.rdf.store.AbstractTripleStore;
import com.bigdata.rdf.store.BigdataValueIterator;
import com.bigdata.rdf.store.BigdataValueIteratorImpl;
import com.bigdata.rdf.vocab.decls.DCTermsVocabularyDecl;
import com.bigdata.rdf.vocab.decls.VoidVocabularyDecl;
import com.bigdata.striterator.IChunkedIterator;

/**
 * Helper class for VoID descriptions.
 * 
 * @see  Describing Linked Datasets with
 *      the VoID Vocabulary 
 *      
 * @author Bryan Thompson
 */
public class VoID {

    /**
     * The graph in which the service description is accumulated (from the
     * constructor).
     */
    private final Graph g;
    
    /**
     * The KB instance that is being described (from the constructor).
     */
    private final AbstractTripleStore tripleStore;
    
    /**
     * The service end point(s) (from the constructor).
     */
    private final String[] serviceURI;

    /**
     * The value factory used to create values for the service description graph
     * {@link #g}.
     */
    private final ValueFactory f;
    
//    /**
//     * The resource which models the service.
//     */
//    protected final BNode aService;

    /**
     * The resource which models the data set.
     */
    private final Resource aDataset;

    /**
     * The resource which models the default graph in the data set.
     */
    private final BNode aDefaultGraph;

    /**
     * 
     * @param g
     *            Where to assemble the description.
     * @param tripleStore
     *            The KB instance to be described.
     * @param serviceURI
     *            The SPARQL service end point.
     * @param aDefaultDataset
     *            The data set identifier that will be used on the description
     *            (the bigdata namespace of the dataset is obtained from the
     *            tripleStore).
     */
    public VoID(final Graph g, final AbstractTripleStore tripleStore,
            final String[] serviceURI, final Resource aDataset) {

        if (g == null)
            throw new IllegalArgumentException();

        if (tripleStore == null)
            throw new IllegalArgumentException();
        
        if (serviceURI == null)
            throw new IllegalArgumentException();

        if (serviceURI.length == 0)
            throw new IllegalArgumentException();

        for (String s : serviceURI)
            if (s == null)
                throw new IllegalArgumentException();
        
        if (aDataset == null)
            throw new IllegalArgumentException();

        this.g = g;
        
        this.tripleStore = tripleStore;
        
        this.serviceURI = serviceURI;
        
        this.f = g.getValueFactory();
        
        this.aDataset = aDataset;
    
        this.aDefaultGraph = f.createBNode("defaultGraph");
        
    }
    
    /**
     * Describe the default data set (the one identified by the namespace
     * associated with the {@link AbstractTripleStore}.
     * 
     * @param describeStatistics
     *            When true, the VoID description will include the
     *            {@link VoidVocabularyDecl#vocabulary} declarations, the
     *            property partition statistics, and the class partition
     *            statistics.
     * @param describeNamedGraphs
     *            When true, each named graph will also be
     *            described in in the same level of detail as the default graph.
     *            Otherwise only the default graph will be described.
     */
    public void describeDataSet(final boolean describeStatistics,
            final boolean describeNamedGraphs) {

        final String namespace = tripleStore.getNamespace();
        
        // This is a VoID data set.
        g.add(aDataset, RDF.TYPE, VoidVocabularyDecl.Dataset);

        // The namespace is used as a title for the data set.
        g.add(aDataset, DCTermsVocabularyDecl.title,
                f.createLiteral(namespace));

        // Also present the namespace in an unambiguous manner.
        g.add(aDataset, SD.KB_NAMESPACE, f.createLiteral(namespace));

        /**
         * Service end point for this namespace.
         * 
         * @see 
         *      Missing URL encoding in RemoteRepositoryManager 
         */
        for (String uri : serviceURI) {
            g.add(aDataset,
                    VoidVocabularyDecl.sparqlEndpoint,
                    f.createURI(uri + "/" + ConnectOptions.urlEncode(namespace)
                            + "/sparql"));
        }

        // any URI is considered to be an entity.
        g.add(aDataset, VoidVocabularyDecl.uriRegexPattern,
                f.createLiteral("^.*"));

        if(!describeStatistics) {
        
            // No statistics.
            return;
            
        }
        
        // Frequency count of the predicates in the default graph.
        final IVCount[] predicatePartitionCounts = predicateUsage(
                tripleStore);

        // Frequency count of the classes in the default graph.
        final IVCount[] classPartitionCounts = classUsage(tripleStore);

        // Describe vocabularies based on the predicate partitions.
        describeVocabularies(predicatePartitionCounts);
        
        // defaultGraph description.
        {

            // Default graph in the default data set.
            g.add(aDataset, SD.defaultGraph, aDefaultGraph);
            
            // Describe the default graph using statistics.
            describeGraph(aDefaultGraph, predicatePartitionCounts,
                    classPartitionCounts);

        } // end defaultGraph

        // sb.append("termCount\t = " + tripleStore.getTermCount() + "\n");
        //
        // sb.append("uriCount\t = " + tripleStore.getURICount() + "\n");
        //
        // sb.append("literalCount\t = " + tripleStore.getLiteralCount() +
        // "\n");
        //
        // /*
        // * Note: The blank node count is only available when using the told
        // * bnodes mode.
        // */
        // sb
        // .append("bnodeCount\t = "
        // + (tripleStore.getLexiconRelation()
        // .isStoreBlankNodes() ? ""
        // + tripleStore.getBNodeCount() : "N/A")
        // + "\n");

        /*
         * Report for each named graph.
         */
        if (describeNamedGraphs && tripleStore.isQuads()) {

            final SPORelation r = tripleStore.getSPORelation();

            // the index to use for distinct term scan.
            final SPOKeyOrder keyOrder = SPOKeyOrder.CSPO;

            // visit distinct IVs for context position on that index.
            @SuppressWarnings("rawtypes")
            final IChunkedIterator itr = r.distinctTermScan(keyOrder);

            // resolve IVs to terms efficiently during iteration.
            final BigdataValueIterator itr2 = new BigdataValueIteratorImpl(
                    tripleStore/* resolveTerms */, itr);

            try {

                while (itr2.hasNext()) {

                    /*
                     * Describe this named graph.
                     * 
                     * Note: This is using the predicate and class partition
                     * statistics from the default graph (RDF merge) to identify
                     * the set of all possible predicates and classes within
                     * each named graph. It then tests each predicate and class
                     * partition against the named graph and ignores those which
                     * are not present in a given named graph. This is being
                     * done because we do not have a CPxx index.
                     */

                    final BigdataResource graph = (BigdataResource) itr2.next();

                    final IVCount[] predicatePartitionCounts2 = predicateUsage(
                            tripleStore, graph.getIV(),
                            predicatePartitionCounts);

                    final IVCount[] classPartitionCounts2 = classUsage(
                            tripleStore, graph.getIV(), classPartitionCounts);

                    final BNode aNamedGraph = f.createBNode();

                    // Named graph in the default data set.
                    g.add(aDataset, SD.namedGraph, aNamedGraph);

                    // The name of that named graph.
                    g.add(aNamedGraph, SD.name, graph);

                    // Describe the named graph.
                    describeGraph(aNamedGraph, predicatePartitionCounts2,
                            classPartitionCounts2);

                }
                
            } finally {

                itr2.close();

            }

        }

    }

    /**
     * Describe the vocabularies which are in use in the KB based on the
     * predicate partition statistics.
     * 
     * @param predicateParitionCounts
     *            The predicate partition statistics.
     */
    protected void describeVocabularies(final IVCount[] predicatePartitionCounts) {

        // Find the distinct vocabularies in use.
        final Set namespaces = new LinkedHashSet();
        {

            // property partitions.
            for (IVCount tmp : predicatePartitionCounts) {

                final URI p = (URI) tmp.getValue();

                String namespace = p.getNamespace();

                if (namespace.endsWith("#")) {

                    // Strip trailing '#' per VoID specification.
                    namespace = namespace.substring(0,
                            namespace.length() - 1);

                }

                namespaces.add(namespace);

            }

        }

        // Sort into dictionary order.
        final String[] a = namespaces.toArray(new String[namespaces.size()]);
        
        Arrays.sort(a);

        for (String namespace : a) {

            g.add(aDataset, VoidVocabularyDecl.vocabulary,
                    f.createURI(namespace));

        }

    }
    
    /**
     * Describe a named or default graph.
     * 
     * @param graph
     *            The named graph.
     * @param predicatePartitionCounts
     *            The predicate partition statistics for that graph.
     * @param classPartitionCounts
     *            The class partition statistics for that graph.
     */
    protected void describeGraph(final Resource graph,
            final IVCount[] predicatePartitionCounts,
            final IVCount[] classPartitionCounts) {

        // The graph is a Graph.
        g.add(graph, RDF.TYPE, SD.Graph);

        // #of triples in the default graph
        g.add(graph, VoidVocabularyDecl.triples,
                f.createLiteral(tripleStore.getStatementCount()));

        // #of entities in the default graph.
        g.add(graph, VoidVocabularyDecl.entities,
                f.createLiteral(tripleStore.getURICount()));

        // #of distinct predicates in the default graph.
        g.add(graph, VoidVocabularyDecl.properties,
                f.createLiteral(predicatePartitionCounts.length));

        // #of distinct classes in the default graph.
        g.add(graph, VoidVocabularyDecl.classes,
                f.createLiteral(classPartitionCounts.length));

        // property partition statistics.
        for (IVCount tmp : predicatePartitionCounts) {

            final BNode propertyPartition = f.createBNode();

            final URI p = (URI) tmp.getValue();

            g.add(graph, VoidVocabularyDecl.propertyPartition,
                    propertyPartition);

            g.add(propertyPartition, VoidVocabularyDecl.property, p);

            g.add(propertyPartition, VoidVocabularyDecl.triples,
                    f.createLiteral(tmp.count));

        }

        // class partition statistics.
        {

            // per class partition statistics.
            for (IVCount tmp : classPartitionCounts) {

                final BNode classPartition = f.createBNode();

                final BigdataValue cls = tmp.getValue();

                g.add(graph, VoidVocabularyDecl.classPartition,
                        classPartition);

                g.add(classPartition, VoidVocabularyDecl.class_, cls);

                g.add(classPartition, VoidVocabularyDecl.triples,
                        f.createLiteral(tmp.count));

            }

        } // end class partition statistics.

    }

    /**
     * An {@link IV} and a counter for that {@link IV}.
     */
    protected static class IVCount implements Comparable {

        public final IV iv;

        public final long count;

        private BigdataValue val;
        
        /**
         * Return the associated {@link BigdataValue}.
         * 

* Note: A resolution set is necessary if you want to attach the * {@link BigdataValue} to the {@link IV}. * * @throws NotMaterializedException */ public BigdataValue getValue() { if(val == null) throw new NotMaterializedException(iv.toString()); return val; } public void setValue(final BigdataValue val) { if (val == null) throw new IllegalArgumentException(); if (this.val != null && !this.val.equals(val)) throw new IllegalArgumentException(); this.val = val; } public IVCount(final IV iv, final long count) { if (iv == null) throw new IllegalArgumentException(); this.iv = iv; this.count = count; } /** * Place into order by descending count. */ @Override public int compareTo(IVCount arg0) { if (count < arg0.count) return 1; if (count > arg0.count) return -1; return 0; } } /** * Return an array of the distinct predicates in the KB ordered by their * descending frequency of use. The {@link IV}s in the returned array will * have been resolved to the corresponding {@link BigdataURI}s which can be * accessed using {@link IV#getValue()}. * * @param kb * The KB instance. */ protected static IVCount[] predicateUsage(final AbstractTripleStore kb) { final SPORelation r = kb.getSPORelation(); if (r.oneAccessPath) { // The necessary index (POS or POCS) does not exist. throw new UnsupportedOperationException(); } final boolean quads = kb.isQuads(); // the index to use for distinct predicate scan. final SPOKeyOrder keyOrder = quads ? SPOKeyOrder.POCS : SPOKeyOrder.POS; // visit distinct term identifiers for predicate position on that index. @SuppressWarnings("rawtypes") final IChunkedIterator itr = r.distinctTermScan(keyOrder); // resolve term identifiers to terms efficiently during iteration. final BigdataValueIterator itr2 = new BigdataValueIteratorImpl( kb/* resolveTerms */, itr); try { final Set> ivs = new LinkedHashSet>(); final Map, IVCount> counts = new LinkedHashMap, IVCount>(); while (itr2.hasNext()) { final BigdataValue term = itr2.next(); final IV iv = term.getIV(); final long n = r.getAccessPath(null, iv, null, null) .rangeCount(false/* exact */); ivs.add(iv); counts.put(iv, new IVCount(iv, n)); } // Batch resolve IVs to Values final Map, BigdataValue> x = kb.getLexiconRelation() .getTerms(ivs); for (Map.Entry, BigdataValue> e : x.entrySet()) { final IVCount count = counts.get(e.getKey()); count.setValue(e.getValue()); } final IVCount[] a = counts.values().toArray( new IVCount[counts.size()]); // Order by descending count. Arrays.sort(a); return a; } finally { itr2.close(); } } /** * Return the predicate partition statistics for the named graph. * * @param kb * The KB instance. * @param civ * The {@link IV} of a named graph (required). * * @return The predicate partition statistics for that named graph. Only * predicate partitions which are non-empty are returned. */ protected static IVCount[] predicateUsage(final AbstractTripleStore kb, final IV civ, final IVCount[] predicatePartitionCounts) { final SPORelation r = kb.getSPORelation(); final boolean quads = kb.isQuads(); if (!quads) { // Named graph only valid in quads mode. throw new IllegalArgumentException(); } // The non-zero counts. final List counts = new LinkedList(); // Check the known non-empty predicate partitions. for(IVCount in : predicatePartitionCounts){ final long n = r.getAccessPath(null, in.iv, null, civ).rangeCount( false/* exact */); if (n == 0) continue; final IVCount out = new IVCount(in.iv, n); out.setValue(in.getValue()); counts.add(out); } final IVCount[] a = counts.toArray(new IVCount[counts.size()]); // Order by descending count. Arrays.sort(a); return a; } /** * Return an efficient statistical summary for the class partitions. The * SPARQL query for this is * *

     * SELECT  ?class (COUNT(?s) AS ?count ) { ?s a ?class } GROUP BY ?class ORDER BY ?count
     * 
* * However, it is much efficient to scan POS for * *
     * rdf:type ?o ?s
     * 
* * and report the range count of * *
     * rdf:type ?o ?s
     * 
* * for each distinct value of ?o. * * @param kb * The KB instance. * * @return The class usage statistics. */ protected static IVCount[] classUsage(final AbstractTripleStore kb) { final SPORelation r = kb.getSPORelation(); if (r.oneAccessPath) { // The necessary index (POS or POCS) does not exist. throw new UnsupportedOperationException(); } final boolean quads = kb.isQuads(); final SPOKeyOrder keyOrder = quads ? SPOKeyOrder.POCS : SPOKeyOrder.POS; // Resolve IV for rdf:type final BigdataURI rdfType = kb.getValueFactory().asValue(RDF.TYPE); kb.getLexiconRelation().addTerms(new BigdataValue[] { rdfType }, 1/* numTerms */, true/* readOnly */); if (rdfType.getIV() == null) { // No rdf:type assertions since rdf:type is unknown term. return new IVCount[0]; } // visit distinct term identifiers for the rdf:type predicate. @SuppressWarnings("rawtypes") final IChunkedIterator itr = r.distinctMultiTermScan(keyOrder, new IV[] { rdfType.getIV() }/* knownTerms */); // resolve term identifiers to terms efficiently during iteration. final BigdataValueIterator itr2 = new BigdataValueIteratorImpl( kb/* resolveTerms */, itr); try { final Set> ivs = new LinkedHashSet>(); final Map, IVCount> counts = new LinkedHashMap, IVCount>(); while (itr2.hasNext()) { final BigdataValue term = itr2.next(); final IV iv = term.getIV(); final long n = r.getAccessPath(null, rdfType.getIV()/* p */, iv/* o */, null).rangeCount(false/* exact */); ivs.add(iv); counts.put(iv, new IVCount(iv, n)); } // Batch resolve IVs to Values final Map, BigdataValue> x = kb.getLexiconRelation() .getTerms(ivs); for (Map.Entry, BigdataValue> e : x.entrySet()) { final IVCount count = counts.get(e.getKey()); count.setValue(e.getValue()); } final IVCount[] a = counts.values().toArray( new IVCount[counts.size()]); // Order by descending count. Arrays.sort(a); return a; } finally { itr2.close(); } } /** * Return the class partition statistics for the named graph. * * @param kb * The KB instance. * @param civ * The {@link IV} of a named graph (required). * * @return The class partition statistics for that named graph. Only class * partitions which are non-empty are returned. */ protected static IVCount[] classUsage(final AbstractTripleStore kb, final IV civ, final IVCount[] classPartitionCounts) { final SPORelation r = kb.getSPORelation(); final boolean quads = kb.isQuads(); if (!quads) { // Named graph only valid in quads mode. throw new IllegalArgumentException(); } // Resolve IV for rdf:type final BigdataURI rdfType = kb.getValueFactory().asValue(RDF.TYPE); kb.getLexiconRelation().addTerms(new BigdataValue[] { rdfType }, 1/* numTerms */, true/* readOnly */); if (rdfType.getIV() == null) { // No rdf:type assertions since rdf:type is unknown term. return new IVCount[0]; } // The non-zero counts. final List counts = new LinkedList(); // Check the known non-empty predicate partitions. for (IVCount in : classPartitionCounts) { final long n = r.getAccessPath(null, rdfType.getIV()/* p */, in.iv/* o */, civ).rangeCount(false/* exact */); if (n == 0) continue; final IVCount out = new IVCount(in.iv, n); out.setValue(in.getValue()); counts.add(out); } final IVCount[] a = counts.toArray(new IVCount[counts.size()]); // Order by descending count. Arrays.sort(a); return a; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy