com.bigdata.rdf.sail.webapp.VoID Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Jul 24, 2012
 */
package com.bigdata.rdf.sail.webapp;

import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.openrdf.model.BNode;
import org.openrdf.model.Graph;
import org.openrdf.model.Resource;
import org.openrdf.model.URI;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.vocabulary.RDF;

import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.NotMaterializedException;
import com.bigdata.rdf.model.BigdataResource;
import com.bigdata.rdf.model.BigdataURI;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.sail.webapp.client.ConnectOptions;
import com.bigdata.rdf.spo.SPOKeyOrder;
import com.bigdata.rdf.spo.SPORelation;
import com.bigdata.rdf.store.AbstractTripleStore;
import com.bigdata.rdf.store.BigdataValueIterator;
import com.bigdata.rdf.store.BigdataValueIteratorImpl;
import com.bigdata.rdf.vocab.decls.DCTermsVocabularyDecl;
import com.bigdata.rdf.vocab.decls.VoidVocabularyDecl;
import com.bigdata.striterator.IChunkedIterator;

/**
 * Helper class for VoID descriptions.
 * 
 * @see  Describing Linked Datasets with
 *      the VoID Vocabulary 
 *      
 * @author Bryan Thompson
 */
public class VoID {

    /**
     * The graph in which the service description is accumulated (from the
     * constructor).
     */
    private final Graph g;
    
    /**
     * The KB instance that is being described (from the constructor).
     */
    private final AbstractTripleStore tripleStore;
    
    /**
     * The service end point(s) (from the constructor).
     */
    private final String[] serviceURI;

    /**
     * The value factory used to create values for the service description graph
     * {@link #g}.
     */
    private final ValueFactory f;
    
//    /**
//     * The resource which models the service.
//     */
//    protected final BNode aService;

    /**
     * The resource which models the data set.
     */
    private final Resource aDataset;

    /**
     * The resource which models the default graph in the data set.
     */
    private final BNode aDefaultGraph;

    /**
     * 
     * @param g
     *            Where to assemble the description.
     * @param tripleStore
     *            The KB instance to be described.
     * @param serviceURI
     *            The SPARQL service end point.
     * @param aDefaultDataset
     *            The data set identifier that will be used on the description
     *            (the bigdata namespace of the dataset is obtained from the
     *            tripleStore).
     */
    public VoID(final Graph g, final AbstractTripleStore tripleStore,
            final String[] serviceURI, final Resource aDataset) {

        if (g == null)
            throw new IllegalArgumentException();

        if (tripleStore == null)
            throw new IllegalArgumentException();
        
        if (serviceURI == null)
            throw new IllegalArgumentException();

        if (serviceURI.length == 0)
            throw new IllegalArgumentException();

        for (String s : serviceURI)
            if (s == null)
                throw new IllegalArgumentException();
        
        if (aDataset == null)
            throw new IllegalArgumentException();

        this.g = g;
        
        this.tripleStore = tripleStore;
        
        this.serviceURI = serviceURI;
        
        this.f = g.getValueFactory();
        
        this.aDataset = aDataset;
    
        this.aDefaultGraph = f.createBNode("defaultGraph");
        
    }
    
    /**
     * Describe the default data set (the one identified by the namespace
     * associated with the {@link AbstractTripleStore}.
     * 
     * @param describeStatistics
     *            When true, the VoID description will include the
     *            {@link VoidVocabularyDecl#vocabulary} declarations, the
     *            property partition statistics, and the class partition
     *            statistics.
     * @param describeNamedGraphs
     *            When true, each named graph will also be
     *            described in in the same level of detail as the default graph.
     *            Otherwise only the default graph will be described.
     */
    public void describeDataSet(final boolean describeStatistics,
            final boolean describeNamedGraphs) {

        final String namespace = tripleStore.getNamespace();
        
        // This is a VoID data set.
        g.add(aDataset, RDF.TYPE, VoidVocabularyDecl.Dataset);

        // The namespace is used as a title for the data set.
        g.add(aDataset, DCTermsVocabularyDecl.title,
                f.createLiteral(namespace));

        // Also present the namespace in an unambiguous manner.
        g.add(aDataset, SD.KB_NAMESPACE, f.createLiteral(namespace));

        /**
         * Service end point for this namespace.
         * 
         * @see 
         *      Missing URL encoding in RemoteRepositoryManager 
         */
        for (String uri : serviceURI) {
            g.add(aDataset,
                    VoidVocabularyDecl.sparqlEndpoint,
                    f.createURI(uri + "/" + ConnectOptions.urlEncode(namespace)
                            + "/sparql"));
        }

        // any URI is considered to be an entity.
        g.add(aDataset, VoidVocabularyDecl.uriRegexPattern,
                f.createLiteral("^.*"));

        if(!describeStatistics) {
        
            // No statistics.
            return;
            
        }
        
        // Frequency count of the predicates in the default graph.
        final IVCount[] predicatePartitionCounts = predicateUsage(
                tripleStore);

        // Frequency count of the classes in the default graph.
        final IVCount[] classPartitionCounts = classUsage(tripleStore);

        // Describe vocabularies based on the predicate partitions.
        describeVocabularies(predicatePartitionCounts);
        
        // defaultGraph description.
        {

            // Default graph in the default data set.
            g.add(aDataset, SD.defaultGraph, aDefaultGraph);
            
            // Describe the default graph using statistics.
            describeGraph(aDefaultGraph, predicatePartitionCounts,
                    classPartitionCounts);

        } // end defaultGraph

        // sb.append("termCount\t = " + tripleStore.getTermCount() + "\n");
        //
        // sb.append("uriCount\t = " + tripleStore.getURICount() + "\n");
        //
        // sb.append("literalCount\t = " + tripleStore.getLiteralCount() +
        // "\n");
        //
        // /*
        // * Note: The blank node count is only available when using the told
        // * bnodes mode.
        // */
        // sb
        // .append("bnodeCount\t = "
        // + (tripleStore.getLexiconRelation()
        // .isStoreBlankNodes() ? ""
        // + tripleStore.getBNodeCount() : "N/A")
        // + "\n");

        /*
         * Report for each named graph.
         */
        if (describeNamedGraphs && tripleStore.isQuads()) {

            final SPORelation r = tripleStore.getSPORelation();

            // the index to use for distinct term scan.
            final SPOKeyOrder keyOrder = SPOKeyOrder.CSPO;

            // visit distinct IVs for context position on that index.
            @SuppressWarnings("rawtypes")
            final IChunkedIterator itr = r.distinctTermScan(keyOrder);

            // resolve IVs to terms efficiently during iteration.
            final BigdataValueIterator itr2 = new BigdataValueIteratorImpl(
                    tripleStore/* resolveTerms */, itr);

            try {

                while (itr2.hasNext()) {

                    /*
                     * Describe this named graph.
                     * 
                     * Note: This is using the predicate and class partition
                     * statistics from the default graph (RDF merge) to identify
                     * the set of all possible predicates and classes within
                     * each named graph. It then tests each predicate and class
                     * partition against the named graph and ignores those which
                     * are not present in a given named graph. This is being
                     * done because we do not have a CPxx index.
                     */

                    final BigdataResource graph = (BigdataResource) itr2.next();

                    final IVCount[] predicatePartitionCounts2 = predicateUsage(
                            tripleStore, graph.getIV(),
                            predicatePartitionCounts);

                    final IVCount[] classPartitionCounts2 = classUsage(
                            tripleStore, graph.getIV(), classPartitionCounts);

                    final BNode aNamedGraph = f.createBNode();

                    // Named graph in the default data set.
                    g.add(aDataset, SD.namedGraph, aNamedGraph);

                    // The name of that named graph.
                    g.add(aNamedGraph, SD.name, graph);

                    // Describe the named graph.
                    describeGraph(aNamedGraph, predicatePartitionCounts2,
                            classPartitionCounts2);

                }
                
            } finally {

                itr2.close();

            }

        }

    }

    /**
     * Describe the vocabularies which are in use in the KB based on the
     * predicate partition statistics.
     * 
     * @param predicateParitionCounts
     *            The predicate partition statistics.
     */
    protected void describeVocabularies(final IVCount[] predicatePartitionCounts) {

        // Find the distinct vocabularies in use.
        final Set namespaces = new LinkedHashSet();
        {

            // property partitions.
            for (IVCount tmp : predicatePartitionCounts) {

                final URI p = (URI) tmp.getValue();

                String namespace = p.getNamespace();

                if (namespace.endsWith("#")) {

                    // Strip trailing '#' per VoID specification.
                    namespace = namespace.substring(0,
                            namespace.length() - 1);

                }

                namespaces.add(namespace);

            }

        }

        // Sort into dictionary order.
        final String[] a = namespaces.toArray(new String[namespaces.size()]);
        
        Arrays.sort(a);

        for (String namespace : a) {

            g.add(aDataset, VoidVocabularyDecl.vocabulary,
                    f.createURI(namespace));

        }

    }
    
    /**
     * Describe a named or default graph.
     * 
     * @param graph
     *            The named graph.
     * @param predicatePartitionCounts
     *            The predicate partition statistics for that graph.
     * @param classPartitionCounts
     *            The class partition statistics for that graph.
     */
    protected void describeGraph(final Resource graph,
            final IVCount[] predicatePartitionCounts,
            final IVCount[] classPartitionCounts) {

        // The graph is a Graph.
        g.add(graph, RDF.TYPE, SD.Graph);

        // #of triples in the default graph
        g.add(graph, VoidVocabularyDecl.triples,
                f.createLiteral(tripleStore.getStatementCount()));

        // #of entities in the default graph.
        g.add(graph, VoidVocabularyDecl.entities,
                f.createLiteral(tripleStore.getURICount()));

        // #of distinct predicates in the default graph.
        g.add(graph, VoidVocabularyDecl.properties,
                f.createLiteral(predicatePartitionCounts.length));

        // #of distinct classes in the default graph.
        g.add(graph, VoidVocabularyDecl.classes,
                f.createLiteral(classPartitionCounts.length));

        // property partition statistics.
        for (IVCount tmp : predicatePartitionCounts) {

            final BNode propertyPartition = f.createBNode();

            final URI p = (URI) tmp.getValue();

            g.add(graph, VoidVocabularyDecl.propertyPartition,
                    propertyPartition);

            g.add(propertyPartition, VoidVocabularyDecl.property, p);

            g.add(propertyPartition, VoidVocabularyDecl.triples,
                    f.createLiteral(tmp.count));

        }

        // class partition statistics.
        {

            // per class partition statistics.
            for (IVCount tmp : classPartitionCounts) {

                final BNode classPartition = f.createBNode();

                final BigdataValue cls = tmp.getValue();

                g.add(graph, VoidVocabularyDecl.classPartition,
                        classPartition);

                g.add(classPartition, VoidVocabularyDecl.class_, cls);

                g.add(classPartition, VoidVocabularyDecl.triples,
                        f.createLiteral(tmp.count));

            }

        } // end class partition statistics.

    }

    /**
     * An {@link IV} and a counter for that {@link IV}.
     */
    protected static class IVCount implements Comparable {

        public final IV iv;

        public final long count;

        private BigdataValue val;
        
        /**
         * Return the associated {@link BigdataValue}.
         * 
         * Note: A resolution set is necessary if you want to attach the
         * {@link BigdataValue} to the {@link IV}.
         * 
         * @throws NotMaterializedException
         */
        public BigdataValue getValue() {

            if(val == null)
                throw new NotMaterializedException(iv.toString());
            
            return val;
            
        }
        
        public void setValue(final BigdataValue val) {

            if (val == null)
                throw new IllegalArgumentException();
            
            if (this.val != null && !this.val.equals(val))
                throw new IllegalArgumentException();
            
            this.val = val;

        }
        
        public IVCount(final IV iv, final long count) {

            if (iv == null)
                throw new IllegalArgumentException();
            
            this.iv = iv;
            
            this.count = count;
            
        }

        /**
         * Place into order by descending count.
         */
        @Override
        public int compareTo(IVCount arg0) {

            if (count < arg0.count)
                return 1;

            if (count > arg0.count)
                return -1;

            return 0;
            
        }
        
    }

    /**
     * Return an array of the distinct predicates in the KB ordered by their
     * descending frequency of use. The {@link IV}s in the returned array will
     * have been resolved to the corresponding {@link BigdataURI}s which can be
     * accessed using {@link IV#getValue()}.
     * 
     * @param kb
     *            The KB instance.
     */
    protected static IVCount[] predicateUsage(final AbstractTripleStore kb) {

        final SPORelation r = kb.getSPORelation();
        
        if (r.oneAccessPath) {

            // The necessary index (POS or POCS) does not exist.
            throw new UnsupportedOperationException();

        }

        final boolean quads = kb.isQuads();

        // the index to use for distinct predicate scan.
        final SPOKeyOrder keyOrder = quads ? SPOKeyOrder.POCS : SPOKeyOrder.POS;

        // visit distinct term identifiers for predicate position on that index.
        @SuppressWarnings("rawtypes")
        final IChunkedIterator itr = r.distinctTermScan(keyOrder);

        // resolve term identifiers to terms efficiently during iteration.
        final BigdataValueIterator itr2 = new BigdataValueIteratorImpl(
                kb/* resolveTerms */, itr);

        try {

            final Set> ivs = new LinkedHashSet>();

            final Map, IVCount> counts = new LinkedHashMap, IVCount>();

            while (itr2.hasNext()) {

                final BigdataValue term = itr2.next();

                final IV iv = term.getIV();

                final long n = r.getAccessPath(null, iv, null, null)
                        .rangeCount(false/* exact */);

                ivs.add(iv);
                
                counts.put(iv, new IVCount(iv, n));

            }

            // Batch resolve IVs to Values
            final Map, BigdataValue> x = kb.getLexiconRelation()
                    .getTerms(ivs);

            for (Map.Entry, BigdataValue> e : x.entrySet()) {

                final IVCount count = counts.get(e.getKey());

                count.setValue(e.getValue());

            }

            final IVCount[] a = counts.values().toArray(
                    new IVCount[counts.size()]);

            // Order by descending count.
            Arrays.sort(a);

            return a;

        } finally {

            itr2.close();

        }

    }

    /**
     * Return the predicate partition statistics for the named graph.
     * 
     * @param kb
     *            The KB instance.
     * @param civ
     *            The {@link IV} of a named graph (required).
     * 
     * @return The predicate partition statistics for that named graph. Only
     *         predicate partitions which are non-empty are returned.
     */
    protected static IVCount[] predicateUsage(final AbstractTripleStore kb,
            final IV civ, final IVCount[] predicatePartitionCounts) {
        
        final SPORelation r = kb.getSPORelation();

        final boolean quads = kb.isQuads();
        
        if (!quads) {
            
            // Named graph only valid in quads mode.
            throw new IllegalArgumentException();
        
        }

        // The non-zero counts.
        final List counts = new LinkedList();

        // Check the known non-empty predicate partitions.
        for(IVCount in : predicatePartitionCounts){

            final long n = r.getAccessPath(null, in.iv, null, civ).rangeCount(
                    false/* exact */);

            if (n == 0)
                continue;

            final IVCount out = new IVCount(in.iv, n);

            out.setValue(in.getValue());
            
            counts.add(out);

        }
        

        final IVCount[] a = counts.toArray(new IVCount[counts.size()]);

        // Order by descending count.
        Arrays.sort(a);

        return a;

    }
    
    /**
     * Return an efficient statistical summary for the class partitions. The
     * SPARQL query for this is
     * 
     * 
     * SELECT  ?class (COUNT(?s) AS ?count ) { ?s a ?class } GROUP BY ?class ORDER BY ?count
     * 
     * 
     * However, it is much efficient to scan POS for
     * 
     *      * rdf:type ?o ?s
     * 
     * 
     * and report the range count of
     * 
     *      * rdf:type ?o ?s
     * 
     * 
     * for each distinct value of ?o.
     * 
     * @param kb
     *            The KB instance.
     *            
     * @return The class usage statistics.
     */
    protected static IVCount[] classUsage(final AbstractTripleStore kb) {

        final SPORelation r = kb.getSPORelation();

        if (r.oneAccessPath) {

            // The necessary index (POS or POCS) does not exist.
            throw new UnsupportedOperationException();

        }

        final boolean quads = kb.isQuads();

        final SPOKeyOrder keyOrder = quads ? SPOKeyOrder.POCS : SPOKeyOrder.POS;

        // Resolve IV for rdf:type
        final BigdataURI rdfType = kb.getValueFactory().asValue(RDF.TYPE);

        kb.getLexiconRelation().addTerms(new BigdataValue[] { rdfType },
                1/* numTerms */, true/* readOnly */);

        if (rdfType.getIV() == null) {

            // No rdf:type assertions since rdf:type is unknown term.
            return new IVCount[0];

        }
        
        // visit distinct term identifiers for the rdf:type predicate.
        @SuppressWarnings("rawtypes")
        final IChunkedIterator itr = r.distinctMultiTermScan(keyOrder,
                new IV[] { rdfType.getIV() }/* knownTerms */);

        // resolve term identifiers to terms efficiently during iteration.
        final BigdataValueIterator itr2 = new BigdataValueIteratorImpl(
                kb/* resolveTerms */, itr);

        try {

            final Set> ivs = new LinkedHashSet>();

            final Map, IVCount> counts = new LinkedHashMap, IVCount>();

            while (itr2.hasNext()) {

                final BigdataValue term = itr2.next();

                final IV iv = term.getIV();

                final long n = r.getAccessPath(null, rdfType.getIV()/* p */,
                        iv/* o */, null).rangeCount(false/* exact */);

                ivs.add(iv);
                
                counts.put(iv, new IVCount(iv, n));

            }

            // Batch resolve IVs to Values
            final Map, BigdataValue> x = kb.getLexiconRelation()
                    .getTerms(ivs);

            for (Map.Entry, BigdataValue> e : x.entrySet()) {

                final IVCount count = counts.get(e.getKey());

                count.setValue(e.getValue());

            }

            final IVCount[] a = counts.values().toArray(
                    new IVCount[counts.size()]);

            // Order by descending count.
            Arrays.sort(a);

            return a;

        } finally {

            itr2.close();

        }

    }

    /**
     * Return the class partition statistics for the named graph.
     * 
     * @param kb
     *            The KB instance.
     * @param civ
     *            The {@link IV} of a named graph (required).
     * 
     * @return The class partition statistics for that named graph. Only class
     *         partitions which are non-empty are returned.
     */
    protected static IVCount[] classUsage(final AbstractTripleStore kb,
            final IV civ, final IVCount[] classPartitionCounts) {

        final SPORelation r = kb.getSPORelation();

        final boolean quads = kb.isQuads();

        if (!quads) {

            // Named graph only valid in quads mode.
            throw new IllegalArgumentException();

        }

        // Resolve IV for rdf:type
        final BigdataURI rdfType = kb.getValueFactory().asValue(RDF.TYPE);

        kb.getLexiconRelation().addTerms(new BigdataValue[] { rdfType },
                1/* numTerms */, true/* readOnly */);

        if (rdfType.getIV() == null) {

            // No rdf:type assertions since rdf:type is unknown term.
            return new IVCount[0];

        }

        // The non-zero counts.
        final List counts = new LinkedList();

        // Check the known non-empty predicate partitions.
        for (IVCount in : classPartitionCounts) {

            final long n = r.getAccessPath(null, rdfType.getIV()/* p */,
                    in.iv/* o */, civ).rangeCount(false/* exact */);

            if (n == 0)
                continue;
            
            final IVCount out = new IVCount(in.iv, n);

            out.setValue(in.getValue());

            counts.add(out);

        }

        final IVCount[] a = counts.toArray(new IVCount[counts.size()]);

        // Order by descending count.
        Arrays.sort(a);

        return a;

    }

}