com.blazegraph.gremlin.structure.BlazeGraph Maven / Gradle / Ivy

/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package com.blazegraph.gremlin.structure;

import static com.blazegraph.gremlin.util.Lambdas.toMap;
import static java.util.stream.Collectors.toList;

import java.util.AbstractMap.SimpleEntry;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.apache.commons.configuration.Configuration;
import org.apache.tinkerpop.gremlin.process.computer.GraphComputer;
import org.apache.tinkerpop.gremlin.structure.Direction;
import org.apache.tinkerpop.gremlin.structure.Edge;
import org.apache.tinkerpop.gremlin.structure.Element;
import org.apache.tinkerpop.gremlin.structure.Graph;
import org.apache.tinkerpop.gremlin.structure.Graph.Features.VertexFeatures;
import org.apache.tinkerpop.gremlin.structure.Property;
import org.apache.tinkerpop.gremlin.structure.T;
import org.apache.tinkerpop.gremlin.structure.Transaction;
import org.apache.tinkerpop.gremlin.structure.Vertex;
import org.apache.tinkerpop.gremlin.structure.VertexProperty;
import org.apache.tinkerpop.gremlin.structure.VertexProperty.Cardinality;
import org.apache.tinkerpop.gremlin.structure.util.ElementHelper;
import org.apache.tinkerpop.gremlin.structure.util.StringFactory;
import org.openrdf.model.Literal;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.query.BindingSet;
import org.openrdf.query.Query;
import org.openrdf.query.impl.MapBindingSet;
import org.openrdf.repository.RepositoryConnection;

import com.bigdata.rdf.internal.XSD;
import com.bigdata.rdf.internal.impl.extensions.DateTimeExtension;
import com.bigdata.rdf.model.BigdataBNode;
import com.bigdata.rdf.model.BigdataStatement;
import com.bigdata.rdf.model.BigdataURI;
import com.bigdata.rdf.model.BigdataValueFactory;
import com.bigdata.rdf.sail.RDRHistory;
import com.bigdata.rdf.sail.model.RunningQuery;
import com.bigdata.rdf.store.AbstractTripleStore;
import com.blazegraph.gremlin.embedded.BasicRepositoryProvider;
import com.blazegraph.gremlin.embedded.BlazeGraphEmbedded;
import com.blazegraph.gremlin.embedded.BlazeGraphEmbedded.BlazeTransaction;
import com.blazegraph.gremlin.listener.BlazeGraphAtom;
import com.blazegraph.gremlin.listener.BlazeGraphEdit;
import com.blazegraph.gremlin.listener.BlazeGraphEdit.Action;
import com.blazegraph.gremlin.util.CloseableIterator;
import com.blazegraph.gremlin.util.Code;
import com.blazegraph.gremlin.util.LambdaLogger;
import com.blazegraph.gremlin.util.Streams;

import info.aduna.iteration.CloseableIteration;

/**
 * Blazegraph/tinkerpop3 integration.  Handles the mapping between the 
 * tinkerpop3 data model and a custom RDF/PG data model using RDF*.
 * 
 * Currently the only concrete implementation of this class is
 * {@link BlazeGraphEmbedded}, which provides an embedded (same JVM)
 * implementation of the Blazegraph/tinkerpop3 API.
 * 
 * See {@link BlazeGraphFeatures} for what tinkerpop3 features this
 * implementation supports. In addition to the tinkerpop3 features, this API
 * also provides the following:
 * 
 * 
 * History API - capture full or partial history of edits to the graph.
 * Built-in full text index and search API to find graph elements.
 * Automatic SPARQL to PG translation - run a SPARQL query and get your 
 *     results back in property graph form.
 * Query management API - list and cancel running Sparql queries.
 * Bulk Load API for fast setup of new graphs.
 * 
 * 
 * And an additional two features specific to the embedded implementation:
 * 
 * 
 * Listener API - subscribe to notifications about updates to the graph 
 *     (adds and removes of vertices/edges/properties, commits, rollbacks, etc.)
 * Support for MVCC concurrency model for high-concurrency read access.
 * 
 * @author mikepersonick
 */
@Graph.OptIn("com.blazegraph.gremlin.structure.StructureStandardSuite")
@Graph.OptIn(Graph.OptIn.SUITE_STRUCTURE_INTEGRATE)
@Graph.OptIn(Graph.OptIn.SUITE_STRUCTURE_PERFORMANCE)
@Graph.OptIn(Graph.OptIn.SUITE_GROOVY_ENVIRONMENT)
@Graph.OptIn(Graph.OptIn.SUITE_GROOVY_ENVIRONMENT_INTEGRATE)
@Graph.OptIn(Graph.OptIn.SUITE_GROOVY_ENVIRONMENT_PERFORMANCE)
public abstract class BlazeGraph implements Graph {
    
    protected final transient static LambdaLogger log = LambdaLogger.getLogger(BlazeGraph.class);
    
    protected final transient static LambdaLogger sparqlLog = LambdaLogger.getLogger(BlazeGraph.class.getName() + ".SparqlLog");
    
    /**
     * Options that can be specified in the graph configuration.
     * 
     * @author mikepersonick
     */
    public static interface Options {

        /**
         * The {@link BlazeValueFactory} instance this graph should use.
         * Defaults to {@link BlazeValueFactory#INSTANCE}.
         */
        String VALUE_FACTORY = BlazeGraph.class.getName() + ".valueFactory";
        
        /**
         * The max query time for Sparql queries before timeout.  Defaults to
         * infinite (0).
         */
        String MAX_QUERY_TIME = BlazeGraph.class.getName() + ".maxQueryTime";
        
        /**
         * An internal option set by the concrete implementation as a floor
         * to use when assigning list index values for Cardinality.list 
         * properties.  Default is System.currentTimeMillis(), but better is
         * to use the last commit time of the database (which could be in the
         * future in cases of clock skew). 
         */
        String LIST_INDEX_FLOOR = BlazeGraph.class.getName() + ".listIndexFloor";

        /**
         * Maximum number of chars to print through the SparqlLogger.
         */
        String SPARQL_LOG_MAX = BlazeGraph.class.getName() + ".sparqlLogMax";
        
        /**
         * Defaults to 10k.
         */
        int DEFAULT_SPARQL_LOG_MAX = 10000;

    }
    
    /**
     * Enum used by the full text search API.
     *  
     * @author mikepersonick
     */
    public static enum Match {
        
        /**
         * Match any terms in the search string (OR).
         */
        ANY,
        
        /**
         * Match all terms in the search string (AND).
         */
        ALL,
        
        /**
         * Match the search string exactly using a regex filter. Most expensive
         * option - use ALL instead if possible.
         */
        EXACT;
        
    }
    
    /**
     * Value factory for round-tripping between property graph values 
     * (ids, labels, keys, and values) and RDF values (URIs and Literals).
     */
    private final BlazeValueFactory vf;
    
    /**
     * Configuration of this graph instance.
     */
    protected final Configuration config;
    
    /**
     * Sparql query string generator.
     */
    private final SparqlGenerator sparql;
    
    /**
     * Counter for Cardinality.list vertex property ids, which also serve as 
     * their list index.  By starting at System.currentTimeMillis() we are 
     * guaranteed to have monotonically increasing ids/indices
     * for a given vertex/key, assuming no system clock skew.  
     * 
     * We could use the last commit time on the underlying journal instead,
     * which would then make us impervious to bad system clock times.
     */
    private final AtomicLong vpIdFactory;
    
    /**
     * Max Query Time used to globally set the query timeout.
     * 
     * Default is 0 (unlimited)
     */
    private final int maxQueryTime;
    
    /**
     * URI used for labeling elements.
     */
    private final URI TYPE;
    
    /**
     * URI used for list item values.
     */
    private final URI VALUE;
    
    /**
     * Datatype URI for list index for Cardinality.list vertex properties. 
     */
    private final URI LI_DATATYPE;

    /**
     * Transform functions for converting from RDF query results to property
     * graph results.
     */
    protected final Transforms transforms;
    
    /**
     * Maximum number of chars to print through the SparqlLogger. Defaults to 
     * 10k.
     */
    protected final int sparqlLogMax;
    
    /**
     * When this is set to true, disables any implicit reads/removes that are
     * interleaved with the process of adding new data. This means no checking
     * on vertex and edge id reuse and no cleaning of old property values
     * (applies to all properties on Edges and VertexProperties and 
     * Cardinality.single properties on Vertices).
     */
    private transient volatile boolean bulkLoad = false;
    
    /**
     * Construct an instance using the supplied configuration.
     */
    protected BlazeGraph(final Configuration config) {
        this.config = config;
        
        this.vf = Optional.ofNullable((BlazeValueFactory) 
                                config.getProperty(Options.VALUE_FACTORY))
                          .orElse(BlazeValueFactory.INSTANCE);
        
        final long listIndexFloor = config.getLong(
                Options.LIST_INDEX_FLOOR, System.currentTimeMillis());
        this.vpIdFactory = new AtomicLong(listIndexFloor);
        
        this.maxQueryTime = config.getInt(Options.MAX_QUERY_TIME, 0);
        
        this.sparqlLogMax = config.getInt(Options.SPARQL_LOG_MAX, 
                                          Options.DEFAULT_SPARQL_LOG_MAX);
        
        this.TYPE = vf.type();
        this.VALUE = vf.value();
        this.LI_DATATYPE = vf.liDatatype();
        
        this.sparql = new SparqlGenerator(vf);
        this.transforms = new Transforms();
    }
    
    /**
     * Return the factory used to round-trip between Tinkerpop values and
     * RDF values.
     */
    public BlazeValueFactory valueFactory() {
        return vf;
    }
    
    /**
     * RDF value factory for Sesame model objects.
     */
    public abstract BigdataValueFactory rdfValueFactory();

    /**
     * Provide a connection to the SAIL repository for read and write
     * operations.
     */
    protected abstract RepositoryConnection cxn();
    
    /**
     * Returns whether the graph is in bulkLoad (true) or incremental update
     * (false) mode.  Incremental update is the default mode
     * 
     * @see #setBulkLoad(boolean)
     */
    public boolean isBulkLoad() {
        return bulkLoad;
    }
    
    /**
     * When this is set to true, disables any implicit reads/removes that are
     * interleaved with the process of adding new data. This means no checking
     * on vertex and edge id reuse and no cleaning of old property values
     * (applies to all properties on Edges and VertexProperties and
     * Cardinality.single properties on Vertices). This results in considerably
     * greater write throughput and is suitable for loading a new data set into
     * an empty graph or loading data that does overlap with any data already in
     * an existing graph.
     * 
     * Default is incremental update (bulkLoad = false).
     * 
     */
    public void setBulkLoad(final boolean bulkLoad) {
        this.bulkLoad = bulkLoad;
    }
    
    /**
     * Execute the supplied code fragment in bulk load mode and reset to 
     * incremental mode when finished.
     * 
     * @see #setBulkLoad(boolean)
     */
    public void bulkLoad(final Code code) {
        if (isBulkLoad()) {
            Code.wrapThrow(code);
        } else {
            setBulkLoad(true);
            Code.wrapThrow(code, () -> setBulkLoad(false));
        }
    }
    
    /**
     * Bulk load a Graph (TinkerGraph or otherwise).  Uses the Graph's 
     * features to determine VertexProperty key cardinality.  Vertex and Edge
     * ids will be toString()-ed, VertexProperty ids will be ignored.
     */
    public void bulkLoad(final Graph g) {
        final VertexFeatures vf = g.features().vertex();
        bulkLoad(g, key -> vf.getCardinality(key));
    }
    
    /**
     * Bulk load a Graph (TinkerGraph or otherwise), using the supplied
     * function to determine VertexProperty key cardinality.  Vertex and Edge
     * ids will be toString()-ed, VertexProperty ids will be ignored.
     */
    public void bulkLoad(final Graph g, 
            final Function getCardinality) {
        bulkLoad(() -> {
            g.vertices().forEachRemaining(v -> {
                final String id = v.id().toString();
                final BlazeVertex bv = this.addVertex(T.id, id, T.label, v.label());
                v.properties().forEachRemaining(vp -> {
                    final String key = vp.key();
                    final Object val = vp.value();
                    final BlazeVertexProperty