com.bigdata.rdf.rules.RDFJoinNexus Maven / Gradle / Ivy

Go to download
/*

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*/
/*
 * Created on Jun 25, 2008
 */

package com.bigdata.rdf.rules;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;

import org.apache.log4j.Logger;

import com.bigdata.bop.Constant;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstant;
import com.bigdata.bop.IPredicate;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.IVariableOrConstant;
import com.bigdata.bop.Var;
import com.bigdata.bop.joinGraph.IEvaluationPlanFactory;
import com.bigdata.bop.joinGraph.IRangeCountFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.ISortKeyBuilder;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.journal.IIndexManager;
import com.bigdata.rdf.inf.Justification;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.VTE;
import com.bigdata.rdf.internal.impl.TermId;
import com.bigdata.rdf.lexicon.LexiconRelation;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.relation.rule.BindingSetSortKeyBuilder;
import com.bigdata.rdf.spo.SPO;
import com.bigdata.rdf.spo.SPORelation;
import com.bigdata.rdf.spo.SPOSortKeyBuilder;
import com.bigdata.rdf.store.AbstractTripleStore;
import com.bigdata.relation.IMutableRelation;
import com.bigdata.relation.IRelation;
import com.bigdata.relation.accesspath.IAccessPath;
import com.bigdata.relation.accesspath.IAsynchronousIterator;
import com.bigdata.relation.accesspath.IBuffer;
import com.bigdata.relation.accesspath.UnsynchronizedArrayBuffer;
import com.bigdata.relation.rule.IAccessPathExpander;
import com.bigdata.relation.rule.IProgram;
import com.bigdata.relation.rule.IRule;
import com.bigdata.relation.rule.IStep;
import com.bigdata.relation.rule.eval.AbstractJoinNexus;
import com.bigdata.relation.rule.eval.AbstractSolutionBuffer;
import com.bigdata.relation.rule.eval.ActionEnum;
import com.bigdata.relation.rule.eval.IJoinNexus;
import com.bigdata.relation.rule.eval.IRuleState;
import com.bigdata.relation.rule.eval.IRuleStatisticsFactory;
import com.bigdata.relation.rule.eval.ISolution;
import com.bigdata.relation.rule.eval.RuleStats;
import com.bigdata.striterator.ChunkedArrayIterator;
import com.bigdata.striterator.IChunkedIterator;
import com.bigdata.striterator.IChunkedOrderedIterator;
import com.bigdata.striterator.IKeyOrder;

/**
 * {@link IProgram} execution support for the RDF DB.
 * 
 * The rules have potential parallelism when performing closure. Each join has
 * potential parallelism as well for subqueries. We could even define a PARALLEL
 * iterator flag and have parallelism across index partitions for a
 * read-historical iterator since the data service locators are immutable for
 * historical reads.
 * 

 * Rule-level parallelism (for fix point closure of a rule set) and join
 * subquery-level parallelism could be distributed to available workers in a
 * cluster. In a similar way, high-level queries could be distributed to workers
 * in a cluster to evaluation. Such distribution would increase the practical
 * parallelism beyond what a single machine could support as long as the total
 * parallelism does not overload the cluster.
 * 

 * There is a pragmatic limit on the #of concurrent threads for a single host.
 * When those threads target a blocking queue, then thread contention becomes
 * very high and throughput drops dramatically. We can reduce this problem by
 * allocating a distinct {@link UnsynchronizedArrayBuffer} to each task. The
 * task collects a 'chunk' in the {@link UnsynchronizedArrayBuffer}. When full,
 * the buffer propagates onto a thread-safe buffer of chunks which flushes
 * either on an {@link IMutableRelation} (mutation) or feeding an
 * {@link IAsynchronousIterator} (high-level query). It is chunks themselves
 * that accumulate in this thread-safe buffer, so each add() on that buffer may
 * cause the thread to yield, but the return for yielding is an entire chunk in
 * the buffer, not just a single element.
 * 

 * There is one high-level buffer factory corresponding to each of the kinds of
 * {@link ActionEnum}: {@link #newQueryBuffer()};
 * {@link #newInsertBuffer(IMutableRelation)}; and
 * {@link #newDeleteBuffer(IMutableRelation)}. In addition there is one for
 * {@link UnsynchronizedArrayBuffer}s -- this is a buffer that is NOT
 * thread-safe and that is designed to store a single chunk of elements, e.g.,
 * in an array E[N]).
 * 
 * @author Bryan Thompson
 * @version $Id$
 */
public class RDFJoinNexus extends AbstractJoinNexus implements IJoinNexus {

    protected final static transient Logger log = Logger.getLogger(RDFJoinNexus.class);
    
    private final RDFJoinNexusFactory joinNexusFactory;

    private final boolean justify;
    
    /**
     * when true the backchainer will be enabled for access path
     * reads.
     */
    private final boolean backchain;

    private final IRuleStatisticsFactory ruleStatisticsFactory = new IRuleStatisticsFactory() {

        public RuleStats newInstance(IStep step) {
            
            return new RDFRuleStats(step);
            
        }

        public RuleStats newInstance(IRuleState ruleState) {
         
            return new RDFRuleStats(null, getReadTimestamp(), ruleState);

        }
        
//        /**
//         * Factory will resolve term identifiers in {@link IPredicate}s in the
//         * tail of the {@link IRule} to {@link BigdataValue}s unless the
//         * {@link IIndexManager} is an {@link IBigdataFederation}.
//         * 
//         * @todo translation of term identifiers is disabled. someone is
//         *       interrupting the thread logging the {@link RuleStats}. until i
//         *       can figure out who that is, you will see term identifiers
//         *       rather than {@link BigdataValue}s.
//         */
//        public RuleStats newInstancex(IRuleState ruleState) {
//            
//            return new RDFRuleStats(
//                    (indexManager instanceof IBigdataFederation ? null
//                            : indexManager), //
//                        getReadTimestamp(), //
//                        ruleState//
//                        );
//            
//        }
        
    };
    
    /**
     * Extends {@link RuleStats}s to translate the tail predicates back into
     * RDF by resolving the term identifiers to {@link BigdataValue}s.
     */
    private static class RDFRuleStats extends RuleStats {

        private final IIndexManager indexManager;
        private final long timestamp;
        
        public RDFRuleStats(IStep step) {

            super(step);

            indexManager = null;
            
            timestamp = 0L; // ignored.
            
        }
        
        /**
         * 
         * @param indexManager
         *            When non-null, this is used to resolve
         *            the term identifiers in the {@link IPredicate}s in the
         *            tail of the rule to {@link BigdataValue}s.
         * 
         * @param ruleState
         */
        public RDFRuleStats(final IIndexManager indexManager,
                final long timestamp, final IRuleState ruleState) {

            super(ruleState);

            this.indexManager = indexManager;
            
            this.timestamp = timestamp;
            
        }

        @Override
        @SuppressWarnings("unchecked")
        protected String toString(final IPredicate pred) {

            if (indexManager == null) {

                return pred.toString().replace(", ", " ");
                
            }

            final SPORelation spoRelation = (SPORelation) indexManager
                    .getResourceLocator().locate(pred.getRelationName(0),
                            timestamp);

            final AbstractTripleStore db = spoRelation.getContainer();

            final Object s, p, o;
            try {

                {

                    final IVariableOrConstant t = pred.get(0);

                    if (t.isVar())
                        s = t.getName();
                    else
                        s = db.toString(t.get());

                }

                {

                    final IVariableOrConstant t = pred.get(1);

                    if (t.isVar())
                        p = t.getName();
                    else
                        p = db.toString(t.get());

                }

                {

                    final IVariableOrConstant t = pred.get(2);

                    if (t.isVar())
                        o = t.getName();
                    else
                        o = db.toString(t.get());

                }
            } catch (Throwable t) {
                
                /*
                 * @todo It appears that someone is interrupting the thread in
                 * which the logging data is being generated. You can see this
                 * if you enable translation of term identifiers above in the
                 * factory that produces instances of this class.
                 */
                
                throw new RuntimeException("pred=" + pred + ", timestamp="
                        + timestamp + ", indexManager=" + indexManager
                        + ", db=" + db, t);
            }
           
            return "(" + s + " " + p + " " + o + ")";
            
        }
        
    }
        
	/**
	 * @param joinNexusFactory
	 *            The object used to create this instance and which can be used
	 *            to create other instances as necessary for distributed rule
	 *            execution.
	 * @param indexManager
	 *            The object used to resolve indices, relations, etc.
	 */
	public RDFJoinNexus(final RDFJoinNexusFactory joinNexusFactory,
			final IIndexManager indexManager) {

        super(joinNexusFactory, indexManager);
	    
        this.joinNexusFactory = joinNexusFactory;
        
        this.justify = joinNexusFactory.justify;
        
        this.backchain = joinNexusFactory.backchain;
        
    }

	@Override
    public IRuleStatisticsFactory getRuleStatisticsFactory() {
        
        return ruleStatisticsFactory;
        
    }
    
    /**
     * When {@link #backchain} is true and the tail predicate is
     * reading on the {@link SPORelation}, then the {@link IAccessPath} is
     * wrapped so that the iterator will visit the backchained inferences as
     * well. On the other hand, if {@link IPredicate#getPartitionId()} is
     * defined (not -1) then the returned access path will be for
     * the specified shard using the data service local index manager (
     * {@link #indexManager} MUST be the data service local index manager for
     * this case) and expanders WILL NOT be applied (they require a view of the
     * total relation, not just a shard).
     * 
     * @see InferenceEngine
     * @see BackchainAccessPath
     * 
     * @todo consider encapsulating the {@link IRangeCountFactory} in the
     *       returned access path for non-exact range count requests. this will
     *       make it slightly harder to write the unit tests for the
     *       {@link IEvaluationPlanFactory}
     */
    @Override
	@SuppressWarnings("unchecked")
    public IAccessPath getTailAccessPath(final IRelation relation,
            final IPredicate predicate) {

//        if (predicate.getPartitionId() != -1) {
//
//            /*
//             * Note: This handles a read against a local index partition. For
//             * scale-out, the [indexManager] will be the data service's local
//             * index manager.
//             * 
//             * Note: Expanders ARE NOT applied in this code path. Expanders
//             * require a total view of the relation, which is not available
//             * during scale-out pipeline joins. Likewise, the [backchain]
//             * property will be ignored since it is handled by an expander.
//             * 
//             * @todo If getAccessPathForIndexPartition() is raised into the
//             * IRelation interface, then we can get rid of the cast to the
//             * SPORelation implementation.
//             */
//
////            return ((SPORelation) relation).getAccessPathForIndexPartition(
////                    indexManager, predicate);
//            return relation.getAccessPath(indexManager, relation
//                    .getKeyOrder(predicate), predicate);
//
//        }
//
//        // Find the best access path for the predicate for that relation.
//        IAccessPath accessPath = relation.getAccessPath(predicate);
////
////        if (predicate.getPartitionId() != -1) {
////
////            /*
////             * Note: The expander can not run against a shard since it assumes
////             * access to the full key range of the index. Expanders are
////             * convenient and work well for stand alone indices, but they should
////             * be replaced by rule rewrites for scale-out.
////             */
////
////            return accessPath;
////            
////        }
//        
        final IKeyOrder keyOrder = relation.getKeyOrder(predicate);

        IAccessPath accessPath = relation.getAccessPath(
                indexManager/* localIndexManager */, keyOrder, predicate);

        final IAccessPathExpander expander = predicate.getAccessPathExpander();
//        
//        if (expander != null) {
//            
//            // allow the predicate to wrap the access path
//            accessPath = expander.getAccessPath(accessPath);
//            
//        }

        // @todo raise into SPORelation#getAccessPath/3?
        // @see https://sourceforge.net/apps/trac/bigdata/ticket/231
        if(backchain && relation instanceof SPORelation) {

            if (expander == null || expander.backchain()) {
            
                final SPORelation spoRelation = (SPORelation)relation;
            
                accessPath = new BackchainAccessPath(
                        spoRelation.getContainer(), accessPath,
                        joinNexusFactory.isOwlSameAsUsed ? Boolean.TRUE
                                : Boolean.FALSE);
                
            }
            
        }
        
        // return that access path.
        return accessPath;

    }

//    @SuppressWarnings("unchecked")
//    public boolean bind(final IRule rule, final int index, final Object e,
//            final IBindingSet bindings) {
//
//        // propagate bindings from the visited object into the binding set.
//        copyValues((IElement) e, rule.getTail(index), bindings);
//
//        // verify constraints.
//        return rule.isConsistent(bindings);
//
//    }
//    
//    public boolean bind(final IPredicate pred, final IConstraint constraint,
//            final Object e, final IBindingSet bindings) {
//
//        // propagate bindings from the visited object into the binding set.
//        copyValues((IElement) e, pred, bindings);
//
//        if (constraint != null) {
//
//            // verify constraint.
//            return constraint.accept(bindings);
//        
//        }
//        
//        // no constraint.
//        return true;
//        
//    }
    
//    @SuppressWarnings("unchecked")
//    private void copyValues(final IElement e, final IPredicate pred,
//            final IBindingSet bindingSet) {
//
//        for (int i = 0; i < pred.arity(); i++) {
//
//            final IVariableOrConstant t = pred.get(i);
//
//            if (t.isVar()) {
//
//                final IVariable var = (IVariable) t;
//
//                final Constant newval = new Constant(e.get(i));
//
//                bindingSet.set(var, newval);
//
//            }
//
//        }
//
//    }

    public IConstant fakeBinding(IPredicate pred, Var var) {

        return fakeTermId;

    }

    final private static transient IConstant fakeTermId =
        new Constant(TermId.mockIV(VTE.URI));

    /**
     * FIXME unit tests for DISTINCT with a head and ELEMENT, with bindings and
     * a head, with bindings but no head, and with a head but no bindings
     * (error). See {@link #runQuery(IStep)}
     * 
     * FIXME unit tests for SORT with and without DISTINCT and with the various
     * combinations used in the unit tests for DISTINCT. Note that SORT, unlike
     * DISTINCT, requires that all solutions are materialized before any
     * solutions can be returned to the caller. A lot of optimization can be
     * done for SORT implementations, including merge sort of large blocks (ala
     * map/reduce), using compressed sort keys or word sort keys with 2nd stage
     * disambiguation, etc.
     * 
     * FIXME Add property for sort {ascending,descending,none} to {@link IRule}.
     * The sort order can also be specified in terms of a sequence of variables.
     * The choice of the variable order should be applied here.
     * 
     * FIXME The properties that govern the Unicode collator for the generated
     * sort keys should be configured by the {@link RDFJoinNexusFactory}. In
     * particular, Unicode should be handled however it is handled for the
     * {@link LexiconRelation}.
     */
    public ISortKeyBuilder newBindingSetSortKeyBuilder(final IRule rule) {

        final IKeyBuilder keyBuilder = KeyBuilder.newUnicodeInstance();
        
        final int nvars = rule.getVariableCount();
        
        final IVariable[] vars = new IVariable[nvars];
        
        {

            final Iterator itr = rule.getVariables();

            int i = 0;

            while (itr.hasNext()) {

                vars[i++] = itr.next();
                
            }

        }

        // @todo this class has RDF specific stuff in it.
        return new BindingSetSortKeyBuilder(keyBuilder, vars);
        
    }
    
    @Override
    protected ISortKeyBuilder newSortKeyBuilder(final IPredicate head) {

        return new SPOSortKeyBuilder(head.arity());
        
    }

    /**
     * Buffer writes on {@link IMutableRelation#insert(IChunkedIterator)} when it is
     * {@link #flush() flushed}.
     * 
     * @author Bryan Thompson
     * @version $Id$
     * @param 
     */
    public static class InsertSPOAndJustificationBuffer extends AbstractSolutionBuffer {
        
        /**
         * @param capacity
         * @param relation
         */
        public InsertSPOAndJustificationBuffer(final int capacity,
                final IMutableRelation relation) {

            super(capacity, relation);
            
        }

        @Override
        protected long flush(final IChunkedOrderedIterator> itr) {

            try {

                /*
                 * The mutation count is the #of SPOs written (there is one
                 * justification written per solution generated, but the
                 * mutation count does not reflect duplicate justifications -
                 * only duplicate statements).
                 * 
                 * Note: the optional filter for the ctor was already applied.
                 * If an element/solution was rejected, then it is not in the
                 * buffer and we will never see it during flush().
                 */
                
                long mutationCount = 0;
                
                while (itr.hasNext()) {

                    final ISolution[] chunk = itr.nextChunk();

                    mutationCount += writeChunk(chunk);
                    
                }
                
                return mutationCount;
                
            } finally {

                itr.close();

            }
            
        }
        
        private long writeChunk(final ISolution[] chunk) {

            final int n = chunk.length;
            
            if(log.isDebugEnabled()) 
                log.debug("chunkSize="+n);
            
            final long begin = System.currentTimeMillis();

            final SPO[] a = new SPO[ n ];

            final Justification[] b = new Justification[ n ];

            for (int i = 0; i < chunk.length; i++) {

                if(log.isDebugEnabled()) {
                    
                    log.debug("chunk["+i+"] = "+chunk[i]);
                    
                }
                
                final ISolution solution = (ISolution) chunk[i];
                
                a[i] = solution.get();
                
                b[i] = new Justification(solution);
                                
            }
            
            final SPORelation r = (SPORelation) (IMutableRelation) getRelation();

            /*
             * Use a thread pool to write out the statement and the
             * justifications concurrently. This drammatically reduces the
             * latency when also writing justifications.
             */

            final List> tasks = new ArrayList>(2);

            /*
             * Note: we reject using the filter before stmts or justifications
             * make it into the buffer so we do not need to apply the filter
             * again here.
             */

            tasks.add(new Callable(){
                public Long call() {
                    return r.insert(a,a.length,null/*filter*/);
                }
            });
            
            tasks.add(new Callable(){
                public Long call() {
                    return r
                            .addJustifications(new ChunkedArrayIterator(
                                    b.length, b, null/* keyOrder */));
                }
            });
            
            final List> futures;

            /*
             * @todo The timings for the tasks that we run here are not being
             * reported up to this point.
             */
            final long mutationCount;
            try {

                futures = r.getExecutorService().invokeAll(tasks);

                mutationCount = futures.get(0).get();

                                futures.get(1).get();

            } catch (InterruptedException ex) {

                throw new RuntimeException(ex);

            } catch (ExecutionException ex) {

                throw new RuntimeException(ex);

            }

            final long elapsed = System.currentTimeMillis() - begin;

            if (log.isInfoEnabled())
                log.info("Wrote " + mutationCount
                                + " statements and justifications in "
                                + elapsed + "ms");

            return mutationCount;

        }

    }
    
    /**
     * Overridden to handle justifications when using truth maintenance.
     * 
     * {@inheritDoc}
     */
    @Override
    @SuppressWarnings("unchecked")
    public IBuffer newInsertBuffer(final IMutableRelation relation) {

        if (getAction() != ActionEnum.Insert)
            throw new IllegalStateException();

        if (log.isDebugEnabled()) {

            log.debug("relation=" + relation);
            
        }
        
        if(justify) {

            /*
             * Buffer knows how to write the computed elements on the statement
             * indices and the computed binding sets on the justifications
             * indices.
             */
            
            return new InsertSPOAndJustificationBuffer(chunkOfChunksCapacity,
                    relation);

        }

        /*
         * Buffer resolves the computed elements and writes them on the
         * statement indices.
         */

        return new AbstractSolutionBuffer.InsertSolutionBuffer(
                chunkOfChunksCapacity, relation);

    }
    
}