com.bigdata.rdf.lexicon.Term2IdWriteProc Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on May 21, 2007
 */
package com.bigdata.rdf.lexicon;

import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.log4j.Logger;

import com.bigdata.btree.ICounter;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedure;
import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedureConstructor;
import com.bigdata.btree.proc.AbstractLocalSplitResultAggregator;
import com.bigdata.btree.proc.IParallelizableIndexProcedure;
import com.bigdata.btree.proc.IResultHandler;
import com.bigdata.btree.proc.SplitValuePair;
import com.bigdata.btree.raba.IRaba;
import com.bigdata.btree.raba.codec.IRabaCoder;
import com.bigdata.io.DataOutputBuffer;
import com.bigdata.io.LongPacker;
import com.bigdata.io.ShortPacker;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.IVUtility;
import com.bigdata.rdf.internal.VTE;
import com.bigdata.rdf.internal.impl.TermId;
import com.bigdata.rdf.lexicon.Term2IdWriteProc.Result;
import com.bigdata.relation.IMutableRelationIndexWriteProcedure;
import com.bigdata.service.Split;
import com.bigdata.util.BytesUtil;

/**
 * This unisolated operation inserts terms into the term:id index,
 * assigning identifiers to terms as a side-effect. The use of this operation
 * MUST be followed by the the use of {@link Id2TermWriteProc} to ensure that
 * the reverse mapping from id to term is defined before any statements are
 * inserted using the assigned term identifiers. The client MUST NOT make
 * assertions using the assigned term identifiers until the corresponding
 * {@link Id2TermWriteProc} operation has succeeded.
 * 
 * In order for the lexicon to remain consistent if the client fails for any
 * reason after the forward mapping has been made restart-safe and before the
 * reverse mapping has been made restart-safe clients MUST always use a
 * successful {@link Term2IdWriteProc} followed by a successful
 * {@link Id2TermWriteProc} before inserting statements using term identifiers
 * into the statement indices. In particular, a client MUST NOT treat lookup
 * against the terms index as satisfactory evidence that the term also exists
 * in the reverse mapping.
 * 

 * Note that it is perfectly possible that a concurrent client will overlap in
 * the terms being inserted. The results will always be fully consistent if the
 * rules of the road are observed since (a) unisolated operations are
 * single-threaded; (b) term identifiers are assigned in an unisolated atomic
 * operation by {@link Term2IdWriteProc}; and (c) the reverse mapping is made
 * consistent with the assignments made/discovered by the forward mapping.
 * 

 * Note: The {@link Term2IdWriteProc} and {@link Id2TermWriteProc} operations
 * may be analyzed as a batch variant of the following pseudo code.
 * 
 * 
 *  
 *  for each term:
 *  
 *  termId = null;
 *  
 *  synchronized (ndx) {
 *    
 *    counter = ndx.getCounter();
 *  
 *    termId = ndx.lookup(term.key);
 *    
 *    if(termId == null) {
 * 
 *       termId = counter.inc();
 *       
 *       ndx.insert(term.key,termId);
 *       
 *       }
 *  
 *  }
 *  
 * 
 * 
 * In addition, the actual operations against scale-out indices are performed on
 * index partitions rather than on the whole index.
 * 
 * @author Bryan Thompson
 */
public class Term2IdWriteProc extends AbstractKeyArrayIndexProcedure implements
        IParallelizableIndexProcedure, IMutableRelationIndexWriteProcedure {
    
    private static final Logger log = Logger.getLogger(Term2IdWriteProc.class);
    
//    static {
//        if(DEBUG) {
//         
//            log.removeAllAppenders();
//            
//            try {
//                log.addAppender(new FileAppender(new SimpleLayout(),"Term2IdWriteProc.log"));
//            } catch (IOException e) {
//                e.printStackTrace();
//            }
//            
//        }
//    }

    /**
     * Flag enables optional ground truth verification. It is only enabled at
     * the DEBUG level IFF this flag is ALSO set.
     * 
     * WARNING: This IS NOT scalable! 
     * 

     * WARNING: This option IS NOT safe when using more than one triple
     * store either concurrently or in sequence! For example, you can use it to
     * examine a single unit test for inconsistencies, but not a sequence of unit
     * tests since the data will be kept within the same global map and hence
     * confound the test!
     */
    private static boolean enableGroundTruth = false;
    private static ConcurrentHashMap groundTruthId2Term;
    private static ConcurrentHashMap groundTruthTerm2Id;
    static {
        
        if (log.isDebugEnabled() && enableGroundTruth) {
        
            log.warn("Will track ground truth assignments");
            
            // note: use a large initial capacity. default concurrency level is 16.
            
            groundTruthId2Term = new ConcurrentHashMap(500000);

            groundTruthTerm2Id = new ConcurrentHashMap(500000);
            
        }
        
    }
    
    /**
     * 
     */
    private static final long serialVersionUID = -4736465754523655679L;

    /**
     * Serialized as extended metadata. When true unknown terms
     * are NOT added to the database.
     */
    private boolean readOnly;

    @Override
    public final boolean isReadOnly() {
        
        return readOnly;
        
    }
    
    /**
     * Serialized as extended metadata. When true blank nodes
     * are stored in the lexicon's forward index.
     */
    private boolean storeBlankNodes;
    
    public final boolean isStoreBlankNodes() {
        
        return storeBlankNodes;
        
    }

    private int scaleOutTermIdBitsToReverse;
    
    /**
     * De-serialization constructor.
     */
    public Term2IdWriteProc() {
        
    }
    
    protected Term2IdWriteProc(IRabaCoder keySer, int fromIndex,
            int toIndex, byte[][] keys, boolean readOnly,
            boolean storeBlankNodes, int scaleOutTermIdBitsToReverse) {

        super(keySer, null, fromIndex, toIndex, keys, null /* vals */);

        this.readOnly = readOnly;

        this.storeBlankNodes = storeBlankNodes;

        this.scaleOutTermIdBitsToReverse = scaleOutTermIdBitsToReverse;

    }

    public static class Term2IdWriteProcConstructor extends
            AbstractKeyArrayIndexProcedureConstructor {

        private final boolean readOnly;
        private final boolean storeBlankNodes;
        private final int scaleOutTermIdBitsToReverse;

        /**
         * Values ARE NOT sent.
         */
        @Override
        public final boolean sendValues() {
            
            return false;
            
        }

        public Term2IdWriteProcConstructor(final boolean readOnly,
                final boolean storeBlankNodes,
                final int scaleOutTermIdBitsToReverse) {

            this.readOnly = readOnly;

            this.storeBlankNodes = storeBlankNodes;

            this.scaleOutTermIdBitsToReverse = scaleOutTermIdBitsToReverse;
            
        }

        @Override
        public Term2IdWriteProc newInstance(final IRabaCoder keySer,
                final IRabaCoder valSer, final int fromIndex,
                final int toIndex, final byte[][] keys, final byte[][] vals) {

            assert vals == null;

            if(log.isInfoEnabled())
                log.info("TERM2ID Proc Ctor: ntuples=" + (toIndex-fromIndex));
            
            return new Term2IdWriteProc(keySer, fromIndex, toIndex, keys,
                    readOnly, storeBlankNodes, scaleOutTermIdBitsToReverse);

        }

    }

    /**
     * For each term whose serialized key is mapped to the current index
     * partition, lookup the term in the terms index. If it is there
     * then note its assigned termId. Otherwise, use the partition local counter
     * to assign the term identifier, note the term identifier so that it can be
     * communicated back to the client, and insert the {term,termId} entry into
     * the terms index.
     * 
     * @param ndx
     *            The terms index.
     * 
     * @return The {@link Result}, which contains the discovered / assigned
     *         term identifiers.
     * 
     * TODO no point sending bnodes when readOnly.
     */
    @Override
    public Result applyOnce(final IIndex ndx, final IRaba keys, final IRaba vals) {

		final boolean DEBUG = log.isDebugEnabled();

		final int numTerms = keys.size();

        assert numTerms > 0 : "numTerms="+numTerms;
        
		// used to store the discovered / assigned term identifiers.
		@SuppressWarnings("rawtypes")
		final IV[] ivs = new IV[numTerms];
        
        // used to assign term identifiers.
        final ICounter counter = ndx.getCounter();

//        // true iff this is an unpartitioned index.
//        final boolean scaleOut = counter instanceof BTree.PartitionedCounter;
        
        // used to serialize term identifiers.
        @SuppressWarnings("resource")
      final DataOutputBuffer idbuf = new DataOutputBuffer();
        
        final TermIdEncoder encoder = readOnly ? null
                : scaleOutTermIdBitsToReverse == 0 ? null : new TermIdEncoder(
                        scaleOutTermIdBitsToReverse);
        
//        final DataOutputBuffer kbuf = new DataOutputBuffer(128);

        // #of new terms (#of writes on the index).
        int nnew = 0;
        for (int i = 0; i < numTerms; i++) {

            // Note: Copying the key into a buffer does not help since we need
            // it in its own byte[] to do lookup against the index.
//          getKeys().copy(i, kbuf.reset());
            final byte[] key = keys.get(i);

            // this byte encodes the kind of term (URI, Literal, BNode, etc.)
            final byte code = key[0];//KeyBuilder.decodeByte(key[0]);
            
            if (!storeBlankNodes && code == ITermIndexCodes.TERM_CODE_BND) {

                /*
                 * Do not enter blank nodes into the forward index.
                 * 
                 * For this case, we just assign a term identifier and leave it
                 * at that. If two different documents by some chance happen to
                 * specify the same blank node ID they will still be assigned
                 * distinct term identifiers. The only way that you can get the
                 * same term identifier for a blank node is to have the blank
                 * node ID matched in a canonicalizing map of blank nodes by the
                 * client. That map, of course, should be scoped to the document
                 * in which the blank node IDs appear.
                 */
                
                if (readOnly) {
                
                    // blank nodes can not be resolved by the index.
                    ivs[i] = null;

                } else {
                    
                    /*
                     * Assign a term identifier.
                     * 
                     * Note: The TermIdEncoder is ONLY used in scale-out.
                     */
                    
                    final long ctr = counter.incrementAndGet();
                    
                    final long termId = encoder == null ? ctr : encoder
                            .encode(ctr);
                    
                    ivs[i] = new TermId(VTE(code), termId);
                    
                }
                
            } else {

                /*
                 * Lookup in the forward index (URIs, Literals, and SIDs)
                 * 
                 * Note: Also handles BNodes iff storeBlankNodes is true
                 * 
                 * @todo reuse Tuple for lookups to reduce allocation (will
                 * reuse an internal buffer).
                 */
                final byte[] tmp = ndx.lookup(key);
    
                if (tmp == null) {

                    // not found.
                    
                    if(readOnly) {
                        
                        // not found - will not be assigned.
                        ivs[i] = null;

                    } else {

                        /*
                         * Assign a term identifier.
                         * 
                         * Note: The TermIdEncoder is ONLY used in scale-out.
                         */
                        
                        final long ctr = counter.incrementAndGet();
                        
                        final long termId = encoder == null ? ctr : encoder
                                .encode(ctr);

                        @SuppressWarnings("rawtypes")
                        final TermId iv = new TermId(VTE(code), termId);
                        
                        if (DEBUG && enableGroundTruth) {

                            groundTruthTest(key, termId, ndx, counter);

                        }

                        final byte[] bytes = iv
                                .encode(KeyBuilder.newInstance()).getKey();

                        idbuf.reset().write(bytes);

                        // insert into index.
                        if (ndx.insert(key, idbuf.toByteArray()) != null) {

                            throw new AssertionError();

                        }

                        nnew++;
                        
                        ivs[i] = iv;
                    
                    }
                    
                } else { // found.
    
                    ivs[i] = IVUtility.decode(tmp);
                        
                }
    
            }
            
        }

        /*
         * Note: this is for debugging. It does not rely on ground truth, but
         * only logs information. It was originally used to track down a lost
         * update problem.
         */
//        if (enableGroundTruth && ndx.getIndexMetadata().getPartitionMetadata() != null) {
//
//            final long v = counter.get();
//            final int pid = (int) v >>> 32;
//            final int ctr = (int) v;
//
//            // note: the mutable btree - accessed here for debugging only.
//            final BTree btree;
//            if (ndx instanceof AbstractBTree) {
//                btree = (BTree) ndx;
//            } else {
//                btree = (BTree) ((FusedView) ndx).getSources()[0];
//            }
//            
//            log.warn("after task"+
//            ": nnew="+nnew+//
//            ", partitionId="+ndx.getIndexMetadata().getPartitionMetadata().getPartitionId()+//
//            ", pid="+pid+//
//            ", ctr="+ctr+//
//            ", counter="+counter.getClass().getName()+//
//            ", sourceCheckpoint="+btree.getCheckpoint()// btree was loaded from here.
//            );
//            
//        }
        
        return new Result(ivs);

    }
    
    private void groundTruthTest(final byte[] key, final long termId, final IIndex ndx,
            final ICounter counter) {
        
        if(groundTruthId2Term.isEmpty()) {
            
            log.warn("Ground truth testing enabled.");
            
        }

        /*
         * Note: add to map if not present. returns the value
         * already stored in the map (and null if there was no value
         * in the map).
         */
        
        // remember the termId assigned to that key.
        final Long oldId = groundTruthTerm2Id.putIfAbsent(key, termId);
        
        if( oldId != null && oldId.longValue() != termId ) {

            /*
             * The assignment of the term identifier to the key is
             * not stable.
             */
            
            throw new AssertionError("different termId assigned"+//
                    ": oldId=" + oldId + //
                    ", newId=" + termId + //
                    ", key=" + BytesUtil.toString(key)+//
                    ", pmd="+ndx.getIndexMetadata().getPartitionMetadata());

        }

        // remember the key to which we assigned that termId.
        final byte[] oldKey = groundTruthId2Term.putIfAbsent(termId, key);
        
        if (oldKey != null && !BytesUtil.bytesEqual(oldKey, key)) {

            /*
             * The assignment of the term identifier to the key is
             * not unique.
             */
            
            // the partition identifier (assuming index is partitioned).
//                        final long pid = id0 >> 32;
//                        final long mask = 0xffffffffL;
//                        final int ctr = (int) (id0 & mask);
            
            throw new AssertionError("assignment not unique"+//
                    ": termId=" + termId +//
                    ", oldKey=" + BytesUtil.toString(oldKey) + //
                    ", newKey=" + BytesUtil.toString(key)+//
                    ", pmd="+ndx.getIndexMetadata().getPartitionMetadata()+//
//                                ", pid="+pid+", ctr="+ctr+//
                    ", counter="+counter+//
                    ", counter="+counter.getClass().getName());
            
        }
        
    }
    
    @Override
    protected void readMetadata(final ObjectInput in) throws IOException, ClassNotFoundException {
        
        super.readMetadata(in);
        
        readOnly = in.readBoolean();
     
//        scaleOutTermIds = in.readBoolean();

        scaleOutTermIdBitsToReverse = (int) in.readByte();

    }

    /**
     * Writes metadata (not the keys or values, but just other metadata used by
     * the procedure).
     * 
     * The default implementation writes toIndex - fromIndex,
     * which is the #of keys.
     * 
     * @param out
     * 
     * @throws IOException
     */
    @Override
    protected void writeMetadata(final ObjectOutput out) throws IOException {

        super.writeMetadata(out);

        out.writeBoolean(readOnly);

//        out.writeBoolean(scaleOutTermIds);

        out.writeByte((byte) scaleOutTermIdBitsToReverse);

    }
    
    final public static VTE VTE(final byte code) {
        
        switch(code) {
        case ITermIndexCodes.TERM_CODE_URI:
            return VTE.URI;
        case ITermIndexCodes.TERM_CODE_BND:
            return VTE.BNODE;
//        case ITermIndexCodes.TERM_CODE_STMT:
//            return VTE.STATEMENT;
        case ITermIndexCodes.TERM_CODE_DTL:
//        case ITermIndexCodes.TERM_CODE_DTL2:
        case ITermIndexCodes.TERM_CODE_LCL:
        case ITermIndexCodes.TERM_CODE_LIT:
            return VTE.LITERAL;
        default:
            throw new IllegalArgumentException("code=" + code);
        }
        
    }

    
    /**
     * Object encapsulates the discovered / assigned term identifiers and
     * provides efficient serialization for communication of those data to the
     * client.
     * 
     * @author Bryan Thompson
     */
    public static class Result implements Externalizable {

        public IV[] ivs;
        
        private static final long serialVersionUID = -8307927320589290348L;

        /**
         * De-serialization constructor.
         */
        public Result() {
            
        }
        
        public Result(final IV[] ivs) {

            assert ivs != null;
            
            assert ivs.length > 0;
            
            this.ivs = ivs;
            
        }

        private final static transient short VERSION0 = 0x0;

        @Override
        public void readExternal(final ObjectInput in) throws IOException, ClassNotFoundException {

            final short version = ShortPacker.unpackShort(in);
            
            if(version!=VERSION0) {
                
                throw new IOException("Unknown version: "+version);
                
            }
            
            final int n = (int) LongPacker.unpackLong(in);
            
            ivs = new IV[n];
            
            for (int i = 0; i < n; i++) {
                
//                ids[i] = LongPacker.unpackLong(in);
                ivs[i] = (IV) in.readObject();
                
            }
            
        }

        @Override
        public void writeExternal(final ObjectOutput out) throws IOException {

            final int n = ivs.length;
            
            ShortPacker.packShort(out, VERSION0);
            
            LongPacker.packLong(out,n);

            for (int i = 0; i < n; i++) {
                                
//                LongPacker.packLong(out, ids[i]);
                out.writeObject(ivs[i]);
                
            }
            
        }
        
    }
    
    /**
	 * {@link Split}-wise aggregation followed by combining the results across
	 * those splits in order to return an aggregated result whose iv[] is 1:1
	 * with the original keys[][].
	 */
	@Override
	protected IResultHandler newAggregator() {
	
		return new TermResultAggregator(getKeys().size());

	}
	
	/**
	 * Aggregator collects the individual results in an internal ordered map and
	 * assembles the final result when it is requested from the individual
	 * results. With this approach there is no overhead or contention when the
	 * results are being produced in parallel and they can be combined
	 * efficiently within a single thread in {@link #getResult()}.
	 *
	 * @author bryan
	 */
	private class TermResultAggregator extends AbstractLocalSplitResultAggregator {

		/**
		 * 
		 * @param size
		 *            The #of elements in the request (which is the same as the
		 *            cardinality of the aggregated result).
		 */
		public TermResultAggregator(final int size) {
			
			super(size);
			
		}

		@Override
		protected Result newResult(final int size, final SplitValuePair[] a) {

			@SuppressWarnings("rawtypes")
			final IV[] ivs = new IV[size];

			for (int i = 0; i < a.length; i++) {

				final Split split = a[i].key;

				final Result tmp = a[i].val;

				System.arraycopy(tmp.ivs/* src */, 0/* srcPos */, ivs/* dest */, split.fromIndex/* destPos */,
						split.ntuples/* length */);

			}

			/*
			 * Return the aggregated result.
			 */
			final Result r = new Result(ivs);

			return r;

		}

	} // TermResultHandler

}