com.bigdata.rdf.lexicon.BlobsWriteProc Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on May 21, 2007
 */
package com.bigdata.rdf.lexicon;

import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;

import org.apache.log4j.Logger;
import org.openrdf.model.Value;

import com.bigdata.btree.IIndex;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedure;
import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedureConstructor;
import com.bigdata.btree.proc.AbstractLocalSplitResultAggregator;
import com.bigdata.btree.proc.IParallelizableIndexProcedure;
import com.bigdata.btree.proc.IResultHandler;
import com.bigdata.btree.proc.SplitValuePair;
import com.bigdata.btree.raba.IRaba;
import com.bigdata.btree.raba.codec.IRabaCoder;
import com.bigdata.io.DataOutputBuffer;
import com.bigdata.io.LongPacker;
import com.bigdata.io.ShortPacker;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.VTE;
import com.bigdata.rdf.internal.impl.AbstractIV;
import com.bigdata.rdf.internal.impl.BlobIV;
import com.bigdata.rdf.lexicon.BlobsWriteProc.Result;
import com.bigdata.relation.IMutableRelationIndexWriteProcedure;
import com.bigdata.service.Split;

/**
 * This unisolated operation inserts {@link Value}s into the
 * {termCode,hash(Value),counter}:Value index, assigning {@link IV}s to
 * {@link Value}s as a side-effect.
 * 
 * Note that it is perfectly possible that a concurrent client will overlap in
 * the terms being inserted. The results will always be fully consistent if the
 * rules of the road are observed since (a) unisolated operations are
 * single-threaded; and (b) {@link IV}s are assigned in an unisolated atomic
 * operation by {@link BlobsWriteProc}.
 * 
 * @author Bryan Thompson
 */
public class BlobsWriteProc extends AbstractKeyArrayIndexProcedure implements
        IParallelizableIndexProcedure, IMutableRelationIndexWriteProcedure {
    
    private static final Logger log = Logger.getLogger(BlobsWriteProc.class);
    
    /**
     * 
     */
    private static final long serialVersionUID = 1L;

    /**
     * Serialized as extended metadata. When true unknown terms
     * are NOT added to the database.
     */
    private boolean readOnly;
    
    @Override
    public final boolean isReadOnly() {
        
        return readOnly;
        
    }
    
    /**
     * Serialized as extended metadata. When true blank nodes
     * are stored in the lexicon's forward index.
     */
    private boolean toldBNodes;
    
    public final boolean isToldBNodes() {
        
        return toldBNodes;
        
    }

    /**
     * De-serialization constructor.
     */
    public BlobsWriteProc() {
        
    }
    
	protected BlobsWriteProc(final IRabaCoder keySer, final IRabaCoder valSer,
			final int fromIndex, final int toIndex, final byte[][] keys,
			final byte[][] vals, final boolean readOnly,
			final boolean storeBlankNodes) {

		super(keySer, valSer, fromIndex, toIndex, keys, vals);

		this.readOnly = readOnly;

		this.toldBNodes = storeBlankNodes;

	}

	public static class BlobsWriteProcConstructor extends
			AbstractKeyArrayIndexProcedureConstructor {

		private final boolean readOnly;
		private final boolean toldBNodes;

		@Override
		public final boolean sendValues() {

			return true;
            
        }

        public BlobsWriteProcConstructor(final boolean readOnly,
                final boolean toldBNodes) {

            this.readOnly = readOnly;

            this.toldBNodes = toldBNodes;

        }

        @Override
        public BlobsWriteProc newInstance(final IRabaCoder keySer,
                final IRabaCoder valSer, final int fromIndex,
                final int toIndex, final byte[][] keys, final byte[][] vals) {

            if(log.isInfoEnabled())
				log.info("ntuples=" + (toIndex - fromIndex));

			return new BlobsWriteProc(keySer, valSer, fromIndex, toIndex, keys,
					vals, readOnly, toldBNodes);

        }

    } // class BlobsWriteProcConstructor

	/**
	 * For each term whose serialized key is mapped to the current index
	 * partition, lookup the term in the terms index. If it is there
	 * then wrap its key as its {@link IV}. Otherwise, note the of terms in the
	 * collision bucket, and insert {hash(term),counter},term} entry into the
	 * terms index.
	 * 
	 * @param ndx
	 *            The terms index.
	 * 
	 * @return The {@link Result}, which contains the discovered / assigned term
	 *         identifiers.
	 * 
	 *         TODO There is no point sending bnodes when readOnly and NOT in
	 *         told bnodes mode because the caller is unable to unify a blank
	 *         node with an entry in the index.
	 */
	@Override
    public Result applyOnce(final IIndex ndx, final IRaba keys, final IRaba vals) {
        
        final int numTerms = keys.size();
        
        assert numTerms > 0 : "numTerms="+numTerms;

		// Helper class for index operations.
		final BlobsIndexHelper helper = new BlobsIndexHelper();
		
		// used to format the keys for the TERMS index.
		final IKeyBuilder keyBuilder = helper.newKeyBuilder();

        // used to store the discovered / assigned hash collision counters.
        final int[] counters = new int[numTerms];

        /*
         * Note: The baseKey is shorter than the full key (it does not include
         * the hash collision counter).
         */
        final byte[] baseKey = new byte[keyBuilder.capacity()
                - BlobsIndexHelper.SIZEOF_COUNTER];

        // Temporary buffer used to format the [toKey] for the bucket scan.
        final byte[] tmp = new byte[BlobsIndexHelper.SIZEOF_PREFIX_KEY];
        
//        // Used to report the size of each collision bucket.
//        final AtomicInteger bucketSize = new AtomicInteger(0);
        
        // Incremented by the size of each collision bucket probed.
        long totalBucketSize = 0L;
        
        // The size of the largest collision bucket.
        int maxBucketSize = 0;

        final DataOutputBuffer kbuf = new DataOutputBuffer(
                0/* existingDataLength */, baseKey);

        for (int i = 0; i < numTerms; i++) {

            // Copy key into reused buffer to reduce allocation.
//            final byte[] baseKey = getKey(i);
			keys.copy(i, kbuf.reset());

			// decode the VTE from the flags.
			final VTE vte = AbstractIV
					.getVTE(KeyBuilder.decodeByte(baseKey[0]));

			final int counter;
			if (!toldBNodes && vte == VTE.BNODE) {

                /*
                 * Do not enter blank nodes into the TERMS index.
                 * 
                 * For this case, we just assign a term identifier and leave it
                 * at that. If two different documents by some chance happen to
                 * specify the same blank node ID they will still be assigned
                 * distinct term identifiers. The only way that you can get the
                 * same term identifier for a blank node is to have the blank
                 * node ID matched in a canonicalizing map of blank nodes by the
                 * client. That map, of course, should be scoped to the document
                 * in which the blank node IDs appear.
                 */
                
                if (readOnly) {
                
                    // blank nodes can not be resolved by the index.
                    counter = BlobsIndexHelper.NOT_FOUND;

                    /*
                     * FIXME Use this to track down people who pass in a blank
                     * node on a read-only request when we are not using told
                     * bnodes. Under these conditions we can not unify the blank
                     * node with the TERMS index so the node should not have
                     * been passed in at all (for efficiency reasons).
                     */
                    //throw new UnsupportedOperationException();

                } else {

					/*
					 * We are not in a told bnode mode and this is not a
					 * read-only request. The TERMS index will be used to assign
					 * a unique counter to complete the blank node's key. That
					 * counter is just the current size of the collision bucket
					 * at the time that we check the index. The collision bucket
					 * is increased by ONE since we insert the blank node into
					 * the index.
					 */

					// The size of the collision bucket (aka the assigned ctr).
					counter = helper.addBNode(ndx, keyBuilder, baseKey,
							vals.get(i), tmp);

                }
                
            } else {

				/*
				 * The serialized BigdataValue object.
				 * 
				 * TODO Avoid materialization of this, preferring to operate on
				 * streams in the source IRaba. We will need to compare it with
				 * the value[] on any other tuples in the collision bucket. We
				 * also need to decode the byte which represents the termCode
				 * can can be used to derive the VTE of the Value.
				 */
            	final byte[] val = vals.get(i);
            	
				counter = helper.resolveOrAddValue(ndx, readOnly,
						keyBuilder, baseKey, val, tmp, null/* bucketSize */);

            }

			if (!readOnly && counter < 0)
				throw new AssertionError("counter=" + counter);

			counters[i] = counter;

			if (counter != BlobsIndexHelper.NOT_FOUND) {

				/*
				 * TODO This does not update the bucketSize when the Value was
				 * not found in the index. We could do this by changing the
				 * return value of resolveOrAddValue() to -rangeCount and
				 * casting to an (int). The (-rangeCount) could then be
				 * normalized to a marker as we pass the information back to the
				 * client. [Or just enable the bucketSize argument above.]
				 */
				if (maxBucketSize < counter) {

					maxBucketSize += counter;

				}

				totalBucketSize += counter;
				
			}
            
        }

		return new Result(totalBucketSize, maxBucketSize, counters);

    } // apply(ndx)

	@Override
	protected void readMetadata(final ObjectInput in) throws IOException,
			ClassNotFoundException {

        super.readMetadata(in);
        
        readOnly = in.readBoolean();

    }

    /**
     * Writes metadata (not the keys or values, but just other metadata used by
     * the procedure).
     * 

     * The default implementation writes toIndex - fromIndex,
     * which is the #of keys.
     * 
     * @param out
     * 
     * @throws IOException
     */
	@Override
    protected void writeMetadata(final ObjectOutput out) throws IOException {

        super.writeMetadata(out);

        out.writeBoolean(readOnly);

    }

	/**
	 * Object encapsulates the discovered / assigned {@link IV}s and provides
	 * efficient serialization for communication of those data to the client.
	 * 
	 * @author Bryan
	 *         Thompson
	 */
    public static class Result implements Externalizable {

		/**
		 * The total size of the hash collision buckets examined across all
		 * {@link Value}s in the request. Each time a {@link Value} is resolved
		 * to a hash collision bucket, the size of that bucket is incremented
		 * against this field. Thus it will double count a bucket if the same
		 * bucket is visited more than once for the request.
		 */
		public long totalBucketSize;
		
		/**
		 * The size of the largest hash collision bucket examined across all the
		 * {@link Value}s in the request.
		 */
		public int maxBucketSize;

        /**
         * The counters assigned to each {@link Value} in the request. The
         * indices of this array are correlated with the indices of the array
         * provided to the request.
         * 
         * Note: The actual counters values are SHORTs, not INTs. However,
         * {@link BlobsIndexHelper#NOT_FOUND} is an INT value used to indicate
         * that the desired {@link BlobIV} was not discovered in the index. That
         * means that we need to interchange and represent the counters as an
         * int[].
         */
		public int[] counters;
        
        private static final long serialVersionUID = 1L;

        /**
         * De-serialization constructor.
         */
        public Result() {
            
        }

		/**
		 * 
		 * @param totalBucketSize
		 *            The total bucket size across all buckets examined.
		 * @param maxBucketSize
		 *            The size of the largest collision bucket examined.
		 * @param ivs
		 *            The assigned/resolved collision counters.
		 */
		public Result(final long totalBucketSize, final int maxBucketSize,
				final int[] counters) {

			if(counters == null)
				throw new IllegalArgumentException();
            
            this.totalBucketSize = totalBucketSize;
            
            this.maxBucketSize = maxBucketSize;
            
            this.counters = counters;
            
        }

        private final static transient short VERSION0 = 0x0;
        
        @Override
		public void readExternal(final ObjectInput in) throws IOException,
				ClassNotFoundException {

			final short version = ShortPacker.unpackShort(in);

			if (version != VERSION0) {

				throw new IOException("Unknown version: " + version);

			}

			final int n = (int) LongPacker.unpackLong(in);

            totalBucketSize = LongPacker.unpackLong(in);

            maxBucketSize = LongPacker.unpackInt(in);
			
			counters = new int[n];

            for (int i = 0; i < n; i++) {
                
				final short tmp = ShortPacker.unpackShort(in);

				counters[i] = tmp == Short.MAX_VALUE ? BlobsIndexHelper.NOT_FOUND
						: tmp;
                
            }
            
        }

        @Override
        public void writeExternal(final ObjectOutput out) throws IOException {

            final int n = counters.length;
            
            ShortPacker.packShort(out, VERSION0);

            // The #of results.
			LongPacker.packLong(out, n);

			// The total bucket size across all buckets examined.
			LongPacker.packLong(out, totalBucketSize);
			
			// The size of the largest collision bucket examined.
			LongPacker.packLong(out, maxBucketSize);

			/*
			 * Write out the assigned/resolved collision counters.
			 * 
			 * Note: This uses a packed short encoding for the collision
			 * counters. If we see the marker for an unresolved collision
			 * counter (NOT_FOUND) then it is replaced with [Short.MAX_VALUE].
			 * This is fine as long as the collision counter is a byte. Since
			 * the [short] value is in [0:Short.MAX_VALUE] we can then pack it
			 * into the output stream.
			 */
			for (int i = 0; i < n; i++) {

				final int c = counters[i];

				final short tmp = c == BlobsIndexHelper.NOT_FOUND ? Short.MAX_VALUE
						: (short) c;

				ShortPacker.packShort(out, tmp);
                
            }
            
        }
        
    } // class Result

    /**
	 * {@link Split}-wise aggregation followed by combining the results across
	 * those splits in order to return an aggregated result whose counters[] is
	 * 1:1 with the original keys[][].
	 */
	@Override
	protected IResultHandler newAggregator() {
	
		return new BlobResultAggregator(getKeys().size());

	}
	
	/**
	 * Aggregator collects the individual results in an internal ordered map and
	 * assembles the final result when it is requested from the individual
	 * results. With this approach there is no overhead or contention when the
	 * results are being produced in parallel and they can be combined
	 * efficiently within a single thread in {@link #getResult()}.
	 *
	 * @author bryan
	 */
	private class BlobResultAggregator extends AbstractLocalSplitResultAggregator {

		/**
		 * 
		 * @param size
		 *            The #of elements in the request (which is the same as the
		 *            cardinality of the aggregated result).
		 */
		public BlobResultAggregator(final int size) {
			super(size);
		}

		@Override
		protected Result newResult(final int size, SplitValuePair[] a) {

			long totalBucketSize = 0;

			int maxBucketSize = 0;

			final int[] counters = new int[size];

			for (int i = 0; i < a.length; i++) {

				final Split split = a[i].key;

				final Result tmp = a[i].val;

				totalBucketSize += tmp.totalBucketSize;

				maxBucketSize = Math.max(maxBucketSize, tmp.maxBucketSize);

				System.arraycopy(tmp.counters/* src */, 0/* srcPos */, counters/* dest */, split.fromIndex/* destPos */,
						split.ntuples/* length */);

			}

			/*
			 * Return the aggregated result.
			 */
			final Result r = new Result(totalBucketSize, maxBucketSize, counters);

			return r;
		}

	} // BlobResultHandler

}