com.bigdata.rdf.lexicon.BlobsWriteTask Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bigdata-core Show documentation
Blazegraph(TM) DB Core Platform. It contains all Blazegraph DB dependencies other than Blueprints.
There is a newer version: 2.1.4
package com.bigdata.rdf.lexicon;

import java.util.Arrays;
import java.util.concurrent.Callable;

import org.apache.log4j.Logger;

import com.bigdata.btree.IIndex;
import com.bigdata.btree.keys.KVO;
import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedureConstructor;
import com.bigdata.btree.proc.IResultHandler;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.VTE;
import com.bigdata.rdf.internal.impl.BlobIV;
import com.bigdata.rdf.lexicon.BlobsWriteProc.BlobsWriteProcConstructor;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.model.BigdataValueFactory;
import com.bigdata.service.Split;
import com.bigdata.service.ndx.pipeline.KVOList;

/**
 * Synchronous RPC write on the TERMS index.
 * 
 * @author Bryan Thompson
 */
public class BlobsWriteTask implements Callable[]> {

    private static transient final Logger log = Logger.getLogger(BlobsWriteTask.class);
            
//    private final LexiconRelation r;
    final IIndex ndx;
    final BigdataValueFactory valueFactory;
    private final boolean readOnly;
    private final boolean storeBlankNodes;
    private final int numTerms;
    private final BigdataValue[] terms;
    private final WriteTaskStats stats;
    
    public BlobsWriteTask(final IIndex ndx,
            final BigdataValueFactory valueFactory, final boolean readOnly,
            final boolean storeBlankNodes, final int numTerms,
            final BigdataValue[] terms, final WriteTaskStats stats) {

        if (ndx == null)
            throw new IllegalArgumentException();

        if (valueFactory == null)
            throw new IllegalArgumentException();

        if (terms == null)
            throw new IllegalArgumentException();

        if (numTerms < 0 || numTerms > terms.length)
            throw new IllegalArgumentException();

        if (stats == null)
            throw new IllegalArgumentException();

//        this.r = r;
        
        this.ndx = ndx;
        
        this.valueFactory = valueFactory;

        this.readOnly = readOnly;
        
        this.storeBlankNodes = storeBlankNodes;
        
        this.numTerms = numTerms;
        
        this.terms = terms;
        
        this.stats = stats;
        
    }

    /**
     * Unify the {@link BigdataValue}s with the TERMS index, setting the
     * {@link IV}s on the {@link BigdataValue}s as a side-effect.
     * 
     * @return A dense {@link KVO}[] chunk consisting of only those distinct
     *         {@link BigdataValue}s whose {@link IV}s were not already known.
     *         (This may be used to write on the full text index).
     * 
     * @throws Exception
     */
    public KVO[] call() throws Exception {

		/*
		 * Insert into the TERMS index ({termCode,hash(Value),counter} ->
		 * Value). This will set the IV on the BigdataValue. If the Value was
		 * not in the lexicon, then a new entry is created in the TERMS index
		 * for the Value and the key for that entry is wrapped as its IV. If the
		 * Value is in the lexicon, then the key for the existing entry is
		 * wrapped as its IV.
		 * 
		 * Note: The code has to scan the "collision bucket" comprised of each
		 * Value having the same hash code. In practice, collisions are quite
		 * rare and the #of tuples in each "collision bucket" is quite small.
		 * 
		 * Note: TERMS index shards must not split "collision buckets".
		 */
        
        // The #of distinct terms lacking a pre-assigned term identifier in [a].
        int ndistinct = 0;

        // A dense array of correlated tuples.
        final KVO[] a;
        {
            
            final KVO[] b;

            /*
             * Make sure that each term has an assigned sort key.
             * 
             * Note: The caller SHOULD first remove anything with an
             * pre-assigned IV. That will let us avoid any further costs
             * associated with those Values (LexiconRelation is doing this.)
             */
            {

                final long _begin = System.currentTimeMillis();
                
                b = new BlobsIndexHelper().generateKVOs(valueFactory
                        .getValueSerializer(), terms, numTerms);

                stats.keyGenTime.add(System.currentTimeMillis() - _begin);

            }

            /*
             * Sort by the assigned sort key. This places the array into the
             * natural order for the term:id index.
             */
            {

                final long _begin = System.currentTimeMillis();

                Arrays.sort(b);

                stats.keySortTime.add(System.currentTimeMillis() - _begin);

            }

            /*
             * For each distinct term that does not have a pre-assigned IV, add
             * it to a remote unisolated batch operation that assigns IVs.
             * 
             * Note: Both duplicate term references and terms with their IVs
             * already assigned are dropped out in this step.
             * 
             * FIXME Caller SHOULD first remove any duplicates, which is what we
             * are doing here [Since LexiconRelation guarantees that we do not
             * have to do this here.]
             */
            {

                final long _begin = System.currentTimeMillis();

                /*
                 * Create a key buffer holding the sort keys. This does not
                 * allocate new storage for the sort keys, but rather aligns the
                 * data structures for the call to splitKeys(). This also makes
                 * a[] into a dense copy of the references in b[], but without
                 * duplicates and without terms that already have assigned term
                 * identifiers. Note that keys[] and a[] are correlated.
                 * 
                 * @todo Could be restated as an IDuplicateRemover, but note
                 * that this case is specialized since it can drop terms whose
                 * term identifier is known (they do not need to be written on
                 * T2ID, but they still need to be written on the reverse index
                 * to ensure a robust and consistent mapping).
                 */
                final byte[][] keys = new byte[numTerms][];
                final byte[][] vals = new byte[numTerms][];
                a = new KVO[numTerms];
                {

                    for (int i = 0; i < numTerms; i++) {

                        if (b[i].obj.getIV() != null) {
                            
                            if (log.isDebugEnabled())
                                log.debug("IV already assigned: "
                                        + b[i].obj);
                            
                            // IV is already assigned.
                            continue;
                            
                        }
                        
                        if (i > 0 && b[i - 1].obj == b[i].obj) {

                            if (log.isDebugEnabled())
                                log.debug("duplicate reference: "
                                        + b[i].obj);
                            
                            // duplicate reference.
                            continue;
                            
                        }

                        // assign to a[] (dense variant of b[]).
                        a[ndistinct] = b[i];
                        
                        // assign to keys[]/vals[] (dense; correlated with a[]).
                        keys[ndistinct] = b[i].key;
                        vals[ndistinct] = b[i].val;
                        
                        ndistinct++;

                    }

                }

                if (ndistinct == 0) {
                    
                    /*
                     * Nothing to be written.
                     */
                    
                    return new KVO[0];
                    
                }
                
                final AbstractKeyArrayIndexProcedureConstructor ctor = new BlobsWriteProcConstructor(
                        readOnly, storeBlankNodes);

				// run the procedure.
                ndx.submit(0/* fromIndex */, ndistinct/* toIndex */, keys,
                        vals, ctor, new BlobsWriteProcResultHandler(a,
                                readOnly, stats));

                stats.indexTime.addAndGet(stats.termsIndexTime = System
                        .currentTimeMillis()
                        - _begin);

            }

        }
        
        stats.ndistinct.addAndGet( ndistinct );

        return KVO.dense(a, ndistinct);
        
    } // call

    /**
     * Class applies the term identifiers assigned by the
     * {@link Term2IdWriteProc} to the {@link BigdataValue} references in the
     * {@link KVO} correlated with each {@link Split} of data processed by that
     * procedure.
     * 
     * Note: Of necessity, this requires access to the {@link BigdataValue}s
     * whose term identifiers are being resolved. This implementation presumes
     * that the array specified to the ctor and the array returned for each
     * chunk that is processed have correlated indices and that the offset into
     * {@link #a} is given by {@link Split#fromIndex}.
     * 
     * @author Bryan Thompson
     */
	static class BlobsWriteProcResultHandler implements
			IResultHandler {

        private final KVO[] a;
        private final boolean readOnly;
        
        /**
         * @todo this could be the value returned by {@link #getResult()} which
         *       would make the API simpler.
         */ 
        private final WriteTaskStats stats;

		/**
		 * 
		 * @param a
		 *            A dense array of {@link KVO}s.
		 * @param readOnly
		 *            if readOnly was specified for the {@link Term2IdWriteProc}
		 *            .
		 * @param stats
		 *            Various atomic fields are updated as a side effect.
		 */
		public BlobsWriteProcResultHandler(final KVO[] a,
				final boolean readOnly, final WriteTaskStats stats) {

            if (a == null)
                throw new IllegalArgumentException();

            if (stats == null)
                throw new IllegalArgumentException();

            this.a = a;

            this.readOnly = readOnly;
            
            this.stats = stats;
            
        }

        /**
         * Copy the assigned / discovered term identifiers onto the
         * corresponding elements of the terms[].
         */
		@Override
        public void aggregate(final BlobsWriteProc.Result result,
                final Split split) {

			stats.totalBucketSize.add(result.totalBucketSize);

			/*
			 * Update the maximum bucket size. There is a data race here, but
			 * conflicts should be rare and this loop will eventually resolve
			 * any conflict.
			 */
			while (true) {
				final int tmp = stats.maxBucketSize.get();
				if (tmp < result.maxBucketSize) {
					if (!stats.maxBucketSize.compareAndSet(tmp/* expect */,
							result.maxBucketSize/* newValue */)) {
						continue;
					}
				}
				break;
			}

            for (int i = split.fromIndex, j = 0; i < split.toIndex; i++, j++) {

                final int counter = result.counters[j];

                if (counter == BlobsIndexHelper.NOT_FOUND) {

                    if (!readOnly)
                        throw new AssertionError();

                    stats.nunknown.incrementAndGet();

				} else {

					// The value whose IV we have discovered/asserted.
					final BigdataValue value = a[i].obj;

					// Rebuild the IV.
					final BlobIV iv = new BlobIV(VTE.valueOf(value), value
							.hashCode(), (short) counter);

					// assign the term identifier.
					value.setIV(iv);

                    if(a[i] instanceof KVOList) {
                        
                        final KVOList tmp = (KVOList) a[i];

                        if (!tmp.isDuplicateListEmpty()) {

                            // assign the term identifier to the duplicates.
                            tmp.map(new AssignTermId(iv));

                        }
                        
                    }
                    
					if (log.isDebugEnabled())
						log.debug("termId=" + iv + ", term=" + a[i].obj);

                }

            }

        }

		@Override
        public Void getResult() {

            return null;

        }

    }
    
}