All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bigdata.rdf.lexicon.BlobsWriteTask Maven / Gradle / Ivy

Go to download

Blazegraph(TM) DB Core Platform. It contains all Blazegraph DB dependencies other than Blueprints.

There is a newer version: 2.1.4
Show newest version
package com.bigdata.rdf.lexicon;

import java.util.Arrays;
import java.util.concurrent.Callable;

import org.apache.log4j.Logger;

import com.bigdata.btree.IIndex;
import com.bigdata.btree.keys.KVO;
import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedureConstructor;
import com.bigdata.btree.proc.IResultHandler;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.VTE;
import com.bigdata.rdf.internal.impl.BlobIV;
import com.bigdata.rdf.lexicon.BlobsWriteProc.BlobsWriteProcConstructor;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.model.BigdataValueFactory;
import com.bigdata.service.Split;
import com.bigdata.service.ndx.pipeline.KVOList;

/**
 * Synchronous RPC write on the TERMS index.
 * 
 * @author Bryan Thompson
 */
public class BlobsWriteTask implements Callable[]> {

    private static transient final Logger log = Logger.getLogger(BlobsWriteTask.class);
            
//    private final LexiconRelation r;
    final IIndex ndx;
    final BigdataValueFactory valueFactory;
    private final boolean readOnly;
    private final boolean storeBlankNodes;
    private final int numTerms;
    private final BigdataValue[] terms;
    private final WriteTaskStats stats;
    
    public BlobsWriteTask(final IIndex ndx,
            final BigdataValueFactory valueFactory, final boolean readOnly,
            final boolean storeBlankNodes, final int numTerms,
            final BigdataValue[] terms, final WriteTaskStats stats) {

        if (ndx == null)
            throw new IllegalArgumentException();

        if (valueFactory == null)
            throw new IllegalArgumentException();

        if (terms == null)
            throw new IllegalArgumentException();

        if (numTerms < 0 || numTerms > terms.length)
            throw new IllegalArgumentException();

        if (stats == null)
            throw new IllegalArgumentException();

//        this.r = r;
        
        this.ndx = ndx;
        
        this.valueFactory = valueFactory;

        this.readOnly = readOnly;
        
        this.storeBlankNodes = storeBlankNodes;
        
        this.numTerms = numTerms;
        
        this.terms = terms;
        
        this.stats = stats;
        
    }

    /**
     * Unify the {@link BigdataValue}s with the TERMS index, setting the
     * {@link IV}s on the {@link BigdataValue}s as a side-effect.
     * 
     * @return A dense {@link KVO}[] chunk consisting of only those distinct
     *         {@link BigdataValue}s whose {@link IV}s were not already known.
     *         (This may be used to write on the full text index).
     * 
     * @throws Exception
     */
    public KVO[] call() throws Exception {

		/*
		 * Insert into the TERMS index ({termCode,hash(Value),counter} ->
		 * Value). This will set the IV on the BigdataValue. If the Value was
		 * not in the lexicon, then a new entry is created in the TERMS index
		 * for the Value and the key for that entry is wrapped as its IV. If the
		 * Value is in the lexicon, then the key for the existing entry is
		 * wrapped as its IV.
		 * 
		 * Note: The code has to scan the "collision bucket" comprised of each
		 * Value having the same hash code. In practice, collisions are quite
		 * rare and the #of tuples in each "collision bucket" is quite small.
		 * 
		 * Note: TERMS index shards must not split "collision buckets".
		 */
        
        // The #of distinct terms lacking a pre-assigned term identifier in [a].
        int ndistinct = 0;

        // A dense array of correlated tuples.
        final KVO[] a;
        {
            
            final KVO[] b;

            /*
             * Make sure that each term has an assigned sort key.
             * 
             * Note: The caller SHOULD first remove anything with an
             * pre-assigned IV. That will let us avoid any further costs
             * associated with those Values (LexiconRelation is doing this.)
             */
            {

                final long _begin = System.currentTimeMillis();
                
                b = new BlobsIndexHelper().generateKVOs(valueFactory
                        .getValueSerializer(), terms, numTerms);

                stats.keyGenTime.add(System.currentTimeMillis() - _begin);

            }

            /*
             * Sort by the assigned sort key. This places the array into the
             * natural order for the term:id index.
             */
            {

                final long _begin = System.currentTimeMillis();

                Arrays.sort(b);

                stats.keySortTime.add(System.currentTimeMillis() - _begin);

            }

            /*
             * For each distinct term that does not have a pre-assigned IV, add
             * it to a remote unisolated batch operation that assigns IVs.
             * 
             * Note: Both duplicate term references and terms with their IVs
             * already assigned are dropped out in this step.
             * 
             * FIXME Caller SHOULD first remove any duplicates, which is what we
             * are doing here [Since LexiconRelation guarantees that we do not
             * have to do this here.]
             */
            {

                final long _begin = System.currentTimeMillis();

                /*
                 * Create a key buffer holding the sort keys. This does not
                 * allocate new storage for the sort keys, but rather aligns the
                 * data structures for the call to splitKeys(). This also makes
                 * a[] into a dense copy of the references in b[], but without
                 * duplicates and without terms that already have assigned term
                 * identifiers. Note that keys[] and a[] are correlated.
                 * 
                 * @todo Could be restated as an IDuplicateRemover, but note
                 * that this case is specialized since it can drop terms whose
                 * term identifier is known (they do not need to be written on
                 * T2ID, but they still need to be written on the reverse index
                 * to ensure a robust and consistent mapping).
                 */
                final byte[][] keys = new byte[numTerms][];
                final byte[][] vals = new byte[numTerms][];
                a = new KVO[numTerms];
                {

                    for (int i = 0; i < numTerms; i++) {

                        if (b[i].obj.getIV() != null) {
                            
                            if (log.isDebugEnabled())
                                log.debug("IV already assigned: "
                                        + b[i].obj);
                            
                            // IV is already assigned.
                            continue;
                            
                        }
                        
                        if (i > 0 && b[i - 1].obj == b[i].obj) {

                            if (log.isDebugEnabled())
                                log.debug("duplicate reference: "
                                        + b[i].obj);
                            
                            // duplicate reference.
                            continue;
                            
                        }

                        // assign to a[] (dense variant of b[]).
                        a[ndistinct] = b[i];
                        
                        // assign to keys[]/vals[] (dense; correlated with a[]).
                        keys[ndistinct] = b[i].key;
                        vals[ndistinct] = b[i].val;
                        
                        ndistinct++;

                    }

                }

                if (ndistinct == 0) {
                    
                    /*
                     * Nothing to be written.
                     */
                    
                    return new KVO[0];
                    
                }
                
                final AbstractKeyArrayIndexProcedureConstructor ctor = new BlobsWriteProcConstructor(
                        readOnly, storeBlankNodes);

				// run the procedure.
                ndx.submit(0/* fromIndex */, ndistinct/* toIndex */, keys,
                        vals, ctor, new BlobsWriteProcResultHandler(a,
                                readOnly, stats));

                stats.indexTime.addAndGet(stats.termsIndexTime = System
                        .currentTimeMillis()
                        - _begin);

            }

        }
        
        stats.ndistinct.addAndGet( ndistinct );

        return KVO.dense(a, ndistinct);
        
    } // call

    /**
     * Class applies the term identifiers assigned by the
     * {@link Term2IdWriteProc} to the {@link BigdataValue} references in the
     * {@link KVO} correlated with each {@link Split} of data processed by that
     * procedure.
     * 

* Note: Of necessity, this requires access to the {@link BigdataValue}s * whose term identifiers are being resolved. This implementation presumes * that the array specified to the ctor and the array returned for each * chunk that is processed have correlated indices and that the offset into * {@link #a} is given by {@link Split#fromIndex}. * * @author Bryan Thompson */ static class BlobsWriteProcResultHandler implements IResultHandler { private final KVO[] a; private final boolean readOnly; /** * @todo this could be the value returned by {@link #getResult()} which * would make the API simpler. */ private final WriteTaskStats stats; /** * * @param a * A dense array of {@link KVO}s. * @param readOnly * if readOnly was specified for the {@link Term2IdWriteProc} * . * @param stats * Various atomic fields are updated as a side effect. */ public BlobsWriteProcResultHandler(final KVO[] a, final boolean readOnly, final WriteTaskStats stats) { if (a == null) throw new IllegalArgumentException(); if (stats == null) throw new IllegalArgumentException(); this.a = a; this.readOnly = readOnly; this.stats = stats; } /** * Copy the assigned / discovered term identifiers onto the * corresponding elements of the terms[]. */ @Override public void aggregate(final BlobsWriteProc.Result result, final Split split) { stats.totalBucketSize.add(result.totalBucketSize); /* * Update the maximum bucket size. There is a data race here, but * conflicts should be rare and this loop will eventually resolve * any conflict. */ while (true) { final int tmp = stats.maxBucketSize.get(); if (tmp < result.maxBucketSize) { if (!stats.maxBucketSize.compareAndSet(tmp/* expect */, result.maxBucketSize/* newValue */)) { continue; } } break; } for (int i = split.fromIndex, j = 0; i < split.toIndex; i++, j++) { final int counter = result.counters[j]; if (counter == BlobsIndexHelper.NOT_FOUND) { if (!readOnly) throw new AssertionError(); stats.nunknown.incrementAndGet(); } else { // The value whose IV we have discovered/asserted. final BigdataValue value = a[i].obj; // Rebuild the IV. final BlobIV iv = new BlobIV(VTE.valueOf(value), value .hashCode(), (short) counter); // assign the term identifier. value.setIV(iv); if(a[i] instanceof KVOList) { final KVOList tmp = (KVOList) a[i]; if (!tmp.isDuplicateListEmpty()) { // assign the term identifier to the duplicates. tmp.map(new AssignTermId(iv)); } } if (log.isDebugEnabled()) log.debug("termId=" + iv + ", term=" + a[i].obj); } } } @Override public Void getResult() { return null; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy