All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bigdata.rdf.lexicon.Term2IdWriteTask Maven / Gradle / Ivy

/*

 Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

 Contact:
 SYSTAP, LLC DBA Blazegraph
 2501 Calvert ST NW #106
 Washington, DC 20008
 [email protected]

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; version 2 of the License.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

 */
package com.bigdata.rdf.lexicon;

import java.util.Arrays;
import java.util.concurrent.Callable;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.log4j.Logger;

import com.bigdata.btree.IIndex;
import com.bigdata.btree.keys.KVO;
import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedureConstructor;
import com.bigdata.btree.proc.IResultHandler;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.lexicon.Term2IdWriteProc.Term2IdWriteProcConstructor;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.service.Split;
import com.bigdata.service.ndx.pipeline.KVOList;

/**
 * Synchronous RPC write on the TERM2ID index.
 * 
 * @author Bryan Thompson
 */
public class Term2IdWriteTask implements
        Callable[]> {

    private static transient final Logger log = Logger
            .getLogger(Term2IdWriteTask.class);

//    private final LexiconRelation r;
    private final IIndex termIdIndex;
    private final boolean readOnly;
    private final boolean storeBlankNodes;
    private final int termIdBitsToReverse;
    private final int numTerms;
    private final BigdataValue[] terms;
    private final WriteTaskStats stats;
    
    public Term2IdWriteTask(final IIndex termIdIndex, final boolean readOnly,
            final boolean storeBlankNodes, final int termIdBitsToReverse,
            final int numTerms, final BigdataValue[] terms,
            final WriteTaskStats stats) {

        if (termIdIndex == null)
            throw new IllegalArgumentException();

        if (terms == null)
            throw new IllegalArgumentException();

        if (numTerms < 0 || numTerms > terms.length)
            throw new IllegalArgumentException();

        if (stats == null)
            throw new IllegalArgumentException();

//        this.r = r;
        
        this.termIdIndex = termIdIndex;

        this.readOnly = readOnly;

        this.storeBlankNodes = storeBlankNodes;
        
        this.termIdBitsToReverse = termIdBitsToReverse;
        
        this.numTerms = numTerms;
        
        this.terms = terms;
        
        this.stats = stats;
        
    }
    
    /**
     * Unify the {@link BigdataValue}s with the TERM2ID index, setting the
     * term identifiers (TIDs) on those values as a side-effect.
     * 
     * @return A dense {@link KVO}[] chunk consisting of only those
     *         distinct terms whose term identifier was not already known.
     *         (This may be used to write on the reverse index).
     * 
     * @throws Exception
     */
    public KVO[] call() throws Exception {
        
        /*
         * Insert into the forward index (term -> id). This will either assign a
         * termId or return the existing termId if the term is already in the
         * lexicon.
         */
        
        // The #of distinct terms lacking a pre-assigned term identifier in [a].
        int ndistinct = 0;

        // A dense array of correlated tuples.
        final KVO[] a;
        {
            
            final KVO[] b;

            /*
             * First make sure that each term has an assigned sort key.
             */
            {

                final long _begin = System.currentTimeMillis();
                
                final Term2IdTupleSerializer tupleSer = (Term2IdTupleSerializer) termIdIndex
                        .getIndexMetadata().getTupleSerializer();

                // may contain duplicates and/or terms with pre-assigned term
                // identifiers.
                b = generateSortKeys(tupleSer.getLexiconKeyBuilder(), terms,
                        numTerms);

                stats.keyGenTime.add(System.currentTimeMillis() - _begin);

            }

            /*
             * Sort by the assigned sort key. This places the array into the
             * natural order for the term:id index.
             */
            {

                final long _begin = System.currentTimeMillis();

                Arrays.sort(b);

                stats.keySortTime.add(System.currentTimeMillis() - _begin);

            }

            /*
             * For each distinct term that does not have a pre-assigned term
             * identifier, add it to a remote unisolated batch operation that
             * assigns term identifiers.
             * 
             * Note: Both duplicate term references and terms with their term
             * identifiers already assigned are dropped out in this step.
             */
            {

                final long _begin = System.currentTimeMillis();

                /*
                 * Create a key buffer holding the sort keys. This does not
                 * allocate new storage for the sort keys, but rather aligns the
                 * data structures for the call to splitKeys(). This also makes
                 * a[] into a dense copy of the references in b[], but without
                 * duplicates and without terms that already have assigned term
                 * identifiers. Note that keys[] and a[] are correlated.
                 * 
                 * @todo Could be restated as an IDuplicateRemover, but note
                 * that this case is specialized since it can drop terms whose
                 * term identifier is known (they do not need to be written on
                 * T2ID, but they still need to be written on the reverse index
                 * to ensure a robust and consistent mapping).
                 */
                final byte[][] keys = new byte[numTerms][];
                a = new KVO[numTerms];
                {

                    for (int i = 0; i < numTerms; i++) {

                        if (b[i].obj.getIV() != null) {
                            
                            if (log.isDebugEnabled())
                                log.debug("term identifier already assigned: "
                                        + b[i].obj);
                            
                            // term identifier already assigned.
                            continue;
                            
                        }
                        
                        if (i > 0 && b[i - 1].obj == b[i].obj) {

                            if (log.isDebugEnabled())
                                log.debug("duplicate term reference: "
                                        + b[i].obj);
                            
                            // duplicate reference.
                            continue;
                            
                        }

                        // assign to a[] (dense variant of b[]).
                        a[ndistinct] = b[i];
                        
                        // assign to keys[] (dense and correlated with a[]).
                        keys[ndistinct] = b[i].key;
                        
                        ndistinct++;

                    }

                }

                if (ndistinct == 0) {
                    
                    /*
                     * Nothing to be written.
                     */
                    
                    return new KVO[0];
                    
                }
                
                final AbstractKeyArrayIndexProcedureConstructor ctor =
                    new Term2IdWriteProcConstructor(
                            readOnly, storeBlankNodes, termIdBitsToReverse);
                
                // run the procedure.
                termIdIndex.submit(0/* fromIndex */, ndistinct/* toIndex */,
                        keys, null/* vals */, ctor,
                        new Term2IdWriteProcResultHandler(a, readOnly,
                                stats.nunknown));

                stats.indexTime.addAndGet(stats.forwardIndexTime = System
                        .currentTimeMillis()
                        - _begin);

            }

        }

        stats.ndistinct.addAndGet(ndistinct);

        return KVO.dense(a, ndistinct);
        
    } // call

    /**
     * Class applies the term identifiers assigned by the
     * {@link Term2IdWriteProc} to the {@link BigdataValue} references in the
     * {@link KVO} correlated with each {@link Split} of data processed by that
     * procedure.
     * 

* Note: Of necessity, this requires access to the {@link BigdataValue}s * whose term identifiers are being resolved. This implementation presumes * that the array specified to the ctor and the array returned for each * chunk that is processed have correlated indices and that the offset into * {@link #a} is given by {@link Split#fromIndex}. * * @author Bryan Thompson */ static private class Term2IdWriteProcResultHandler implements IResultHandler { private final KVO[] a; private final boolean readOnly; /** * @todo this could be the value returned by {@link #getResult()} which * would make the API simpler. */ private final AtomicInteger nunknown; /** * * @param a * A dense array of {@link KVO}s. * @param readOnly * if readOnly was specified for the {@link Term2IdWriteProc}. * @param nunknown * Incremented as a side effect for each terms that could not * be resolved (iff readOnly == true). */ public Term2IdWriteProcResultHandler(final KVO[] a, final boolean readOnly, final AtomicInteger nunknown) { if (a == null) throw new IllegalArgumentException(); if (nunknown == null) throw new IllegalArgumentException(); this.a = a; this.readOnly = readOnly; this.nunknown = nunknown; } /** * Copy the assigned / discovered term identifiers onto the * corresponding elements of the terms[]. */ @Override public void aggregate(final Term2IdWriteProc.Result result, final Split split) { for (int i = split.fromIndex, j = 0; i < split.toIndex; i++, j++) { final IV termId = result.ivs[j]; if (termId == null) { if (!readOnly) throw new AssertionError(); nunknown.incrementAndGet(); } else { // assign the term identifier. a[i].obj.setIV(termId); if(a[i] instanceof KVOList) { final KVOList tmp = (KVOList) a[i]; if (!tmp.isDuplicateListEmpty()) { // assign the term identifier to the duplicates. tmp.map(new AssignTermId(termId)); } } if (log.isDebugEnabled()) { log.debug("termId=" + termId + ", term=" + a[i].obj); } } } } @Override public Void getResult() { return null; } } /** * Generate the sort keys for the terms. * * @param keyBuilder * The object used to generate the sort keys. * @param terms * The terms whose sort keys will be generated. * @param numTerms * The #of terms in that array. * * @return An array of correlated key-value-object tuples. *

* Note that {@link KVO#val} is null until we know * that we need to write it on the reverse index. * * @see LexiconKeyBuilder */ @SuppressWarnings("unchecked") final private KVO[] generateSortKeys( final LexiconKeyBuilder keyBuilder, final BigdataValue[] terms, final int numTerms) { final KVO[] a = new KVO[numTerms]; for (int i = 0; i < numTerms; i++) { final BigdataValue term = terms[i]; a[i] = new KVO(keyBuilder.value2Key(term), null/* val */, term); } return a; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy